from pymupdf import TEXTFLAGS_DICT, TEXT_PRESERVE_IMAGES from typing import Tuple, Dict, List import pymupdf BBox = Tuple[float, float, float, float] # (x0, y0, x1, y1) or top-left and bottom-right corners Origin = Tuple[float, float] # (x, y) or (x1, y0) or bottom-left corner def extract_text(pdf_path: str): flags = TEXTFLAGS_DICT & ~TEXT_PRESERVE_IMAGES extracted_text = {} with pymupdf.open(pdf_path) as pdf_file: for page in pdf_file: text_dict = page.get_text('dict', flags=flags, sort=True) for block in text_dict['blocks']: for line in block['lines']: for span in line['spans']: original_text = span['text'] stripped_text = original_text.strip() if not stripped_text: continue origin = span['origin'] bbox = span['bbox'] corrected_bbox = _correct_bbox(bbox, origin) text_with_metadata = { 'original_text': span['text'], 'stripped_text': stripped_text, 'bbox': corrected_bbox, 'origin': origin, 'size': span['size'], 'font': span['font'], 'color': _int_to_rgbf(span['color']), } extracted_text.setdefault(page.number, []).append(text_with_metadata) return extracted_text def replace_texts(pdf_path: str, output_path: str, replacement_data: Dict, preserve_original_fonts: bool = True): with pymupdf.open(pdf_path) as pdf_file: for page_number, page_replacements in replacement_data.items(): page = pdf_file[int(page_number)] page_fonts = page.get_fonts() # Add redact annotation to remove the original text for replacement in page_replacements: page.add_redact_annot(replacement['bbox']) page.apply_redactions() # Insert the corrected text for replacement in page_replacements: original_text = replacement['original_text'] stripped_text = replacement['stripped_text'] corrected_text = replacement['corrected_text'] replacement_text = original_text.replace(stripped_text, corrected_text) page.insert_text( replacement['origin'], replacement_text, fontsize=replacement['size'], fontname=_get_font_name(replacement['font'], page_fonts) if preserve_original_fonts else 'helvetica', color=replacement['color'], ) pdf_file.save(output_path, garbage=3, deflate=True, clean=True) def _correct_bbox(bbox: BBox, origin: Origin) -> BBox: corrected_bbox = list(bbox) if bbox[3] != origin[1]: corrected_bbox[1] = origin[1] - (bbox[3] - bbox[1]) corrected_bbox[3] = origin[1] return tuple(corrected_bbox) def _int_to_rgbf(color: int) -> Tuple[float, float, float]: return ( ((color >> 16) & 0xFF) / 255, ((color >> 8) & 0xFF) / 255, (color & 0xFF) / 255, ) def _get_font_name(keyword: str, fonts: List): for font in fonts: if keyword in font[3]: return font[4] return 'helvetica'