| 1234567891011121314151617181920212223242526272829303132333435363738394041424344454647484950515253545556575859606162636465666768697071727374757677787980818283 |
- from pymupdf import TEXTFLAGS_DICT, TEXT_PRESERVE_IMAGES
- from typing import Tuple, Dict, List
-
- import pymupdf
-
- BBox = Tuple[float, float, float, float] # (x0, y0, x1, y1) or top-left and bottom-right corners
- Origin = Tuple[float, float] # (x, y) or (x1, y0) or bottom-left corner
-
- def extract_text(pdf_path: str):
- flags = TEXTFLAGS_DICT & ~TEXT_PRESERVE_IMAGES
- extracted_text = {}
- with pymupdf.open(pdf_path) as pdf_file:
- for page in pdf_file:
- text_dict = page.get_text('dict', flags=flags, sort=True)
- for block in text_dict['blocks']:
- for line in block['lines']:
- for span in line['spans']:
- original_text = span['text']
- stripped_text = original_text.strip()
- if not stripped_text:
- continue
- origin = span['origin']
- bbox = span['bbox']
- corrected_bbox = _correct_bbox(bbox, origin)
- text_with_metadata = {
- 'original_text': span['text'],
- 'stripped_text': stripped_text,
- 'bbox': corrected_bbox,
- 'origin': origin,
- 'size': span['size'],
- 'font': span['font'],
- 'color': _int_to_rgbf(span['color']),
- }
- extracted_text.setdefault(page.number, []).append(text_with_metadata)
- return extracted_text
-
- def replace_texts(pdf_path: str, output_path: str, replacement_data: Dict, preserve_original_fonts: bool = True):
- with pymupdf.open(pdf_path) as pdf_file:
- for page_number, page_replacements in replacement_data.items():
- page = pdf_file[int(page_number)]
- page_fonts = page.get_fonts()
-
- # Add redact annotation to remove the original text
- for replacement in page_replacements:
- page.add_redact_annot(replacement['bbox'])
-
- page.apply_redactions()
-
- # Insert the corrected text
- for replacement in page_replacements:
- original_text = replacement['original_text']
- stripped_text = replacement['stripped_text']
- corrected_text = replacement['corrected_text']
- replacement_text = original_text.replace(stripped_text, corrected_text)
-
- page.insert_text(
- replacement['origin'],
- replacement_text,
- fontsize=replacement['size'],
- fontname=_get_font_name(replacement['font'], page_fonts) if preserve_original_fonts else 'helvetica',
- color=replacement['color'],
- )
- pdf_file.save(output_path, garbage=3, deflate=True, clean=True)
-
- def _correct_bbox(bbox: BBox, origin: Origin) -> BBox:
- corrected_bbox = list(bbox)
- if bbox[3] != origin[1]:
- corrected_bbox[1] = origin[1] - (bbox[3] - bbox[1])
- corrected_bbox[3] = origin[1]
- return tuple(corrected_bbox)
-
- def _int_to_rgbf(color: int) -> Tuple[float, float, float]:
- return (
- ((color >> 16) & 0xFF) / 255,
- ((color >> 8) & 0xFF) / 255,
- (color & 0xFF) / 255,
- )
-
- def _get_font_name(keyword: str, fonts: List):
- for font in fonts:
- if keyword in font[3]:
- return font[4]
- return 'helvetica'
|