FastAPI server for online PDF processing

1234567891011121314151617181920212223242526272829303132333435363738394041424344454647484950515253545556575859606162636465666768697071727374757677787980818283
  1. from pymupdf import TEXTFLAGS_DICT, TEXT_PRESERVE_IMAGES
  2. from typing import Tuple, Dict, List
  3. import pymupdf
  4. BBox = Tuple[float, float, float, float] # (x0, y0, x1, y1) or top-left and bottom-right corners
  5. Origin = Tuple[float, float] # (x, y) or (x1, y0) or bottom-left corner
  6. def extract_text(pdf_path: str):
  7. flags = TEXTFLAGS_DICT & ~TEXT_PRESERVE_IMAGES
  8. extracted_text = {}
  9. with pymupdf.open(pdf_path) as pdf_file:
  10. for page in pdf_file:
  11. text_dict = page.get_text('dict', flags=flags, sort=True)
  12. for block in text_dict['blocks']:
  13. for line in block['lines']:
  14. for span in line['spans']:
  15. original_text = span['text']
  16. stripped_text = original_text.strip()
  17. if not stripped_text:
  18. continue
  19. origin = span['origin']
  20. bbox = span['bbox']
  21. corrected_bbox = _correct_bbox(bbox, origin)
  22. text_with_metadata = {
  23. 'original_text': span['text'],
  24. 'stripped_text': stripped_text,
  25. 'bbox': corrected_bbox,
  26. 'origin': origin,
  27. 'size': span['size'],
  28. 'font': span['font'],
  29. 'color': _int_to_rgbf(span['color']),
  30. }
  31. extracted_text.setdefault(page.number, []).append(text_with_metadata)
  32. return extracted_text
  33. def replace_texts(pdf_path: str, output_path: str, replacement_data: Dict, preserve_original_fonts: bool = True):
  34. with pymupdf.open(pdf_path) as pdf_file:
  35. for page_number, page_replacements in replacement_data.items():
  36. page = pdf_file[int(page_number)]
  37. page_fonts = page.get_fonts()
  38. # Add redact annotation to remove the original text
  39. for replacement in page_replacements:
  40. page.add_redact_annot(replacement['bbox'])
  41. page.apply_redactions()
  42. # Insert the corrected text
  43. for replacement in page_replacements:
  44. original_text = replacement['original_text']
  45. stripped_text = replacement['stripped_text']
  46. corrected_text = replacement['corrected_text']
  47. replacement_text = original_text.replace(stripped_text, corrected_text)
  48. page.insert_text(
  49. replacement['origin'],
  50. replacement_text,
  51. fontsize=replacement['size'],
  52. fontname=_get_font_name(replacement['font'], page_fonts) if preserve_original_fonts else 'helvetica',
  53. color=replacement['color'],
  54. )
  55. pdf_file.save(output_path, garbage=3, deflate=True, clean=True)
  56. def _correct_bbox(bbox: BBox, origin: Origin) -> BBox:
  57. corrected_bbox = list(bbox)
  58. if bbox[3] != origin[1]:
  59. corrected_bbox[1] = origin[1] - (bbox[3] - bbox[1])
  60. corrected_bbox[3] = origin[1]
  61. return tuple(corrected_bbox)
  62. def _int_to_rgbf(color: int) -> Tuple[float, float, float]:
  63. return (
  64. ((color >> 16) & 0xFF) / 255,
  65. ((color >> 8) & 0xFF) / 255,
  66. (color & 0xFF) / 255,
  67. )
  68. def _get_font_name(keyword: str, fonts: List):
  69. for font in fonts:
  70. if keyword in font[3]:
  71. return font[4]
  72. return 'helvetica'