FastAPI server for online PDF processing

extraction.py 1.5KB

12345678910111213141516171819202122232425262728293031323334353637383940414243
  1. from fastapi import APIRouter
  2. from pydantic import BaseModel
  3. from tempfile import TemporaryDirectory
  4. from glob import iglob
  5. from pdf.processor import extract_text
  6. from core.config import settings
  7. import base64
  8. import os
  9. import zipfile
  10. import json
  11. router = APIRouter()
  12. class ExtractionRequest(BaseModel):
  13. pdf_zip: str # Base64 encoded zip file
  14. @router.post("/extraction")
  15. async def extraction(request: ExtractionRequest):
  16. base_dir = settings.BASE_DIR
  17. pdf_zip = base64.b64decode(request.pdf_zip)
  18. with TemporaryDirectory(dir=base_dir, prefix=f'tmp_', ignore_cleanup_errors=True) as temp_dir:
  19. # Save the zip file to a temporary directory
  20. pdf_zip_path = os.path.join(temp_dir, 'pdf.zip')
  21. with open(pdf_zip_path, 'wb') as f:
  22. f.write(pdf_zip)
  23. pdf_dir = os.path.join(temp_dir, 'pdf')
  24. # Extract the PDFs from the zip file
  25. with zipfile.ZipFile(pdf_zip_path, 'r') as zip_ref:
  26. zip_ref.extractall(pdf_dir)
  27. # Extract text from the PDFs
  28. pattern = os.path.join(pdf_dir, '*.pdf')
  29. extracted_texts = dict()
  30. for pdf_path in iglob(pattern):
  31. pdf_name = os.path.basename(pdf_path)
  32. extracted_text = extract_text(pdf_path)
  33. extracted_texts[pdf_name] = extracted_text
  34. extracted_texts = json.dumps(extracted_texts).encode('utf-8')
  35. return {
  36. 'status': 'success',
  37. 'data': base64.b64encode(extracted_texts).decode('utf-8')
  38. }