| 12345678910111213141516171819202122232425262728293031323334353637383940414243 |
- from fastapi import APIRouter
- from pydantic import BaseModel
- from tempfile import TemporaryDirectory
- from glob import iglob
- from pdf.processor import extract_text
- from core.config import settings
- import base64
- import os
- import zipfile
- import json
-
- router = APIRouter()
-
- class ExtractionRequest(BaseModel):
- pdf_zip: str # Base64 encoded zip file
-
- @router.post("/extraction")
- async def extraction(request: ExtractionRequest):
- base_dir = settings.BASE_DIR
- pdf_zip = base64.b64decode(request.pdf_zip)
- with TemporaryDirectory(dir=base_dir, prefix=f'tmp_', ignore_cleanup_errors=True) as temp_dir:
- # Save the zip file to a temporary directory
- pdf_zip_path = os.path.join(temp_dir, 'pdf.zip')
- with open(pdf_zip_path, 'wb') as f:
- f.write(pdf_zip)
- pdf_dir = os.path.join(temp_dir, 'pdf')
-
- # Extract the PDFs from the zip file
- with zipfile.ZipFile(pdf_zip_path, 'r') as zip_ref:
- zip_ref.extractall(pdf_dir)
-
- # Extract text from the PDFs
- pattern = os.path.join(pdf_dir, '*.pdf')
- extracted_texts = dict()
- for pdf_path in iglob(pattern):
- pdf_name = os.path.basename(pdf_path)
- extracted_text = extract_text(pdf_path)
- extracted_texts[pdf_name] = extracted_text
- extracted_texts = json.dumps(extracted_texts).encode('utf-8')
- return {
- 'status': 'success',
- 'data': base64.b64encode(extracted_texts).decode('utf-8')
- }
|