from fastapi import APIRouter from pydantic import BaseModel from tempfile import TemporaryDirectory from glob import iglob from pdf.processor import extract_text from core.config import settings import base64 import os import zipfile import json router = APIRouter() class ExtractionRequest(BaseModel): pdf_zip: str # Base64 encoded zip file @router.post("/extraction") async def extraction(request: ExtractionRequest): base_dir = settings.BASE_DIR pdf_zip = base64.b64decode(request.pdf_zip) with TemporaryDirectory(dir=base_dir, prefix=f'tmp_', ignore_cleanup_errors=True) as temp_dir: # Save the zip file to a temporary directory pdf_zip_path = os.path.join(temp_dir, 'pdf.zip') with open(pdf_zip_path, 'wb') as f: f.write(pdf_zip) pdf_dir = os.path.join(temp_dir, 'pdf') # Extract the PDFs from the zip file with zipfile.ZipFile(pdf_zip_path, 'r') as zip_ref: zip_ref.extractall(pdf_dir) # Extract text from the PDFs pattern = os.path.join(pdf_dir, '*.pdf') extracted_texts = dict() for pdf_path in iglob(pattern): pdf_name = os.path.basename(pdf_path) extracted_text = extract_text(pdf_path) extracted_texts[pdf_name] = extracted_text extracted_texts = json.dumps(extracted_texts).encode('utf-8') return { 'status': 'success', 'data': base64.b64encode(extracted_texts).decode('utf-8') }