Ver código fonte

add: FastAPI

Michael 1 ano atrás
pai
commit
929e86bfc9

+ 2
- 0
.env.example Ver arquivo

@@ -0,0 +1,2 @@
1
+BASE_DIR=
2
+API_KEY=

+ 10
- 0
.gitignore Ver arquivo

@@ -0,0 +1,10 @@
1
+.env
2
+__pycache__
3
+*.pyc
4
+*.pyo
5
+*.pyd
6
+venv/
7
+.venv/
8
+.vscode/
9
+.idea/
10
+*.swp

+ 0
- 0
app/__init__.py Ver arquivo


+ 0
- 0
app/core/__init__.py Ver arquivo


+ 13
- 0
app/core/auth.py Ver arquivo

@@ -0,0 +1,13 @@
1
+from fastapi.security import APIKeyHeader
2
+from fastapi import Security, HTTPException, status
3
+from .config import settings
4
+
5
+api_key_header = APIKeyHeader(name='X-API-Key')
6
+
7
+def validate_api_key(api_key: str = Security(api_key_header)):
8
+    if api_key == settings.API_KEY:
9
+        return api_key
10
+    raise HTTPException(
11
+        status_code=status.HTTP_403_FORBIDDEN,
12
+        detail="Could not validate credentials",
13
+    )

+ 11
- 0
app/core/config.py Ver arquivo

@@ -0,0 +1,11 @@
1
+from pydantic_settings import BaseSettings
2
+from dotenv import load_dotenv
3
+import os
4
+
5
+load_dotenv()
6
+
7
+class Settings(BaseSettings):
8
+    BASE_DIR: str = os.getenv("BASE_DIR")
9
+    API_KEY: str = os.getenv("API_KEY")
10
+
11
+settings = Settings()

+ 8
- 0
app/main.py Ver arquivo

@@ -0,0 +1,8 @@
1
+from fastapi import FastAPI, Depends
2
+from routers import extraction, replacement
3
+from core.auth import validate_api_key
4
+
5
+app = FastAPI()
6
+
7
+app.include_router(extraction.router, tags=["extraction"], dependencies=[Depends(validate_api_key)])
8
+app.include_router(replacement.router, tags=["replacement"], dependencies=[Depends(validate_api_key)])

+ 0
- 0
app/pdf/__init__.py Ver arquivo


+ 83
- 0
app/pdf/processor.py Ver arquivo

@@ -0,0 +1,83 @@
1
+from pymupdf import TEXTFLAGS_DICT, TEXT_PRESERVE_IMAGES
2
+from typing import Tuple, Dict, List
3
+
4
+import pymupdf
5
+
6
+BBox = Tuple[float, float, float, float] # (x0, y0, x1, y1) or top-left and bottom-right corners
7
+Origin = Tuple[float, float] # (x, y) or (x1, y0) or bottom-left corner
8
+
9
+def extract_text(pdf_path: str):
10
+    flags = TEXTFLAGS_DICT & ~TEXT_PRESERVE_IMAGES
11
+    extracted_text = {}
12
+    with pymupdf.open(pdf_path) as pdf_file:
13
+        for page in pdf_file:
14
+            text_dict = page.get_text('dict', flags=flags, sort=True)
15
+            for block in text_dict['blocks']:
16
+                for line in block['lines']:
17
+                    for span in line['spans']:
18
+                        original_text = span['text']
19
+                        stripped_text = original_text.strip()
20
+                        if not stripped_text:
21
+                            continue
22
+                        origin = span['origin']
23
+                        bbox = span['bbox']
24
+                        corrected_bbox = _correct_bbox(bbox, origin)
25
+                        text_with_metadata = {
26
+                            'original_text': span['text'],
27
+                            'stripped_text': stripped_text,
28
+                            'bbox': corrected_bbox,
29
+                            'origin': origin,
30
+                            'size': span['size'],
31
+                            'font': span['font'],
32
+                            'color': _int_to_rgbf(span['color']),
33
+                        }
34
+                        extracted_text.setdefault(page.number, []).append(text_with_metadata)
35
+    return extracted_text
36
+
37
+def replace_texts(pdf_path: str, output_path: str, replacement_data: Dict, preserve_original_fonts: bool = True):
38
+    with pymupdf.open(pdf_path) as pdf_file:
39
+        for page_number, page_replacements in replacement_data.items():
40
+            page = pdf_file[int(page_number)]
41
+            page_fonts = page.get_fonts()
42
+
43
+            # Add redact annotation to remove the original text
44
+            for replacement in page_replacements:
45
+                page.add_redact_annot(replacement['bbox'])
46
+
47
+            page.apply_redactions()
48
+
49
+            # Insert the corrected text
50
+            for replacement in page_replacements:
51
+                original_text = replacement['original_text']
52
+                stripped_text = replacement['stripped_text']
53
+                corrected_text = replacement['corrected_text']
54
+                replacement_text = original_text.replace(stripped_text, corrected_text)
55
+
56
+                page.insert_text(
57
+                    replacement['origin'],
58
+                    replacement_text,
59
+                    fontsize=replacement['size'],
60
+                    fontname=_get_font_name(replacement['font'], page_fonts) if preserve_original_fonts else 'helvetica',
61
+                    color=replacement['color'],
62
+                )
63
+        pdf_file.save(output_path, garbage=3, deflate=True, clean=True)
64
+
65
+def _correct_bbox(bbox: BBox, origin: Origin) -> BBox:
66
+    corrected_bbox = list(bbox)
67
+    if bbox[3] != origin[1]:
68
+        corrected_bbox[1] = origin[1] - (bbox[3] - bbox[1])
69
+        corrected_bbox[3] = origin[1]
70
+    return tuple(corrected_bbox)
71
+
72
+def _int_to_rgbf(color: int) -> Tuple[float, float, float]:
73
+    return (
74
+        ((color >> 16) & 0xFF) / 255,
75
+        ((color >> 8) & 0xFF) / 255,
76
+        (color & 0xFF) / 255,
77
+    )
78
+
79
+def _get_font_name(keyword: str, fonts: List):
80
+    for font in fonts:
81
+        if keyword in font[3]:
82
+            return font[4]
83
+    return 'helvetica'

+ 0
- 0
app/routers/__init__.py Ver arquivo


+ 43
- 0
app/routers/extraction.py Ver arquivo

@@ -0,0 +1,43 @@
1
+from fastapi import APIRouter
2
+from pydantic import BaseModel
3
+from tempfile import TemporaryDirectory
4
+from glob import iglob
5
+from pdf.processor import extract_text
6
+from core.config import settings
7
+import base64
8
+import os
9
+import zipfile
10
+import json
11
+
12
+router = APIRouter()
13
+
14
+class ExtractionRequest(BaseModel):
15
+    pdf_zip: str # Base64 encoded zip file
16
+
17
+@router.post("/extraction")
18
+async def extraction(request: ExtractionRequest):
19
+    base_dir = settings.BASE_DIR
20
+    pdf_zip = base64.b64decode(request.pdf_zip)
21
+    with TemporaryDirectory(dir=base_dir, prefix=f'tmp_', ignore_cleanup_errors=True) as temp_dir:
22
+        # Save the zip file to a temporary directory
23
+        pdf_zip_path = os.path.join(temp_dir, 'pdf.zip')
24
+        with open(pdf_zip_path, 'wb') as f:
25
+            f.write(pdf_zip)
26
+        pdf_dir = os.path.join(temp_dir, 'pdf')
27
+
28
+        # Extract the PDFs from the zip file
29
+        with zipfile.ZipFile(pdf_zip_path, 'r') as zip_ref:
30
+            zip_ref.extractall(pdf_dir)
31
+
32
+        # Extract text from the PDFs
33
+        pattern = os.path.join(pdf_dir, '*.pdf')
34
+        extracted_texts = dict()
35
+        for pdf_path in iglob(pattern):
36
+            pdf_name = os.path.basename(pdf_path)
37
+            extracted_text = extract_text(pdf_path)
38
+            extracted_texts[pdf_name] = extracted_text
39
+        extracted_texts = json.dumps(extracted_texts).encode('utf-8')
40
+        return {
41
+            'status': 'success',
42
+            'data': base64.b64encode(extracted_texts).decode('utf-8')
43
+        }

+ 55
- 0
app/routers/replacement.py Ver arquivo

@@ -0,0 +1,55 @@
1
+from fastapi import APIRouter
2
+from pydantic import BaseModel
3
+from tempfile import TemporaryDirectory
4
+from glob import iglob
5
+from pdf.processor import replace_texts
6
+from core.config import settings
7
+import base64
8
+import os
9
+import zipfile
10
+import json
11
+
12
+router = APIRouter()
13
+
14
+class ReplacementRequest(BaseModel):
15
+    pdf_zip: str # Base64 encoded zip file
16
+    replacement_data: str # Base64 encoded JSON string
17
+    preserve_original_fonts: bool = True
18
+
19
+@router.post("/replacement")
20
+async def replacement(request: ReplacementRequest):
21
+    base_dir = settings.BASE_DIR
22
+    pdf_zip = base64.b64decode(request.pdf_zip)
23
+    replacement_data = json.loads(base64.b64decode(request.replacement_data).decode('utf-8'))
24
+    with TemporaryDirectory(dir=base_dir, prefix=f'tmp_', ignore_cleanup_errors=True) as temp_dir:
25
+        # Save the zip file to a temporary directory
26
+        pdf_zip_path = os.path.join(temp_dir, 'pdf.zip')
27
+        with open(pdf_zip_path, 'wb') as f:
28
+            f.write(pdf_zip)
29
+
30
+        # Extract the PDFs from the zip file
31
+        pdf_dir = os.path.join(temp_dir, 'pdf')
32
+        output_dir = os.path.join(temp_dir, 'output')
33
+        os.makedirs(output_dir, exist_ok=True)
34
+        with zipfile.ZipFile(pdf_zip_path, 'r') as zip_ref:
35
+            zip_ref.extractall(pdf_dir)
36
+
37
+        # Replace text in the PDFs
38
+        pattern = os.path.join(pdf_dir, '*.pdf')
39
+        for pdf_path in iglob(pattern):
40
+            pdf_name = os.path.basename(pdf_path)
41
+            output_path = os.path.join(output_dir, pdf_name)
42
+            replace_texts(pdf_path, output_path, replacement_data[pdf_name], request.preserve_original_fonts)
43
+
44
+        # Zip the output PDFs
45
+        output_zip_path = os.path.join(temp_dir, 'output.zip')
46
+        with zipfile.ZipFile(output_zip_path, 'w') as zip_ref:
47
+            for output_pdf_path in iglob(os.path.join(output_dir, '*.pdf')):
48
+                zip_ref.write(output_pdf_path, os.path.basename(output_pdf_path))
49
+        zip_file = open(output_zip_path, 'rb')
50
+        data = zip_file.read()
51
+        zip_file.close()
52
+        return {
53
+            'status': 'success',
54
+            'data': base64.b64encode(data).decode('utf-8'),
55
+        }

+ 5
- 0
requirements.txt Ver arquivo

@@ -0,0 +1,5 @@
1
+fastapi==0.115.8
2
+pydantic==2.10.6
3
+pydantic_settings==2.7.1
4
+pymupdf==1.25.2
5
+python-dotenv==1.0.1