File size: 5,493 Bytes
c8588c9
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
import os
import io
import json
from typing import List, Tuple, Dict, Any

import fitz  # PyMuPDF
from PIL import Image
import gradio as gr


# Lazy-load the OCR model to reduce startup time and memory
_ocr_model = None


def get_ocr_model(lang: str = "en"):
    global _ocr_model
    if _ocr_model is not None:
        return _ocr_model

    # PaddleOCR supports language packs like 'en', 'ch', 'fr', 'german', etc.
    # The Spaces container will download the model weights on first run and cache them.
    from paddleocr import PaddleOCR  # import here to avoid heavy import at startup

    _ocr_model = PaddleOCR(use_angle_cls=True, lang=lang, show_log=False)
    return _ocr_model


def pdf_page_to_image(pdf_doc: fitz.Document, page_index: int, dpi: int = 170) -> Image.Image:
    page = pdf_doc.load_page(page_index)
    zoom = dpi / 72.0  # 72 dpi is PDF default
    mat = fitz.Matrix(zoom, zoom)
    pix = page.get_pixmap(matrix=mat, alpha=False)
    img = Image.frombytes("RGB", [pix.width, pix.height], pix.samples)
    return img


def run_paddle_ocr_on_image(image: Image.Image, lang: str = "en") -> Tuple[str, List[Dict[str, Any]]]:
    ocr = get_ocr_model(lang=lang)
    # Convert PIL image to numpy array for PaddleOCR
    import numpy as np

    img_np = np.array(image)
    result = ocr.ocr(img_np, cls=True)

    lines: List[str] = []
    items: List[Dict[str, Any]] = []

    # PaddleOCR returns list per image: [[(box, (text, conf)), ...]]
    for page_result in result:
        if page_result is None:
            continue
        for det in page_result:
            box = det[0]
            text = det[1][0]
            conf = float(det[1][1])
            lines.append(text)
            items.append({"bbox": box, "text": text, "confidence": conf})

    return "\n".join(lines), items


def extract_text_from_pdf(file_obj, dpi: int = 170, max_pages: int | None = None, lang: str = "en") -> Tuple[str, str]:
    """

    Returns combined text and a JSON string with per-page OCR results.

    """
    if file_obj is None:
        return "", json.dumps({"pages": []}, ensure_ascii=False)

    # Gradio may pass a path or a tempfile.NamedTemporaryFile-like with .name
    pdf_path = file_obj if isinstance(file_obj, str) else getattr(file_obj, "name", None)
    if pdf_path is None or not os.path.exists(pdf_path):
        # If bytes were passed, fall back to reading from buffer
        file_bytes = file_obj.read() if hasattr(file_obj, "read") else None
        if not file_bytes:
            return "", json.dumps({"pages": []}, ensure_ascii=False)
        pdf_doc = fitz.open(stream=file_bytes, filetype="pdf")
    else:
        pdf_doc = fitz.open(pdf_path)

    try:
        num_pages = pdf_doc.page_count
        if max_pages is not None:
            num_pages = min(num_pages, max_pages)

        all_text_lines: List[str] = []
        pages_payload: List[Dict[str, Any]] = []

        for page_index in range(num_pages):
            image = pdf_page_to_image(pdf_doc, page_index, dpi=dpi)
            page_text, page_items = run_paddle_ocr_on_image(image, lang=lang)

            all_text_lines.append(page_text)
            pages_payload.append({
                "page": page_index + 1,
                "items": page_items,
            })

        combined_text = "\n\n".join([t for t in all_text_lines if t])
        json_payload = json.dumps({"pages": pages_payload}, ensure_ascii=False)

        return combined_text, json_payload
    finally:
        pdf_doc.close()


def gradio_predict(pdf_file, dpi, max_pages, lang):
    text, payload = extract_text_from_pdf(pdf_file, dpi=int(dpi), max_pages=(int(max_pages) if max_pages else None), lang=lang)
    return text, payload


with gr.Blocks(title="PDF OCR with PaddleOCR + PyMuPDF") as demo:
    gr.Markdown("""

    # PDF OCR (PaddleOCR + PyMuPDF)



    Upload a PDF to extract text using OCR. Processes each page as an image rendered by PyMuPDF, then recognizes text with PaddleOCR.

    """)

    with gr.Row():
        pdf_input = gr.File(label="PDF", file_types=[".pdf"], file_count="single")
        with gr.Column():
            dpi_input = gr.Slider(100, 300, value=170, step=10, label="Render DPI (higher = slower but more accurate)")
            max_pages_input = gr.Number(value=None, label="Max pages (optional)")
            lang_input = gr.Dropdown(choices=["en", "ch", "fr", "german", "korean", "japanese", "ta", "te", "latin"], value="en", label="OCR Language")

    with gr.Row():
        text_output = gr.Textbox(label="Extracted Text", lines=15)
        json_output = gr.JSON(label="Per-page OCR details (bbox, text, confidence)")

    run_btn = gr.Button("Run OCR")
    run_btn.click(gradio_predict, inputs=[pdf_input, dpi_input, max_pages_input, lang_input], outputs=[text_output, json_output])

    gr.Examples(
        examples=[],
        inputs=[pdf_input, dpi_input, max_pages_input, lang_input],
    )

    # Enable simple API for clients via gradio_client or Spaces Inference API
    gr.Markdown("""

    ## API usage

    - Use `gradio_client` to call this Space programmatically.

    - Endpoint function: `gradio_predict(pdf_file, dpi, max_pages, lang)` returning `(text, json)`.

    """)


if __name__ == "__main__":
    # On Spaces, the host/port are managed by the platform. Locally, this runs on 7860 by default.
    demo.launch()