Spaces:
Sleeping
Sleeping
| import os | |
| import io | |
| import json | |
| from typing import List, Tuple, Dict, Any | |
| import fitz # PyMuPDF | |
| from PIL import Image | |
| import gradio as gr | |
| # Lazy-load the OCR model to reduce startup time and memory | |
| _ocr_model = None | |
| def get_ocr_model(lang: str = "en"): | |
| global _ocr_model | |
| if _ocr_model is not None: | |
| return _ocr_model | |
| # PaddleOCR supports language packs like 'en', 'ch', 'fr', 'german', etc. | |
| # The Spaces container will download the model weights on first run and cache them. | |
| from paddleocr import PaddleOCR # import here to avoid heavy import at startup | |
| _ocr_model = PaddleOCR(use_angle_cls=True, lang=lang, show_log=False) | |
| return _ocr_model | |
| def pdf_page_to_image(pdf_doc: fitz.Document, page_index: int, dpi: int = 170) -> Image.Image: | |
| page = pdf_doc.load_page(page_index) | |
| zoom = dpi / 72.0 # 72 dpi is PDF default | |
| mat = fitz.Matrix(zoom, zoom) | |
| pix = page.get_pixmap(matrix=mat, alpha=False) | |
| img = Image.frombytes("RGB", [pix.width, pix.height], pix.samples) | |
| return img | |
| def run_paddle_ocr_on_image(image: Image.Image, lang: str = "en") -> Tuple[str, List[Dict[str, Any]]]: | |
| ocr = get_ocr_model(lang=lang) | |
| # Convert PIL image to numpy array for PaddleOCR | |
| import numpy as np | |
| img_np = np.array(image) | |
| result = ocr.ocr(img_np, cls=True) | |
| lines: List[str] = [] | |
| items: List[Dict[str, Any]] = [] | |
| # PaddleOCR returns list per image: [[(box, (text, conf)), ...]] | |
| for page_result in result: | |
| if page_result is None: | |
| continue | |
| for det in page_result: | |
| box = det[0] | |
| text = det[1][0] | |
| conf = float(det[1][1]) | |
| lines.append(text) | |
| items.append({"bbox": box, "text": text, "confidence": conf}) | |
| return "\n".join(lines), items | |
| def extract_text_from_pdf(file_obj, dpi: int = 170, max_pages: int | None = None, lang: str = "en") -> Tuple[str, str]: | |
| """ | |
| Returns combined text and a JSON string with per-page OCR results. | |
| """ | |
| if file_obj is None: | |
| return "", json.dumps({"pages": []}, ensure_ascii=False) | |
| # Gradio may pass a path or a tempfile.NamedTemporaryFile-like with .name | |
| pdf_path = file_obj if isinstance(file_obj, str) else getattr(file_obj, "name", None) | |
| if pdf_path is None or not os.path.exists(pdf_path): | |
| # If bytes were passed, fall back to reading from buffer | |
| file_bytes = file_obj.read() if hasattr(file_obj, "read") else None | |
| if not file_bytes: | |
| return "", json.dumps({"pages": []}, ensure_ascii=False) | |
| pdf_doc = fitz.open(stream=file_bytes, filetype="pdf") | |
| else: | |
| pdf_doc = fitz.open(pdf_path) | |
| try: | |
| num_pages = pdf_doc.page_count | |
| if max_pages is not None: | |
| num_pages = min(num_pages, max_pages) | |
| all_text_lines: List[str] = [] | |
| pages_payload: List[Dict[str, Any]] = [] | |
| for page_index in range(num_pages): | |
| image = pdf_page_to_image(pdf_doc, page_index, dpi=dpi) | |
| page_text, page_items = run_paddle_ocr_on_image(image, lang=lang) | |
| all_text_lines.append(page_text) | |
| pages_payload.append({ | |
| "page": page_index + 1, | |
| "items": page_items, | |
| }) | |
| combined_text = "\n\n".join([t for t in all_text_lines if t]) | |
| json_payload = json.dumps({"pages": pages_payload}, ensure_ascii=False) | |
| return combined_text, json_payload | |
| finally: | |
| pdf_doc.close() | |
| def gradio_predict(pdf_file, dpi, max_pages, lang): | |
| text, payload = extract_text_from_pdf(pdf_file, dpi=int(dpi), max_pages=(int(max_pages) if max_pages else None), lang=lang) | |
| return text, payload | |
| with gr.Blocks(title="PDF OCR with PaddleOCR + PyMuPDF") as demo: | |
| gr.Markdown(""" | |
| # PDF OCR (PaddleOCR + PyMuPDF) | |
| Upload a PDF to extract text using OCR. Processes each page as an image rendered by PyMuPDF, then recognizes text with PaddleOCR. | |
| """) | |
| with gr.Row(): | |
| pdf_input = gr.File(label="PDF", file_types=[".pdf"], file_count="single") | |
| with gr.Column(): | |
| dpi_input = gr.Slider(100, 300, value=170, step=10, label="Render DPI (higher = slower but more accurate)") | |
| max_pages_input = gr.Number(value=None, label="Max pages (optional)") | |
| lang_input = gr.Dropdown(choices=["en", "ch", "fr", "german", "korean", "japanese", "ta", "te", "latin"], value="en", label="OCR Language") | |
| with gr.Row(): | |
| text_output = gr.Textbox(label="Extracted Text", lines=15) | |
| json_output = gr.JSON(label="Per-page OCR details (bbox, text, confidence)") | |
| run_btn = gr.Button("Run OCR") | |
| run_btn.click(gradio_predict, inputs=[pdf_input, dpi_input, max_pages_input, lang_input], outputs=[text_output, json_output]) | |
| gr.Examples( | |
| examples=[], | |
| inputs=[pdf_input, dpi_input, max_pages_input, lang_input], | |
| ) | |
| # Enable simple API for clients via gradio_client or Spaces Inference API | |
| gr.Markdown(""" | |
| ## API usage | |
| - Use `gradio_client` to call this Space programmatically. | |
| - Endpoint function: `gradio_predict(pdf_file, dpi, max_pages, lang)` returning `(text, json)`. | |
| """) | |
| if __name__ == "__main__": | |
| # On Spaces, the host/port are managed by the platform. Locally, this runs on 7860 by default. | |
| demo.launch() | |