Spaces:

markobinario
/

OCRapi

Running

App Files Files Community

markobinario commited on Oct 17

Commit

c0c9942

verified ·

1 Parent(s): 575870b

Update app.py

Browse files

Files changed (1) hide show

app.py +278 -52

app.py CHANGED Viewed

@@ -1,17 +1,23 @@
 import os
 import io
 import json
-from typing import List, Tuple, Dict, Any
 import fitz  # PyMuPDF
 from PIL import Image
 import gradio as gr
 # Lazy-load the OCR model to reduce startup time and memory
 _ocr_model = None
 def get_ocr_model(lang: str = "en"):
     global _ocr_model
     if _ocr_model is not None:
@@ -24,8 +30,7 @@ def get_ocr_model(lang: str = "en"):
     _ocr_model = PaddleOCR(use_angle_cls=True, lang=lang, show_log=False)
     return _ocr_model
-def pdf_page_to_image(pdf_doc: fitz.Document, page_index: int, dpi: int = 170) -> Image.Image:
     page = pdf_doc.load_page(page_index)
     zoom = dpi / 72.0  # 72 dpi is PDF default
     mat = fitz.Matrix(zoom, zoom)
@@ -33,12 +38,9 @@ def pdf_page_to_image(pdf_doc: fitz.Document, page_index: int, dpi: int = 170) -
     img = Image.frombytes("RGB", [pix.width, pix.height], pix.samples)
     return img
 def run_paddle_ocr_on_image(image: Image.Image, lang: str = "en") -> Tuple[str, List[Dict[str, Any]]]:
     ocr = get_ocr_model(lang=lang)
     # Convert PIL image to numpy array for PaddleOCR
-    import numpy as np
     img_np = np.array(image)
     result = ocr.ocr(img_np, cls=True)
@@ -58,26 +60,27 @@ def run_paddle_ocr_on_image(image: Image.Image, lang: str = "en") -> Tuple[str,
     return "\n".join(lines), items
-def extract_text_from_pdf(file_obj, dpi: int = 170, max_pages: int | None = None, lang: str = "en") -> Tuple[str, str]:
     """
-    Returns combined text and a JSON string with per-page OCR results.
     """
     if file_obj is None:
-        return "", json.dumps({"pages": []}, ensure_ascii=False)
-    # Gradio may pass a path or a tempfile.NamedTemporaryFile-like with .name
-    pdf_path = file_obj if isinstance(file_obj, str) else getattr(file_obj, "name", None)
-    if pdf_path is None or not os.path.exists(pdf_path):
-        # If bytes were passed, fall back to reading from buffer
-        file_bytes = file_obj.read() if hasattr(file_obj, "read") else None
-        if not file_bytes:
-            return "", json.dumps({"pages": []}, ensure_ascii=False)
-        pdf_doc = fitz.open(stream=file_bytes, filetype="pdf")
-    else:
-        pdf_doc = fitz.open(pdf_path)
     try:
         num_pages = pdf_doc.page_count
         if max_pages is not None:
             num_pages = min(num_pages, max_pages)
@@ -97,39 +100,262 @@ def extract_text_from_pdf(file_obj, dpi: int = 170, max_pages: int | None = None
         combined_text = "\n\n".join([t for t in all_text_lines if t])
         json_payload = json.dumps({"pages": pages_payload}, ensure_ascii=False)
-        return combined_text, json_payload
-    finally:
         pdf_doc.close()
-def gradio_predict(pdf_file):
-    # Always render at a high DPI for accuracy and use English OCR by default
-    text, _ = extract_text_from_pdf(pdf_file, dpi=300, max_pages=None, lang="en")
-    return text
-with gr.Blocks(title="PDF OCR with PaddleOCR + PyMuPDF") as demo:
-    gr.Markdown("""
-    # PDF OCR (PaddleOCR + PyMuPDF)
-    Upload a PDF to extract text using OCR. The app renders pages with PyMuPDF at a high DPI and uses PaddleOCR for recognition.
     """)
-    pdf_input = gr.File(label="PDF", file_types=[".pdf"], file_count="single")
-    text_output = gr.Textbox(label="Extracted Text", lines=20)
-    # Auto-run OCR when a PDF is uploaded
-    pdf_input.change(fn=gradio_predict, inputs=[pdf_input], outputs=[text_output], api_name="predict")
-    # Simple API note
-    gr.Markdown("""
-    ## API usage
-    - Use `gradio_client` to call this Space. Function signature: `gradio_predict(pdf_file)` → `text`.
     """)
 if __name__ == "__main__":
-    # On Spaces, the host/port are managed by the platform. Locally, this runs on 7860 by default.
-    demo.launch()

 import os
 import io
 import json
+import time
+from typing import List, Tuple, Dict, Any, Optional
 import fitz  # PyMuPDF
 from PIL import Image
 import gradio as gr
+import numpy as np
+# =========================
+# Config
+# =========================
+LOGO_IMAGE_PATH = './assets/logo.jpg'
+GOOGLE_FONTS_URL = "<link href='https://fonts.googleapis.com/css2?family=Inter:wght@400;500;600;700&display=swap' rel='stylesheet'>"
 # Lazy-load the OCR model to reduce startup time and memory
 _ocr_model = None
 def get_ocr_model(lang: str = "en"):
     global _ocr_model
     if _ocr_model is not None:
     _ocr_model = PaddleOCR(use_angle_cls=True, lang=lang, show_log=False)
     return _ocr_model
+def pdf_page_to_image(pdf_doc: fitz.Document, page_index: int, dpi: int = 300) -> Image.Image:
     page = pdf_doc.load_page(page_index)
     zoom = dpi / 72.0  # 72 dpi is PDF default
     mat = fitz.Matrix(zoom, zoom)
     img = Image.frombytes("RGB", [pix.width, pix.height], pix.samples)
     return img
 def run_paddle_ocr_on_image(image: Image.Image, lang: str = "en") -> Tuple[str, List[Dict[str, Any]]]:
     ocr = get_ocr_model(lang=lang)
     # Convert PIL image to numpy array for PaddleOCR
     img_np = np.array(image)
     result = ocr.ocr(img_np, cls=True)
     return "\n".join(lines), items
+def extract_text_from_pdf(file_obj, dpi: int = 300, max_pages: int | None = None, lang: str = "en") -> Tuple[str, str, Dict[str, Any]]:
     """
+    Returns combined text, JSON string with per-page OCR results, and processing stats.
     """
     if file_obj is None:
+        return "", json.dumps({"pages": []}, ensure_ascii=False), {"error": "No file provided"}
+    start_time = time.time()
     try:
+        # Gradio may pass a path or a tempfile.NamedTemporaryFile-like with .name
+        pdf_path = file_obj if isinstance(file_obj, str) else getattr(file_obj, "name", None)
+        if pdf_path is None or not os.path.exists(pdf_path):
+            # If bytes were passed, fall back to reading from buffer
+            file_bytes = file_obj.read() if hasattr(file_obj, "read") else None
+            if not file_bytes:
+                return "", json.dumps({"pages": []}, ensure_ascii=False), {"error": "Could not read file"}
+            pdf_doc = fitz.open(stream=file_bytes, filetype="pdf")
+        else:
+            pdf_doc = fitz.open(pdf_path)
         num_pages = pdf_doc.page_count
         if max_pages is not None:
             num_pages = min(num_pages, max_pages)
         combined_text = "\n\n".join([t for t in all_text_lines if t])
         json_payload = json.dumps({"pages": pages_payload}, ensure_ascii=False)
+        processing_time = time.time() - start_time
+        stats = {
+            "pages_processed": num_pages,
+            "total_pages": pdf_doc.page_count,
+            "processing_time": round(processing_time, 2),
+            "dpi": dpi,
+            "language": lang
+        }
         pdf_doc.close()
+        return combined_text, json_payload, stats
+    except Exception as e:
+        return "", json.dumps({"pages": []}, ensure_ascii=False), {"error": str(e)}
+def handle_pdf_ocr(pdf_file: str) -> Tuple[str, str, str]:
+    """Main handler for PDF OCR processing"""
+    if not pdf_file:
+        raise gr.Error("Please upload a PDF file first.")
+    try:
+        print(f"Processing PDF: {pdf_file}")
+        start_time = time.time()
+        text, json_data, stats = extract_text_from_pdf(pdf_file, dpi=300, max_pages=None, lang="en")
+        end_time = time.time()
+        duration = end_time - start_time
+        print(f"PDF processing completed in {duration:.2f} seconds.")
+        if "error" in stats:
+            raise gr.Error(f"Processing failed: {stats['error']}")
+        # Format stats for display
+        stats_text = f"""**Processing Statistics:**
+- Pages processed: {stats.get('pages_processed', 0)}/{stats.get('total_pages', 0)}
+- Processing time: {stats.get('processing_time', 0)}s
+- DPI: {stats.get('dpi', 300)}
+- Language: {stats.get('language', 'en')}"""
+        return text, json_data, stats_text
+    except Exception as e:
+        error_msg = f"Error processing PDF: {str(e)}"
+        print(error_msg)
+        raise gr.Error(error_msg)
+# =========================
+# CSS & UI
+# =========================
+custom_css = """
+/* Global fonts */
+body, .gradio-container {
+  font-family: "Inter", "Segoe UI", "Roboto", sans-serif;
+}
+.app-header {
+  text-align: center;
+  max-width: 900px;
+  margin: 0 auto 20px !important;
+  padding: 20px;
+  background: linear-gradient(135deg, #667eea 0%, #764ba2 100%);
+  border-radius: 12px;
+  color: white;
+}
+.app-header h1 {
+  margin: 0;
+  font-size: 2.5rem;
+  font-weight: 700;
+}
+.app-header p {
+  margin: 10px 0 0 0;
+  opacity: 0.9;
+  font-size: 1.1rem;
+}
+.gradio-container {
+  padding: 20px 0 !important;
+  max-width: 1200px;
+  margin: 0 auto;
+}
+.upload-section {
+  background: #f8fafc;
+  border: 2px dashed #cbd5e1;
+  border-radius: 12px;
+  padding: 30px;
+  text-align: center;
+  margin: 20px 0;
+}
+.upload-section:hover {
+  border-color: #667eea;
+  background: #f1f5f9;
+}
+.results-section {
+  margin-top: 20px;
+}
+.stats-box {
+  background: #f0f9ff;
+  border: 1px solid #0ea5e9;
+  border-radius: 8px;
+  padding: 15px;
+  margin: 10px 0;
+}
+#text_output {
+  min-height: 300px;
+  font-family: 'Monaco', 'Menlo', 'Ubuntu Mono', monospace;
+  font-size: 14px;
+  line-height: 1.6;
+}
+#json_output {
+  min-height: 200px;
+  font-family: 'Monaco', 'Menlo', 'Ubuntu Mono', monospace;
+  font-size: 12px;
+}
+.process-btn {
+  background: linear-gradient(135deg, #667eea 0%, #764ba2 100%) !important;
+  color: white !important;
+  border: none !important;
+  padding: 12px 30px !important;
+  border-radius: 8px !important;
+  font-weight: 600 !important;
+  font-size: 16px !important;
+}
+.process-btn:hover {
+  transform: translateY(-2px);
+  box-shadow: 0 8px 25px rgba(102, 126, 234, 0.3);
+}
+.notice {
+  background: #fef3c7;
+  border: 1px solid #f59e0b;
+  border-radius: 8px;
+  padding: 15px;
+  margin: 20px 0;
+  color: #92400e;
+}
+.api-section {
+  background: #f1f5f9;
+  border-radius: 8px;
+  padding: 20px;
+  margin: 20px 0;
+  border-left: 4px solid #667eea;
+}
+"""
+with gr.Blocks(head=GOOGLE_FONTS_URL, css=custom_css, theme=gr.themes.Soft()) as demo:
+    # Header
+    gr.HTML("""
+    <div class="app-header">
+        <h1>📄 PDF OCR Extractor</h1>
+        <p>Extract text from PDF documents using PaddleOCR + PyMuPDF</p>
+    </div>
     """)
+    # Notice
+    gr.HTML("""
+    <div class="notice">
+        <strong>💡 Tip:</strong> This tool processes PDFs by rendering each page as a high-resolution image (300 DPI) and then applying OCR.
+        For best results, use clear, well-scanned PDFs with good contrast.
+    </div>
     """)
+    with gr.Row():
+        with gr.Column(scale=1):
+            # Upload section
+            gr.HTML('<div class="upload-section">')
+            pdf_input = gr.File(
+                label="📁 Upload PDF File",
+                file_types=[".pdf"],
+                file_count="single",
+                elem_id="pdf_upload"
+            )
+            gr.HTML('</div>')
+            # Process button
+            process_btn = gr.Button(
+                "🚀 Extract Text",
+                variant="primary",
+                elem_classes=["process-btn"],
+                scale=2
+            )
+            # API section
+            gr.HTML("""
+            <div class="api-section">
+                <h3>🔗 API Usage</h3>
+                <p><strong>Endpoint:</strong> <code>/predict</code></p>
+                <p><strong>Input:</strong> PDF file</p>
+                <p><strong>Output:</strong> Extracted text</p>
+            </div>
+            """)
+        with gr.Column(scale=2):
+            # Results section
+            gr.HTML('<div class="results-section">')
+            with gr.Tabs():
+                with gr.Tab("📝 Extracted Text"):
+                    text_output = gr.Textbox(
+                        label="Extracted Text",
+                        lines=20,
+                        elem_id="text_output",
+                        placeholder="Extracted text will appear here..."
+                    )
+                with gr.Tab("📊 JSON Data"):
+                    json_output = gr.Code(
+                        label="Detailed OCR Results (JSON)",
+                        language="json",
+                        elem_id="json_output",
+                        placeholder="Detailed OCR results will appear here..."
+                    )
+                with gr.Tab("📈 Statistics"):
+                    stats_output = gr.Markdown(
+                        label="Processing Statistics",
+                        placeholder="Processing statistics will appear here..."
+                    )
+            gr.HTML('</div>')
+    # Event handlers
+    process_btn.click(
+        fn=handle_pdf_ocr,
+        inputs=[pdf_input],
+        outputs=[text_output, json_output, stats_output],
+        api_name="predict"
+    )
+    # Auto-process on file upload
+    pdf_input.change(
+        fn=handle_pdf_ocr,
+        inputs=[pdf_input],
+        outputs=[text_output, json_output, stats_output],
+        api_name="predict"
+    )
 if __name__ == "__main__":
+    port = int(os.getenv("PORT", "7860"))
+    demo.queue(max_size=6).launch(
+        server_name="0.0.0.0",
+        server_port=port,
+        share=False
+    )