Spaces:

lucacadalora
/

jatevo

Running on Zero

App Files Files Community

lucacadalora commited on 8 days ago

Commit

8aaa52d

verified ·

1 Parent(s): 604587a

Update app.py

Browse files

Files changed (1) hide show

app.py +158 -90

app.py CHANGED Viewed

@@ -94,23 +94,35 @@ def extract_images_from_pdf(pdf_path):
     return images_by_page
-def image_to_base64(pil_image, format='PNG'):
     """
     Convert PIL Image to base64 string for markdown embedding
     """
     buffered = BytesIO()
-    pil_image.save(buffered, format=format)
     img_str = base64.b64encode(buffered.getvalue()).decode()
     return f"data:image/{format.lower()};base64,{img_str}"
 def detect_figure_regions(text_result, original_image):
     """
-    Detect figure regions from OCR output and crop them
     Returns list of cropped figure images
     """
-    # Look for figure-related patterns in the text
-    # This is a heuristic approach - adjust based on your needs
     figure_images = []
     # Pattern to detect bounding boxes (if model returns them)
@@ -200,40 +212,40 @@ def process_single_page(image, model_runtime, tokenizer, model_size, task_type,
     # ===== Embed images if requested =====
     if embed_images and markdown_content:
         # Check if markdown mentions figures/charts/images
-        figure_keywords = ['figure', 'chart', 'graph', 'diagram', 'image', 'plot', 'illustration']
         has_figure_mention = any(keyword in markdown_content.lower() for keyword in figure_keywords)
         if has_figure_mention:
-            # Try to detect figure regions
             figure_images = detect_figure_regions(plain_text_result, image)
-            # If no figures detected by bounding boxes, embed the whole page as figure
-            if not figure_images and has_figure_mention:
-                # Embed full page image where figures are mentioned
-                base64_img = image_to_base64(image, format='JPEG')
-                figure_markdown = f"\n\n![Page {page_num} Visual Content]({base64_img})\n\n"
-                # Insert image after first mention of figure/chart
-                for keyword in figure_keywords:
-                    if keyword in markdown_content.lower():
-                        # Find the line with the keyword
-                        lines = markdown_content.split('\n')
-                        for i, line in enumerate(lines):
-                            if keyword in line.lower():
-                                # Insert image after this line
-                                lines.insert(i + 1, figure_markdown)
-                                markdown_content = '\n'.join(lines)
-                                break
-                        break
-            # If we found specific figure regions, embed them
-            elif figure_images:
                 for idx, fig_img in enumerate(figure_images):
-                    base64_img = image_to_base64(fig_img, format='PNG')
-                    fig_markdown = f"\n\n![Figure {idx+1} from Page {page_num}]({base64_img})\n\n"
-                    markdown_content += fig_markdown
-    return markdown_content
 # ===== Main Processing Function =====
@@ -241,9 +253,10 @@ def process_single_page(image, model_runtime, tokenizer, model_size, task_type,
 def process_pdf(pdf_file, model_size, task_type, ref_text, is_eval_mode, embed_images, progress=gr.Progress()):
     """
     Process PDF with DeepSeek-OCR and return combined markdown from all pages.
     """
     if pdf_file is None:
-        return "Please upload a PDF file first."
     # handle CPU/GPU
     if torch.cuda.is_available():
@@ -258,13 +271,17 @@ def process_pdf(pdf_file, model_size, task_type, ref_text, is_eval_mode, embed_i
         total_pages = len(images)
         if total_pages == 0:
-            return "No pages found in the PDF."
         # Extract embedded images if needed
         embedded_images = {}
         if embed_images:
-            progress(0.05, desc="Extracting embedded images...")
-            embedded_images = extract_images_from_pdf(pdf_file.name)
         progress(0.1, desc=f"Found {total_pages} pages. Starting OCR...")
@@ -279,7 +296,7 @@ def process_pdf(pdf_file, model_size, task_type, ref_text, is_eval_mode, embed_i
                         desc=f"Processing page {page_num}/{total_pages}..."
                     )
-                    markdown_content = process_single_page(
                         image,
                         model_runtime,
                         tokenizer,
@@ -294,12 +311,15 @@ def process_pdf(pdf_file, model_size, task_type, ref_text, is_eval_mode, embed_i
                     # Add embedded images from PDF if any
                     if embed_images and (page_num - 1) in embedded_images:
-                        markdown_content += "\n\n### Embedded Images from this Page\n\n"
                         for idx, img in enumerate(embedded_images[page_num - 1]):
-                            base64_img = image_to_base64(img, format='PNG')
-                            markdown_content += f"![Embedded Image {idx+1}]({base64_img})\n\n"
-                    # Add page separator
                     page_header = f"\n\n---\n\n# Page {page_num}\n\n"
                     all_markdown_results.append(page_header + markdown_content)
@@ -309,24 +329,61 @@ def process_pdf(pdf_file, model_size, task_type, ref_text, is_eval_mode, embed_i
                     gc.collect()
                 except Exception as e:
-                    error_msg = f"\n\n---\n\n# Page {page_num}\n\n**Error processing this page:** {str(e)}\n\n"
                     all_markdown_results.append(error_msg)
                     print(f"Error on page {page_num}: {str(e)}")
                     continue
         # Combine all results
-        progress(1.0, desc="Finalizing...")
         combined_markdown = "\n\n".join(all_markdown_results)
-        # Add document header
-        final_output = f"# Document OCR Results\n\n**Total Pages:** {total_pages}\n\n{combined_markdown}"
-        return final_output
     except Exception as e:
-        error_message = f"Error processing PDF: {str(e)}\n\nPlease try:\n- Using a smaller model size\n- Processing fewer pages\n- Checking if the PDF is corrupted"
         print(f"Fatal error: {str(e)}")
-        return error_message
 # ===== Theme and UI =====
@@ -334,6 +391,7 @@ theme = Soft(
     font=fonts.GoogleFont("Inter"),
     font_mono=fonts.GoogleFont("JetBrains Mono"),
 )
 custom_css = """
 .gradio-container, body {
   font-family: 'Inter', ui-sans-serif, system-ui, -apple-system, 'Segoe UI', Roboto, 'Helvetica Neue', Arial, 'Noto Sans', 'Apple Color Emoji','Segoe UI Emoji','Segoe UI Symbol','Noto Color Emoji' !important;
@@ -341,6 +399,7 @@ custom_css = """
 .prose h1 { font-weight: 800; letter-spacing: -0.02em; }
 .prose h2, .prose h3 { font-weight: 700; letter-spacing: -0.01em; }
 .gr-button { border-radius: 12px; font-weight: 600; }
 """
@@ -357,27 +416,36 @@ with gr.Blocks(
         Upload a PDF to extract text and convert to Markdown using **DeepSeek-OCR**.
         Each page is processed sequentially and combined into a single markdown document.
-        **NEW:** Now supports embedding images/charts directly in markdown output!
-        **Model Sizes:**
-        - **Tiny** — Fastest, lower accuracy (512×512) - Best for large PDFs
-        - **Small** — Fast, good accuracy (640×640) - Good for 20+ pages
         - **Base** — Balanced performance (1024×1024) - Good for 10-20 pages
         - **Large** — Best accuracy, slower (1280×1280) - Best for <10 pages
         - **Gundam (Recommended)** — Optimized for documents (1024 base, 640 image, crop mode)
-        **Tips for large PDFs:**
-        - Use Tiny or Small model for 20+ pages
-        - Enable "Embed Images" to include charts/figures in markdown
-        - Processing time: ~2-5 seconds per page depending on model size
         - Maximum recommended: 50 pages at once
         """
     )
     with gr.Row():
         with gr.Column(scale=1):
             pdf_input = gr.File(
-                label="Upload PDF",
                 file_types=[".pdf"],
                 type="filepath"
             )
@@ -385,7 +453,7 @@ with gr.Blocks(
             model_size = gr.Dropdown(
                 choices=["Tiny", "Small", "Base", "Large", "Gundam (Recommended)"],
                 value="Small",
-                label="Model Size",
                 info="Use Tiny/Small for large PDFs (20+ pages)"
             )
@@ -397,58 +465,65 @@ with gr.Blocks(
                     "🔍 Locate Object by Reference",
                 ],
                 value="📄 Convert to Markdown",
-                label="Task Type",
             )
             ref_text_input = gr.Textbox(
-                label="Reference Text (for Locate task)",
                 placeholder="e.g., 'the teacher', '20-10', 'a red car'...",
                 visible=False,
             )
-            eval_mode_checkbox = gr.Checkbox(
-                value=False,
-                label="Enable Evaluation Mode",
-                info="Returns only plain text (faster).",
-            )
-            embed_images_checkbox = gr.Checkbox(
-                value=True,
-                label="🖼️ Embed Images/Charts in Markdown",
-                info="Include images and charts as base64 in the markdown output"
-            )
             submit_btn = gr.Button("🚀 Process PDF", variant="primary", size="lg")
             gr.Markdown(
                 """
                 ---
-                **Processing Status:**
-                Watch the progress bar above for real-time updates.
-                **Note:** Embedding images will increase output size and processing time.
                 """
             )
         with gr.Column(scale=2):
-            gr.Markdown("### 📝 Markdown Output")
             output_markdown_preview = gr.Markdown(
                 label="Rendered Markdown",
-                value="*Upload a PDF and click 'Process PDF' to see results here.*"
             )
-            gr.Markdown("### 📄 Markdown Source (Copy/Download)")
             output_text = gr.Textbox(
                 label="Raw Markdown",
                 lines=25,
                 show_copy_button=True,
                 interactive=False,
-                placeholder="Markdown source will appear here..."
             )
     # show/hide reference text box based on selected task
     def toggle_ref_text_visibility(task):
-        return gr.Textbox(visible=True) if task == "🔍 Locate Object by Reference" else gr.Textbox(visible=False)
     task_type.change(
         fn=toggle_ref_text_visibility,
@@ -456,18 +531,10 @@ with gr.Blocks(
         outputs=ref_text_input,
     )
-    def update_outputs(markdown_text):
-        """Update both markdown preview and raw text"""
-        return markdown_text, markdown_text
     submit_btn.click(
         fn=process_pdf,
         inputs=[pdf_input, model_size, task_type, ref_text_input, eval_mode_checkbox, embed_images_checkbox],
-        outputs=output_text,
-    ).then(
-        fn=update_outputs,
-        inputs=output_text,
-        outputs=[output_markdown_preview, output_text]
     )
@@ -479,6 +546,7 @@ if __name__ == "__main__":
         default_concurrency_limit=2
     )
     demo.launch(
-        max_threads=40,  # Increase thread limit
-        show_error=True  # Show errors in UI
     )

     return images_by_page
+def image_to_base64(pil_image, format='JPEG', max_size=(1200, 1200)):
     """
     Convert PIL Image to base64 string for markdown embedding
+    Resize if too large to keep file size manageable
     """
+    # Resize if image is too large
+    if pil_image.size[0] > max_size[0] or pil_image.size[1] > max_size[1]:
+        pil_image = pil_image.copy()
+        pil_image.thumbnail(max_size, Image.Resampling.LANCZOS)
     buffered = BytesIO()
+    # Convert RGBA to RGB if necessary
+    if pil_image.mode == 'RGBA' and format == 'JPEG':
+        rgb_image = Image.new('RGB', pil_image.size, (255, 255, 255))
+        rgb_image.paste(pil_image, mask=pil_image.split()[3])
+        rgb_image.save(buffered, format=format, quality=85)
+    else:
+        pil_image.save(buffered, format=format, quality=85 if format == 'JPEG' else None)
     img_str = base64.b64encode(buffered.getvalue()).decode()
     return f"data:image/{format.lower()};base64,{img_str}"
 def detect_figure_regions(text_result, original_image):
     """
+    Detect figure regions from OCR output using bounding boxes
     Returns list of cropped figure images
     """
     figure_images = []
     # Pattern to detect bounding boxes (if model returns them)
     # ===== Embed images if requested =====
     if embed_images and markdown_content:
         # Check if markdown mentions figures/charts/images
+        figure_keywords = ['figure', 'chart', 'graph', 'diagram', 'image', 'plot', 'illustration', 'table', 'screenshot']
         has_figure_mention = any(keyword in markdown_content.lower() for keyword in figure_keywords)
         if has_figure_mention:
+            # Try to detect figure regions from bounding boxes
             figure_images = detect_figure_regions(plain_text_result, image)
+            # If specific figures detected, embed them
+            if figure_images:
+                figures_markdown = "\n\n### Detected Figures\n\n"
                 for idx, fig_img in enumerate(figure_images):
+                    try:
+                        base64_img = image_to_base64(fig_img, format='PNG')
+                        figures_markdown += f"![Figure {idx+1} from Page {page_num}]({base64_img})\n\n"
+                    except Exception as e:
+                        print(f"Error embedding figure {idx+1}: {e}")
+                markdown_content += figures_markdown
+            else:
+                # No specific regions detected, but figures mentioned
+                # Embed full page image for context
+                try:
+                    base64_img = image_to_base64(image, format='JPEG')
+                    page_image_markdown = f"\n\n### Page {page_num} Visual Content\n\n![Page {page_num} Full View]({base64_img})\n\n"
+                    # Insert image after first paragraph or at the beginning
+                    lines = markdown_content.split('\n\n', 1)
+                    if len(lines) > 1:
+                        markdown_content = lines[0] + page_image_markdown + lines[1]
+                    else:
+                        markdown_content = page_image_markdown + markdown_content
+                except Exception as e:
+                    print(f"Error embedding page image: {e}")
+    return markdown_content, plain_text_result
 # ===== Main Processing Function =====
 def process_pdf(pdf_file, model_size, task_type, ref_text, is_eval_mode, embed_images, progress=gr.Progress()):
     """
     Process PDF with DeepSeek-OCR and return combined markdown from all pages.
+    Includes both visual images and extracted text content.
     """
     if pdf_file is None:
+        return "Please upload a PDF file first.", "Please upload a PDF file first."
     # handle CPU/GPU
     if torch.cuda.is_available():
         total_pages = len(images)
         if total_pages == 0:
+            return "No pages found in the PDF.", "No pages found in the PDF."
         # Extract embedded images if needed
         embedded_images = {}
         if embed_images:
+            progress(0.05, desc="Extracting embedded images from PDF...")
+            try:
+                embedded_images = extract_images_from_pdf(pdf_file.name)
+                print(f"Found embedded images on {len(embedded_images)} pages")
+            except Exception as e:
+                print(f"Could not extract embedded images: {e}")
         progress(0.1, desc=f"Found {total_pages} pages. Starting OCR...")
                         desc=f"Processing page {page_num}/{total_pages}..."
                     )
+                    markdown_content, plain_text = process_single_page(
                         image,
                         model_runtime,
                         tokenizer,
                     # Add embedded images from PDF if any
                     if embed_images and (page_num - 1) in embedded_images:
+                        markdown_content += "\n\n### Embedded Images from PDF\n\n"
                         for idx, img in enumerate(embedded_images[page_num - 1]):
+                            try:
+                                base64_img = image_to_base64(img, format='PNG')
+                                markdown_content += f"![Embedded Image {idx+1} - Page {page_num}]({base64_img})\n\n"
+                            except Exception as e:
+                                print(f"Error embedding PDF image {idx+1}: {e}")
+                    # Add page separator and content
                     page_header = f"\n\n---\n\n# Page {page_num}\n\n"
                     all_markdown_results.append(page_header + markdown_content)
                     gc.collect()
                 except Exception as e:
+                    error_msg = f"\n\n---\n\n# Page {page_num}\n\n**⚠️ Error processing this page:** {str(e)}\n\n"
                     all_markdown_results.append(error_msg)
                     print(f"Error on page {page_num}: {str(e)}")
+                    # Clear memory even on error
+                    if torch.cuda.is_available():
+                        torch.cuda.empty_cache()
+                    gc.collect()
                     continue
         # Combine all results
+        progress(1.0, desc="Finalizing document...")
         combined_markdown = "\n\n".join(all_markdown_results)
+        # Add document header with metadata
+        image_status = "✅ Enabled" if embed_images else "❌ Disabled"
+        final_output = f"""# 📄 Document OCR Results
+**Total Pages:** {total_pages}
+**Model Size:** {model_size}
+**Task Type:** {task_type}
+**Image Embedding:** {image_status}
+---
+{combined_markdown}
+---
+**End of Document** - Processed {total_pages} pages successfully.
+"""
+        return final_output, final_output  # Return twice: once for preview, once for raw text
     except Exception as e:
+        error_message = f"""# ❌ Error Processing PDF
+**Error:** {str(e)}
+**Troubleshooting Tips:**
+- Try using a smaller model size (Tiny or Small)
+- Disable image embedding for faster processing
+- Check if the PDF is corrupted or password-protected
+- For very large PDFs (50+ pages), consider processing in batches
+- Ensure you have enough GPU memory available
+**Technical Details:**
+```
+{str(e)}
+```
+"""
         print(f"Fatal error: {str(e)}")
+        import traceback
+        traceback.print_exc()
+        return error_message, error_message  # Return twice: once for preview, once for raw text
 # ===== Theme and UI =====
     font=fonts.GoogleFont("Inter"),
     font_mono=fonts.GoogleFont("JetBrains Mono"),
 )
 custom_css = """
 .gradio-container, body {
   font-family: 'Inter', ui-sans-serif, system-ui, -apple-system, 'Segoe UI', Roboto, 'Helvetica Neue', Arial, 'Noto Sans', 'Apple Color Emoji','Segoe UI Emoji','Segoe UI Symbol','Noto Color Emoji' !important;
 .prose h1 { font-weight: 800; letter-spacing: -0.02em; }
 .prose h2, .prose h3 { font-weight: 700; letter-spacing: -0.01em; }
 .gr-button { border-radius: 12px; font-weight: 600; }
+.prose img { max-width: 100%; height: auto; border-radius: 8px; margin: 1rem 0; }
 """
         Upload a PDF to extract text and convert to Markdown using **DeepSeek-OCR**.
         Each page is processed sequentially and combined into a single markdown document.
+        ## ✨ Features
+        - 🖼️ **Image Embedding** - Charts, graphs, and figures embedded directly in markdown
+        - 📝 **Text Extraction** - All text content from images and charts extracted
+        - 📊 **Table Support** - Tables converted to markdown format
+        - 🔍 **Object Detection** - Locate specific elements in documents
+        - 🎯 **Multiple Models** - Choose speed vs. accuracy trade-off
+        ## 📏 Model Sizes
+        - **Tiny** — Fastest, lower accuracy (512×512) - Best for large PDFs (30+ pages)
+        - **Small** — Fast, good accuracy (640×640) - Good for 15-30 pages
         - **Base** — Balanced performance (1024×1024) - Good for 10-20 pages
         - **Large** — Best accuracy, slower (1280×1280) - Best for <10 pages
         - **Gundam (Recommended)** — Optimized for documents (1024 base, 640 image, crop mode)
+        ## 💡 Tips
+        - Enable **"Embed Images"** to include charts/figures (recommended)
+        - Use **Tiny or Small** model for large PDFs (20+ pages)
+        - Processing time: ~2-5 seconds per page depending on model
         - Maximum recommended: 50 pages at once
+        - Image embedding increases file size (~1-2MB per page with images)
         """
     )
     with gr.Row():
         with gr.Column(scale=1):
             pdf_input = gr.File(
+                label="📎 Upload PDF",
                 file_types=[".pdf"],
                 type="filepath"
             )
             model_size = gr.Dropdown(
                 choices=["Tiny", "Small", "Base", "Large", "Gundam (Recommended)"],
                 value="Small",
+                label="🎯 Model Size",
                 info="Use Tiny/Small for large PDFs (20+ pages)"
             )
                     "🔍 Locate Object by Reference",
                 ],
                 value="📄 Convert to Markdown",
+                label="📋 Task Type",
             )
             ref_text_input = gr.Textbox(
+                label="🔍 Reference Text (for Locate task)",
                 placeholder="e.g., 'the teacher', '20-10', 'a red car'...",
                 visible=False,
             )
+            with gr.Row():
+                eval_mode_checkbox = gr.Checkbox(
+                    value=False,
+                    label="⚡ Evaluation Mode",
+                    info="Plain text only (faster)",
+                )
+                embed_images_checkbox = gr.Checkbox(
+                    value=True,
+                    label="🖼️ Embed Images",
+                    info="Include charts/figures in output",
+                )
             submit_btn = gr.Button("🚀 Process PDF", variant="primary", size="lg")
             gr.Markdown(
                 """
                 ---
+                ### 📊 Processing Status
+                Watch the progress bar for real-time updates.
+                **Note:** Image embedding provides both:
+                - 👁️ Visual image (embedded as base64)
+                - 📝 Extracted text content (OCR'd from image)
+                You get the best of both worlds!
                 """
             )
         with gr.Column(scale=2):
+            gr.Markdown("### 📝 Markdown Output Preview")
             output_markdown_preview = gr.Markdown(
                 label="Rendered Markdown",
+                value="*Upload a PDF and click 'Process PDF' to see results here.*\n\n*The output will include both images and extracted text.*"
             )
+            gr.Markdown("### 📄 Raw Markdown Source (Copy/Download)")
             output_text = gr.Textbox(
                 label="Raw Markdown",
                 lines=25,
                 show_copy_button=True,
                 interactive=False,
+                placeholder="Markdown source will appear here... You can copy/paste this into any markdown editor."
             )
     # show/hide reference text box based on selected task
     def toggle_ref_text_visibility(task):
+        return gr.Textbox(visible=(task == "🔍 Locate Object by Reference"))
     task_type.change(
         fn=toggle_ref_text_visibility,
         outputs=ref_text_input,
     )
     submit_btn.click(
         fn=process_pdf,
         inputs=[pdf_input, model_size, task_type, ref_text_input, eval_mode_checkbox, embed_images_checkbox],
+        outputs=[output_markdown_preview, output_text],
     )
         default_concurrency_limit=2
     )
     demo.launch(
+        max_threads=40,  # Increase thread limit for better concurrency
+        show_error=True,  # Show errors in UI for debugging
+        share=False  # Set to True to create a public link
     )