Spaces:

lucacadalora
/

jatevo

Running on Zero

App Files Files Community

lucacadalora commited on 7 days ago

Commit

3959304

verified ·

1 Parent(s): 8aaa52d

Update app.py

Browse files

Files changed (1) hide show

app.py +79 -36

app.py CHANGED Viewed

@@ -149,7 +149,7 @@ def detect_figure_regions(text_result, original_image):
 def process_single_page(image, model_runtime, tokenizer, model_size, task_type, ref_text, is_eval_mode, output_path, page_num, embed_images=True):
     """
     Process a single page/image with DeepSeek-OCR
-    Returns markdown content with embedded images if requested
     """
     # ===== choose task prompt =====
     if task_type == "📝 Free OCR":
@@ -210,42 +210,80 @@ def process_single_page(image, model_runtime, tokenizer, model_size, task_type,
         markdown_content = plain_text_result
     # ===== Embed images if requested =====
     if embed_images and markdown_content:
-        # Check if markdown mentions figures/charts/images
-        figure_keywords = ['figure', 'chart', 'graph', 'diagram', 'image', 'plot', 'illustration', 'table', 'screenshot']
-        has_figure_mention = any(keyword in markdown_content.lower() for keyword in figure_keywords)
-        if has_figure_mention:
-            # Try to detect figure regions from bounding boxes
-            figure_images = detect_figure_regions(plain_text_result, image)
-            # If specific figures detected, embed them
-            if figure_images:
-                figures_markdown = "\n\n### Detected Figures\n\n"
-                for idx, fig_img in enumerate(figure_images):
-                    try:
-                        base64_img = image_to_base64(fig_img, format='PNG')
-                        figures_markdown += f"![Figure {idx+1} from Page {page_num}]({base64_img})\n\n"
-                    except Exception as e:
-                        print(f"Error embedding figure {idx+1}: {e}")
-                markdown_content += figures_markdown
-            else:
-                # No specific regions detected, but figures mentioned
-                # Embed full page image for context
-                try:
-                    base64_img = image_to_base64(image, format='JPEG')
-                    page_image_markdown = f"\n\n### Page {page_num} Visual Content\n\n![Page {page_num} Full View]({base64_img})\n\n"
-                    # Insert image after first paragraph or at the beginning
-                    lines = markdown_content.split('\n\n', 1)
-                    if len(lines) > 1:
-                        markdown_content = lines[0] + page_image_markdown + lines[1]
-                    else:
-                        markdown_content = page_image_markdown + markdown_content
                 except Exception as e:
-                    print(f"Error embedding page image: {e}")
-    return markdown_content, plain_text_result
 # ===== Main Processing Function =====
@@ -296,7 +334,7 @@ def process_pdf(pdf_file, model_size, task_type, ref_text, is_eval_mode, embed_i
                         desc=f"Processing page {page_num}/{total_pages}..."
                     )
-                    markdown_content, plain_text = process_single_page(
                         image,
                         model_runtime,
                         tokenizer,
@@ -309,15 +347,20 @@ def process_pdf(pdf_file, model_size, task_type, ref_text, is_eval_mode, embed_i
                         embed_images
                     )
-                    # Add embedded images from PDF if any
                     if embed_images and (page_num - 1) in embedded_images:
-                        markdown_content += "\n\n### Embedded Images from PDF\n\n"
                         for idx, img in enumerate(embedded_images[page_num - 1]):
                             try:
                                 base64_img = image_to_base64(img, format='PNG')
-                                markdown_content += f"![Embedded Image {idx+1} - Page {page_num}]({base64_img})\n\n"
                             except Exception as e:
                                 print(f"Error embedding PDF image {idx+1}: {e}")
                     # Add page separator and content
                     page_header = f"\n\n---\n\n# Page {page_num}\n\n"

 def process_single_page(image, model_runtime, tokenizer, model_size, task_type, ref_text, is_eval_mode, output_path, page_num, embed_images=True):
     """
     Process a single page/image with DeepSeek-OCR
+    Returns markdown content with embedded images inline with context
     """
     # ===== choose task prompt =====
     if task_type == "📝 Free OCR":
         markdown_content = plain_text_result
     # ===== Embed images if requested =====
+    embedded_images_list = []
     if embed_images and markdown_content:
+        # Extract embedded PDF images first (logos, seals, etc.)
+        figure_images = detect_figure_regions(plain_text_result, image)
+        # Detect document type for smart placement
+        is_certificate = any(word in markdown_content.lower() for word in ['sertifikat', 'certificate', 'pengesahan', 'approval'])
+        is_letter = any(word in markdown_content.lower() for word in ['surat', 'letter', 'memo', 'kementerian', 'ministry'])
+        has_logo = 'logo' in markdown_content.lower() or 'seal' in markdown_content.lower()
+        # For certificates and official letters, embed full page at top
+        if (is_certificate or is_letter or has_logo) and len(markdown_content.split()) > 20:
+            try:
+                base64_img = image_to_base64(image, format='JPEG')
+                # Find the first heading or title
+                lines = markdown_content.split('\n')
+                insert_pos = 0
+                # Look for the first significant heading (##, ###, or all-caps line)
+                for i, line in enumerate(lines):
+                    stripped = line.strip()
+                    if stripped.startswith('##') or (stripped.isupper() and len(stripped.split()) >= 3):
+                        insert_pos = i + 1
+                        break
+                # Insert image right after the title
+                if insert_pos > 0:
+                    lines.insert(insert_pos, f"\n![Page {page_num} - Official Document]({base64_img})\n")
+                    markdown_content = '\n'.join(lines)
+                else:
+                    # No clear heading, insert at top
+                    markdown_content = f"![Page {page_num}]({base64_img})\n\n" + markdown_content
+            except Exception as e:
+                print(f"Error embedding page image: {e}")
+        # If specific figures detected (charts, graphs), embed them inline
+        elif figure_images:
+            figures_markdown = "\n\n"
+            for idx, fig_img in enumerate(figure_images):
+                try:
+                    base64_img = image_to_base64(fig_img, format='PNG')
+                    figures_markdown += f"![Figure {idx+1}]({base64_img})\n\n"
                 except Exception as e:
+                    print(f"Error embedding figure {idx+1}: {e}")
+            # Insert after the first paragraph
+            paragraphs = markdown_content.split('\n\n', 1)
+            if len(paragraphs) >= 2:
+                markdown_content = paragraphs[0] + figures_markdown + paragraphs[1]
+            else:
+                markdown_content = figures_markdown + markdown_content
+        # For pages with charts/graphs mentioned but not detected
+        elif any(word in markdown_content.lower() for word in ['chart', 'graph', 'diagram', 'figure']):
+            try:
+                base64_img = image_to_base64(image, format='JPEG')
+                # Insert after first mention of chart/graph
+                for keyword in ['chart', 'graph', 'diagram', 'figure']:
+                    if keyword in markdown_content.lower():
+                        parts = markdown_content.lower().split(keyword, 1)
+                        # Find the position in the original text
+                        pos = len(parts[0])
+                        # Insert image after the current paragraph
+                        para_end = markdown_content.find('\n\n', pos)
+                        if para_end > 0:
+                            markdown_content = markdown_content[:para_end] + f"\n\n![Visual Content]({base64_img})\n\n" + markdown_content[para_end+2:]
+                        else:
+                            markdown_content += f"\n\n![Visual Content]({base64_img})\n\n"
+                        break
+            except Exception as e:
+                print(f"Error embedding contextual image: {e}")
+    return markdown_content, plain_text_result, embedded_images_list
 # ===== Main Processing Function =====
                         desc=f"Processing page {page_num}/{total_pages}..."
                     )
+                    markdown_content, plain_text, page_embedded_imgs = process_single_page(
                         image,
                         model_runtime,
                         tokenizer,
                         embed_images
                     )
+                    # Add embedded images from PDF inline if any
                     if embed_images and (page_num - 1) in embedded_images:
+                        # Insert PDF images right after the OCR'd content, but before page separator
+                        pdf_images_markdown = "\n\n"
                         for idx, img in enumerate(embedded_images[page_num - 1]):
                             try:
                                 base64_img = image_to_base64(img, format='PNG')
+                                # Add inline without a big header
+                                pdf_images_markdown += f"![Image {idx+1}]({base64_img})\n\n"
                             except Exception as e:
                                 print(f"Error embedding PDF image {idx+1}: {e}")
+                        # Append to the markdown content directly (inline)
+                        markdown_content += pdf_images_markdown
                     # Add page separator and content
                     page_header = f"\n\n---\n\n# Page {page_num}\n\n"