Spaces:
				
			
			
	
			
			
		Running
		
			on 
			
			Zero
	
	
	
			
			
	
	
	
	
		
		
		Running
		
			on 
			
			Zero
	Update app.py
Browse files
    	
        app.py
    CHANGED
    
    | @@ -149,7 +149,7 @@ def detect_figure_regions(text_result, original_image): | |
| 149 | 
             
            def process_single_page(image, model_runtime, tokenizer, model_size, task_type, ref_text, is_eval_mode, output_path, page_num, embed_images=True):
         | 
| 150 | 
             
                """
         | 
| 151 | 
             
                Process a single page/image with DeepSeek-OCR
         | 
| 152 | 
            -
                Returns markdown content with embedded images  | 
| 153 | 
             
                """
         | 
| 154 | 
             
                # ===== choose task prompt =====
         | 
| 155 | 
             
                if task_type == "📝 Free OCR":
         | 
| @@ -210,42 +210,80 @@ def process_single_page(image, model_runtime, tokenizer, model_size, task_type, | |
| 210 | 
             
                    markdown_content = plain_text_result
         | 
| 211 |  | 
| 212 | 
             
                # ===== Embed images if requested =====
         | 
|  | |
| 213 | 
             
                if embed_images and markdown_content:
         | 
| 214 | 
            -
                    #  | 
| 215 | 
            -
                     | 
| 216 | 
            -
                    has_figure_mention = any(keyword in markdown_content.lower() for keyword in figure_keywords)
         | 
| 217 |  | 
| 218 | 
            -
                     | 
| 219 | 
            -
             | 
| 220 | 
            -
             | 
| 221 | 
            -
             | 
| 222 | 
            -
             | 
| 223 | 
            -
             | 
| 224 | 
            -
             | 
| 225 | 
            -
             | 
| 226 | 
            -
             | 
| 227 | 
            -
             | 
| 228 | 
            -
             | 
| 229 | 
            -
             | 
| 230 | 
            -
             | 
| 231 | 
            -
                             | 
| 232 | 
            -
             | 
| 233 | 
            -
             | 
| 234 | 
            -
             | 
| 235 | 
            -
             | 
| 236 | 
            -
             | 
| 237 | 
            -
             | 
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
| 238 |  | 
| 239 | 
            -
             | 
| 240 | 
            -
             | 
| 241 | 
            -
             | 
| 242 | 
            -
             | 
| 243 | 
            -
             | 
| 244 | 
            -
             | 
|  | |
|  | |
|  | |
|  | |
| 245 | 
             
                            except Exception as e:
         | 
| 246 | 
            -
                                print(f"Error embedding  | 
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
| 247 |  | 
| 248 | 
            -
                return markdown_content, plain_text_result
         | 
| 249 |  | 
| 250 |  | 
| 251 | 
             
            # ===== Main Processing Function =====
         | 
| @@ -296,7 +334,7 @@ def process_pdf(pdf_file, model_size, task_type, ref_text, is_eval_mode, embed_i | |
| 296 | 
             
                                    desc=f"Processing page {page_num}/{total_pages}..."
         | 
| 297 | 
             
                                )
         | 
| 298 |  | 
| 299 | 
            -
                                markdown_content, plain_text = process_single_page(
         | 
| 300 | 
             
                                    image,
         | 
| 301 | 
             
                                    model_runtime,
         | 
| 302 | 
             
                                    tokenizer,
         | 
| @@ -309,15 +347,20 @@ def process_pdf(pdf_file, model_size, task_type, ref_text, is_eval_mode, embed_i | |
| 309 | 
             
                                    embed_images
         | 
| 310 | 
             
                                )
         | 
| 311 |  | 
| 312 | 
            -
                                # Add embedded images from PDF if any
         | 
| 313 | 
             
                                if embed_images and (page_num - 1) in embedded_images:
         | 
| 314 | 
            -
                                     | 
|  | |
| 315 | 
             
                                    for idx, img in enumerate(embedded_images[page_num - 1]):
         | 
| 316 | 
             
                                        try:
         | 
| 317 | 
             
                                            base64_img = image_to_base64(img, format='PNG')
         | 
| 318 | 
            -
                                             | 
|  | |
| 319 | 
             
                                        except Exception as e:
         | 
| 320 | 
             
                                            print(f"Error embedding PDF image {idx+1}: {e}")
         | 
|  | |
|  | |
|  | |
| 321 |  | 
| 322 | 
             
                                # Add page separator and content
         | 
| 323 | 
             
                                page_header = f"\n\n---\n\n# Page {page_num}\n\n"
         | 
|  | |
| 149 | 
             
            def process_single_page(image, model_runtime, tokenizer, model_size, task_type, ref_text, is_eval_mode, output_path, page_num, embed_images=True):
         | 
| 150 | 
             
                """
         | 
| 151 | 
             
                Process a single page/image with DeepSeek-OCR
         | 
| 152 | 
            +
                Returns markdown content with embedded images inline with context
         | 
| 153 | 
             
                """
         | 
| 154 | 
             
                # ===== choose task prompt =====
         | 
| 155 | 
             
                if task_type == "📝 Free OCR":
         | 
|  | |
| 210 | 
             
                    markdown_content = plain_text_result
         | 
| 211 |  | 
| 212 | 
             
                # ===== Embed images if requested =====
         | 
| 213 | 
            +
                embedded_images_list = []
         | 
| 214 | 
             
                if embed_images and markdown_content:
         | 
| 215 | 
            +
                    # Extract embedded PDF images first (logos, seals, etc.)
         | 
| 216 | 
            +
                    figure_images = detect_figure_regions(plain_text_result, image)
         | 
|  | |
| 217 |  | 
| 218 | 
            +
                    # Detect document type for smart placement
         | 
| 219 | 
            +
                    is_certificate = any(word in markdown_content.lower() for word in ['sertifikat', 'certificate', 'pengesahan', 'approval'])
         | 
| 220 | 
            +
                    is_letter = any(word in markdown_content.lower() for word in ['surat', 'letter', 'memo', 'kementerian', 'ministry'])
         | 
| 221 | 
            +
                    has_logo = 'logo' in markdown_content.lower() or 'seal' in markdown_content.lower()
         | 
| 222 | 
            +
                    
         | 
| 223 | 
            +
                    # For certificates and official letters, embed full page at top
         | 
| 224 | 
            +
                    if (is_certificate or is_letter or has_logo) and len(markdown_content.split()) > 20:
         | 
| 225 | 
            +
                        try:
         | 
| 226 | 
            +
                            base64_img = image_to_base64(image, format='JPEG')
         | 
| 227 | 
            +
                            # Find the first heading or title
         | 
| 228 | 
            +
                            lines = markdown_content.split('\n')
         | 
| 229 | 
            +
                            insert_pos = 0
         | 
| 230 | 
            +
                            
         | 
| 231 | 
            +
                            # Look for the first significant heading (##, ###, or all-caps line)
         | 
| 232 | 
            +
                            for i, line in enumerate(lines):
         | 
| 233 | 
            +
                                stripped = line.strip()
         | 
| 234 | 
            +
                                if stripped.startswith('##') or (stripped.isupper() and len(stripped.split()) >= 3):
         | 
| 235 | 
            +
                                    insert_pos = i + 1
         | 
| 236 | 
            +
                                    break
         | 
| 237 | 
            +
                            
         | 
| 238 | 
            +
                            # Insert image right after the title
         | 
| 239 | 
            +
                            if insert_pos > 0:
         | 
| 240 | 
            +
                                lines.insert(insert_pos, f"\n\n")
         | 
| 241 | 
            +
                                markdown_content = '\n'.join(lines)
         | 
| 242 | 
            +
                            else:
         | 
| 243 | 
            +
                                # No clear heading, insert at top
         | 
| 244 | 
            +
                                markdown_content = f"\n\n" + markdown_content
         | 
| 245 |  | 
| 246 | 
            +
                        except Exception as e:
         | 
| 247 | 
            +
                            print(f"Error embedding page image: {e}")
         | 
| 248 | 
            +
                    
         | 
| 249 | 
            +
                    # If specific figures detected (charts, graphs), embed them inline
         | 
| 250 | 
            +
                    elif figure_images:
         | 
| 251 | 
            +
                        figures_markdown = "\n\n"
         | 
| 252 | 
            +
                        for idx, fig_img in enumerate(figure_images):
         | 
| 253 | 
            +
                            try:
         | 
| 254 | 
            +
                                base64_img = image_to_base64(fig_img, format='PNG')
         | 
| 255 | 
            +
                                figures_markdown += f"\n\n"
         | 
| 256 | 
             
                            except Exception as e:
         | 
| 257 | 
            +
                                print(f"Error embedding figure {idx+1}: {e}")
         | 
| 258 | 
            +
                        
         | 
| 259 | 
            +
                        # Insert after the first paragraph
         | 
| 260 | 
            +
                        paragraphs = markdown_content.split('\n\n', 1)
         | 
| 261 | 
            +
                        if len(paragraphs) >= 2:
         | 
| 262 | 
            +
                            markdown_content = paragraphs[0] + figures_markdown + paragraphs[1]
         | 
| 263 | 
            +
                        else:
         | 
| 264 | 
            +
                            markdown_content = figures_markdown + markdown_content
         | 
| 265 | 
            +
                    
         | 
| 266 | 
            +
                    # For pages with charts/graphs mentioned but not detected
         | 
| 267 | 
            +
                    elif any(word in markdown_content.lower() for word in ['chart', 'graph', 'diagram', 'figure']):
         | 
| 268 | 
            +
                        try:
         | 
| 269 | 
            +
                            base64_img = image_to_base64(image, format='JPEG')
         | 
| 270 | 
            +
                            # Insert after first mention of chart/graph
         | 
| 271 | 
            +
                            for keyword in ['chart', 'graph', 'diagram', 'figure']:
         | 
| 272 | 
            +
                                if keyword in markdown_content.lower():
         | 
| 273 | 
            +
                                    parts = markdown_content.lower().split(keyword, 1)
         | 
| 274 | 
            +
                                    # Find the position in the original text
         | 
| 275 | 
            +
                                    pos = len(parts[0])
         | 
| 276 | 
            +
                                    # Insert image after the current paragraph
         | 
| 277 | 
            +
                                    para_end = markdown_content.find('\n\n', pos)
         | 
| 278 | 
            +
                                    if para_end > 0:
         | 
| 279 | 
            +
                                        markdown_content = markdown_content[:para_end] + f"\n\n\n\n" + markdown_content[para_end+2:]
         | 
| 280 | 
            +
                                    else:
         | 
| 281 | 
            +
                                        markdown_content += f"\n\n\n\n"
         | 
| 282 | 
            +
                                    break
         | 
| 283 | 
            +
                        except Exception as e:
         | 
| 284 | 
            +
                            print(f"Error embedding contextual image: {e}")
         | 
| 285 |  | 
| 286 | 
            +
                return markdown_content, plain_text_result, embedded_images_list
         | 
| 287 |  | 
| 288 |  | 
| 289 | 
             
            # ===== Main Processing Function =====
         | 
|  | |
| 334 | 
             
                                    desc=f"Processing page {page_num}/{total_pages}..."
         | 
| 335 | 
             
                                )
         | 
| 336 |  | 
| 337 | 
            +
                                markdown_content, plain_text, page_embedded_imgs = process_single_page(
         | 
| 338 | 
             
                                    image,
         | 
| 339 | 
             
                                    model_runtime,
         | 
| 340 | 
             
                                    tokenizer,
         | 
|  | |
| 347 | 
             
                                    embed_images
         | 
| 348 | 
             
                                )
         | 
| 349 |  | 
| 350 | 
            +
                                # Add embedded images from PDF inline if any
         | 
| 351 | 
             
                                if embed_images and (page_num - 1) in embedded_images:
         | 
| 352 | 
            +
                                    # Insert PDF images right after the OCR'd content, but before page separator
         | 
| 353 | 
            +
                                    pdf_images_markdown = "\n\n"
         | 
| 354 | 
             
                                    for idx, img in enumerate(embedded_images[page_num - 1]):
         | 
| 355 | 
             
                                        try:
         | 
| 356 | 
             
                                            base64_img = image_to_base64(img, format='PNG')
         | 
| 357 | 
            +
                                            # Add inline without a big header
         | 
| 358 | 
            +
                                            pdf_images_markdown += f"\n\n"
         | 
| 359 | 
             
                                        except Exception as e:
         | 
| 360 | 
             
                                            print(f"Error embedding PDF image {idx+1}: {e}")
         | 
| 361 | 
            +
                                    
         | 
| 362 | 
            +
                                    # Append to the markdown content directly (inline)
         | 
| 363 | 
            +
                                    markdown_content += pdf_images_markdown
         | 
| 364 |  | 
| 365 | 
             
                                # Add page separator and content
         | 
| 366 | 
             
                                page_header = f"\n\n---\n\n# Page {page_num}\n\n"
         | 
