lucacadalora commited on
Commit
3959304
·
verified ·
1 Parent(s): 8aaa52d

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +79 -36
app.py CHANGED
@@ -149,7 +149,7 @@ def detect_figure_regions(text_result, original_image):
149
  def process_single_page(image, model_runtime, tokenizer, model_size, task_type, ref_text, is_eval_mode, output_path, page_num, embed_images=True):
150
  """
151
  Process a single page/image with DeepSeek-OCR
152
- Returns markdown content with embedded images if requested
153
  """
154
  # ===== choose task prompt =====
155
  if task_type == "📝 Free OCR":
@@ -210,42 +210,80 @@ def process_single_page(image, model_runtime, tokenizer, model_size, task_type,
210
  markdown_content = plain_text_result
211
 
212
  # ===== Embed images if requested =====
 
213
  if embed_images and markdown_content:
214
- # Check if markdown mentions figures/charts/images
215
- figure_keywords = ['figure', 'chart', 'graph', 'diagram', 'image', 'plot', 'illustration', 'table', 'screenshot']
216
- has_figure_mention = any(keyword in markdown_content.lower() for keyword in figure_keywords)
217
 
218
- if has_figure_mention:
219
- # Try to detect figure regions from bounding boxes
220
- figure_images = detect_figure_regions(plain_text_result, image)
221
-
222
- # If specific figures detected, embed them
223
- if figure_images:
224
- figures_markdown = "\n\n### Detected Figures\n\n"
225
- for idx, fig_img in enumerate(figure_images):
226
- try:
227
- base64_img = image_to_base64(fig_img, format='PNG')
228
- figures_markdown += f"![Figure {idx+1} from Page {page_num}]({base64_img})\n\n"
229
- except Exception as e:
230
- print(f"Error embedding figure {idx+1}: {e}")
231
- markdown_content += figures_markdown
232
- else:
233
- # No specific regions detected, but figures mentioned
234
- # Embed full page image for context
235
- try:
236
- base64_img = image_to_base64(image, format='JPEG')
237
- page_image_markdown = f"\n\n### Page {page_num} Visual Content\n\n![Page {page_num} Full View]({base64_img})\n\n"
 
 
 
 
 
 
 
238
 
239
- # Insert image after first paragraph or at the beginning
240
- lines = markdown_content.split('\n\n', 1)
241
- if len(lines) > 1:
242
- markdown_content = lines[0] + page_image_markdown + lines[1]
243
- else:
244
- markdown_content = page_image_markdown + markdown_content
 
 
 
 
245
  except Exception as e:
246
- print(f"Error embedding page image: {e}")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
247
 
248
- return markdown_content, plain_text_result
249
 
250
 
251
  # ===== Main Processing Function =====
@@ -296,7 +334,7 @@ def process_pdf(pdf_file, model_size, task_type, ref_text, is_eval_mode, embed_i
296
  desc=f"Processing page {page_num}/{total_pages}..."
297
  )
298
 
299
- markdown_content, plain_text = process_single_page(
300
  image,
301
  model_runtime,
302
  tokenizer,
@@ -309,15 +347,20 @@ def process_pdf(pdf_file, model_size, task_type, ref_text, is_eval_mode, embed_i
309
  embed_images
310
  )
311
 
312
- # Add embedded images from PDF if any
313
  if embed_images and (page_num - 1) in embedded_images:
314
- markdown_content += "\n\n### Embedded Images from PDF\n\n"
 
315
  for idx, img in enumerate(embedded_images[page_num - 1]):
316
  try:
317
  base64_img = image_to_base64(img, format='PNG')
318
- markdown_content += f"![Embedded Image {idx+1} - Page {page_num}]({base64_img})\n\n"
 
319
  except Exception as e:
320
  print(f"Error embedding PDF image {idx+1}: {e}")
 
 
 
321
 
322
  # Add page separator and content
323
  page_header = f"\n\n---\n\n# Page {page_num}\n\n"
 
149
  def process_single_page(image, model_runtime, tokenizer, model_size, task_type, ref_text, is_eval_mode, output_path, page_num, embed_images=True):
150
  """
151
  Process a single page/image with DeepSeek-OCR
152
+ Returns markdown content with embedded images inline with context
153
  """
154
  # ===== choose task prompt =====
155
  if task_type == "📝 Free OCR":
 
210
  markdown_content = plain_text_result
211
 
212
  # ===== Embed images if requested =====
213
+ embedded_images_list = []
214
  if embed_images and markdown_content:
215
+ # Extract embedded PDF images first (logos, seals, etc.)
216
+ figure_images = detect_figure_regions(plain_text_result, image)
 
217
 
218
+ # Detect document type for smart placement
219
+ is_certificate = any(word in markdown_content.lower() for word in ['sertifikat', 'certificate', 'pengesahan', 'approval'])
220
+ is_letter = any(word in markdown_content.lower() for word in ['surat', 'letter', 'memo', 'kementerian', 'ministry'])
221
+ has_logo = 'logo' in markdown_content.lower() or 'seal' in markdown_content.lower()
222
+
223
+ # For certificates and official letters, embed full page at top
224
+ if (is_certificate or is_letter or has_logo) and len(markdown_content.split()) > 20:
225
+ try:
226
+ base64_img = image_to_base64(image, format='JPEG')
227
+ # Find the first heading or title
228
+ lines = markdown_content.split('\n')
229
+ insert_pos = 0
230
+
231
+ # Look for the first significant heading (##, ###, or all-caps line)
232
+ for i, line in enumerate(lines):
233
+ stripped = line.strip()
234
+ if stripped.startswith('##') or (stripped.isupper() and len(stripped.split()) >= 3):
235
+ insert_pos = i + 1
236
+ break
237
+
238
+ # Insert image right after the title
239
+ if insert_pos > 0:
240
+ lines.insert(insert_pos, f"\n![Page {page_num} - Official Document]({base64_img})\n")
241
+ markdown_content = '\n'.join(lines)
242
+ else:
243
+ # No clear heading, insert at top
244
+ markdown_content = f"![Page {page_num}]({base64_img})\n\n" + markdown_content
245
 
246
+ except Exception as e:
247
+ print(f"Error embedding page image: {e}")
248
+
249
+ # If specific figures detected (charts, graphs), embed them inline
250
+ elif figure_images:
251
+ figures_markdown = "\n\n"
252
+ for idx, fig_img in enumerate(figure_images):
253
+ try:
254
+ base64_img = image_to_base64(fig_img, format='PNG')
255
+ figures_markdown += f"![Figure {idx+1}]({base64_img})\n\n"
256
  except Exception as e:
257
+ print(f"Error embedding figure {idx+1}: {e}")
258
+
259
+ # Insert after the first paragraph
260
+ paragraphs = markdown_content.split('\n\n', 1)
261
+ if len(paragraphs) >= 2:
262
+ markdown_content = paragraphs[0] + figures_markdown + paragraphs[1]
263
+ else:
264
+ markdown_content = figures_markdown + markdown_content
265
+
266
+ # For pages with charts/graphs mentioned but not detected
267
+ elif any(word in markdown_content.lower() for word in ['chart', 'graph', 'diagram', 'figure']):
268
+ try:
269
+ base64_img = image_to_base64(image, format='JPEG')
270
+ # Insert after first mention of chart/graph
271
+ for keyword in ['chart', 'graph', 'diagram', 'figure']:
272
+ if keyword in markdown_content.lower():
273
+ parts = markdown_content.lower().split(keyword, 1)
274
+ # Find the position in the original text
275
+ pos = len(parts[0])
276
+ # Insert image after the current paragraph
277
+ para_end = markdown_content.find('\n\n', pos)
278
+ if para_end > 0:
279
+ markdown_content = markdown_content[:para_end] + f"\n\n![Visual Content]({base64_img})\n\n" + markdown_content[para_end+2:]
280
+ else:
281
+ markdown_content += f"\n\n![Visual Content]({base64_img})\n\n"
282
+ break
283
+ except Exception as e:
284
+ print(f"Error embedding contextual image: {e}")
285
 
286
+ return markdown_content, plain_text_result, embedded_images_list
287
 
288
 
289
  # ===== Main Processing Function =====
 
334
  desc=f"Processing page {page_num}/{total_pages}..."
335
  )
336
 
337
+ markdown_content, plain_text, page_embedded_imgs = process_single_page(
338
  image,
339
  model_runtime,
340
  tokenizer,
 
347
  embed_images
348
  )
349
 
350
+ # Add embedded images from PDF inline if any
351
  if embed_images and (page_num - 1) in embedded_images:
352
+ # Insert PDF images right after the OCR'd content, but before page separator
353
+ pdf_images_markdown = "\n\n"
354
  for idx, img in enumerate(embedded_images[page_num - 1]):
355
  try:
356
  base64_img = image_to_base64(img, format='PNG')
357
+ # Add inline without a big header
358
+ pdf_images_markdown += f"![Image {idx+1}]({base64_img})\n\n"
359
  except Exception as e:
360
  print(f"Error embedding PDF image {idx+1}: {e}")
361
+
362
+ # Append to the markdown content directly (inline)
363
+ markdown_content += pdf_images_markdown
364
 
365
  # Add page separator and content
366
  page_header = f"\n\n---\n\n# Page {page_num}\n\n"