prithivMLmods commited on
Commit
50dc858
·
verified ·
1 Parent(s): eb96e48

update app

Browse files
Files changed (1) hide show
  1. app.py +102 -68
app.py CHANGED
@@ -5,6 +5,7 @@ import json
5
  import time
6
  import asyncio
7
  from threading import Thread
 
8
 
9
  import gradio as gr
10
  import spaces
@@ -12,6 +13,7 @@ import torch
12
  import numpy as np
13
  from PIL import Image, ImageOps
14
  import cv2
 
15
 
16
  from transformers import (
17
  Qwen2VLForConditionalGeneration,
@@ -22,6 +24,8 @@ from transformers import (
22
  TextIteratorStreamer,
23
  )
24
  from transformers.image_utils import load_image
 
 
25
 
26
  from docling_core.types.doc import DoclingDocument, DocTagsDocument
27
 
@@ -29,6 +33,79 @@ import re
29
  import ast
30
  import html
31
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
32
  # Constants for text generation
33
  MAX_MAX_NEW_TOKENS = 5120
34
  DEFAULT_MAX_NEW_TOKENS = 3072
@@ -121,7 +198,7 @@ def downsample_video(video_path):
121
  total_frames = int(vidcap.get(cv2.CAP_PROP_FRAME_COUNT))
122
  fps = vidcap.get(cv2.CAP_PROP_FPS)
123
  frames = []
124
- frame_indices = np.linspace(0, total_frames - 1, 10, dtype=int)
125
  for i in frame_indices:
126
  vidcap.set(cv2.CAP_PROP_POS_FRAMES, i)
127
  success, image = vidcap.read()
@@ -142,20 +219,15 @@ def generate_image(model_name: str, text: str, image: Image.Image,
142
  repetition_penalty: float = 1.2):
143
  """Generate responses for image input using the selected model."""
144
  if model_name == "Nanonets-OCR-s":
145
- processor = processor_m
146
- model = model_m
147
  elif model_name == "MonkeyOCR-Recognition":
148
- processor = processor_g
149
- model = model_g
150
  elif model_name == "SmolDocling-256M-preview":
151
- processor = processor_x
152
- model = model_x
153
  elif model_name == "Typhoon-OCR-7B":
154
- processor = processor_l
155
- model = model_l
156
  elif model_name == "Thyme-RL":
157
- processor = processor_n
158
- model = model_n
159
  else:
160
  yield "Invalid model selected.", "Invalid model selected."
161
  return
@@ -223,20 +295,15 @@ def generate_video(model_name: str, text: str, video_path: str,
223
  repetition_penalty: float = 1.2):
224
  """Generate responses for video input using the selected model."""
225
  if model_name == "Nanonets-OCR-s":
226
- processor = processor_m
227
- model = model_m
228
  elif model_name == "MonkeyOCR-Recognition":
229
- processor = processor_g
230
- model = model_g
231
  elif model_name == "SmolDocling-256M-preview":
232
- processor = processor_x
233
- model = model_x
234
  elif model_name == "Typhoon-OCR-7B":
235
- processor = processor_l
236
- model = model_l
237
  elif model_name == "Thyme-RL":
238
- processor = processor_n
239
- model = model_n
240
  else:
241
  yield "Invalid model selected.", "Invalid model selected."
242
  return
@@ -314,44 +381,22 @@ video_examples = [
314
  ["Explain the video in detail.", "videos/2.mp4"]
315
  ]
316
 
317
- #css
318
- css = """
319
- .submit-btn {
320
- background-color: #2980b9 !important;
321
- color: white !important;
322
- }
323
- .submit-btn:hover {
324
- background-color: #3498db !important;
325
- }
326
- .canvas-output {
327
- border: 2px solid #4682B4;
328
- border-radius: 10px;
329
- padding: 20px;
330
- }
331
- """
332
-
333
  # Create the Gradio Interface
334
- with gr.Blocks(css=css, theme="bethecloud/storj_theme") as demo:
335
- gr.Markdown("# **[Multimodal OCR2](https://huggingface.co/collections/prithivMLmods/multimodal-implementations-67c9982ea04b39f0608badb0)**")
336
  with gr.Row():
337
- with gr.Column():
338
  with gr.Tabs():
339
  with gr.TabItem("Image Inference"):
340
  image_query = gr.Textbox(label="Query Input", placeholder="Enter your query here...")
341
  image_upload = gr.Image(type="pil", label="Image", height=290)
342
- image_submit = gr.Button("Submit", elem_classes="submit-btn")
343
- gr.Examples(
344
- examples=image_examples,
345
- inputs=[image_query, image_upload]
346
- )
347
  with gr.TabItem("Video Inference"):
348
  video_query = gr.Textbox(label="Query Input", placeholder="Enter your query here...")
349
  video_upload = gr.Video(label="Video", height=290)
350
- video_submit = gr.Button("Submit", elem_classes="submit-btn")
351
- gr.Examples(
352
- examples=video_examples,
353
- inputs=[video_query, video_upload]
354
- )
355
  with gr.Accordion("Advanced options", open=False):
356
  max_new_tokens = gr.Slider(label="Max new tokens", minimum=1, maximum=MAX_MAX_NEW_TOKENS, step=1, value=DEFAULT_MAX_NEW_TOKENS)
357
  temperature = gr.Slider(label="Temperature", minimum=0.1, maximum=4.0, step=0.1, value=0.6)
@@ -359,13 +404,11 @@ with gr.Blocks(css=css, theme="bethecloud/storj_theme") as demo:
359
  top_k = gr.Slider(label="Top-k", minimum=1, maximum=1000, step=1, value=50)
360
  repetition_penalty = gr.Slider(label="Repetition penalty", minimum=1.0, maximum=2.0, step=0.05, value=1.2)
361
 
362
- with gr.Column():
363
- with gr.Column(elem_classes="canvas-output"):
364
- gr.Markdown("## Output")
365
- raw_output = gr.Textbox(label="Raw Output Stream", interactive=False, lines=5, show_copy_button=True)
366
-
367
- with gr.Accordion("(Result.md)", open=False):
368
- formatted_output = gr.Markdown(label="(Result.md)")
369
 
370
  model_choice = gr.Radio(
371
  choices=["Nanonets-OCR-s", "MonkeyOCR-Recognition", "Thyme-RL", "Typhoon-OCR-7B", "SmolDocling-256M-preview"],
@@ -373,14 +416,6 @@ with gr.Blocks(css=css, theme="bethecloud/storj_theme") as demo:
373
  value="Nanonets-OCR-s"
374
  )
375
 
376
- gr.Markdown("**Model Info 💻** | [Report Bug](https://huggingface.co/spaces/prithivMLmods/Multimodal-OCR2/discussions)")
377
- gr.Markdown("> [Nanonets-OCR-s](https://huggingface.co/nanonets/Nanonets-OCR-s): nanonets-ocr-s is a powerful, state-of-the-art image-to-markdown ocr model that goes far beyond traditional text extraction. it transforms documents into structured markdown with intelligent content recognition and semantic tagging.")
378
- gr.Markdown("> [SmolDocling-256M](https://huggingface.co/ds4sd/SmolDocling-256M-preview): SmolDocling is a multimodal Image-Text-to-Text model designed for efficient document conversion. It retains Docling's most popular features while ensuring full compatibility with Docling through seamless support for DoclingDocuments.")
379
- gr.Markdown("> [MonkeyOCR-Recognition](https://huggingface.co/echo840/MonkeyOCR): MonkeyOCR adopts a Structure-Recognition-Relation (SRR) triplet paradigm, which simplifies the multi-tool pipeline of modular approaches while avoiding the inefficiency of using large multimodal models for full-page document processing.")
380
- gr.Markdown("> [Typhoon-OCR-7B](https://huggingface.co/scb10x/typhoon-ocr-7b): A bilingual document parsing model built specifically for real-world documents in Thai and English inspired by models like olmOCR based on Qwen2.5-VL-Instruction. Extracts and interprets embedded text (e.g., chart labels, captions) in Thai or English.")
381
- gr.Markdown("> [Thyme-RL](https://huggingface.co/Kwai-Keye/Thyme-RL): Thyme: Think Beyond Images. Thyme transcends traditional ``thinking with images'' paradigms by autonomously generating and executing diverse image processing and computational operations through executable code, significantly enhancing performance on high-resolution perception and complex reasoning tasks.")
382
- gr.Markdown(">⚠️note: all the models in space are not guaranteed to perform well in video inference use cases.")
383
-
384
  image_submit.click(
385
  fn=generate_image,
386
  inputs=[model_choice, image_query, image_upload, max_new_tokens, temperature, top_p, top_k, repetition_penalty],
@@ -389,9 +424,8 @@ with gr.Blocks(css=css, theme="bethecloud/storj_theme") as demo:
389
  video_submit.click(
390
  fn=generate_video,
391
  inputs=[model_choice, video_query, video_upload, max_new_tokens, temperature, top_p, top_k, repetition_penalty],
392
- outputs=[raw_output,
393
- formatted_output]
394
  )
395
 
396
  if __name__ == "__main__":
397
- demo.queue(max_size=50).launch(share=True, mcp_server=True, ssr_mode=False, show_error=True)
 
5
  import time
6
  import asyncio
7
  from threading import Thread
8
+ from typing import Iterable
9
 
10
  import gradio as gr
11
  import spaces
 
13
  import numpy as np
14
  from PIL import Image, ImageOps
15
  import cv2
16
+ import requests
17
 
18
  from transformers import (
19
  Qwen2VLForConditionalGeneration,
 
24
  TextIteratorStreamer,
25
  )
26
  from transformers.image_utils import load_image
27
+ from gradio.themes import Soft
28
+ from gradio.themes.utils import colors, fonts, sizes
29
 
30
  from docling_core.types.doc import DoclingDocument, DocTagsDocument
31
 
 
33
  import ast
34
  import html
35
 
36
+ # --- Theme and CSS Definition ---
37
+
38
+ colors.steel_blue = colors.Color(
39
+ name="steel_blue",
40
+ c50="#EBF3F8",
41
+ c100="#D3E5F0",
42
+ c200="#A8CCE1",
43
+ c300="#7DB3D2",
44
+ c400="#529AC3",
45
+ c500="#4682B4", # SteelBlue base color
46
+ c600="#3E72A0",
47
+ c700="#36638C",
48
+ c800="#2E5378",
49
+ c900="#264364",
50
+ c950="#1E3450",
51
+ )
52
+
53
+ class SteelBlueTheme(Soft):
54
+ def __init__(
55
+ self,
56
+ *,
57
+ primary_hue: colors.Color | str = colors.gray,
58
+ secondary_hue: colors.Color | str = colors.steel_blue,
59
+ neutral_hue: colors.Color | str = colors.slate,
60
+ text_size: sizes.Size | str = sizes.text_lg,
61
+ font: fonts.Font | str | Iterable[fonts.Font | str] = (
62
+ fonts.GoogleFont("Outfit"), "Arial", "sans-serif",
63
+ ),
64
+ font_mono: fonts.Font | str | Iterable[fonts.Font | str] = (
65
+ fonts.GoogleFont("IBM Plex Mono"), "ui-monospace", "monospace",
66
+ ),
67
+ ):
68
+ super().__init__(
69
+ primary_hue=primary_hue,
70
+ secondary_hue=secondary_hue,
71
+ neutral_hue=neutral_hue,
72
+ text_size=text_size,
73
+ font=font,
74
+ font_mono=font_mono,
75
+ )
76
+ super().set(
77
+ background_fill_primary="*primary_50",
78
+ background_fill_primary_dark="*primary_900",
79
+ body_background_fill="linear-gradient(135deg, *primary_200, *primary_100)",
80
+ body_background_fill_dark="linear-gradient(135deg, *primary_900, *primary_800)",
81
+ button_primary_text_color="white",
82
+ button_primary_text_color_hover="white",
83
+ button_primary_background_fill="linear-gradient(90deg, *secondary_500, *secondary_600)",
84
+ button_primary_background_fill_hover="linear-gradient(90deg, *secondary_600, *secondary_700)",
85
+ button_primary_background_fill_dark="linear-gradient(90deg, *secondary_600, *secondary_700)",
86
+ button_primary_background_fill_hover_dark="linear-gradient(90deg, *secondary_500, *secondary_600)",
87
+ slider_color="*secondary_500",
88
+ slider_color_dark="*secondary_600",
89
+ block_title_text_weight="600",
90
+ block_border_width="3px",
91
+ block_shadow="*shadow_drop_lg",
92
+ button_primary_shadow="*shadow_drop_lg",
93
+ button_large_padding="11px",
94
+ color_accent_soft="*primary_100",
95
+ block_label_background_fill="*primary_200",
96
+ )
97
+
98
+ steel_blue_theme = SteelBlueTheme()
99
+
100
+ css = """
101
+ #main-title h1 {
102
+ font-size: 2.3em !important;
103
+ }
104
+ #output-title h2 {
105
+ font-size: 2.1em !important;
106
+ }
107
+ """
108
+
109
  # Constants for text generation
110
  MAX_MAX_NEW_TOKENS = 5120
111
  DEFAULT_MAX_NEW_TOKENS = 3072
 
198
  total_frames = int(vidcap.get(cv2.CAP_PROP_FRAME_COUNT))
199
  fps = vidcap.get(cv2.CAP_PROP_FPS)
200
  frames = []
201
+ frame_indices = np.linspace(0, total_frames - 1, min(total_frames, 10), dtype=int)
202
  for i in frame_indices:
203
  vidcap.set(cv2.CAP_PROP_POS_FRAMES, i)
204
  success, image = vidcap.read()
 
219
  repetition_penalty: float = 1.2):
220
  """Generate responses for image input using the selected model."""
221
  if model_name == "Nanonets-OCR-s":
222
+ processor, model = processor_m, model_m
 
223
  elif model_name == "MonkeyOCR-Recognition":
224
+ processor, model = processor_g, model_g
 
225
  elif model_name == "SmolDocling-256M-preview":
226
+ processor, model = processor_x, model_x
 
227
  elif model_name == "Typhoon-OCR-7B":
228
+ processor, model = processor_l, model_l
 
229
  elif model_name == "Thyme-RL":
230
+ processor, model = processor_n, model_n
 
231
  else:
232
  yield "Invalid model selected.", "Invalid model selected."
233
  return
 
295
  repetition_penalty: float = 1.2):
296
  """Generate responses for video input using the selected model."""
297
  if model_name == "Nanonets-OCR-s":
298
+ processor, model = processor_m, model_m
 
299
  elif model_name == "MonkeyOCR-Recognition":
300
+ processor, model = processor_g, model_g
 
301
  elif model_name == "SmolDocling-256M-preview":
302
+ processor, model = processor_x, model_x
 
303
  elif model_name == "Typhoon-OCR-7B":
304
+ processor, model = processor_l, model_l
 
305
  elif model_name == "Thyme-RL":
306
+ processor, model = processor_n, model_n
 
307
  else:
308
  yield "Invalid model selected.", "Invalid model selected."
309
  return
 
381
  ["Explain the video in detail.", "videos/2.mp4"]
382
  ]
383
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
384
  # Create the Gradio Interface
385
+ with gr.Blocks(css=css, theme=steel_blue_theme) as demo:
386
+ gr.Markdown("# **Multimodal OCR2**", elem_id="main-title")
387
  with gr.Row():
388
+ with gr.Column(scale=2):
389
  with gr.Tabs():
390
  with gr.TabItem("Image Inference"):
391
  image_query = gr.Textbox(label="Query Input", placeholder="Enter your query here...")
392
  image_upload = gr.Image(type="pil", label="Image", height=290)
393
+ image_submit = gr.Button("Submit", variant="primary")
394
+ gr.Examples(examples=image_examples, inputs=[image_query, image_upload])
 
 
 
395
  with gr.TabItem("Video Inference"):
396
  video_query = gr.Textbox(label="Query Input", placeholder="Enter your query here...")
397
  video_upload = gr.Video(label="Video", height=290)
398
+ video_submit = gr.Button("Submit", variant="primary")
399
+ gr.Examples(examples=video_examples, inputs=[video_query, video_upload])
 
 
 
400
  with gr.Accordion("Advanced options", open=False):
401
  max_new_tokens = gr.Slider(label="Max new tokens", minimum=1, maximum=MAX_MAX_NEW_TOKENS, step=1, value=DEFAULT_MAX_NEW_TOKENS)
402
  temperature = gr.Slider(label="Temperature", minimum=0.1, maximum=4.0, step=0.1, value=0.6)
 
404
  top_k = gr.Slider(label="Top-k", minimum=1, maximum=1000, step=1, value=50)
405
  repetition_penalty = gr.Slider(label="Repetition penalty", minimum=1.0, maximum=2.0, step=0.05, value=1.2)
406
 
407
+ with gr.Column(scale=3):
408
+ gr.Markdown("## Output", elem_id="output-title")
409
+ raw_output = gr.Textbox(label="Raw Output Stream", interactive=False, lines=14, show_copy_button=True)
410
+ with gr.Accordion("(Result.md)", open=False):
411
+ formatted_output = gr.Markdown(label="(Result.md)")
 
 
412
 
413
  model_choice = gr.Radio(
414
  choices=["Nanonets-OCR-s", "MonkeyOCR-Recognition", "Thyme-RL", "Typhoon-OCR-7B", "SmolDocling-256M-preview"],
 
416
  value="Nanonets-OCR-s"
417
  )
418
 
 
 
 
 
 
 
 
 
419
  image_submit.click(
420
  fn=generate_image,
421
  inputs=[model_choice, image_query, image_upload, max_new_tokens, temperature, top_p, top_k, repetition_penalty],
 
424
  video_submit.click(
425
  fn=generate_video,
426
  inputs=[model_choice, video_query, video_upload, max_new_tokens, temperature, top_p, top_k, repetition_penalty],
427
+ outputs=[raw_output, formatted_output]
 
428
  )
429
 
430
  if __name__ == "__main__":
431
+ demo.queue(max_size=50).launch(mcp_server=True, ssr_mode=False, show_error=True)