prithivMLmods commited on
Commit
f5e1158
·
verified ·
1 Parent(s): c086f4e

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +206 -257
app.py CHANGED
@@ -1,8 +1,7 @@
1
  import os
2
  import random
3
- import uuid
4
- import json
5
- import time
6
  import asyncio
7
  from threading import Thread
8
 
@@ -12,82 +11,69 @@ import torch
12
  import numpy as np
13
  from PIL import Image, ImageOps
14
  import cv2
15
-
16
  from transformers import (
17
- Qwen2VLForConditionalGeneration,
18
  Qwen2_5_VLForConditionalGeneration,
19
- AutoModelForCausalLM,
20
  AutoModelForVision2Seq,
21
  AutoProcessor,
22
  TextIteratorStreamer,
23
  )
24
- from transformers.image_utils import load_image
25
 
26
  from docling_core.types.doc import DoclingDocument, DocTagsDocument
27
 
28
- import re
29
- import ast
30
- import html
31
-
32
- # Constants for text generation
33
  MAX_MAX_NEW_TOKENS = 5120
34
  DEFAULT_MAX_NEW_TOKENS = 3072
35
  MAX_INPUT_TOKEN_LENGTH = int(os.getenv("MAX_INPUT_TOKEN_LENGTH", "4096"))
 
 
 
 
 
 
 
36
 
37
- device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
 
 
 
 
 
 
38
 
39
  # Load Nanonets-OCR-s
40
- MODEL_ID_M = "nanonets/Nanonets-OCR-s"
41
- processor_m = AutoProcessor.from_pretrained(MODEL_ID_M, trust_remote_code=True)
42
- model_m = Qwen2_5_VLForConditionalGeneration.from_pretrained(
43
- MODEL_ID_M,
44
- trust_remote_code=True,
45
- torch_dtype=torch.float16
46
- ).to(device).eval()
47
 
48
  # Load MonkeyOCR
49
- MODEL_ID_G = "echo840/MonkeyOCR"
50
- SUBFOLDER = "Recognition"
51
- processor_g = AutoProcessor.from_pretrained(
52
- MODEL_ID_G,
53
- trust_remote_code=True,
54
- subfolder=SUBFOLDER
55
  )
56
- model_g = Qwen2_5_VLForConditionalGeneration.from_pretrained(
57
- MODEL_ID_G,
58
- trust_remote_code=True,
59
- subfolder=SUBFOLDER,
60
- torch_dtype=torch.float16
61
- ).to(device).eval()
62
 
63
  # Load Typhoon-OCR-7B
64
- MODEL_ID_L = "scb10x/typhoon-ocr-7b"
65
- processor_l = AutoProcessor.from_pretrained(MODEL_ID_L, trust_remote_code=True)
66
- model_l = Qwen2_5_VLForConditionalGeneration.from_pretrained(
67
- MODEL_ID_L,
68
- trust_remote_code=True,
69
- torch_dtype=torch.float16
70
- ).to(device).eval()
71
 
72
  # Load SmolDocling-256M-preview
73
- MODEL_ID_X = "ds4sd/SmolDocling-256M-preview"
74
- processor_x = AutoProcessor.from_pretrained(MODEL_ID_X, trust_remote_code=True)
75
- model_x = AutoModelForVision2Seq.from_pretrained(
76
- MODEL_ID_X,
77
- trust_remote_code=True,
78
- torch_dtype=torch.float16
79
- ).to(device).eval()
80
 
81
  # Thyme-RL
82
- MODEL_ID_N = "Kwai-Keye/Thyme-RL"
83
- processor_n = AutoProcessor.from_pretrained(MODEL_ID_N, trust_remote_code=True)
84
- model_n = Qwen2_5_VLForConditionalGeneration.from_pretrained(
85
- MODEL_ID_N,
86
- trust_remote_code=True,
87
- torch_dtype=torch.float16
88
- ).to(device).eval()
89
-
90
- # Preprocessing functions for SmolDocling-256M
 
 
 
 
91
  def add_random_padding(image, min_percent=0.1, max_percent=0.10):
92
  """Add random padding to an image based on its size."""
93
  image = image.convert("RGB")
@@ -96,94 +82,81 @@ def add_random_padding(image, min_percent=0.1, max_percent=0.10):
96
  pad_h_percent = random.uniform(min_percent, max_percent)
97
  pad_w = int(width * pad_w_percent)
98
  pad_h = int(height * pad_h_percent)
99
- corner_pixel = image.getpixel((0, 0)) # Top-left corner
100
- padded_image = ImageOps.expand(image, border=(pad_w, pad_h, pad_w, pad_h), fill=corner_pixel)
 
 
101
  return padded_image
102
 
103
  def normalize_values(text, target_max=500):
104
- """Normalize numerical values in text to a target maximum."""
105
  def normalize_list(values):
106
  max_value = max(values) if values else 1
107
  return [round((v / max_value) * target_max) for v in values]
108
 
109
  def process_match(match):
110
- num_list = ast.literal_eval(match.group(0))
111
- normalized = normalize_list(num_list)
112
- return "".join([f"<loc_{num}>" for num in normalized])
 
 
 
113
 
114
  pattern = r"\[([\d\.\s,]+)\]"
115
- normalized_text = re.sub(pattern, process_match, text)
116
- return normalized_text
117
 
118
- def downsample_video(video_path):
119
- """Downsample a video to evenly spaced frames, returning PIL images with timestamps."""
 
 
120
  vidcap = cv2.VideoCapture(video_path)
121
  total_frames = int(vidcap.get(cv2.CAP_PROP_FRAME_COUNT))
122
- fps = vidcap.get(cv2.CAP_PROP_FPS)
123
  frames = []
124
- frame_indices = np.linspace(0, total_frames - 1, 10, dtype=int)
 
125
  for i in frame_indices:
126
  vidcap.set(cv2.CAP_PROP_POS_FRAMES, i)
127
  success, image = vidcap.read()
128
  if success:
129
- image = cv2.cvtColor(image, cv2.COLOR_BGR2RGB)
130
- pil_image = Image.fromarray(image)
131
- timestamp = round(i / fps, 2)
132
- frames.append((pil_image, timestamp))
133
  vidcap.release()
134
  return frames
135
 
136
- @spaces.GPU
137
- def generate_image(model_name: str, text: str, image: Image.Image,
138
- max_new_tokens: int = 1024,
139
- temperature: float = 0.6,
140
- top_p: float = 0.9,
141
- top_k: int = 50,
142
- repetition_penalty: float = 1.2):
143
- """Generate responses for image input using the selected model."""
144
- if model_name == "Nanonets-OCR-s":
145
- processor = processor_m
146
- model = model_m
147
- elif model_name == "MonkeyOCR-Recognition":
148
- processor = processor_g
149
- model = model_g
150
- elif model_name == "SmolDocling-256M-preview":
151
- processor = processor_x
152
- model = model_x
153
- elif model_name == "Typhoon-OCR-7B":
154
- processor = processor_l
155
- model = model_l
156
- elif model_name == "Thyme-RL":
157
- processor = processor_n
158
- model = model_n
159
- else:
160
- yield "Invalid model selected.", "Invalid model selected."
161
  return
162
 
163
- if image is None:
164
- yield "Please upload an image.", "Please upload an image."
 
 
165
  return
166
 
167
- images = [image]
168
-
169
  if model_name == "SmolDocling-256M-preview":
170
- if "OTSL" in text or "code" in text:
171
  images = [add_random_padding(img) for img in images]
172
- if "OCR at text at" in text or "Identify element" in text or "formula" in text:
173
  text = normalize_values(text, target_max=500)
174
 
175
  messages = [
176
  {
177
  "role": "user",
178
- "content": [{"type": "image"} for _ in images] + [
179
- {"type": "text", "text": text}
180
- ]
181
  }
182
  ]
 
183
  prompt = processor.apply_chat_template(messages, add_generation_prompt=True)
184
- inputs = processor(text=prompt, images=images, return_tensors="pt").to(device)
 
 
 
 
185
 
186
- streamer = TextIteratorStreamer(processor, skip_prompt=True, skip_special_tokens=True)
187
  generation_kwargs = {
188
  **inputs,
189
  "streamer": streamer,
@@ -193,6 +166,7 @@ def generate_image(model_name: str, text: str, image: Image.Image,
193
  "top_k": top_k,
194
  "repetition_penalty": repetition_penalty,
195
  }
 
196
  thread = Thread(target=model.generate, kwargs=generation_kwargs)
197
  thread.start()
198
 
@@ -201,102 +175,70 @@ def generate_image(model_name: str, text: str, image: Image.Image,
201
  buffer += new_text.replace("<|im_end|>", "")
202
  yield buffer, buffer
203
 
 
204
  if model_name == "SmolDocling-256M-preview":
205
  cleaned_output = buffer.replace("<end_of_utterance>", "").strip()
206
- if any(tag in cleaned_output for tag in ["<doctag>", "<otsl>", "<code>", "<chart>", "<formula>"]):
 
 
 
207
  if "<chart>" in cleaned_output:
208
  cleaned_output = cleaned_output.replace("<chart>", "<otsl>").replace("</chart>", "</otsl>")
209
  cleaned_output = re.sub(r'(<loc_500>)(?!.*<loc_500>)<[^>]+>', r'\1', cleaned_output)
210
- doctags_doc = DocTagsDocument.from_doctags_and_image_pairs([cleaned_output], images)
211
- doc = DoclingDocument.load_from_doctags(doctags_doc, document_name="Document")
212
- markdown_output = doc.export_to_markdown()
213
- yield buffer, markdown_output
 
 
 
 
214
  else:
215
  yield buffer, cleaned_output
216
 
217
  @spaces.GPU
218
- def generate_video(model_name: str, text: str, video_path: str,
219
- max_new_tokens: int = 1024,
220
- temperature: float = 0.6,
221
- top_p: float = 0.9,
222
- top_k: int = 50,
223
- repetition_penalty: float = 1.2):
224
- """Generate responses for video input using the selected model."""
225
- if model_name == "Nanonets-OCR-s":
226
- processor = processor_m
227
- model = model_m
228
- elif model_name == "MonkeyOCR-Recognition":
229
- processor = processor_g
230
- model = model_g
231
- elif model_name == "SmolDocling-256M-preview":
232
- processor = processor_x
233
- model = model_x
234
- elif model_name == "Typhoon-OCR-7B":
235
- processor = processor_l
236
- model = model_l
237
- elif model_name == "Thyme-RL":
238
- processor = processor_n
239
- model = model_n
240
- else:
241
- yield "Invalid model selected.", "Invalid model selected."
242
  return
 
243
 
 
 
 
 
244
  if video_path is None:
245
- yield "Please upload a video.", "Please upload a video."
246
  return
247
-
248
  frames = downsample_video(video_path)
249
- images = [frame for frame, _ in frames]
250
-
251
- if model_name == "SmolDocling-256M-preview":
252
- if "OTSL" in text or "code" in text:
253
- images = [add_random_padding(img) for img in images]
254
- if "OCR at text at" in text or "Identify element" in text or "formula" in text:
255
- text = normalize_values(text, target_max=500)
256
-
257
- messages = [
258
- {
259
- "role": "user",
260
- "content": [{"type": "image"} for _ in images] + [
261
- {"type": "text", "text": text}
262
- ]
263
- }
264
- ]
265
- prompt = processor.apply_chat_template(messages, add_generation_prompt=True)
266
- inputs = processor(text=prompt, images=images, return_tensors="pt").to(device)
267
-
268
- streamer = TextIteratorStreamer(processor, skip_prompt=True, skip_special_tokens=True)
269
- generation_kwargs = {
270
- **inputs,
271
- "streamer": streamer,
272
- "max_new_tokens": max_new_tokens,
273
- "temperature": temperature,
274
- "top_p": top_p,
275
- "top_k": top_k,
276
- "repetition_penalty": repetition_penalty,
277
- }
278
- thread = Thread(target=model.generate, kwargs=generation_kwargs)
279
- thread.start()
280
 
281
- buffer = ""
282
- for new_text in streamer:
283
- buffer += new_text.replace("<|im_end|>", "")
284
- yield buffer, buffer
285
 
286
- if model_name == "SmolDocling-256M-preview":
287
- cleaned_output = buffer.replace("<end_of_utterance>", "").strip()
288
- if any(tag in cleaned_output for tag in ["<doctag>", "<otsl>", "<code>", "<chart>", "<formula>"]):
289
- if "<chart>" in cleaned_output:
290
- cleaned_output = cleaned_output.replace("<chart>", "<otsl>").replace("</chart>", "</otsl>")
291
- cleaned_output = re.sub(r'(<loc_500>)(?!.*<loc_500>)<[^>]+>', r'\1', cleaned_output)
292
- doctags_doc = DocTagsDocument.from_doctags_and_image_pairs([cleaned_output], images)
293
- doc = DoclingDocument.load_from_doctags(doctags_doc, document_name="Document")
294
- markdown_output = doc.export_to_markdown()
295
- yield buffer, markdown_output
296
- else:
297
- yield buffer, cleaned_output
 
 
 
 
 
 
 
298
 
299
- # Define examples for image and video inference
300
  image_examples = [
301
  ["Reconstruct the doc [table] as it is.", "images/0.png"],
302
  ["Describe the image!", "images/8.png"],
@@ -306,92 +248,99 @@ image_examples = [
306
  ["Convert chart to OTSL.", "images/4.png"],
307
  ["Convert code to text", "images/5.jpg"],
308
  ["Convert this table to OTSL.", "images/6.jpg"],
309
- ["Convert formula to late.", "images/7.jpg"],
310
  ]
311
 
312
  video_examples = [
313
  ["Explain the video in detail.", "videos/1.mp4"],
314
- ["Explain the video in detail.", "videos/2.mp4"]
315
  ]
316
 
317
- #css
318
- css = """
319
- .submit-btn {
320
- background-color: #2980b9 !important;
321
- color: white !important;
322
- }
323
- .submit-btn:hover {
324
- background-color: #3498db !important;
325
- }
326
- .canvas-output {
327
- border: 2px solid #4682B4;
328
- border-radius: 10px;
329
- padding: 20px;
330
- }
331
- """
332
-
333
- # Create the Gradio Interface
334
  with gr.Blocks(css=css, theme="bethecloud/storj_theme") as demo:
335
- gr.Markdown("# **[Multimodal OCR2](https://huggingface.co/collections/prithivMLmods/multimodal-implementations-67c9982ea04b39f0608badb0)**")
 
 
336
  with gr.Row():
337
- with gr.Column():
338
- with gr.Tabs():
339
- with gr.TabItem("Image Inference"):
340
- image_query = gr.Textbox(label="Query Input", placeholder="Enter your query here...")
341
- image_upload = gr.Image(type="pil", label="Image", height=290)
342
- image_submit = gr.Button("Submit", elem_classes="submit-btn")
343
- gr.Examples(
344
- examples=image_examples,
345
- inputs=[image_query, image_upload]
346
- )
347
- with gr.TabItem("Video Inference"):
348
- video_query = gr.Textbox(label="Query Input", placeholder="Enter your query here...")
349
- video_upload = gr.Video(label="Video", height=290)
350
- video_submit = gr.Button("Submit", elem_classes="submit-btn")
351
- gr.Examples(
352
- examples=video_examples,
353
- inputs=[video_query, video_upload]
354
- )
355
- with gr.Accordion("Advanced options", open=False):
356
- max_new_tokens = gr.Slider(label="Max new tokens", minimum=1, maximum=MAX_MAX_NEW_TOKENS, step=1, value=DEFAULT_MAX_NEW_TOKENS)
357
- temperature = gr.Slider(label="Temperature", minimum=0.1, maximum=4.0, step=0.1, value=0.6)
358
- top_p = gr.Slider(label="Top-p (nucleus sampling)", minimum=0.05, maximum=1.0, step=0.05, value=0.9)
359
- top_k = gr.Slider(label="Top-k", minimum=1, maximum=1000, step=1, value=50)
360
- repetition_penalty = gr.Slider(label="Repetition penalty", minimum=1.0, maximum=2.0, step=0.05, value=1.2)
361
-
362
- with gr.Column():
363
- with gr.Column(elem_classes="canvas-output"):
364
- gr.Markdown("## Output")
365
- raw_output = gr.Textbox(label="Raw Output Stream", interactive=False, lines=5)
366
-
367
- with gr.Accordion("(Result.md)", open=False):
368
- formatted_output = gr.Markdown(label="(Result.md)")
369
-
370
  model_choice = gr.Radio(
371
- choices=["Nanonets-OCR-s", "MonkeyOCR-Recognition", "Thyme-RL", "Typhoon-OCR-7B", "SmolDocling-256M-preview"],
372
- label="Select Model",
373
- value="Nanonets-OCR-s"
 
 
 
 
 
 
374
  )
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
375
 
376
- gr.Markdown("**Model Info 💻** | [Report Bug](https://huggingface.co/spaces/prithivMLmods/Multimodal-OCR2/discussions)")
377
- gr.Markdown("> [Nanonets-OCR-s](https://huggingface.co/nanonets/Nanonets-OCR-s): nanonets-ocr-s is a powerful, state-of-the-art image-to-markdown ocr model that goes far beyond traditional text extraction. it transforms documents into structured markdown with intelligent content recognition and semantic tagging.")
378
- gr.Markdown("> [SmolDocling-256M](https://huggingface.co/ds4sd/SmolDocling-256M-preview): SmolDocling is a multimodal Image-Text-to-Text model designed for efficient document conversion. It retains Docling's most popular features while ensuring full compatibility with Docling through seamless support for DoclingDocuments.")
379
- gr.Markdown("> [MonkeyOCR-Recognition](https://huggingface.co/echo840/MonkeyOCR): MonkeyOCR adopts a Structure-Recognition-Relation (SRR) triplet paradigm, which simplifies the multi-tool pipeline of modular approaches while avoiding the inefficiency of using large multimodal models for full-page document processing.")
380
- gr.Markdown("> [Typhoon-OCR-7B](https://huggingface.co/scb10x/typhoon-ocr-7b): A bilingual document parsing model built specifically for real-world documents in Thai and English inspired by models like olmOCR based on Qwen2.5-VL-Instruction. Extracts and interprets embedded text (e.g., chart labels, captions) in Thai or English.")
381
- gr.Markdown("> [Thyme-RL](https://huggingface.co/Kwai-Keye/Thyme-RL): Thyme: Think Beyond Images. Thyme transcends traditional ``thinking with images'' paradigms by autonomously generating and executing diverse image processing and computational operations through executable code, significantly enhancing performance on high-resolution perception and complex reasoning tasks.")
382
- gr.Markdown(">⚠️note: all the models in space are not guaranteed to perform well in video inference use cases.")
 
 
 
383
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
384
  image_submit.click(
385
- fn=generate_image,
386
- inputs=[model_choice, image_query, image_upload, max_new_tokens, temperature, top_p, top_k, repetition_penalty],
387
- outputs=[raw_output, formatted_output]
388
  )
 
389
  video_submit.click(
390
- fn=generate_video,
391
- inputs=[model_choice, video_query, video_upload, max_new_tokens, temperature, top_p, top_k, repetition_penalty],
392
- outputs=[raw_output,
393
- formatted_output]
394
  )
395
 
396
  if __name__ == "__main__":
397
- demo.queue(max_size=50).launch(share=True, mcp_server=True, ssr_mode=False, show_error=True)
 
1
  import os
2
  import random
3
+ import re
4
+ import ast
 
5
  import asyncio
6
  from threading import Thread
7
 
 
11
  import numpy as np
12
  from PIL import Image, ImageOps
13
  import cv2
 
14
  from transformers import (
 
15
  Qwen2_5_VLForConditionalGeneration,
 
16
  AutoModelForVision2Seq,
17
  AutoProcessor,
18
  TextIteratorStreamer,
19
  )
 
20
 
21
  from docling_core.types.doc import DoclingDocument, DocTagsDocument
22
 
23
+ # --- Constants ---
 
 
 
 
24
  MAX_MAX_NEW_TOKENS = 5120
25
  DEFAULT_MAX_NEW_TOKENS = 3072
26
  MAX_INPUT_TOKEN_LENGTH = int(os.getenv("MAX_INPUT_TOKEN_LENGTH", "4096"))
27
+ DEVICE = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
28
+
29
+ # --- Model Loading ---
30
+ def load_model(model_id, model_class, subfolder=None):
31
+ """Generic function to load a model and its processor."""
32
+ processor_kwargs = {"trust_remote_code": True}
33
+ model_kwargs = {"trust_remote_code": True, "torch_dtype": torch.float16}
34
 
35
+ if subfolder:
36
+ processor_kwargs["subfolder"] = subfolder
37
+ model_kwargs["subfolder"] = subfolder
38
+
39
+ processor = AutoProcessor.from_pretrained(model_id, **processor_kwargs)
40
+ model = model_class.from_pretrained(model_id, **model_kwargs).to(DEVICE).eval()
41
+ return processor, model
42
 
43
  # Load Nanonets-OCR-s
44
+ processor_m, model_m = load_model(
45
+ "nanonets/Nanonets-OCR-s", Qwen2_5_VLForConditionalGeneration
46
+ )
 
 
 
 
47
 
48
  # Load MonkeyOCR
49
+ processor_g, model_g = load_model(
50
+ "echo840/MonkeyOCR", Qwen2_5_VLForConditionalGeneration, subfolder="Recognition"
 
 
 
 
51
  )
 
 
 
 
 
 
52
 
53
  # Load Typhoon-OCR-7B
54
+ processor_l, model_l = load_model(
55
+ "scb10x/typhoon-ocr-7b", Qwen2_5_VLForConditionalGeneration
56
+ )
 
 
 
 
57
 
58
  # Load SmolDocling-256M-preview
59
+ processor_x, model_x = load_model(
60
+ "ds4sd/SmolDocling-256M-preview", AutoModelForVision2Seq
61
+ )
 
 
 
 
62
 
63
  # Thyme-RL
64
+ processor_n, model_n = load_model(
65
+ "Kwai-Keye/Thyme-RL", Qwen2_5_VLForConditionalGeneration
66
+ )
67
+
68
+ MODEL_MAPPING = {
69
+ "Nanonets-OCR-s": (processor_m, model_m),
70
+ "MonkeyOCR-Recognition": (processor_g, model_g),
71
+ "Typhoon-OCR-7B": (processor_l, model_l),
72
+ "SmolDocling-256M-preview": (processor_x, model_x),
73
+ "Thyme-RL": (processor_n, model_n),
74
+ }
75
+
76
+ # --- Preprocessing Functions ---
77
  def add_random_padding(image, min_percent=0.1, max_percent=0.10):
78
  """Add random padding to an image based on its size."""
79
  image = image.convert("RGB")
 
82
  pad_h_percent = random.uniform(min_percent, max_percent)
83
  pad_w = int(width * pad_w_percent)
84
  pad_h = int(height * pad_h_percent)
85
+ corner_pixel = image.getpixel((0, 0))
86
+ padded_image = ImageOps.expand(
87
+ image, border=(pad_w, pad_h, pad_w, pad_h), fill=corner_pixel
88
+ )
89
  return padded_image
90
 
91
  def normalize_values(text, target_max=500):
92
+ """Normalize numerical values in text to a target maximum for SmolDocling."""
93
  def normalize_list(values):
94
  max_value = max(values) if values else 1
95
  return [round((v / max_value) * target_max) for v in values]
96
 
97
  def process_match(match):
98
+ try:
99
+ num_list = ast.literal_eval(match.group(0))
100
+ normalized = normalize_list(num_list)
101
+ return "".join([f"<loc_{num}>" for num in normalized])
102
+ except (ValueError, SyntaxError):
103
+ return match.group(0)
104
 
105
  pattern = r"\[([\d\.\s,]+)\]"
106
+ return re.sub(pattern, process_match, text)
 
107
 
108
+ def downsample_video(video_path, num_frames=10):
109
+ """Downsample a video to evenly spaced frames, returning PIL images."""
110
+ if not video_path:
111
+ return []
112
  vidcap = cv2.VideoCapture(video_path)
113
  total_frames = int(vidcap.get(cv2.CAP_PROP_FRAME_COUNT))
 
114
  frames = []
115
+ frame_indices = np.linspace(0, total_frames - 1, num_frames, dtype=int)
116
+
117
  for i in frame_indices:
118
  vidcap.set(cv2.CAP_PROP_POS_FRAMES, i)
119
  success, image = vidcap.read()
120
  if success:
121
+ image_rgb = cv2.cvtColor(image, cv2.COLOR_BGR2RGB)
122
+ frames.append(Image.fromarray(image_rgb))
 
 
123
  vidcap.release()
124
  return frames
125
 
126
+ # --- Core Generation Logic ---
127
+ def _generate_response(model_name, text, images, max_new_tokens, temperature, top_p, top_k, repetition_penalty):
128
+ """Helper function to handle model inference."""
129
+ if not images:
130
+ yield "Please upload an image or video.", ""
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
131
  return
132
 
133
+ try:
134
+ processor, model = MODEL_MAPPING[model_name]
135
+ except KeyError:
136
+ yield "Invalid model selected.", ""
137
  return
138
 
139
+ # Model-specific preprocessing
 
140
  if model_name == "SmolDocling-256M-preview":
141
+ if any(keyword in text for keyword in ["OTSL", "code"]):
142
  images = [add_random_padding(img) for img in images]
143
+ if any(keyword in text for keyword in ["OCR at text at", "Identify element", "formula"]):
144
  text = normalize_values(text, target_max=500)
145
 
146
  messages = [
147
  {
148
  "role": "user",
149
+ "content": [{"type": "image"}] * len(images) + [{"type": "text", "text": text}],
 
 
150
  }
151
  ]
152
+
153
  prompt = processor.apply_chat_template(messages, add_generation_prompt=True)
154
+ inputs = processor(text=prompt, images=images, return_tensors="pt").to(DEVICE)
155
+
156
+ streamer = TextIteratorStreamer(
157
+ processor, skip_prompt=True, skip_special_tokens=True
158
+ )
159
 
 
160
  generation_kwargs = {
161
  **inputs,
162
  "streamer": streamer,
 
166
  "top_k": top_k,
167
  "repetition_penalty": repetition_penalty,
168
  }
169
+
170
  thread = Thread(target=model.generate, kwargs=generation_kwargs)
171
  thread.start()
172
 
 
175
  buffer += new_text.replace("<|im_end|>", "")
176
  yield buffer, buffer
177
 
178
+ # Model-specific post-processing
179
  if model_name == "SmolDocling-256M-preview":
180
  cleaned_output = buffer.replace("<end_of_utterance>", "").strip()
181
+ is_doc_tag = any(
182
+ tag in cleaned_output for tag in ["<doctag>", "<otsl>", "<code>", "<chart>", "<formula>"]
183
+ )
184
+ if is_doc_tag:
185
  if "<chart>" in cleaned_output:
186
  cleaned_output = cleaned_output.replace("<chart>", "<otsl>").replace("</chart>", "</otsl>")
187
  cleaned_output = re.sub(r'(<loc_500>)(?!.*<loc_500>)<[^>]+>', r'\1', cleaned_output)
188
+
189
+ try:
190
+ doctags_doc = DocTagsDocument.from_doctags_and_image_pairs([cleaned_output], images)
191
+ doc = DoclingDocument.load_from_doctags(doctags_doc, document_name="Document")
192
+ markdown_output = doc.export_to_markdown()
193
+ yield buffer, markdown_output
194
+ except Exception as e:
195
+ yield buffer, f"Error processing Docling output: {e}"
196
  else:
197
  yield buffer, cleaned_output
198
 
199
  @spaces.GPU
200
+ def generate_for_image(model_name, text, image, *args):
201
+ """Generate responses for a single image input."""
202
+ if image is None:
203
+ yield "Please upload an image.", ""
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
204
  return
205
+ yield from _generate_response(model_name, text, [image], *args)
206
 
207
+
208
+ @spaces.GPU
209
+ def generate_for_video(model_name, text, video_path, *args):
210
+ """Generate responses for video input by downsampling frames."""
211
  if video_path is None:
212
+ yield "Please upload a video.", ""
213
  return
 
214
  frames = downsample_video(video_path)
215
+ if not frames:
216
+ yield "Could not process video. Please check the file.", ""
217
+ return
218
+ yield from _generate_response(model_name, text, frames, *args)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
219
 
 
 
 
 
220
 
221
+ # --- Gradio Interface ---
222
+ css = """
223
+ .submit-btn {
224
+ background-color: #2980b9 !important;
225
+ color: white !important;
226
+ font-weight: bold !important;
227
+ border: none !important;
228
+ transition: background-color 0.3s ease;
229
+ }
230
+ .submit-btn:hover {
231
+ background-color: #3498db !important;
232
+ }
233
+ .output-container {
234
+ border: 2px solid #4682B4;
235
+ border-radius: 10px;
236
+ padding: 20px;
237
+ height: 100%;
238
+ }
239
+ """
240
 
241
+ # Define examples
242
  image_examples = [
243
  ["Reconstruct the doc [table] as it is.", "images/0.png"],
244
  ["Describe the image!", "images/8.png"],
 
248
  ["Convert chart to OTSL.", "images/4.png"],
249
  ["Convert code to text", "images/5.jpg"],
250
  ["Convert this table to OTSL.", "images/6.jpg"],
251
+ ["Convert formula to latex.", "images/7.jpg"],
252
  ]
253
 
254
  video_examples = [
255
  ["Explain the video in detail.", "videos/1.mp4"],
256
+ ["Explain the video in detail.", "videos/2.mp4"],
257
  ]
258
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
259
  with gr.Blocks(css=css, theme="bethecloud/storj_theme") as demo:
260
+ gr.Markdown("# **[Multimodal OCR²](https://huggingface.co/collections/prithivMLmods/multimodal-implementations-67c9982ea04b39f0608badb0)**")
261
+ gr.Markdown("A unified interface for state-of-the-art multimodal and document AI models. Select a model, upload an image or video, and enter a query to begin.")
262
+
263
  with gr.Row():
264
+ # --- LEFT COLUMN (INPUTS) ---
265
+ with gr.Column(scale=1):
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
266
  model_choice = gr.Radio(
267
+ choices=[
268
+ "Nanonets-OCR-s",
269
+ "MonkeyOCR-Recognition",
270
+ "Thyme-RL",
271
+ "Typhoon-OCR-7B",
272
+ "SmolDocling-256M-preview",
273
+ ],
274
+ label="🤖 Select Model",
275
+ value="Nanonets-OCR-s",
276
  )
277
+
278
+ with gr.Tabs():
279
+ with gr.TabItem("🖼️ Image Inference"):
280
+ image_query = gr.Textbox(label="Query", placeholder="e.g., 'OCR the document'")
281
+ image_upload = gr.Image(type="pil", label="Upload Image")
282
+ image_submit = gr.Button("Generate", elem_classes="submit-btn")
283
+ gr.Examples(examples=image_examples, inputs=[image_query, image_upload])
284
+
285
+ with gr.TabItem("🎬 Video Inference"):
286
+ video_query = gr.Textbox(label="Query", placeholder="e.g., 'What is happening in this video?'")
287
+ video_upload = gr.Video(label="Upload Video")
288
+ video_submit = gr.Button("Generate", elem_classes="submit-btn")
289
+ gr.Examples(examples=video_examples, inputs=[video_query, video_upload])
290
+
291
+ with gr.Accordion("⚙️ Advanced Options", open=False):
292
+ max_new_tokens = gr.Slider(
293
+ label="Max New Tokens", min=1, max=MAX_MAX_NEW_TOKENS, step=1, value=DEFAULT_MAX_NEW_TOKENS
294
+ )
295
+ temperature = gr.Slider(
296
+ label="Temperature", min=0.1, max=2.0, step=0.1, value=0.6
297
+ )
298
+ top_p = gr.Slider(
299
+ label="Top-P", min=0.05, max=1.0, step=0.05, value=0.9
300
+ )
301
+ top_k = gr.Slider(label="Top-K", min=1, max=1000, step=1, value=50)
302
+ repetition_penalty = gr.Slider(
303
+ label="Repetition Penalty", min=1.0, max=2.0, step=0.05, value=1.2
304
+ )
305
 
306
+ advanced_params = [max_new_tokens, temperature, top_p, top_k, repetition_penalty]
307
+
308
+ # --- RIGHT COLUMN (OUTPUTS & INFO) ---
309
+ with gr.Column(scale=2):
310
+ with gr.Column(elem_classes="output-container"):
311
+ gr.Markdown("## Output")
312
+ raw_output = gr.Textbox(
313
+ label="Raw Output Stream", interactive=False, lines=8
314
+ )
315
+ formatted_output = gr.Markdown(label="Formatted Result (Markdown)")
316
 
317
+ with gr.Accordion("💻 Model Information", open=True):
318
+ gr.Markdown(
319
+ """
320
+ - **[Nanonets-OCR-s](https://huggingface.co/nanonets/Nanonets-OCR-s)**: Transforms documents into structured markdown with intelligent content recognition.
321
+ - **[SmolDocling-256M](https://huggingface.co/ds4sd/SmolDocling-256M-preview)**: An efficient multimodal model for converting documents to structured formats.
322
+ - **[MonkeyOCR-Recognition](https://huggingface.co/echo840/MonkeyOCR)**: Adopts a Structure-Recognition-Relation paradigm for efficient document processing.
323
+ - **[Typhoon-OCR-7B](https://huggingface.co/scb10x/typhoon-ocr-7b)**: A bilingual (Thai/English) document parsing model for real-world documents.
324
+ - **[Thyme-RL](https://huggingface.co/Kwai-Keye/Thyme-RL)**: Generates and executes code for image processing and complex reasoning tasks.
325
+ ---
326
+ > ⚠️ **Note**: Performance on video inference tasks is experimental and may vary between models.
327
+
328
+ > [Report a Bug](https://huggingface.co/spaces/prithivMLmods/Multimodal-OCR2/discussions)
329
+ """
330
+ )
331
+
332
+ # --- Event Handlers ---
333
  image_submit.click(
334
+ fn=generate_for_image,
335
+ inputs=[model_choice, image_query, image_upload] + advanced_params,
336
+ outputs=[raw_output, formatted_output],
337
  )
338
+
339
  video_submit.click(
340
+ fn=generate_for_video,
341
+ inputs=[model_choice, video_query, video_upload] + advanced_params,
342
+ outputs=[raw_output, formatted_output],
 
343
  )
344
 
345
  if __name__ == "__main__":
346
+ demo.queue(max_size=50).launch(share=True, show_error=True)