prithivMLmods commited on
Commit
eb21945
·
verified ·
1 Parent(s): 17a6c18

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +256 -207
app.py CHANGED
@@ -1,7 +1,8 @@
1
  import os
2
  import random
3
- import re
4
- import ast
 
5
  import asyncio
6
  from threading import Thread
7
 
@@ -11,69 +12,82 @@ import torch
11
  import numpy as np
12
  from PIL import Image, ImageOps
13
  import cv2
 
14
  from transformers import (
 
15
  Qwen2_5_VLForConditionalGeneration,
 
16
  AutoModelForVision2Seq,
17
  AutoProcessor,
18
  TextIteratorStreamer,
19
  )
 
20
 
21
  from docling_core.types.doc import DoclingDocument, DocTagsDocument
22
 
23
- # --- Constants ---
 
 
 
 
24
  MAX_MAX_NEW_TOKENS = 5120
25
  DEFAULT_MAX_NEW_TOKENS = 3072
26
  MAX_INPUT_TOKEN_LENGTH = int(os.getenv("MAX_INPUT_TOKEN_LENGTH", "4096"))
27
- DEVICE = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
28
-
29
- # --- Model Loading ---
30
- def load_model(model_id, model_class, subfolder=None):
31
- """Generic function to load a model and its processor."""
32
- processor_kwargs = {"trust_remote_code": True}
33
- model_kwargs = {"trust_remote_code": True, "torch_dtype": torch.float16}
34
 
35
- if subfolder:
36
- processor_kwargs["subfolder"] = subfolder
37
- model_kwargs["subfolder"] = subfolder
38
-
39
- processor = AutoProcessor.from_pretrained(model_id, **processor_kwargs)
40
- model = model_class.from_pretrained(model_id, **model_kwargs).to(DEVICE).eval()
41
- return processor, model
42
 
43
  # Load Nanonets-OCR-s
44
- processor_m, model_m = load_model(
45
- "nanonets/Nanonets-OCR-s", Qwen2_5_VLForConditionalGeneration
46
- )
 
 
 
 
47
 
48
  # Load MonkeyOCR
49
- processor_g, model_g = load_model(
50
- "echo840/MonkeyOCR", Qwen2_5_VLForConditionalGeneration, subfolder="Recognition"
 
 
 
 
51
  )
 
 
 
 
 
 
52
 
53
  # Load Typhoon-OCR-7B
54
- processor_l, model_l = load_model(
55
- "scb10x/typhoon-ocr-7b", Qwen2_5_VLForConditionalGeneration
56
- )
 
 
 
 
57
 
58
  # Load SmolDocling-256M-preview
59
- processor_x, model_x = load_model(
60
- "ds4sd/SmolDocling-256M-preview", AutoModelForVision2Seq
61
- )
 
 
 
 
62
 
63
  # Thyme-RL
64
- processor_n, model_n = load_model(
65
- "Kwai-Keye/Thyme-RL", Qwen2_5_VLForConditionalGeneration
66
- )
67
-
68
- MODEL_MAPPING = {
69
- "Nanonets-OCR-s": (processor_m, model_m),
70
- "MonkeyOCR-Recognition": (processor_g, model_g),
71
- "Typhoon-OCR-7B": (processor_l, model_l),
72
- "SmolDocling-256M-preview": (processor_x, model_x),
73
- "Thyme-RL": (processor_n, model_n),
74
- }
75
-
76
- # --- Preprocessing Functions ---
77
  def add_random_padding(image, min_percent=0.1, max_percent=0.10):
78
  """Add random padding to an image based on its size."""
79
  image = image.convert("RGB")
@@ -82,81 +96,94 @@ def add_random_padding(image, min_percent=0.1, max_percent=0.10):
82
  pad_h_percent = random.uniform(min_percent, max_percent)
83
  pad_w = int(width * pad_w_percent)
84
  pad_h = int(height * pad_h_percent)
85
- corner_pixel = image.getpixel((0, 0))
86
- padded_image = ImageOps.expand(
87
- image, border=(pad_w, pad_h, pad_w, pad_h), fill=corner_pixel
88
- )
89
  return padded_image
90
 
91
  def normalize_values(text, target_max=500):
92
- """Normalize numerical values in text to a target maximum for SmolDocling."""
93
  def normalize_list(values):
94
  max_value = max(values) if values else 1
95
  return [round((v / max_value) * target_max) for v in values]
96
 
97
  def process_match(match):
98
- try:
99
- num_list = ast.literal_eval(match.group(0))
100
- normalized = normalize_list(num_list)
101
- return "".join([f"<loc_{num}>" for num in normalized])
102
- except (ValueError, SyntaxError):
103
- return match.group(0)
104
 
105
  pattern = r"\[([\d\.\s,]+)\]"
106
- return re.sub(pattern, process_match, text)
 
107
 
108
- def downsample_video(video_path, num_frames=10):
109
- """Downsample a video to evenly spaced frames, returning PIL images."""
110
- if not video_path:
111
- return []
112
  vidcap = cv2.VideoCapture(video_path)
113
  total_frames = int(vidcap.get(cv2.CAP_PROP_FRAME_COUNT))
 
114
  frames = []
115
- frame_indices = np.linspace(0, total_frames - 1, num_frames, dtype=int)
116
-
117
  for i in frame_indices:
118
  vidcap.set(cv2.CAP_PROP_POS_FRAMES, i)
119
  success, image = vidcap.read()
120
  if success:
121
- image_rgb = cv2.cvtColor(image, cv2.COLOR_BGR2RGB)
122
- frames.append(Image.fromarray(image_rgb))
 
 
123
  vidcap.release()
124
  return frames
125
 
126
- # --- Core Generation Logic ---
127
- def _generate_response(model_name, text, images, max_new_tokens, temperature, top_p, top_k, repetition_penalty):
128
- """Helper function to handle model inference."""
129
- if not images:
130
- yield "Please upload an image or video.", ""
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
131
  return
132
 
133
- try:
134
- processor, model = MODEL_MAPPING[model_name]
135
- except KeyError:
136
- yield "Invalid model selected.", ""
137
  return
138
 
139
- # Model-specific preprocessing
 
140
  if model_name == "SmolDocling-256M-preview":
141
- if any(keyword in text for keyword in ["OTSL", "code"]):
142
  images = [add_random_padding(img) for img in images]
143
- if any(keyword in text for keyword in ["OCR at text at", "Identify element", "formula"]):
144
  text = normalize_values(text, target_max=500)
145
 
146
  messages = [
147
  {
148
  "role": "user",
149
- "content": [{"type": "image"}] * len(images) + [{"type": "text", "text": text}],
 
 
150
  }
151
  ]
152
-
153
  prompt = processor.apply_chat_template(messages, add_generation_prompt=True)
154
- inputs = processor(text=prompt, images=images, return_tensors="pt").to(DEVICE)
155
-
156
- streamer = TextIteratorStreamer(
157
- processor, skip_prompt=True, skip_special_tokens=True
158
- )
159
 
 
160
  generation_kwargs = {
161
  **inputs,
162
  "streamer": streamer,
@@ -166,7 +193,6 @@ def _generate_response(model_name, text, images, max_new_tokens, temperature, to
166
  "top_k": top_k,
167
  "repetition_penalty": repetition_penalty,
168
  }
169
-
170
  thread = Thread(target=model.generate, kwargs=generation_kwargs)
171
  thread.start()
172
 
@@ -175,70 +201,102 @@ def _generate_response(model_name, text, images, max_new_tokens, temperature, to
175
  buffer += new_text.replace("<|im_end|>", "")
176
  yield buffer, buffer
177
 
178
- # Model-specific post-processing
179
  if model_name == "SmolDocling-256M-preview":
180
  cleaned_output = buffer.replace("<end_of_utterance>", "").strip()
181
- is_doc_tag = any(
182
- tag in cleaned_output for tag in ["<doctag>", "<otsl>", "<code>", "<chart>", "<formula>"]
183
- )
184
- if is_doc_tag:
185
  if "<chart>" in cleaned_output:
186
  cleaned_output = cleaned_output.replace("<chart>", "<otsl>").replace("</chart>", "</otsl>")
187
  cleaned_output = re.sub(r'(<loc_500>)(?!.*<loc_500>)<[^>]+>', r'\1', cleaned_output)
188
-
189
- try:
190
- doctags_doc = DocTagsDocument.from_doctags_and_image_pairs([cleaned_output], images)
191
- doc = DoclingDocument.load_from_doctags(doctags_doc, document_name="Document")
192
- markdown_output = doc.export_to_markdown()
193
- yield buffer, markdown_output
194
- except Exception as e:
195
- yield buffer, f"Error processing Docling output: {e}"
196
  else:
197
  yield buffer, cleaned_output
198
 
199
  @spaces.GPU
200
- def generate_for_image(model_name, text, image, *args):
201
- """Generate responses for a single image input."""
202
- if image is None:
203
- yield "Please upload an image.", ""
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
204
  return
205
- yield from _generate_response(model_name, text, [image], *args)
206
 
207
-
208
- @spaces.GPU
209
- def generate_for_video(model_name, text, video_path, *args):
210
- """Generate responses for video input by downsampling frames."""
211
  if video_path is None:
212
- yield "Please upload a video.", ""
213
  return
 
214
  frames = downsample_video(video_path)
215
- if not frames:
216
- yield "Could not process video. Please check the file.", ""
217
- return
218
- yield from _generate_response(model_name, text, frames, *args)
219
 
 
 
 
 
 
220
 
221
- # --- Gradio Interface ---
222
- css = """
223
- .submit-btn {
224
- background-color: #2980b9 !important;
225
- color: white !important;
226
- font-weight: bold !important;
227
- border: none !important;
228
- transition: background-color 0.3s ease;
229
- }
230
- .submit-btn:hover {
231
- background-color: #3498db !important;
232
- }
233
- .output-container {
234
- border: 2px solid #4682B4;
235
- border-radius: 10px;
236
- padding: 20px;
237
- height: 100%;
238
- }
239
- """
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
240
 
241
- # Define examples
242
  image_examples = [
243
  ["Reconstruct the doc [table] as it is.", "images/0.png"],
244
  ["Describe the image!", "images/8.png"],
@@ -248,101 +306,92 @@ image_examples = [
248
  ["Convert chart to OTSL.", "images/4.png"],
249
  ["Convert code to text", "images/5.jpg"],
250
  ["Convert this table to OTSL.", "images/6.jpg"],
251
- ["Convert formula to latex.", "images/7.jpg"],
252
  ]
253
 
254
  video_examples = [
255
  ["Explain the video in detail.", "videos/1.mp4"],
256
- ["Explain the video in detail.", "videos/2.mp4"],
257
  ]
258
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
259
  with gr.Blocks(css=css, theme="bethecloud/storj_theme") as demo:
260
  gr.Markdown("# **[Multimodal OCR2](https://huggingface.co/collections/prithivMLmods/multimodal-implementations-67c9982ea04b39f0608badb0)**")
261
- gr.Markdown("A unified interface for state-of-the-art multimodal and document AI models. Select a model, upload an image or video, and enter a query to begin.")
262
-
263
  with gr.Row():
264
- # --- LEFT COLUMN (INPUTS) ---
265
- with gr.Column(scale=1):
266
- model_choice = gr.Dropdown(
267
- choices=[
268
- "Nanonets-OCR-s",
269
- "MonkeyOCR-Recognition",
270
- "Thyme-RL",
271
- "Typhoon-OCR-7B",
272
- "SmolDocling-256M-preview",
273
- ],
274
- label="Select Model⚡",
275
- value="Nanonets-OCR-s",
276
- )
277
-
278
  with gr.Tabs():
279
- with gr.TabItem("🖼️ Image Inference"):
280
- image_query = gr.Textbox(label="Query", placeholder="e.g., 'OCR the document'")
281
- image_upload = gr.Image(type="pil", label="Upload Image", height="299")
282
- image_submit = gr.Button("Generate", elem_classes="submit-btn")
283
- gr.Examples(examples=image_examples, inputs=[image_query, image_upload])
284
-
285
- with gr.TabItem("🎬 Video Inference"):
286
- video_query = gr.Textbox(label="Query", placeholder="e.g., 'What is happening in this video?'")
287
- video_upload = gr.Video(label="Upload Video", height="299")
288
- video_submit = gr.Button("Generate", elem_classes="submit-btn")
289
- gr.Examples(examples=video_examples, inputs=[video_query, video_upload])
290
-
291
- with gr.Accordion("⚙️ Advanced Options", open=False):
292
- # FIX: Changed 'min' to 'minimum' and 'max' to 'maximum'
293
- max_new_tokens = gr.Slider(
294
- label="Max New Tokens", minimum=1, maximum=MAX_MAX_NEW_TOKENS, step=1, value=DEFAULT_MAX_NEW_TOKENS
295
- )
296
- temperature = gr.Slider(
297
- label="Temperature", minimum=0.1, maximum=2.0, step=0.1, value=0.6
298
- )
299
- top_p = gr.Slider(
300
- label="Top-P", minimum=0.05, maximum=1.0, step=0.05, value=0.9
301
- )
302
- top_k = gr.Slider(label="Top-K", minimum=1, maximum=1000, step=1, value=50)
303
- repetition_penalty = gr.Slider(
304
- label="Repetition Penalty", minimum=1.0, maximum=2.0, step=0.05, value=1.2
305
- )
306
-
307
- advanced_params = [max_new_tokens, temperature, top_p, top_k, repetition_penalty]
308
-
309
- # --- RIGHT COLUMN (OUTPUTS & INFO) ---
310
- with gr.Column(scale=2):
311
- with gr.Column(elem_classes="output-container"):
312
  gr.Markdown("## Output")
313
- raw_output = gr.Textbox(
314
- label="Raw Output Stream", interactive=False, lines=8
315
- )
316
  with gr.Accordion("(Result.md)", open=False):
317
- formatted_output = gr.Markdown(label="Formatted Result (Markdown)")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
318
 
319
- with gr.Accordion("💻 Model Information", open=True):
320
- gr.Markdown(
321
- """
322
- - **[Nanonets-OCR-s](https://huggingface.co/nanonets/Nanonets-OCR-s)**: nanonets-ocr-s is a powerful, state-of-the-art image-to-markdown ocr model that goes far beyond traditional text extraction. it transforms documents into structured markdown with intelligent content recognition and semantic tagging.
323
- - **[SmolDocling-256M](https://huggingface.co/ds4sd/SmolDocling-256M-preview)**: SmolDocling is a multimodal Image-Text-to-Text model designed for efficient document conversion. It retains Docling's most popular features while ensuring full compatibility with Docling through seamless support for DoclingDocuments.
324
- - **[MonkeyOCR-Recognition](https://huggingface.co/echo840/MonkeyOCR)**: MonkeyOCR adopts a Structure-Recognition-Relation (SRR) triplet paradigm, which simplifies the multi-tool pipeline of modular approaches while avoiding the inefficiency of using large multimodal models for full-page document processing.
325
- - **[Typhoon-OCR-7B](https://huggingface.co/scb10x/typhoon-ocr-7b)**: A bilingual document parsing model built specifically for real-world documents in Thai and English inspired by models like olmOCR based on Qwen2.5-VL-Instruction. Extracts and interprets embedded text (e.g., chart labels, captions) in Thai or English.
326
- - **[Thyme-RL](https://huggingface.co/Kwai-Keye/Thyme-RL)**: Think Beyond Images. Thyme transcends traditional ``thinking with images'' paradigms by autonomously generating and executing diverse image processing and computational operations through executable code, significantly enhancing performance on high-resolution perception and complex reasoning tasks.
327
-
328
- - **⚠️Note**: Performance on video inference tasks is experimental and may vary between models.
329
-
330
- > [Report a Bug](https://huggingface.co/spaces/prithivMLmods/Multimodal-OCR2/discussions)
331
- """
332
- )
333
-
334
- # --- Event Handlers ---
335
  image_submit.click(
336
- fn=generate_for_image,
337
- inputs=[model_choice, image_query, image_upload] + advanced_params,
338
- outputs=[raw_output, formatted_output],
339
  )
340
-
341
  video_submit.click(
342
- fn=generate_for_video,
343
- inputs=[model_choice, video_query, video_upload] + advanced_params,
344
- outputs=[raw_output, formatted_output],
 
345
  )
346
 
347
  if __name__ == "__main__":
348
- demo.queue(max_size=50).launch(share=True, show_error=True)
 
1
  import os
2
  import random
3
+ import uuid
4
+ import json
5
+ import time
6
  import asyncio
7
  from threading import Thread
8
 
 
12
  import numpy as np
13
  from PIL import Image, ImageOps
14
  import cv2
15
+
16
  from transformers import (
17
+ Qwen2VLForConditionalGeneration,
18
  Qwen2_5_VLForConditionalGeneration,
19
+ AutoModelForCausalLM,
20
  AutoModelForVision2Seq,
21
  AutoProcessor,
22
  TextIteratorStreamer,
23
  )
24
+ from transformers.image_utils import load_image
25
 
26
  from docling_core.types.doc import DoclingDocument, DocTagsDocument
27
 
28
+ import re
29
+ import ast
30
+ import html
31
+
32
+ # Constants for text generation
33
  MAX_MAX_NEW_TOKENS = 5120
34
  DEFAULT_MAX_NEW_TOKENS = 3072
35
  MAX_INPUT_TOKEN_LENGTH = int(os.getenv("MAX_INPUT_TOKEN_LENGTH", "4096"))
 
 
 
 
 
 
 
36
 
37
+ device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
 
 
 
 
 
 
38
 
39
  # Load Nanonets-OCR-s
40
+ MODEL_ID_M = "nanonets/Nanonets-OCR-s"
41
+ processor_m = AutoProcessor.from_pretrained(MODEL_ID_M, trust_remote_code=True)
42
+ model_m = Qwen2_5_VLForConditionalGeneration.from_pretrained(
43
+ MODEL_ID_M,
44
+ trust_remote_code=True,
45
+ torch_dtype=torch.float16
46
+ ).to(device).eval()
47
 
48
  # Load MonkeyOCR
49
+ MODEL_ID_G = "echo840/MonkeyOCR"
50
+ SUBFOLDER = "Recognition"
51
+ processor_g = AutoProcessor.from_pretrained(
52
+ MODEL_ID_G,
53
+ trust_remote_code=True,
54
+ subfolder=SUBFOLDER
55
  )
56
+ model_g = Qwen2_5_VLForConditionalGeneration.from_pretrained(
57
+ MODEL_ID_G,
58
+ trust_remote_code=True,
59
+ subfolder=SUBFOLDER,
60
+ torch_dtype=torch.float16
61
+ ).to(device).eval()
62
 
63
  # Load Typhoon-OCR-7B
64
+ MODEL_ID_L = "scb10x/typhoon-ocr-7b"
65
+ processor_l = AutoProcessor.from_pretrained(MODEL_ID_L, trust_remote_code=True)
66
+ model_l = Qwen2_5_VLForConditionalGeneration.from_pretrained(
67
+ MODEL_ID_L,
68
+ trust_remote_code=True,
69
+ torch_dtype=torch.float16
70
+ ).to(device).eval()
71
 
72
  # Load SmolDocling-256M-preview
73
+ MODEL_ID_X = "ds4sd/SmolDocling-256M-preview"
74
+ processor_x = AutoProcessor.from_pretrained(MODEL_ID_X, trust_remote_code=True)
75
+ model_x = AutoModelForVision2Seq.from_pretrained(
76
+ MODEL_ID_X,
77
+ trust_remote_code=True,
78
+ torch_dtype=torch.float16
79
+ ).to(device).eval()
80
 
81
  # Thyme-RL
82
+ MODEL_ID_N = "Kwai-Keye/Thyme-RL"
83
+ processor_n = AutoProcessor.from_pretrained(MODEL_ID_N, trust_remote_code=True)
84
+ model_n = Qwen2_5_VLForConditionalGeneration.from_pretrained(
85
+ MODEL_ID_N,
86
+ trust_remote_code=True,
87
+ torch_dtype=torch.float16
88
+ ).to(device).eval()
89
+
90
+ # Preprocessing functions for SmolDocling-256M
 
 
 
 
91
  def add_random_padding(image, min_percent=0.1, max_percent=0.10):
92
  """Add random padding to an image based on its size."""
93
  image = image.convert("RGB")
 
96
  pad_h_percent = random.uniform(min_percent, max_percent)
97
  pad_w = int(width * pad_w_percent)
98
  pad_h = int(height * pad_h_percent)
99
+ corner_pixel = image.getpixel((0, 0)) # Top-left corner
100
+ padded_image = ImageOps.expand(image, border=(pad_w, pad_h, pad_w, pad_h), fill=corner_pixel)
 
 
101
  return padded_image
102
 
103
  def normalize_values(text, target_max=500):
104
+ """Normalize numerical values in text to a target maximum."""
105
  def normalize_list(values):
106
  max_value = max(values) if values else 1
107
  return [round((v / max_value) * target_max) for v in values]
108
 
109
  def process_match(match):
110
+ num_list = ast.literal_eval(match.group(0))
111
+ normalized = normalize_list(num_list)
112
+ return "".join([f"<loc_{num}>" for num in normalized])
 
 
 
113
 
114
  pattern = r"\[([\d\.\s,]+)\]"
115
+ normalized_text = re.sub(pattern, process_match, text)
116
+ return normalized_text
117
 
118
+ def downsample_video(video_path):
119
+ """Downsample a video to evenly spaced frames, returning PIL images with timestamps."""
 
 
120
  vidcap = cv2.VideoCapture(video_path)
121
  total_frames = int(vidcap.get(cv2.CAP_PROP_FRAME_COUNT))
122
+ fps = vidcap.get(cv2.CAP_PROP_FPS)
123
  frames = []
124
+ frame_indices = np.linspace(0, total_frames - 1, 10, dtype=int)
 
125
  for i in frame_indices:
126
  vidcap.set(cv2.CAP_PROP_POS_FRAMES, i)
127
  success, image = vidcap.read()
128
  if success:
129
+ image = cv2.cvtColor(image, cv2.COLOR_BGR2RGB)
130
+ pil_image = Image.fromarray(image)
131
+ timestamp = round(i / fps, 2)
132
+ frames.append((pil_image, timestamp))
133
  vidcap.release()
134
  return frames
135
 
136
+ @spaces.GPU
137
+ def generate_image(model_name: str, text: str, image: Image.Image,
138
+ max_new_tokens: int = 1024,
139
+ temperature: float = 0.6,
140
+ top_p: float = 0.9,
141
+ top_k: int = 50,
142
+ repetition_penalty: float = 1.2):
143
+ """Generate responses for image input using the selected model."""
144
+ if model_name == "Nanonets-OCR-s":
145
+ processor = processor_m
146
+ model = model_m
147
+ elif model_name == "MonkeyOCR-Recognition":
148
+ processor = processor_g
149
+ model = model_g
150
+ elif model_name == "SmolDocling-256M-preview":
151
+ processor = processor_x
152
+ model = model_x
153
+ elif model_name == "Typhoon-OCR-7B":
154
+ processor = processor_l
155
+ model = model_l
156
+ elif model_name == "Thyme-RL":
157
+ processor = processor_n
158
+ model = model_n
159
+ else:
160
+ yield "Invalid model selected.", "Invalid model selected."
161
  return
162
 
163
+ if image is None:
164
+ yield "Please upload an image.", "Please upload an image."
 
 
165
  return
166
 
167
+ images = [image]
168
+
169
  if model_name == "SmolDocling-256M-preview":
170
+ if "OTSL" in text or "code" in text:
171
  images = [add_random_padding(img) for img in images]
172
+ if "OCR at text at" in text or "Identify element" in text or "formula" in text:
173
  text = normalize_values(text, target_max=500)
174
 
175
  messages = [
176
  {
177
  "role": "user",
178
+ "content": [{"type": "image"} for _ in images] + [
179
+ {"type": "text", "text": text}
180
+ ]
181
  }
182
  ]
 
183
  prompt = processor.apply_chat_template(messages, add_generation_prompt=True)
184
+ inputs = processor(text=prompt, images=images, return_tensors="pt").to(device)
 
 
 
 
185
 
186
+ streamer = TextIteratorStreamer(processor, skip_prompt=True, skip_special_tokens=True)
187
  generation_kwargs = {
188
  **inputs,
189
  "streamer": streamer,
 
193
  "top_k": top_k,
194
  "repetition_penalty": repetition_penalty,
195
  }
 
196
  thread = Thread(target=model.generate, kwargs=generation_kwargs)
197
  thread.start()
198
 
 
201
  buffer += new_text.replace("<|im_end|>", "")
202
  yield buffer, buffer
203
 
 
204
  if model_name == "SmolDocling-256M-preview":
205
  cleaned_output = buffer.replace("<end_of_utterance>", "").strip()
206
+ if any(tag in cleaned_output for tag in ["<doctag>", "<otsl>", "<code>", "<chart>", "<formula>"]):
 
 
 
207
  if "<chart>" in cleaned_output:
208
  cleaned_output = cleaned_output.replace("<chart>", "<otsl>").replace("</chart>", "</otsl>")
209
  cleaned_output = re.sub(r'(<loc_500>)(?!.*<loc_500>)<[^>]+>', r'\1', cleaned_output)
210
+ doctags_doc = DocTagsDocument.from_doctags_and_image_pairs([cleaned_output], images)
211
+ doc = DoclingDocument.load_from_doctags(doctags_doc, document_name="Document")
212
+ markdown_output = doc.export_to_markdown()
213
+ yield buffer, markdown_output
 
 
 
 
214
  else:
215
  yield buffer, cleaned_output
216
 
217
  @spaces.GPU
218
+ def generate_video(model_name: str, text: str, video_path: str,
219
+ max_new_tokens: int = 1024,
220
+ temperature: float = 0.6,
221
+ top_p: float = 0.9,
222
+ top_k: int = 50,
223
+ repetition_penalty: float = 1.2):
224
+ """Generate responses for video input using the selected model."""
225
+ if model_name == "Nanonets-OCR-s":
226
+ processor = processor_m
227
+ model = model_m
228
+ elif model_name == "MonkeyOCR-Recognition":
229
+ processor = processor_g
230
+ model = model_g
231
+ elif model_name == "SmolDocling-256M-preview":
232
+ processor = processor_x
233
+ model = model_x
234
+ elif model_name == "Typhoon-OCR-7B":
235
+ processor = processor_l
236
+ model = model_l
237
+ elif model_name == "Thyme-RL":
238
+ processor = processor_n
239
+ model = model_n
240
+ else:
241
+ yield "Invalid model selected.", "Invalid model selected."
242
  return
 
243
 
 
 
 
 
244
  if video_path is None:
245
+ yield "Please upload a video.", "Please upload a video."
246
  return
247
+
248
  frames = downsample_video(video_path)
249
+ images = [frame for frame, _ in frames]
 
 
 
250
 
251
+ if model_name == "SmolDocling-256M-preview":
252
+ if "OTSL" in text or "code" in text:
253
+ images = [add_random_padding(img) for img in images]
254
+ if "OCR at text at" in text or "Identify element" in text or "formula" in text:
255
+ text = normalize_values(text, target_max=500)
256
 
257
+ messages = [
258
+ {
259
+ "role": "user",
260
+ "content": [{"type": "image"} for _ in images] + [
261
+ {"type": "text", "text": text}
262
+ ]
263
+ }
264
+ ]
265
+ prompt = processor.apply_chat_template(messages, add_generation_prompt=True)
266
+ inputs = processor(text=prompt, images=images, return_tensors="pt").to(device)
267
+
268
+ streamer = TextIteratorStreamer(processor, skip_prompt=True, skip_special_tokens=True)
269
+ generation_kwargs = {
270
+ **inputs,
271
+ "streamer": streamer,
272
+ "max_new_tokens": max_new_tokens,
273
+ "temperature": temperature,
274
+ "top_p": top_p,
275
+ "top_k": top_k,
276
+ "repetition_penalty": repetition_penalty,
277
+ }
278
+ thread = Thread(target=model.generate, kwargs=generation_kwargs)
279
+ thread.start()
280
+
281
+ buffer = ""
282
+ for new_text in streamer:
283
+ buffer += new_text.replace("<|im_end|>", "")
284
+ yield buffer, buffer
285
+
286
+ if model_name == "SmolDocling-256M-preview":
287
+ cleaned_output = buffer.replace("<end_of_utterance>", "").strip()
288
+ if any(tag in cleaned_output for tag in ["<doctag>", "<otsl>", "<code>", "<chart>", "<formula>"]):
289
+ if "<chart>" in cleaned_output:
290
+ cleaned_output = cleaned_output.replace("<chart>", "<otsl>").replace("</chart>", "</otsl>")
291
+ cleaned_output = re.sub(r'(<loc_500>)(?!.*<loc_500>)<[^>]+>', r'\1', cleaned_output)
292
+ doctags_doc = DocTagsDocument.from_doctags_and_image_pairs([cleaned_output], images)
293
+ doc = DoclingDocument.load_from_doctags(doctags_doc, document_name="Document")
294
+ markdown_output = doc.export_to_markdown()
295
+ yield buffer, markdown_output
296
+ else:
297
+ yield buffer, cleaned_output
298
 
299
+ # Define examples for image and video inference
300
  image_examples = [
301
  ["Reconstruct the doc [table] as it is.", "images/0.png"],
302
  ["Describe the image!", "images/8.png"],
 
306
  ["Convert chart to OTSL.", "images/4.png"],
307
  ["Convert code to text", "images/5.jpg"],
308
  ["Convert this table to OTSL.", "images/6.jpg"],
309
+ ["Convert formula to late.", "images/7.jpg"],
310
  ]
311
 
312
  video_examples = [
313
  ["Explain the video in detail.", "videos/1.mp4"],
314
+ ["Explain the video in detail.", "videos/2.mp4"]
315
  ]
316
 
317
+ #css
318
+ css = """
319
+ .submit-btn {
320
+ background-color: #2980b9 !important;
321
+ color: white !important;
322
+ }
323
+ .submit-btn:hover {
324
+ background-color: #3498db !important;
325
+ }
326
+ .canvas-output {
327
+ border: 2px solid #4682B4;
328
+ border-radius: 10px;
329
+ padding: 20px;
330
+ }
331
+ """
332
+
333
+ # Create the Gradio Interface
334
  with gr.Blocks(css=css, theme="bethecloud/storj_theme") as demo:
335
  gr.Markdown("# **[Multimodal OCR2](https://huggingface.co/collections/prithivMLmods/multimodal-implementations-67c9982ea04b39f0608badb0)**")
 
 
336
  with gr.Row():
337
+ with gr.Column():
 
 
 
 
 
 
 
 
 
 
 
 
 
338
  with gr.Tabs():
339
+ with gr.TabItem("Image Inference"):
340
+ image_query = gr.Textbox(label="Query Input", placeholder="Enter your query here...")
341
+ image_upload = gr.Image(type="pil", label="Image", height=290)
342
+ image_submit = gr.Button("Submit", elem_classes="submit-btn")
343
+ gr.Examples(
344
+ examples=image_examples,
345
+ inputs=[image_query, image_upload]
346
+ )
347
+ with gr.TabItem("Video Inference"):
348
+ video_query = gr.Textbox(label="Query Input", placeholder="Enter your query here...")
349
+ video_upload = gr.Video(label="Video", height=290)
350
+ video_submit = gr.Button("Submit", elem_classes="submit-btn")
351
+ gr.Examples(
352
+ examples=video_examples,
353
+ inputs=[video_query, video_upload]
354
+ )
355
+ with gr.Accordion("Advanced options", open=False):
356
+ max_new_tokens = gr.Slider(label="Max new tokens", minimum=1, maximum=MAX_MAX_NEW_TOKENS, step=1, value=DEFAULT_MAX_NEW_TOKENS)
357
+ temperature = gr.Slider(label="Temperature", minimum=0.1, maximum=4.0, step=0.1, value=0.6)
358
+ top_p = gr.Slider(label="Top-p (nucleus sampling)", minimum=0.05, maximum=1.0, step=0.05, value=0.9)
359
+ top_k = gr.Slider(label="Top-k", minimum=1, maximum=1000, step=1, value=50)
360
+ repetition_penalty = gr.Slider(label="Repetition penalty", minimum=1.0, maximum=2.0, step=0.05, value=1.2)
361
+
362
+ with gr.Column():
363
+ with gr.Column(elem_classes="canvas-output"):
 
 
 
 
 
 
 
 
364
  gr.Markdown("## Output")
365
+ raw_output = gr.Textbox(label="Raw Output Stream", interactive=False, lines=5)
366
+
 
367
  with gr.Accordion("(Result.md)", open=False):
368
+ formatted_output = gr.Markdown(label="(Result.md)")
369
+
370
+ model_choice = gr.Radio(
371
+ choices=["Nanonets-OCR-s", "MonkeyOCR-Recognition", "Thyme-RL", "Typhoon-OCR-7B", "SmolDocling-256M-preview"],
372
+ label="Select Model",
373
+ value="Nanonets-OCR-s"
374
+ )
375
+
376
+ gr.Markdown("**Model Info 💻** | [Report Bug](https://huggingface.co/spaces/prithivMLmods/Multimodal-OCR2/discussions)")
377
+ gr.Markdown("> [Nanonets-OCR-s](https://huggingface.co/nanonets/Nanonets-OCR-s): nanonets-ocr-s is a powerful, state-of-the-art image-to-markdown ocr model that goes far beyond traditional text extraction. it transforms documents into structured markdown with intelligent content recognition and semantic tagging.")
378
+ gr.Markdown("> [SmolDocling-256M](https://huggingface.co/ds4sd/SmolDocling-256M-preview): SmolDocling is a multimodal Image-Text-to-Text model designed for efficient document conversion. It retains Docling's most popular features while ensuring full compatibility with Docling through seamless support for DoclingDocuments.")
379
+ gr.Markdown("> [MonkeyOCR-Recognition](https://huggingface.co/echo840/MonkeyOCR): MonkeyOCR adopts a Structure-Recognition-Relation (SRR) triplet paradigm, which simplifies the multi-tool pipeline of modular approaches while avoiding the inefficiency of using large multimodal models for full-page document processing.")
380
+ gr.Markdown("> [Typhoon-OCR-7B](https://huggingface.co/scb10x/typhoon-ocr-7b): A bilingual document parsing model built specifically for real-world documents in Thai and English inspired by models like olmOCR based on Qwen2.5-VL-Instruction. Extracts and interprets embedded text (e.g., chart labels, captions) in Thai or English.")
381
+ gr.Markdown("> [Thyme-RL](https://huggingface.co/Kwai-Keye/Thyme-RL): Thyme: Think Beyond Images. Thyme transcends traditional ``thinking with images'' paradigms by autonomously generating and executing diverse image processing and computational operations through executable code, significantly enhancing performance on high-resolution perception and complex reasoning tasks.")
382
+ gr.Markdown(">⚠️note: all the models in space are not guaranteed to perform well in video inference use cases.")
383
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
384
  image_submit.click(
385
+ fn=generate_image,
386
+ inputs=[model_choice, image_query, image_upload, max_new_tokens, temperature, top_p, top_k, repetition_penalty],
387
+ outputs=[raw_output, formatted_output]
388
  )
 
389
  video_submit.click(
390
+ fn=generate_video,
391
+ inputs=[model_choice, video_query, video_upload, max_new_tokens, temperature, top_p, top_k, repetition_penalty],
392
+ outputs=[raw_output,
393
+ formatted_output]
394
  )
395
 
396
  if __name__ == "__main__":
397
+ demo.queue(max_size=50).launch(share=True, mcp_server=True, ssr_mode=False, show_error=True)