prithivMLmods commited on
Commit
25a44d8
·
verified ·
1 Parent(s): eb21945

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +101 -149
app.py CHANGED
@@ -23,7 +23,9 @@ from transformers import (
23
  )
24
  from transformers.image_utils import load_image
25
 
26
- from docling_core.types.doc import DoclingDocument, DocTagsDocument
 
 
27
 
28
  import re
29
  import ast
@@ -36,6 +38,7 @@ MAX_INPUT_TOKEN_LENGTH = int(os.getenv("MAX_INPUT_TOKEN_LENGTH", "4096"))
36
 
37
  device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
38
 
 
39
  # Load Nanonets-OCR-s
40
  MODEL_ID_M = "nanonets/Nanonets-OCR-s"
41
  processor_m = AutoProcessor.from_pretrained(MODEL_ID_M, trust_remote_code=True)
@@ -87,7 +90,8 @@ model_n = Qwen2_5_VLForConditionalGeneration.from_pretrained(
87
  torch_dtype=torch.float16
88
  ).to(device).eval()
89
 
90
- # Preprocessing functions for SmolDocling-256M
 
91
  def add_random_padding(image, min_percent=0.1, max_percent=0.10):
92
  """Add random padding to an image based on its size."""
93
  image = image.convert("RGB")
@@ -121,6 +125,7 @@ def downsample_video(video_path):
121
  total_frames = int(vidcap.get(cv2.CAP_PROP_FRAME_COUNT))
122
  fps = vidcap.get(cv2.CAP_PROP_FPS)
123
  frames = []
 
124
  frame_indices = np.linspace(0, total_frames - 1, 10, dtype=int)
125
  for i in frame_indices:
126
  vidcap.set(cv2.CAP_PROP_POS_FRAMES, i)
@@ -133,76 +138,11 @@ def downsample_video(video_path):
133
  vidcap.release()
134
  return frames
135
 
136
- @spaces.GPU
137
- def generate_image(model_name: str, text: str, image: Image.Image,
138
- max_new_tokens: int = 1024,
139
- temperature: float = 0.6,
140
- top_p: float = 0.9,
141
- top_k: int = 50,
142
- repetition_penalty: float = 1.2):
143
- """Generate responses for image input using the selected model."""
144
- if model_name == "Nanonets-OCR-s":
145
- processor = processor_m
146
- model = model_m
147
- elif model_name == "MonkeyOCR-Recognition":
148
- processor = processor_g
149
- model = model_g
150
- elif model_name == "SmolDocling-256M-preview":
151
- processor = processor_x
152
- model = model_x
153
- elif model_name == "Typhoon-OCR-7B":
154
- processor = processor_l
155
- model = model_l
156
- elif model_name == "Thyme-RL":
157
- processor = processor_n
158
- model = model_n
159
- else:
160
- yield "Invalid model selected.", "Invalid model selected."
161
- return
162
-
163
- if image is None:
164
- yield "Please upload an image.", "Please upload an image."
165
- return
166
-
167
- images = [image]
168
-
169
- if model_name == "SmolDocling-256M-preview":
170
- if "OTSL" in text or "code" in text:
171
- images = [add_random_padding(img) for img in images]
172
- if "OCR at text at" in text or "Identify element" in text or "formula" in text:
173
- text = normalize_values(text, target_max=500)
174
-
175
- messages = [
176
- {
177
- "role": "user",
178
- "content": [{"type": "image"} for _ in images] + [
179
- {"type": "text", "text": text}
180
- ]
181
- }
182
- ]
183
- prompt = processor.apply_chat_template(messages, add_generation_prompt=True)
184
- inputs = processor(text=prompt, images=images, return_tensors="pt").to(device)
185
-
186
- streamer = TextIteratorStreamer(processor, skip_prompt=True, skip_special_tokens=True)
187
- generation_kwargs = {
188
- **inputs,
189
- "streamer": streamer,
190
- "max_new_tokens": max_new_tokens,
191
- "temperature": temperature,
192
- "top_p": top_p,
193
- "top_k": top_k,
194
- "repetition_penalty": repetition_penalty,
195
- }
196
- thread = Thread(target=model.generate, kwargs=generation_kwargs)
197
- thread.start()
198
-
199
- buffer = ""
200
- for new_text in streamer:
201
- buffer += new_text.replace("<|im_end|>", "")
202
- yield buffer, buffer
203
-
204
- if model_name == "SmolDocling-256M-preview":
205
- cleaned_output = buffer.replace("<end_of_utterance>", "").strip()
206
  if any(tag in cleaned_output for tag in ["<doctag>", "<otsl>", "<code>", "<chart>", "<formula>"]):
207
  if "<chart>" in cleaned_output:
208
  cleaned_output = cleaned_output.replace("<chart>", "<otsl>").replace("</chart>", "</otsl>")
@@ -210,43 +150,44 @@ def generate_image(model_name: str, text: str, image: Image.Image,
210
  doctags_doc = DocTagsDocument.from_doctags_and_image_pairs([cleaned_output], images)
211
  doc = DoclingDocument.load_from_doctags(doctags_doc, document_name="Document")
212
  markdown_output = doc.export_to_markdown()
213
- yield buffer, markdown_output
214
- else:
215
- yield buffer, cleaned_output
216
 
217
- @spaces.GPU
218
- def generate_video(model_name: str, text: str, video_path: str,
219
- max_new_tokens: int = 1024,
220
- temperature: float = 0.6,
221
- top_p: float = 0.9,
222
- top_k: int = 50,
223
- repetition_penalty: float = 1.2):
224
- """Generate responses for video input using the selected model."""
225
  if model_name == "Nanonets-OCR-s":
226
- processor = processor_m
227
- model = model_m
228
  elif model_name == "MonkeyOCR-Recognition":
229
- processor = processor_g
230
- model = model_g
231
  elif model_name == "SmolDocling-256M-preview":
232
- processor = processor_x
233
- model = model_x
234
  elif model_name == "Typhoon-OCR-7B":
235
- processor = processor_l
236
- model = model_l
237
  elif model_name == "Thyme-RL":
238
- processor = processor_n
239
- model = model_n
240
  else:
 
 
 
 
 
 
 
 
241
  yield "Invalid model selected.", "Invalid model selected."
242
  return
243
 
244
- if video_path is None:
245
- yield "Please upload a video.", "Please upload a video."
246
  return
247
 
248
- frames = downsample_video(video_path)
249
- images = [frame for frame, _ in frames]
 
 
 
250
 
251
  if model_name == "SmolDocling-256M-preview":
252
  if "OTSL" in text or "code" in text:
@@ -255,12 +196,7 @@ def generate_video(model_name: str, text: str, video_path: str,
255
  text = normalize_values(text, target_max=500)
256
 
257
  messages = [
258
- {
259
- "role": "user",
260
- "content": [{"type": "image"} for _ in images] + [
261
- {"type": "text", "text": text}
262
- ]
263
- }
264
  ]
265
  prompt = processor.apply_chat_template(messages, add_generation_prompt=True)
266
  inputs = processor(text=prompt, images=images, return_tensors="pt").to(device)
@@ -284,19 +220,20 @@ def generate_video(model_name: str, text: str, video_path: str,
284
  yield buffer, buffer
285
 
286
  if model_name == "SmolDocling-256M-preview":
287
- cleaned_output = buffer.replace("<end_of_utterance>", "").strip()
288
- if any(tag in cleaned_output for tag in ["<doctag>", "<otsl>", "<code>", "<chart>", "<formula>"]):
289
- if "<chart>" in cleaned_output:
290
- cleaned_output = cleaned_output.replace("<chart>", "<otsl>").replace("</chart>", "</otsl>")
291
- cleaned_output = re.sub(r'(<loc_500>)(?!.*<loc_500>)<[^>]+>', r'\1', cleaned_output)
292
- doctags_doc = DocTagsDocument.from_doctags_and_image_pairs([cleaned_output], images)
293
- doc = DoclingDocument.load_from_doctags(doctags_doc, document_name="Document")
294
- markdown_output = doc.export_to_markdown()
295
- yield buffer, markdown_output
296
- else:
297
- yield buffer, cleaned_output
298
 
299
- # Define examples for image and video inference
 
 
 
 
300
  image_examples = [
301
  ["Reconstruct the doc [table] as it is.", "images/0.png"],
302
  ["Describe the image!", "images/8.png"],
@@ -306,7 +243,7 @@ image_examples = [
306
  ["Convert chart to OTSL.", "images/4.png"],
307
  ["Convert code to text", "images/5.jpg"],
308
  ["Convert this table to OTSL.", "images/6.jpg"],
309
- ["Convert formula to late.", "images/7.jpg"],
310
  ]
311
 
312
  video_examples = [
@@ -314,84 +251,99 @@ video_examples = [
314
  ["Explain the video in detail.", "videos/2.mp4"]
315
  ]
316
 
317
- #css
318
  css = """
319
  .submit-btn {
320
  background-color: #2980b9 !important;
321
  color: white !important;
 
 
322
  }
323
  .submit-btn:hover {
324
  background-color: #3498db !important;
 
325
  }
326
  .canvas-output {
327
  border: 2px solid #4682B4;
328
  border-radius: 10px;
329
  padding: 20px;
 
330
  }
331
  """
332
 
333
- # Create the Gradio Interface
334
- with gr.Blocks(css=css, theme="bethecloud/storj_theme") as demo:
335
  gr.Markdown("# **[Multimodal OCR2](https://huggingface.co/collections/prithivMLmods/multimodal-implementations-67c9982ea04b39f0608badb0)**")
 
336
  with gr.Row():
337
- with gr.Column():
 
338
  with gr.Tabs():
339
- with gr.TabItem("Image Inference"):
340
  image_query = gr.Textbox(label="Query Input", placeholder="Enter your query here...")
341
- image_upload = gr.Image(type="pil", label="Image", height=290)
342
- image_submit = gr.Button("Submit", elem_classes="submit-btn")
343
  gr.Examples(
344
  examples=image_examples,
345
- inputs=[image_query, image_upload]
 
346
  )
347
- with gr.TabItem("Video Inference"):
 
 
348
  video_query = gr.Textbox(label="Query Input", placeholder="Enter your query here...")
349
- video_upload = gr.Video(label="Video", height=290)
350
- video_submit = gr.Button("Submit", elem_classes="submit-btn")
351
  gr.Examples(
352
  examples=video_examples,
353
- inputs=[video_query, video_upload]
 
354
  )
355
- with gr.Accordion("Advanced options", open=False):
356
- max_new_tokens = gr.Slider(label="Max new tokens", minimum=1, maximum=MAX_MAX_NEW_TOKENS, step=1, value=DEFAULT_MAX_NEW_TOKENS)
 
 
357
  temperature = gr.Slider(label="Temperature", minimum=0.1, maximum=4.0, step=0.1, value=0.6)
358
  top_p = gr.Slider(label="Top-p (nucleus sampling)", minimum=0.05, maximum=1.0, step=0.05, value=0.9)
359
  top_k = gr.Slider(label="Top-k", minimum=1, maximum=1000, step=1, value=50)
360
- repetition_penalty = gr.Slider(label="Repetition penalty", minimum=1.0, maximum=2.0, step=0.05, value=1.2)
361
 
362
- with gr.Column():
 
363
  with gr.Column(elem_classes="canvas-output"):
364
  gr.Markdown("## Output")
365
  raw_output = gr.Textbox(label="Raw Output Stream", interactive=False, lines=5)
366
 
367
- with gr.Accordion("(Result.md)", open=False):
368
- formatted_output = gr.Markdown(label="(Result.md)")
369
 
370
  model_choice = gr.Radio(
371
  choices=["Nanonets-OCR-s", "MonkeyOCR-Recognition", "Thyme-RL", "Typhoon-OCR-7B", "SmolDocling-256M-preview"],
372
- label="Select Model",
373
  value="Nanonets-OCR-s"
374
  )
375
 
376
  gr.Markdown("**Model Info 💻** | [Report Bug](https://huggingface.co/spaces/prithivMLmods/Multimodal-OCR2/discussions)")
377
- gr.Markdown("> [Nanonets-OCR-s](https://huggingface.co/nanonets/Nanonets-OCR-s): nanonets-ocr-s is a powerful, state-of-the-art image-to-markdown ocr model that goes far beyond traditional text extraction. it transforms documents into structured markdown with intelligent content recognition and semantic tagging.")
378
- gr.Markdown("> [SmolDocling-256M](https://huggingface.co/ds4sd/SmolDocling-256M-preview): SmolDocling is a multimodal Image-Text-to-Text model designed for efficient document conversion. It retains Docling's most popular features while ensuring full compatibility with Docling through seamless support for DoclingDocuments.")
379
- gr.Markdown("> [MonkeyOCR-Recognition](https://huggingface.co/echo840/MonkeyOCR): MonkeyOCR adopts a Structure-Recognition-Relation (SRR) triplet paradigm, which simplifies the multi-tool pipeline of modular approaches while avoiding the inefficiency of using large multimodal models for full-page document processing.")
380
- gr.Markdown("> [Typhoon-OCR-7B](https://huggingface.co/scb10x/typhoon-ocr-7b): A bilingual document parsing model built specifically for real-world documents in Thai and English inspired by models like olmOCR based on Qwen2.5-VL-Instruction. Extracts and interprets embedded text (e.g., chart labels, captions) in Thai or English.")
381
- gr.Markdown("> [Thyme-RL](https://huggingface.co/Kwai-Keye/Thyme-RL): Thyme: Think Beyond Images. Thyme transcends traditional ``thinking with images'' paradigms by autonomously generating and executing diverse image processing and computational operations through executable code, significantly enhancing performance on high-resolution perception and complex reasoning tasks.")
382
- gr.Markdown(">⚠️note: all the models in space are not guaranteed to perform well in video inference use cases.")
383
 
 
 
 
 
384
  image_submit.click(
385
- fn=generate_image,
386
- inputs=[model_choice, image_query, image_upload, max_new_tokens, temperature, top_p, top_k, repetition_penalty],
387
- outputs=[raw_output, formatted_output]
388
  )
 
389
  video_submit.click(
390
- fn=generate_video,
391
- inputs=[model_choice, video_query, video_upload, max_new_tokens, temperature, top_p, top_k, repetition_penalty],
392
- outputs=[raw_output,
393
- formatted_output]
394
  )
395
 
396
  if __name__ == "__main__":
397
- demo.queue(max_size=50).launch(share=True, mcp_server=True, ssr_mode=False, show_error=True)
 
23
  )
24
  from transformers.image_utils import load_image
25
 
26
+ # These imports seem to be from a custom library.
27
+ # If you have 'docling_core' installed, you can uncomment them.
28
+ # from docling_core.types.doc import DoclingDocument, DocTagsDocument
29
 
30
  import re
31
  import ast
 
38
 
39
  device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
40
 
41
+ # --- Model Loading ---
42
  # Load Nanonets-OCR-s
43
  MODEL_ID_M = "nanonets/Nanonets-OCR-s"
44
  processor_m = AutoProcessor.from_pretrained(MODEL_ID_M, trust_remote_code=True)
 
90
  torch_dtype=torch.float16
91
  ).to(device).eval()
92
 
93
+
94
+ # --- Preprocessing and Helper Functions ---
95
  def add_random_padding(image, min_percent=0.1, max_percent=0.10):
96
  """Add random padding to an image based on its size."""
97
  image = image.convert("RGB")
 
125
  total_frames = int(vidcap.get(cv2.CAP_PROP_FRAME_COUNT))
126
  fps = vidcap.get(cv2.CAP_PROP_FPS)
127
  frames = []
128
+ # Use 10 frames for video processing
129
  frame_indices = np.linspace(0, total_frames - 1, 10, dtype=int)
130
  for i in frame_indices:
131
  vidcap.set(cv2.CAP_PROP_POS_FRAMES, i)
 
138
  vidcap.release()
139
  return frames
140
 
141
+ # A placeholder function in case docling_core is not installed
142
+ def format_smoldocling_output(buffer_text, images):
143
+ cleaned_output = buffer_text.replace("<end_of_utterance>", "").strip()
144
+ # Check if docling_core is available and was imported
145
+ if 'DocTagsDocument' in globals() and 'DoclingDocument' in globals():
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
146
  if any(tag in cleaned_output for tag in ["<doctag>", "<otsl>", "<code>", "<chart>", "<formula>"]):
147
  if "<chart>" in cleaned_output:
148
  cleaned_output = cleaned_output.replace("<chart>", "<otsl>").replace("</chart>", "</otsl>")
 
150
  doctags_doc = DocTagsDocument.from_doctags_and_image_pairs([cleaned_output], images)
151
  doc = DoclingDocument.load_from_doctags(doctags_doc, document_name="Document")
152
  markdown_output = doc.export_to_markdown()
153
+ return buffer_text, markdown_output
154
+ # Fallback if library is not available or tags are not present
155
+ return buffer_text, cleaned_output
156
 
157
+ # --- Core Generation Logic ---
158
+ def get_model_and_processor(model_name):
159
+ """Helper to select model and processor."""
 
 
 
 
 
160
  if model_name == "Nanonets-OCR-s":
161
+ return processor_m, model_m
 
162
  elif model_name == "MonkeyOCR-Recognition":
163
+ return processor_g, model_g
 
164
  elif model_name == "SmolDocling-256M-preview":
165
+ return processor_x, model_x
 
166
  elif model_name == "Typhoon-OCR-7B":
167
+ return processor_l, model_l
 
168
  elif model_name == "Thyme-RL":
169
+ return processor_n, model_n
 
170
  else:
171
+ return None, None
172
+
173
+ @spaces.GPU
174
+ def generate_response(model_name: str, text: str, media_input, media_type: str,
175
+ max_new_tokens: int, temperature: float, top_p: float, top_k: int, repetition_penalty: float):
176
+ """Unified generation function for both image and video."""
177
+ processor, model = get_model_and_processor(model_name)
178
+ if not processor or not model:
179
  yield "Invalid model selected.", "Invalid model selected."
180
  return
181
 
182
+ if media_input is None:
183
+ yield f"Please upload a {media_type}.", f"Please upload a {media_type}."
184
  return
185
 
186
+ if media_type == "video":
187
+ frames = downsample_video(media_input)
188
+ images = [frame for frame, _ in frames]
189
+ else: # image
190
+ images = [media_input]
191
 
192
  if model_name == "SmolDocling-256M-preview":
193
  if "OTSL" in text or "code" in text:
 
196
  text = normalize_values(text, target_max=500)
197
 
198
  messages = [
199
+ {"role": "user", "content": [{"type": "image"} for _ in images] + [{"type": "text", "text": text}]}
 
 
 
 
 
200
  ]
201
  prompt = processor.apply_chat_template(messages, add_generation_prompt=True)
202
  inputs = processor(text=prompt, images=images, return_tensors="pt").to(device)
 
220
  yield buffer, buffer
221
 
222
  if model_name == "SmolDocling-256M-preview":
223
+ raw_output, formatted_output = format_smoldocling_output(buffer, images)
224
+ yield raw_output, formatted_output
225
+ else:
226
+ # For other models, the formatted output is just the cleaned buffer
227
+ yield buffer, buffer.strip()
228
+
229
+ def generate_image_wrapper(*args):
230
+ yield from generate_response(*args, media_type="image")
 
 
 
231
 
232
+ def generate_video_wrapper(*args):
233
+ yield from generate_response(*args, media_type="video")
234
+
235
+
236
+ # --- Examples ---
237
  image_examples = [
238
  ["Reconstruct the doc [table] as it is.", "images/0.png"],
239
  ["Describe the image!", "images/8.png"],
 
243
  ["Convert chart to OTSL.", "images/4.png"],
244
  ["Convert code to text", "images/5.jpg"],
245
  ["Convert this table to OTSL.", "images/6.jpg"],
246
+ ["Convert formula to latex.", "images/7.jpg"],
247
  ]
248
 
249
  video_examples = [
 
251
  ["Explain the video in detail.", "videos/2.mp4"]
252
  ]
253
 
254
+ # --- UI Styling ---
255
  css = """
256
  .submit-btn {
257
  background-color: #2980b9 !important;
258
  color: white !important;
259
+ border: none !important;
260
+ box-shadow: 2px 2px 5px rgba(0,0,0,0.2) !important;
261
  }
262
  .submit-btn:hover {
263
  background-color: #3498db !important;
264
+ box-shadow: 2px 2px 8px rgba(0,0,0,0.3) !important;
265
  }
266
  .canvas-output {
267
  border: 2px solid #4682B4;
268
  border-radius: 10px;
269
  padding: 20px;
270
+ background-color: #f0f8ff;
271
  }
272
  """
273
 
274
+ # --- Gradio Interface ---
275
+ with gr.Blocks(css=css) as demo:
276
  gr.Markdown("# **[Multimodal OCR2](https://huggingface.co/collections/prithivMLmods/multimodal-implementations-67c9982ea04b39f0608badb0)**")
277
+
278
  with gr.Row():
279
+ # Left Column for Inputs and Controls
280
+ with gr.Column(scale=1):
281
  with gr.Tabs():
282
+ with gr.TabItem("🖼️ Image Inference"):
283
  image_query = gr.Textbox(label="Query Input", placeholder="Enter your query here...")
284
+ image_upload = gr.Image(type="pil", label="Upload Image", height=300)
 
285
  gr.Examples(
286
  examples=image_examples,
287
+ inputs=[image_query, image_upload],
288
+ label="Image Examples"
289
  )
290
+ image_submit = gr.Button("Submit", elem_classes="submit-btn")
291
+
292
+ with gr.TabItem("🎥 Video Inference"):
293
  video_query = gr.Textbox(label="Query Input", placeholder="Enter your query here...")
294
+ video_upload = gr.Video(label="Upload Video", height=300)
 
295
  gr.Examples(
296
  examples=video_examples,
297
+ inputs=[video_query, video_upload],
298
+ label="Video Examples"
299
  )
300
+ video_submit = gr.Button("Submit", elem_classes="submit-btn")
301
+
302
+ with gr.Accordion("⚙️ Advanced Options", open=False):
303
+ max_new_tokens = gr.Slider(label="Max New Tokens", minimum=1, maximum=MAX_MAX_NEW_TOKENS, step=1, value=DEFAULT_MAX_NEW_TOKENS)
304
  temperature = gr.Slider(label="Temperature", minimum=0.1, maximum=4.0, step=0.1, value=0.6)
305
  top_p = gr.Slider(label="Top-p (nucleus sampling)", minimum=0.05, maximum=1.0, step=0.05, value=0.9)
306
  top_k = gr.Slider(label="Top-k", minimum=1, maximum=1000, step=1, value=50)
307
+ repetition_penalty = gr.Slider(label="Repetition Penalty", minimum=1.0, maximum=2.0, step=0.05, value=1.2)
308
 
309
+ # Right Column for Outputs and Model Info
310
+ with gr.Column(scale=1):
311
  with gr.Column(elem_classes="canvas-output"):
312
  gr.Markdown("## Output")
313
  raw_output = gr.Textbox(label="Raw Output Stream", interactive=False, lines=5)
314
 
315
+ with gr.Accordion("📄 Formatted Result (Result.md)", open=True):
316
+ formatted_output = gr.Markdown(label="Formatted Output")
317
 
318
  model_choice = gr.Radio(
319
  choices=["Nanonets-OCR-s", "MonkeyOCR-Recognition", "Thyme-RL", "Typhoon-OCR-7B", "SmolDocling-256M-preview"],
320
+ label="🤖 Select Model",
321
  value="Nanonets-OCR-s"
322
  )
323
 
324
  gr.Markdown("**Model Info 💻** | [Report Bug](https://huggingface.co/spaces/prithivMLmods/Multimodal-OCR2/discussions)")
325
+ gr.Markdown("> **[Nanonets-OCR-s](https://huggingface.co/nanonets/Nanonets-OCR-s)**: A powerful, state-of-the-art image-to-markdown OCR model that transforms documents into structured markdown with intelligent content recognition.")
326
+ gr.Markdown("> **[SmolDocling-256M](https://huggingface.co/ds4sd/SmolDocling-256M-preview)**: A multimodal Image-Text-to-Text model designed for efficient document conversion, retaining key features of the larger Docling model.")
327
+ gr.Markdown("> **[MonkeyOCR-Recognition](https://huggingface.co/echo840/MonkeyOCR)**: Adopts a Structure-Recognition-Relation (SRR) paradigm, simplifying the pipeline for document processing.")
328
+ gr.Markdown("> **[Typhoon-OCR-7B](https://huggingface.co/scb10x/typhoon-ocr-7b)**: A bilingual document parsing model for real-world documents in Thai and English, capable of extracting text from images and charts.")
329
+ gr.Markdown("> **[Thyme-RL](https://huggingface.co/Kwai-Keye/Thyme-RL)**: Thyme transcends traditional 'thinking with images' by autonomously generating and executing code for image processing and computation, enhancing performance on complex reasoning tasks.")
330
+ gr.Markdown("> ⚠️ **Note**: All models in this space are primarily optimized for image tasks and may not perform as well on video inference use cases.")
331
 
332
+ # --- Event Handlers ---
333
+ common_inputs = [model_choice, max_new_tokens, temperature, top_p, top_k, repetition_penalty]
334
+ common_outputs = [raw_output, formatted_output]
335
+
336
  image_submit.click(
337
+ fn=generate_image_wrapper,
338
+ inputs=[image_query, image_upload] + common_inputs,
339
+ outputs=common_outputs
340
  )
341
+
342
  video_submit.click(
343
+ fn=generate_video_wrapper,
344
+ inputs=[video_query, video_upload] + common_inputs,
345
+ outputs=common_outputs
 
346
  )
347
 
348
  if __name__ == "__main__":
349
+ demo.queue(max_size=50).launch(share=True, show_error=True)