prithivMLmods commited on
Commit
87fc6ad
·
verified ·
1 Parent(s): 9e722ac

update app

Browse files
Files changed (1) hide show
  1. app.py +100 -57
app.py CHANGED
@@ -5,6 +5,7 @@ import json
5
  import time
6
  import asyncio
7
  from threading import Thread
 
8
 
9
  import gradio as gr
10
  import spaces
@@ -12,13 +13,83 @@ import torch
12
  import numpy as np
13
  from PIL import Image
14
  import cv2
 
15
 
16
  from transformers import (
 
17
  Qwen2_5_VLForConditionalGeneration,
18
  AutoProcessor,
19
  TextIteratorStreamer,
 
 
20
  )
21
  from transformers.image_utils import load_image
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
22
 
23
  # Constants for text generation
24
  MAX_MAX_NEW_TOKENS = 2048
@@ -72,7 +143,7 @@ def downsample_video(video_path):
72
  total_frames = int(vidcap.get(cv2.CAP_PROP_FRAME_COUNT))
73
  fps = vidcap.get(cv2.CAP_PROP_FPS)
74
  frames = []
75
- frame_indices = np.linspace(0, total_frames - 1, 10, dtype=int)
76
  for i in frame_indices:
77
  vidcap.set(cv2.CAP_PROP_POS_FRAMES, i)
78
  success, image = vidcap.read()
@@ -96,17 +167,13 @@ def generate_image(model_name: str, text: str, image: Image.Image,
96
  Yields raw text and Markdown-formatted text.
97
  """
98
  if model_name == "Cosmos-Reason1-7B":
99
- processor = processor_m
100
- model = model_m
101
  elif model_name == "docscopeOCR-7B-050425-exp":
102
- processor = processor_x
103
- model = model_x
104
  elif model_name == "Captioner-7B-Qwen2.5VL":
105
- processor = processor_z
106
- model = model_z
107
  elif model_name == "visionOCR-3B":
108
- processor = processor_v
109
- model = model_v
110
  else:
111
  yield "Invalid model selected.", "Invalid model selected."
112
  return
@@ -118,7 +185,7 @@ def generate_image(model_name: str, text: str, image: Image.Image,
118
  messages = [{
119
  "role": "user",
120
  "content": [
121
- {"type": "image", "image": image},
122
  {"type": "text", "text": text},
123
  ]
124
  }]
@@ -128,7 +195,7 @@ def generate_image(model_name: str, text: str, image: Image.Image,
128
  images=[image],
129
  return_tensors="pt",
130
  padding=True,
131
- truncation=False,
132
  max_length=MAX_INPUT_TOKEN_LENGTH
133
  ).to(device)
134
  streamer = TextIteratorStreamer(processor, skip_prompt=True, skip_special_tokens=True)
@@ -153,17 +220,13 @@ def generate_video(model_name: str, text: str, video_path: str,
153
  Yields raw text and Markdown-formatted text.
154
  """
155
  if model_name == "Cosmos-Reason1-7B":
156
- processor = processor_m
157
- model = model_m
158
  elif model_name == "docscopeOCR-7B-050425-exp":
159
- processor = processor_x
160
- model = model_x
161
  elif model_name == "Captioner-7B-Qwen2.5VL":
162
- processor = processor_z
163
- model = model_z
164
  elif model_name == "visionOCR-3B":
165
- processor = processor_v
166
- model = model_v
167
  else:
168
  yield "Invalid model selected.", "Invalid model selected."
169
  return
@@ -187,7 +250,7 @@ def generate_video(model_name: str, text: str, video_path: str,
187
  add_generation_prompt=True,
188
  return_dict=True,
189
  return_tensors="pt",
190
- truncation=False,
191
  max_length=MAX_INPUT_TOKEN_LENGTH
192
  ).to(device)
193
  streamer = TextIteratorStreamer(processor, skip_prompt=True, skip_special_tokens=True)
@@ -221,42 +284,30 @@ video_examples = [
221
  ]
222
 
223
  css = """
224
- .submit-btn {
225
- background-color: #2980b9 !important;
226
- color: white !important;
227
- }
228
- .submit-btn:hover {
229
- background-color: #3498db !important;
230
  }
231
- .canvas-output {
232
- border: 2px solid #4682B4;
233
- border-radius: 10px;
234
- padding: 20px;
235
  }
236
  """
237
 
238
  # Create the Gradio Interface
239
- with gr.Blocks(css=css, theme="bethecloud/storj_theme") as demo:
240
- gr.Markdown("# **[DocScope R1](https://huggingface.co/collections/prithivMLmods/multimodal-implementations-67c9982ea04b39f0608badb0)**")
241
  with gr.Row():
242
- with gr.Column():
243
  with gr.Tabs():
244
  with gr.TabItem("Image Inference"):
245
  image_query = gr.Textbox(label="Query Input", placeholder="Enter your query here...")
246
  image_upload = gr.Image(type="pil", label="Image", height=290)
247
- image_submit = gr.Button("Submit", elem_classes="submit-btn")
248
- gr.Examples(
249
- examples=image_examples,
250
- inputs=[image_query, image_upload]
251
- )
252
  with gr.TabItem("Video Inference"):
253
  video_query = gr.Textbox(label="Query Input", placeholder="Enter your query here...")
254
  video_upload = gr.Video(label="Video", height=290)
255
- video_submit = gr.Button("Submit", elem_classes="submit-btn")
256
- gr.Examples(
257
- examples=video_examples,
258
- inputs=[video_query, video_upload]
259
- )
260
 
261
  with gr.Accordion("Advanced options", open=False):
262
  max_new_tokens = gr.Slider(label="Max new tokens", minimum=1, maximum=MAX_MAX_NEW_TOKENS, step=1, value=DEFAULT_MAX_NEW_TOKENS)
@@ -265,25 +316,17 @@ with gr.Blocks(css=css, theme="bethecloud/storj_theme") as demo:
265
  top_k = gr.Slider(label="Top-k", minimum=1, maximum=1000, step=1, value=50)
266
  repetition_penalty = gr.Slider(label="Repetition penalty", minimum=1.0, maximum=2.0, step=0.05, value=1.2)
267
 
268
- with gr.Column():
269
- with gr.Column(elem_classes="canvas-output"):
270
- gr.Markdown("## Output")
271
- raw_output = gr.Textbox(label="Raw Output Stream", interactive=False, lines=5, show_copy_button=True)
272
-
273
- with gr.Accordion("(Result.md)", open=False):
274
- markdown_output = gr.Markdown()
275
 
276
  model_choice = gr.Radio(
277
  choices=["Cosmos-Reason1-7B", "docscopeOCR-7B-050425-exp", "Captioner-7B-Qwen2.5VL", "visionOCR-3B"],
278
  label="Select Model",
279
  value="Cosmos-Reason1-7B"
280
  )
281
- gr.Markdown("**Model Info 💻** | [Report Bug](https://huggingface.co/spaces/prithivMLmods/DocScope-R1/discussions)")
282
- gr.Markdown("> [Cosmos-Reason1-7B](https://huggingface.co/nvidia/Cosmos-Reason1-7B): understand physical common sense and generate appropriate embodied decisions.")
283
- gr.Markdown("> [docscopeOCR-7B-050425-exp](https://huggingface.co/prithivMLmods/docscopeOCR-7B-050425-exp): optimized for document-level optical character recognition, long-context vision-language understanding.")
284
- gr.Markdown("> [Captioner-Relaxed-7B](https://huggingface.co/Ertugrul/Qwen2.5-VL-7B-Captioner-Relaxed): build with hand-curated dataset for text-to-image models, providing significantly more detailed descriptions or captions of given images.")
285
- gr.Markdown("> [visionOCR-3B](https://huggingface.co/prithivMLmods/visionOCR-3B-061125): visionocr-3b-061125 model is a fine-tuned version of qwen2.5-vl-3b-instruct, optimized for document-level optical character recognition (ocr), long-context vision-language understanding.")
286
- gr.Markdown(">⚠️note: all the models in space are not guaranteed to perform well in video inference use cases.")
287
 
288
  image_submit.click(
289
  fn=generate_image,
@@ -297,4 +340,4 @@ with gr.Blocks(css=css, theme="bethecloud/storj_theme") as demo:
297
  )
298
 
299
  if __name__ == "__main__":
300
- demo.queue(max_size=30).launch(share=True, mcp_server=True, ssr_mode=False, show_error=True)
 
5
  import time
6
  import asyncio
7
  from threading import Thread
8
+ from typing import Iterable
9
 
10
  import gradio as gr
11
  import spaces
 
13
  import numpy as np
14
  from PIL import Image
15
  import cv2
16
+ import requests
17
 
18
  from transformers import (
19
+ Qwen2VLForConditionalGeneration,
20
  Qwen2_5_VLForConditionalGeneration,
21
  AutoProcessor,
22
  TextIteratorStreamer,
23
+ AutoModel,
24
+ AutoTokenizer,
25
  )
26
  from transformers.image_utils import load_image
27
+ from gradio.themes import Soft
28
+ from gradio.themes.utils import colors, fonts, sizes
29
+
30
+ # --- Theme and CSS Definition ---
31
+
32
+ colors.steel_blue = colors.Color(
33
+ name="steel_blue",
34
+ c50="#EBF3F8",
35
+ c100="#D3E5F0",
36
+ c200="#A8CCE1",
37
+ c300="#7DB3D2",
38
+ c400="#529AC3",
39
+ c500="#4682B4", # SteelBlue base color
40
+ c600="#3E72A0",
41
+ c700="#36638C",
42
+ c800="#2E5378",
43
+ c900="#264364",
44
+ c950="#1E3450",
45
+ )
46
+
47
+ class SteelBlueTheme(Soft):
48
+ def __init__(
49
+ self,
50
+ *,
51
+ primary_hue: colors.Color | str = colors.gray,
52
+ secondary_hue: colors.Color | str = colors.steel_blue,
53
+ neutral_hue: colors.Color | str = colors.slate,
54
+ text_size: sizes.Size | str = sizes.text_lg,
55
+ font: fonts.Font | str | Iterable[fonts.Font | str] = (
56
+ fonts.GoogleFont("Outfit"), "Arial", "sans-serif",
57
+ ),
58
+ font_mono: fonts.Font | str | Iterable[fonts.Font | str] = (
59
+ fonts.GoogleFont("IBM Plex Mono"), "ui-monospace", "monospace",
60
+ ),
61
+ ):
62
+ super().__init__(
63
+ primary_hue=primary_hue,
64
+ secondary_hue=secondary_hue,
65
+ neutral_hue=neutral_hue,
66
+ text_size=text_size,
67
+ font=font,
68
+ font_mono=font_mono,
69
+ )
70
+ super().set(
71
+ background_fill_primary="*primary_50",
72
+ background_fill_primary_dark="*primary_900",
73
+ body_background_fill="linear-gradient(135deg, *primary_200, *primary_100)",
74
+ body_background_fill_dark="linear-gradient(135deg, *primary_900, *primary_800)",
75
+ button_primary_text_color="white",
76
+ button_primary_text_color_hover="white",
77
+ button_primary_background_fill="linear-gradient(90deg, *secondary_500, *secondary_600)",
78
+ button_primary_background_fill_hover="linear-gradient(90deg, *secondary_600, *secondary_700)",
79
+ button_primary_background_fill_dark="linear-gradient(90deg, *secondary_600, *secondary_800)",
80
+ button_primary_background_fill_hover_dark="linear-gradient(90deg, *secondary_500, *secondary_500)",
81
+ slider_color="*secondary_500",
82
+ slider_color_dark="*secondary_600",
83
+ block_title_text_weight="600",
84
+ block_border_width="3px",
85
+ block_shadow="*shadow_drop_lg",
86
+ button_primary_shadow="*shadow_drop_lg",
87
+ button_large_padding="11px",
88
+ color_accent_soft="*primary_100",
89
+ block_label_background_fill="*primary_200",
90
+ )
91
+
92
+ steel_blue_theme = SteelBlueTheme()
93
 
94
  # Constants for text generation
95
  MAX_MAX_NEW_TOKENS = 2048
 
143
  total_frames = int(vidcap.get(cv2.CAP_PROP_FRAME_COUNT))
144
  fps = vidcap.get(cv2.CAP_PROP_FPS)
145
  frames = []
146
+ frame_indices = np.linspace(0, total_frames - 1, min(total_frames, 10), dtype=int)
147
  for i in frame_indices:
148
  vidcap.set(cv2.CAP_PROP_POS_FRAMES, i)
149
  success, image = vidcap.read()
 
167
  Yields raw text and Markdown-formatted text.
168
  """
169
  if model_name == "Cosmos-Reason1-7B":
170
+ processor, model = processor_m, model_m
 
171
  elif model_name == "docscopeOCR-7B-050425-exp":
172
+ processor, model = processor_x, model_x
 
173
  elif model_name == "Captioner-7B-Qwen2.5VL":
174
+ processor, model = processor_z, model_z
 
175
  elif model_name == "visionOCR-3B":
176
+ processor, model = processor_v, model_v
 
177
  else:
178
  yield "Invalid model selected.", "Invalid model selected."
179
  return
 
185
  messages = [{
186
  "role": "user",
187
  "content": [
188
+ {"type": "image"},
189
  {"type": "text", "text": text},
190
  ]
191
  }]
 
195
  images=[image],
196
  return_tensors="pt",
197
  padding=True,
198
+ truncation=True,
199
  max_length=MAX_INPUT_TOKEN_LENGTH
200
  ).to(device)
201
  streamer = TextIteratorStreamer(processor, skip_prompt=True, skip_special_tokens=True)
 
220
  Yields raw text and Markdown-formatted text.
221
  """
222
  if model_name == "Cosmos-Reason1-7B":
223
+ processor, model = processor_m, model_m
 
224
  elif model_name == "docscopeOCR-7B-050425-exp":
225
+ processor, model = processor_x, model_x
 
226
  elif model_name == "Captioner-7B-Qwen2.5VL":
227
+ processor, model = processor_z, model_z
 
228
  elif model_name == "visionOCR-3B":
229
+ processor, model = processor_v, model_v
 
230
  else:
231
  yield "Invalid model selected.", "Invalid model selected."
232
  return
 
250
  add_generation_prompt=True,
251
  return_dict=True,
252
  return_tensors="pt",
253
+ truncation=True,
254
  max_length=MAX_INPUT_TOKEN_LENGTH
255
  ).to(device)
256
  streamer = TextIteratorStreamer(processor, skip_prompt=True, skip_special_tokens=True)
 
284
  ]
285
 
286
  css = """
287
+ #main-title h1 {
288
+ font-size: 2.3em !important;
 
 
 
 
289
  }
290
+ #output-title h2 {
291
+ font-size: 2.1em !important;
 
 
292
  }
293
  """
294
 
295
  # Create the Gradio Interface
296
+ with gr.Blocks(css=css, theme=steel_blue_theme) as demo:
297
+ gr.Markdown("# **DocScope R1**", elem_id="main-title")
298
  with gr.Row():
299
+ with gr.Column(scale=2):
300
  with gr.Tabs():
301
  with gr.TabItem("Image Inference"):
302
  image_query = gr.Textbox(label="Query Input", placeholder="Enter your query here...")
303
  image_upload = gr.Image(type="pil", label="Image", height=290)
304
+ image_submit = gr.Button("Submit", variant="primary")
305
+ gr.Examples(examples=image_examples, inputs=[image_query, image_upload])
 
 
 
306
  with gr.TabItem("Video Inference"):
307
  video_query = gr.Textbox(label="Query Input", placeholder="Enter your query here...")
308
  video_upload = gr.Video(label="Video", height=290)
309
+ video_submit = gr.Button("Submit", variant="primary")
310
+ gr.Examples(examples=video_examples, inputs=[video_query, video_upload])
 
 
 
311
 
312
  with gr.Accordion("Advanced options", open=False):
313
  max_new_tokens = gr.Slider(label="Max new tokens", minimum=1, maximum=MAX_MAX_NEW_TOKENS, step=1, value=DEFAULT_MAX_NEW_TOKENS)
 
316
  top_k = gr.Slider(label="Top-k", minimum=1, maximum=1000, step=1, value=50)
317
  repetition_penalty = gr.Slider(label="Repetition penalty", minimum=1.0, maximum=2.0, step=0.05, value=1.2)
318
 
319
+ with gr.Column(scale=3):
320
+ gr.Markdown("## Output", elem_id="output-title")
321
+ raw_output = gr.Textbox(label="Raw Output Stream", interactive=False, lines=14, show_copy_button=True)
322
+ with gr.Accordion("(Result.md)", open=False):
323
+ markdown_output = gr.Markdown()
 
 
324
 
325
  model_choice = gr.Radio(
326
  choices=["Cosmos-Reason1-7B", "docscopeOCR-7B-050425-exp", "Captioner-7B-Qwen2.5VL", "visionOCR-3B"],
327
  label="Select Model",
328
  value="Cosmos-Reason1-7B"
329
  )
 
 
 
 
 
 
330
 
331
  image_submit.click(
332
  fn=generate_image,
 
340
  )
341
 
342
  if __name__ == "__main__":
343
+ demo.queue(max_size=30).launch(mcp_server=True, ssr_mode=False, show_error=True)