prithivMLmods commited on
Commit
d69431d
·
verified ·
1 Parent(s): dfae74e

Upload app.py

Browse files
Files changed (1) hide show
  1. app.py +301 -0
app.py ADDED
@@ -0,0 +1,301 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ import time
3
+ import threading
4
+ import gradio as gr
5
+ import spaces
6
+ import torch
7
+ import numpy as np
8
+ from PIL import Image
9
+ import cv2
10
+ from transformers import (
11
+ Qwen2_5_VLForConditionalGeneration,
12
+ Qwen2VLForConditionalGeneration,
13
+ Glm4vForConditionalGeneration,
14
+ AutoProcessor,
15
+ TextIteratorStreamer,
16
+ )
17
+ from qwen_vl_utils import process_vision_info
18
+
19
+ # Constants for text generation
20
+ MAX_MAX_NEW_TOKENS = 16384
21
+ DEFAULT_MAX_NEW_TOKENS = 8192
22
+ MAX_INPUT_TOKEN_LENGTH = int(os.getenv("MAX_INPUT_TOKEN_LENGTH", "4096"))
23
+
24
+ device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
25
+
26
+ # Load Camel-Doc-OCR-062825
27
+ MODEL_ID_M = "prithivMLmods/Camel-Doc-OCR-062825"
28
+ processor_m = AutoProcessor.from_pretrained(MODEL_ID_M, trust_remote_code=True)
29
+ model_m = Qwen2_5_VLForConditionalGeneration.from_pretrained(
30
+ MODEL_ID_M,
31
+ trust_remote_code=True,
32
+ torch_dtype=torch.float16
33
+ ).to(device).eval()
34
+
35
+ # Load Megalodon-OCR-Sync-0713
36
+ MODEL_ID_T = "prithivMLmods/Megalodon-OCR-Sync-0713"
37
+ processor_t = AutoProcessor.from_pretrained(MODEL_ID_T, trust_remote_code=True)
38
+ model_t = Qwen2_5_VLForConditionalGeneration.from_pretrained(
39
+ MODEL_ID_T,
40
+ trust_remote_code=True,
41
+ torch_dtype=torch.float16
42
+ ).to(device).eval()
43
+
44
+ # Load Video-MTR
45
+ MODEL_ID_S = "Phoebe13/Video-MTR"
46
+ processor_s = AutoProcessor.from_pretrained(MODEL_ID_S, trust_remote_code=True)
47
+ model_s = Qwen2_5_VLForConditionalGeneration.from_pretrained(
48
+ MODEL_ID_S,
49
+ trust_remote_code=True,
50
+ torch_dtype=torch.float16
51
+ ).to(device).eval()
52
+
53
+ # Load ViLaSR
54
+ MODEL_ID_Y = "inclusionAI/ViLaSR"
55
+ processor_y = AutoProcessor.from_pretrained(MODEL_ID_Y, trust_remote_code=True)
56
+ model_y = Qwen2_5_VLForConditionalGeneration.from_pretrained(
57
+ MODEL_ID_Y,
58
+ trust_remote_code=True,
59
+ torch_dtype=torch.float16
60
+ ).to(device).eval()
61
+
62
+ def downsample_video(video_path):
63
+ """
64
+ Downsample a video to evenly spaced frames, returning each as a PIL image with its timestamp.
65
+ """
66
+ vidcap = cv2.VideoCapture(video_path)
67
+ total_frames = int(vidcap.get(cv2.CAP_PROP_FRAME_COUNT))
68
+ fps = vidcap.get(cv2.CAP_PROP_FPS)
69
+ frames = []
70
+ frame_indices = np.linspace(0, total_frames - 1, 10, dtype=int)
71
+ for i in frame_indices:
72
+ vidcap.set(cv2.CAP_PROP_POS_FRAMES, i)
73
+ success, image = vidcap.read()
74
+ if success:
75
+ image = cv2.cvtColor(image, cv2.COLOR_BGR2RGB)
76
+ pil_image = Image.fromarray(image)
77
+ timestamp = round(i / fps, 2)
78
+ frames.append((pil_image, timestamp))
79
+ vidcap.release()
80
+ return frames
81
+
82
+ @spaces.GPU(duration=120)
83
+ def generate_image(model_name: str, text: str, image: Image.Image,
84
+ max_new_tokens: int = 1024,
85
+ temperature: float = 0.6,
86
+ top_p: float = 0.9,
87
+ top_k: int = 50,
88
+ repetition_penalty: float = 1.2):
89
+ """
90
+ Generate responses using the selected model for image input.
91
+ """
92
+ if model_name == "Camel-Doc-OCR-062825":
93
+ processor = processor_m
94
+ model = model_m
95
+ elif model_name == "Megalodon-OCR-Sync-0713":
96
+ processor = processor_t
97
+ model = model_t
98
+ elif model_name == "Video-MTR":
99
+ processor = processor_s
100
+ model = model_s
101
+ elif model_name == "ViLaSR-7B":
102
+ processor = processor_y
103
+ model = model_y
104
+ else:
105
+ yield "Invalid model selected.", "Invalid model selected."
106
+ return
107
+
108
+ if image is None:
109
+ yield "Please upload an image.", "Please upload an image."
110
+ return
111
+
112
+ messages = [{
113
+ "role": "user",
114
+ "content": [
115
+ {"type": "image", "image": image},
116
+ {"type": "text", "text": text},
117
+ ]
118
+ }]
119
+ prompt_full = processor.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
120
+ inputs = processor(
121
+ text=[prompt_full],
122
+ images=[image],
123
+ return_tensors="pt",
124
+ padding=True,
125
+ truncation=False,
126
+ max_length=MAX_INPUT_TOKEN_LENGTH
127
+ ).to(device)
128
+ streamer = TextIteratorStreamer(processor, skip_prompt=True, skip_special_tokens=True)
129
+ generation_kwargs = {**inputs, "streamer": streamer, "max_new_tokens": max_new_tokens}
130
+ thread = threading.Thread(target=model.generate, kwargs=generation_kwargs)
131
+ thread.start()
132
+ buffer = ""
133
+ for new_text in streamer:
134
+ buffer += new_text
135
+ time.sleep(0.01)
136
+ yield buffer, buffer
137
+
138
+ @spaces.GPU
139
+ def generate_video(model_name: str, text: str, video_path: str,
140
+ max_new_tokens: int = 1024,
141
+ temperature: float = 0.6,
142
+ top_p: float = 0.9,
143
+ top_k: int = 50,
144
+ repetition_penalty: float = 1.2):
145
+ """
146
+ Generate responses using the selected model for video input.
147
+ """
148
+ if model_name == "Camel-Doc-OCR-062825":
149
+ processor = processor_m
150
+ model = model_m
151
+ elif model_name == "Megalodon-OCR-Sync-0713":
152
+ processor = processor_t
153
+ model = model_t
154
+ elif model_name == "Video-MTR":
155
+ processor = processor_s
156
+ model = model_s
157
+ elif model_name == "ViLaSR-7B":
158
+ processor = processor_y
159
+ model = model_y
160
+ else:
161
+ yield "Invalid model selected.", "Invalid model selected."
162
+ return
163
+
164
+ if video_path is None:
165
+ yield "Please upload a video.", "Please upload a video."
166
+ return
167
+
168
+ frames = downsample_video(video_path)
169
+ messages = [
170
+ {"role": "system", "content": [{"type": "text", "text": "You are a helpful assistant."}]},
171
+ {"role": "user", "content": [{"type": "text", "text": text}]}
172
+ ]
173
+ for frame in frames:
174
+ image, timestamp = frame
175
+ messages[1]["content"].append({"type": "text", "text": f"Frame {timestamp}:"})
176
+ messages[1]["content"].append({"type": "image", "image": image})
177
+ inputs = processor.apply_chat_template(
178
+ messages,
179
+ tokenize=True,
180
+ add_generation_prompt=True,
181
+ return_dict=True,
182
+ return_tensors="pt",
183
+ truncation=False,
184
+ max_length=MAX_INPUT_TOKEN_LENGTH
185
+ ).to(device)
186
+ streamer = TextIteratorStreamer(processor, skip_prompt=True, skip_special_tokens=True)
187
+ generation_kwargs = {
188
+ **inputs,
189
+ "streamer": streamer,
190
+ "max_new_tokens": max_new_tokens,
191
+ "do_sample": True,
192
+ "temperature": temperature,
193
+ "top_p": top_p,
194
+ "top_k": top_k,
195
+ "repetition_penalty": repetition_penalty,
196
+ }
197
+ thread = threading.Thread(target=model.generate, kwargs=generation_kwargs)
198
+ thread.start()
199
+ buffer = ""
200
+ for new_text in streamer:
201
+ buffer += new_text
202
+ buffer = buffer.replace("<|im_end|>", "")
203
+ time.sleep(0.01)
204
+ yield buffer, buffer
205
+
206
+ # Define examples for image and video inference
207
+ image_examples = [
208
+ ["convert this page to doc [text] precisely for markdown.", "images/1.png"],
209
+ ["explain the movie shot in detail.", "images/5.jpg"],
210
+ ["convert this page to doc [table] precisely for markdown.", "images/2.png"],
211
+ ["explain the movie shot in detail.", "images/3.png"],
212
+ ["fill the correct numbers.", "images/4.png"]
213
+ ]
214
+
215
+ video_examples = [
216
+ ["explain the video in detail.", "videos/b.mp4"],
217
+ ["explain the ad video in detail.", "videos/a.mp4"]
218
+ ]
219
+
220
+ # Updated CSS with model choice highlighting
221
+ css = """
222
+ .submit-btn {
223
+ background-color: #2980b9 !important;
224
+ color: white !important;
225
+ }
226
+ .submit-btn:hover {
227
+ background-color: #3498db !important;
228
+ }
229
+ .canvas-output {
230
+ border: 2px solid #4682B4;
231
+ border-radius: 10px;
232
+ padding: 20px;
233
+ }
234
+ """
235
+
236
+ # Create the Gradio Interface
237
+ with gr.Blocks(css=css, theme="bethecloud/storj_theme") as demo:
238
+ gr.Markdown("# **[Multimodal VLM v1.0](https://huggingface.co/collections/prithivMLmods/multimodal-implementations-67c9982ea04b39f0608badb0)**")
239
+ with gr.Row():
240
+ with gr.Column():
241
+ with gr.Tabs():
242
+ with gr.TabItem("Image Inference"):
243
+ image_query = gr.Textbox(label="Query Input", placeholder="✦︎ Enter your query here...")
244
+ image_upload = gr.Image(type="pil", label="Image")
245
+ image_submit = gr.Button("Submit", elem_classes="submit-btn")
246
+ gr.Examples(
247
+ examples=image_examples,
248
+ inputs=[image_query, image_upload]
249
+ )
250
+ with gr.TabItem("Video Inference"):
251
+ video_query = gr.Textbox(label="Query Input", placeholder="✦︎ Enter your query here...")
252
+ video_upload = gr.Video(label="Video")
253
+ video_submit = gr.Button("Submit", elem_classes="submit-btn")
254
+ gr.Examples(
255
+ examples=video_examples,
256
+ inputs=[video_query, video_upload]
257
+ )
258
+ with gr.Accordion("Advanced options", open=False):
259
+ max_new_tokens = gr.Slider(label="Max new tokens", minimum=1, maximum=MAX_MAX_NEW_TOKENS, step=1, value=DEFAULT_MAX_NEW_TOKENS)
260
+ temperature = gr.Slider(label="Temperature", minimum=0.1, maximum=4.0, step=0.1, value=0.6)
261
+ top_p = gr.Slider(label="Top-p (nucleus sampling)", minimum=0.05, maximum=1.0, step=0.05, value=0.9)
262
+ top_k = gr.Slider(label="Top-k", minimum=1, maximum=1000, step=1, value=50)
263
+ repetition_penalty = gr.Slider(label="Repetition penalty", minimum=1.0, maximum=2.0, step=0.05, value=1.2)
264
+ with gr.Column():
265
+ with gr.Column(elem_classes="canvas-output"):
266
+ gr.Markdown("## Output")
267
+ output = gr.Textbox(label="Raw Output Stream", interactive=False, lines=3)
268
+ with gr.Accordion("(Result.md)", open=False):
269
+ markdown_output = gr.Markdown(label="(Result.md)")
270
+ model_choice = gr.Radio(
271
+ choices=["Camel-Doc-OCR-062825", "Video-MTR", "Megalodon-OCR-Sync-0713", "ViLaSR-7B"],
272
+ label="Select Model",
273
+ value="Camel-Doc-OCR-062825"
274
+ )
275
+ gr.Markdown("**Model Info 💻** | [Report Bug](https://huggingface.co/spaces/prithivMLmods/Multimodal-VLM-v1.0/discussions)")
276
+
277
+ gr.Markdown("> [Camel-Doc-OCR-062825](https://huggingface.co/prithivMLmods/Camel-Doc-OCR-062825) is a Qwen2.5-VL-7B-Instruct finetune, highly optimized for document retrieval, structured extraction, analysis, and direct Markdown generation from images and PDFs.")
278
+ gr.Markdown("> [Megalodon-OCR-Sync-0713](https://huggingface.co/prithivMLmods/Megalodon-OCR-Sync-0713), finetuned from Qwen2.5-VL-3B-Instruct, specializes in context-aware multimodal document extraction and analysis, excelling at retrieval, layout parsing, math, and chart/table recognition.")
279
+ gr.Markdown("> [ViLaSR-7B](https://huggingface.co/inclusionAI/ViLaSR) focuses on reinforcing spatial reasoning in visual-language tasks by combining interwoven thinking with visual drawing, making it especially suited for spatial reasoning and complex tip-based queries.")
280
+ gr.Markdown("> [Video-MTR](https://huggingface.co/Phoebe13/Video-MTR) introduces reinforced multi-turn reasoning for long-form video understanding, enabling iterative key segment selection and deeper question comprehension.")
281
+
282
+ gr.Markdown("> ✋ ViLaSR-7B - demo only supports text-only reasoning, which doesn't reflect the full behavior of the model and may underrepresent its capabilities.")
283
+ gr.Markdown("> ⚠️ Note: Models in this space may not perform well on video inference tasks.")
284
+ # Define the submit button actions
285
+ image_submit.click(fn=generate_image,
286
+ inputs=[
287
+ model_choice, image_query, image_upload,
288
+ max_new_tokens, temperature, top_p, top_k,
289
+ repetition_penalty
290
+ ],
291
+ outputs=[output, markdown_output])
292
+ video_submit.click(fn=generate_video,
293
+ inputs=[
294
+ model_choice, video_query, video_upload,
295
+ max_new_tokens, temperature, top_p, top_k,
296
+ repetition_penalty
297
+ ],
298
+ outputs=[output, markdown_output])
299
+
300
+ if __name__ == "__main__":
301
+ demo.queue(max_size=40).launch(share=True, mcp_server=True, ssr_mode=False, show_error=True)