prithivMLmods commited on
Commit
cfdf27b
·
verified ·
1 Parent(s): 6247522

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +100 -80
app.py CHANGED
@@ -1,12 +1,12 @@
1
  import os
2
- import re
3
  import time
4
  from threading import Thread
 
 
5
 
6
  import gradio as gr
7
  import spaces
8
  import torch
9
- from PIL import Image, ImageDraw
10
 
11
  from transformers import (
12
  Qwen2_5_VLForConditionalGeneration,
@@ -30,62 +30,76 @@ model = Qwen2_5_VLForConditionalGeneration.from_pretrained(
30
  torch_dtype=torch.float16
31
  ).to(device).eval()
32
 
33
-
34
- def extract_coordinates(text: str):
35
- """Extract all (x, y) coordinates from model output text."""
36
- pattern = r"\((\d+),\s*(\d+)\)"
37
- coords = re.findall(pattern, text)
38
- return [(int(x), int(y)) for x, y in coords]
39
-
40
-
41
- def draw_boxes(image: Image.Image, coords, box_type="solid"):
42
- """Draw bounding boxes on the image."""
43
- img_copy = image.copy()
44
- draw = ImageDraw.Draw(img_copy)
45
-
46
- side = 50 # square size
47
- for (x, y) in coords:
48
- box = [(x - side//2, y - side//2), (x + side//2, y + side//2)]
49
- if box_type == "solid":
50
- draw.rectangle(box, outline="red", width=3)
51
- elif box_type == "dotted":
52
- # Draw dotted (dashed) rectangle
53
- dash_len = 5
54
- x1, y1 = box[0]
55
- x2, y2 = box[1]
56
-
57
- # Top edge
58
- for i in range(x1, x2, dash_len*2):
59
- draw.line([(i, y1), (min(i+dash_len, x2), y1)], fill="blue", width=2)
60
- # Bottom edge
61
- for i in range(x1, x2, dash_len*2):
62
- draw.line([(i, y2), (min(i+dash_len, x2), y2)], fill="blue", width=2)
63
- # Left edge
64
- for i in range(y1, y2, dash_len*2):
65
- draw.line([(x1, i), (x1, min(i+dash_len, y2))], fill="blue", width=2)
66
- # Right edge
67
- for i in range(y1, y2, dash_len*2):
68
- draw.line([(x2, i), (x2, min(i+dash_len, y2))], fill="blue", width=2)
69
-
70
- return img_copy
71
-
 
 
 
 
 
 
 
 
 
 
72
 
73
  @spaces.GPU
74
  def generate_image(text: str, image: Image.Image,
75
- max_new_tokens: int = 1024,
76
- temperature: float = 0.6,
77
- top_p: float = 0.9,
78
- top_k: int = 50,
79
- repetition_penalty: float = 1.2,
80
- box_type: str = "solid"):
 
81
  """
82
- Generates responses using the Lumian2-VLR-7B-Thinking model for image input.
83
- Yields raw text, Markdown-formatted text, and annotated image with bounding boxes.
84
  """
85
  if image is None:
86
  yield "Please upload an image.", "Please upload an image.", None
87
  return
88
 
 
 
 
89
  messages = [{
90
  "role": "user",
91
  "content": [
@@ -115,15 +129,23 @@ def generate_image(text: str, image: Image.Image,
115
  }
116
  thread = Thread(target=model.generate, kwargs=generation_kwargs)
117
  thread.start()
 
118
  buffer = ""
119
- coords = []
120
  for new_text in streamer:
121
  buffer += new_text
122
- coords = extract_coordinates(buffer)
123
- annotated_image = draw_boxes(image, coords, box_type) if coords else None
124
  time.sleep(0.01)
125
- yield buffer, buffer, annotated_image
126
-
 
 
 
 
 
 
 
 
 
 
127
 
128
  # Define examples for image inference
129
  image_examples = [
@@ -154,52 +176,50 @@ css = """
154
  with gr.Blocks(css=css, theme="bethecloud/storj_theme") as demo:
155
  gr.Markdown("# **Lumian2-VLR-7B-Thinking Image Inference**")
156
  with gr.Row():
157
- with gr.Column():
158
  gr.Markdown("## Image Inference")
159
  image_query = gr.Textbox(label="Query Input", placeholder="Enter your query here...")
160
  image_upload = gr.Image(type="pil", label="Image")
161
  image_submit = gr.Button("Submit", elem_classes="submit-btn")
162
- gr.Examples(
163
- examples=image_examples,
164
- inputs=[image_query, image_upload]
165
- )
166
  with gr.Accordion("Advanced options", open=False):
167
  max_new_tokens = gr.Slider(label="Max new tokens", minimum=1, maximum=MAX_MAX_NEW_TOKENS, step=1, value=DEFAULT_MAX_NEW_TOKENS)
168
  temperature = gr.Slider(label="Temperature", minimum=0.1, maximum=4.0, step=0.1, value=0.6)
169
  top_p = gr.Slider(label="Top-p (nucleus sampling)", minimum=0.05, maximum=1.0, step=0.05, value=0.9)
170
  top_k = gr.Slider(label="Top-k", minimum=1, maximum=1000, step=1, value=50)
171
  repetition_penalty = gr.Slider(label="Repetition penalty", minimum=1.0, maximum=2.0, step=0.05, value=1.2)
172
-
173
- # New options for bounding box visualization
174
- box_type = gr.Radio(
175
- choices=["solid", "dotted"],
176
- value="solid",
177
- label="Bounding Box Style"
178
  )
179
 
180
- with gr.Column():
181
- with gr.Column(elem_classes="canvas-output"):
182
- gr.Markdown("## Output")
183
- output = gr.Textbox(label="Raw Output", interactive=False, lines=3, scale=2)
184
-
185
- with gr.Accordion("(Result.md)", open=False):
186
- markdown_output = gr.Markdown()
187
-
188
- annotated_output = gr.Image(label="Annotated Image with Bounding Boxes")
189
 
190
  gr.Markdown("**Model Info 💻** | [Report Bug](https://huggingface.co/spaces/prithivMLmods/Qwen2.5-VL/discussions)")
191
 
192
  gr.Markdown(
193
- """
194
- > [Lumian2-VLR-7B-Thinking](https://huggingface.co/prithivMLmods/Lumian2-VLR-7B-Thinking): The Lumian2-VLR-7B-Thinking model is a high-fidelity vision-language reasoning (experimental model) system designed for fine-grained multimodal understanding. Built on Qwen2.5-VL-7B-Instruct, this model enhances image captioning, and document comprehension through explicit grounded reasoning. It produces structured reasoning traces aligned with visual coordinates, enabling explainable multimodal reasoning.
195
- """
196
  )
197
 
 
 
 
 
 
198
  image_submit.click(
199
  fn=generate_image,
200
- inputs=[image_query, image_upload, max_new_tokens, temperature, top_p, top_k, repetition_penalty, box_type],
201
- outputs=[output, markdown_output, annotated_output]
202
  )
203
 
204
  if __name__ == "__main__":
205
- demo.queue(max_size=50).launch(share=True)
 
1
  import os
 
2
  import time
3
  from threading import Thread
4
+ import re
5
+ from PIL import Image, ImageDraw
6
 
7
  import gradio as gr
8
  import spaces
9
  import torch
 
10
 
11
  from transformers import (
12
  Qwen2_5_VLForConditionalGeneration,
 
30
  torch_dtype=torch.float16
31
  ).to(device).eval()
32
 
33
+ def parse_model_output(text: str):
34
+ """
35
+ Parses the model output to extract the answer and bounding box coordinates.
36
+ """
37
+ # Extract coordinates from the <think> block
38
+ think_match = re.search(r"<think>(.*?)</think>", text, re.DOTALL)
39
+ coordinates = []
40
+ if think_match:
41
+ think_content = think_match.group(1)
42
+ # Find all occurrences of (x, y) coordinates
43
+ coords_raw = re.findall(r'\((\d+),\s*(\d+)\)', think_content)
44
+ coordinates = [(int(x), int(y)) for x, y in coords_raw]
45
+
46
+ # Extract the answer from the <answer> block
47
+ answer_match = re.search(r"<answer>(.*?)</answer>", text, re.DOTALL)
48
+ answer = answer_match.group(1).strip() if answer_match else text
49
+
50
+ return answer, coordinates
51
+
52
+ def draw_bounding_boxes(image: Image.Image, coordinates: list, box_size: int = 60, use_dotted_style: bool = False):
53
+ """
54
+ Draws square bounding boxes on the image at the given coordinates.
55
+ """
56
+ if not coordinates:
57
+ return image
58
+
59
+ img_with_boxes = image.copy()
60
+ draw = ImageDraw.Draw(img_with_boxes, "RGBA")
61
+
62
+ half_box = box_size // 2
63
+
64
+ for (x, y) in coordinates:
65
+ # Define the bounding box corners
66
+ x1 = x - half_box
67
+ y1 = y - half_box
68
+ x2 = x + half_box
69
+ y2 = y + half_box
70
+
71
+ if use_dotted_style:
72
+ # "Dotted like seaborn" - a semi-transparent fill with a solid outline
73
+ fill_color = (0, 100, 255, 60) # Light blue, semi-transparent
74
+ outline_color = (0, 0, 255) # Solid blue
75
+ draw.rectangle([x1, y1, x2, y2], fill=fill_color, outline=outline_color, width=2)
76
+ else:
77
+ # Default solid box
78
+ outline_color = (255, 0, 0) # Red
79
+ draw.rectangle([x1, y1, x2, y2], outline=outline_color, width=3)
80
+
81
+ return img_with_boxes
82
 
83
  @spaces.GPU
84
  def generate_image(text: str, image: Image.Image,
85
+ max_new_tokens: int,
86
+ temperature: float,
87
+ top_p: float,
88
+ top_k: int,
89
+ repetition_penalty: float,
90
+ draw_boxes: bool,
91
+ use_dotted_style: bool):
92
  """
93
+ Generates responses and draws bounding boxes based on model output.
94
+ Yields raw text, markdown-formatted text, and the processed image.
95
  """
96
  if image is None:
97
  yield "Please upload an image.", "Please upload an image.", None
98
  return
99
 
100
+ # Yield the original image immediately for the output display
101
+ yield "", "", image
102
+
103
  messages = [{
104
  "role": "user",
105
  "content": [
 
129
  }
130
  thread = Thread(target=model.generate, kwargs=generation_kwargs)
131
  thread.start()
132
+
133
  buffer = ""
 
134
  for new_text in streamer:
135
  buffer += new_text
 
 
136
  time.sleep(0.01)
137
+ # During generation, yield text updates but keep the original image
138
+ yield buffer, buffer, image
139
+
140
+ # After generation is complete, parse the output and draw boxes
141
+ final_answer, coordinates = parse_model_output(buffer)
142
+
143
+ output_image = image
144
+ if draw_boxes and coordinates:
145
+ output_image = draw_bounding_boxes(image, coordinates, use_dotted_style=use_dotted_style)
146
+
147
+ # Yield the final result with the processed image
148
+ yield buffer, final_answer, output_image
149
 
150
  # Define examples for image inference
151
  image_examples = [
 
176
  with gr.Blocks(css=css, theme="bethecloud/storj_theme") as demo:
177
  gr.Markdown("# **Lumian2-VLR-7B-Thinking Image Inference**")
178
  with gr.Row():
179
+ with gr.Column(scale=1):
180
  gr.Markdown("## Image Inference")
181
  image_query = gr.Textbox(label="Query Input", placeholder="Enter your query here...")
182
  image_upload = gr.Image(type="pil", label="Image")
183
  image_submit = gr.Button("Submit", elem_classes="submit-btn")
184
+
 
 
 
185
  with gr.Accordion("Advanced options", open=False):
186
  max_new_tokens = gr.Slider(label="Max new tokens", minimum=1, maximum=MAX_MAX_NEW_TOKENS, step=1, value=DEFAULT_MAX_NEW_TOKENS)
187
  temperature = gr.Slider(label="Temperature", minimum=0.1, maximum=4.0, step=0.1, value=0.6)
188
  top_p = gr.Slider(label="Top-p (nucleus sampling)", minimum=0.05, maximum=1.0, step=0.05, value=0.9)
189
  top_k = gr.Slider(label="Top-k", minimum=1, maximum=1000, step=1, value=50)
190
  repetition_penalty = gr.Slider(label="Repetition penalty", minimum=1.0, maximum=2.0, step=0.05, value=1.2)
191
+
192
+ gr.Examples(
193
+ examples=image_examples,
194
+ inputs=[image_query, image_upload]
 
 
195
  )
196
 
197
+ with gr.Column(scale=2):
198
+ gr.Markdown("## Output")
199
+ with gr.Tabs():
200
+ with gr.TabItem("Image with Bounding Box"):
201
+ image_output = gr.Image(label="Processed Image")
202
+ with gr.TabItem("Raw Text"):
203
+ output = gr.Textbox(label="Raw Model Output", interactive=False, lines=10)
204
+ with gr.TabItem("Parsed Answer"):
205
+ markdown_output = gr.Markdown(label="Parsed Answer")
206
 
207
  gr.Markdown("**Model Info 💻** | [Report Bug](https://huggingface.co/spaces/prithivMLmods/Qwen2.5-VL/discussions)")
208
 
209
  gr.Markdown(
210
+ """> [Lumian2-VLR-7B-Thinking](https://huggingface.co/prithivMLmods/Lumian2-VLR-7B-Thinking): The Lumian2-VLR-7B-Thinking model is a high-fidelity vision-language reasoning (experimental model) system designed for fine-grained multimodal understanding. Built on Qwen2.5-VL-7B-Instruct, this model enhances image captioning, and document comprehension through explicit grounded reasoning. It produces structured reasoning traces aligned with visual coordinates, enabling explainable multimodal reasoning."""
 
 
211
  )
212
 
213
+ with gr.Row():
214
+ draw_boxes_checkbox = gr.Checkbox(label="Draw Bounding Boxes", value=True)
215
+ dotted_style_checkbox = gr.Checkbox(label="Use Dotted Style for Boxes", value=False)
216
+
217
+
218
  image_submit.click(
219
  fn=generate_image,
220
+ inputs=[image_query, image_upload, max_new_tokens, temperature, top_p, top_k, repetition_penalty, draw_boxes_checkbox, dotted_style_checkbox],
221
+ outputs=[output, markdown_output, image_output]
222
  )
223
 
224
  if __name__ == "__main__":
225
+ demo.queue(max_size=50).launch(share=True)```