akhaliq HF Staff commited on
Commit
8e814c4
Β·
verified Β·
1 Parent(s): 837d8aa

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +120 -234
app.py CHANGED
@@ -1,306 +1,192 @@
1
- import spaces
2
- import gradio as gr
3
- import torch
4
- from PIL import Image
5
- from transformers import AutoConfig, AutoModelForCausalLM, AutoProcessor
6
  import os
7
- import tempfile
 
 
 
 
 
 
 
8
 
9
- # Import required modules from perceptron
10
- from perceptron.tensorstream.ops import tensor_stream_token_view, modality_mask
11
- from perceptron.pointing.parser import extract_points
 
 
 
 
 
12
 
13
- # Define vision type enum
14
- class VisionType:
15
- image = 1
 
 
 
 
 
 
 
 
 
 
 
 
 
 
16
 
17
  def document_to_messages(document, vision_token="<image>"):
18
- """Convert a Document to messages format compatible with chat templates."""
19
  messages = []
20
  images = []
21
-
22
  for item in document:
23
  itype = item.get("type")
24
  if itype == "text":
25
  content = item.get("content")
26
  if content:
27
- messages.append({
28
- "role": item.get("role", "user"),
29
- "content": content,
30
- })
31
  elif itype == "image":
32
- content = item.get("content")
33
- if content:
34
- if isinstance(content, str) and os.path.exists(content):
35
- img = Image.open(content)
36
- elif hasattr(content, 'read'): # Gradio file object
37
- img = Image.open(content)
38
- else:
39
- continue
40
  images.append(img)
41
- messages.append({
42
- "role": item.get("role", "user"),
43
- "content": vision_token,
44
- })
45
-
46
  return messages, images
47
 
48
  def decode_tensor_stream(tensor_stream, tokenizer):
49
- """Decode a TensorStream to see its text content."""
50
  token_view = tensor_stream_token_view(tensor_stream)
51
  mod = modality_mask(tensor_stream)
52
-
53
- # Get text tokens (excluding vision tokens)
54
- text_tokens = token_view[(mod != VisionType.image)]
55
  decoded = tokenizer.decode(text_tokens[0] if len(text_tokens.shape) > 1 else text_tokens)
56
  return decoded
57
 
58
- def visualize_predictions(generated_text, image, output_path):
59
- """Extract bounding boxes from generated text and render them on the input image."""
60
- from PIL import ImageDraw, ImageFont
61
-
62
- # Extract bounding boxes from the generated text
63
  boxes = extract_points(generated_text, expected="box")
64
-
65
  if not boxes:
 
66
  image.save(output_path)
67
  return output_path
68
-
69
- # Get image dimensions
70
  img_width, img_height = image.size
71
-
72
- # Create a copy of the image to draw on
73
  img_with_boxes = image.copy()
74
  draw = ImageDraw.Draw(img_with_boxes)
75
-
76
- # Try to use a basic font, fall back to default if not available
77
  try:
78
  font = ImageFont.truetype("/usr/share/fonts/truetype/dejavu/DejaVuSans.ttf", 16)
79
  except:
80
  font = ImageFont.load_default()
81
-
82
- # Define colors for different boxes
83
  colors = ["red", "green", "blue", "yellow", "magenta", "cyan", "orange", "purple"]
84
-
85
  for idx, box in enumerate(boxes):
86
  color = colors[idx % len(colors)]
87
-
88
- # Extract normalized coordinates (0-1000 range)
89
  norm_x1, norm_y1 = box.top_left.x, box.top_left.y
90
  norm_x2, norm_y2 = box.bottom_right.x, box.bottom_right.y
91
-
92
- # Scale coordinates from 0-1000 range to actual image dimensions
93
  x1 = int((norm_x1 / 1000.0) * img_width)
94
  y1 = int((norm_y1 / 1000.0) * img_height)
95
  x2 = int((norm_x2 / 1000.0) * img_width)
96
  y2 = int((norm_y2 / 1000.0) * img_height)
97
-
98
- # Ensure coordinates are within image bounds
99
  x1 = max(0, min(x1, img_width - 1))
100
  y1 = max(0, min(y1, img_height - 1))
101
  x2 = max(0, min(x2, img_width - 1))
102
  y2 = max(0, min(y2, img_height - 1))
103
-
104
- # Draw the bounding box
105
  draw.rectangle([x1, y1, x2, y2], outline=color, width=3)
106
-
107
- # Add label if mention exists
108
  if box.mention:
109
- # Calculate text position (above the box if possible)
110
  text_y = max(y1 - 20, 5)
111
-
112
- # Draw text background for better visibility
113
  text_bbox = draw.textbbox((x1, text_y), box.mention, font=font)
114
  draw.rectangle(text_bbox, fill=color)
115
  draw.text((x1, text_y), box.mention, fill="white", font=font)
116
-
117
- # Save the image with bounding boxes
118
  img_with_boxes.save(output_path, "JPEG")
119
  return output_path
120
 
121
- # Load model and processor once at startup
122
- @spaces.GPU(duration=1500)
123
- def load_model():
124
- """Load the Perceptron model with AoT compilation."""
125
- hf_path = "PerceptronAI/Isaac-0.1"
126
-
127
- print("Loading processor and config...")
128
- config = AutoConfig.from_pretrained(hf_path, trust_remote_code=True)
129
- processor = AutoProcessor.from_pretrained(hf_path, trust_remote_code=True)
130
-
131
- print("Loading model...")
132
- model = AutoModelForCausalLM.from_pretrained(hf_path, trust_remote_code=True)
133
-
134
- # Move to appropriate device and dtype
135
- device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
136
- dtype = torch.bfloat16 if torch.cuda.is_available() else torch.float32
137
- model = model.to(device=device, dtype=dtype)
138
- model.eval()
139
-
140
- print(f"Model loaded on {device} with dtype {dtype}")
141
- return model, processor, config, device
142
 
143
- # Load model during startup
144
- model, processor, config, device = load_model()
145
 
146
- @spaces.GPU(duration=120)
147
- def generate_response(image_file, text_prompt, max_tokens=256):
148
- """Generate response using Perceptron model."""
149
- try:
150
- # Create document from inputs
151
- document = [
152
- {
153
- "type": "text",
154
- "content": "<hint>BOX</hint>",
155
- "role": "user",
156
- },
157
- {
158
- "type": "image",
159
- "content": image_file,
160
- "role": "user",
161
- },
162
- {
163
- "type": "text",
164
- "content": text_prompt,
165
- "role": "user",
166
- },
167
- ]
168
-
169
- # Convert document to messages format
170
- messages, images = document_to_messages(document, vision_token=config.vision_token)
171
-
172
- # Apply chat template
173
- text = processor.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
174
-
175
- # Process with IsaacProcessor
176
- inputs = processor(text=text, images=images, return_tensors="pt")
177
- tensor_stream = inputs["tensor_stream"].to(device)
178
- input_ids = inputs["input_ids"].to(device)
179
-
180
- # Generate text using the model
181
- with torch.no_grad():
182
- generated_ids = model.generate(
183
- tensor_stream=tensor_stream,
184
- max_new_tokens=max_tokens,
185
- do_sample=False,
186
- pad_token_id=processor.tokenizer.eos_token_id,
187
- eos_token_id=processor.tokenizer.eos_token_id,
188
- )
189
-
190
- # Decode the generated text
191
- generated_text = processor.tokenizer.decode(generated_ids[0], skip_special_tokens=False)
192
-
193
- # Extract new tokens only
194
- if generated_ids.shape[1] > input_ids.shape[1]:
195
- new_tokens = generated_ids[0, input_ids.shape[1]:]
196
- new_text = processor.tokenizer.decode(new_tokens, skip_special_tokens=True)
197
- else:
198
- new_text = "No new tokens generated"
199
-
200
- # Create visualization
201
- if images and len(images) > 0:
202
- with tempfile.NamedTemporaryFile(suffix=".jpg", delete=False) as tmp_file:
203
- viz_path = tmp_file.name
204
- viz_path = visualize_predictions(generated_text, images[0], viz_path)
205
  else:
206
- viz_path = None
207
-
208
- return new_text, generated_text, viz_path if viz_path else None
209
-
210
- except Exception as e:
211
- return f"Error: {str(e)}", "", None
212
 
213
- # Create Gradio interface
214
- with gr.Blocks(title="HuggingFace Perceptron Demo", theme=gr.themes.Soft()) as demo:
 
215
  gr.Markdown("""
216
- # πŸš€ HuggingFace Perceptron Multimodal AI Demo
217
-
218
- This demo showcases the PerceptronAI/Isaac-0.1 model for multimodal understanding and generation.
219
- Upload an image and provide a text prompt to see the model's response with bounding box visualizations.
220
-
221
- **Built with [anycoder](https://huggingface.co/spaces/akhaliq/anycoder)**
222
  """)
223
-
224
  with gr.Row():
225
- with gr.Column():
226
  image_input = gr.Image(
227
- label="Upload Image",
228
- type="filepath",
229
- sources=["upload"],
230
- height=300
231
- )
232
- text_input = gr.Textbox(
233
- label="Text Prompt",
234
- placeholder="Describe what you want to analyze in the image...",
235
- lines=3
236
  )
237
- max_tokens_slider = gr.Slider(
238
- label="Max Tokens",
239
- minimum=50,
240
- maximum=512,
241
- value=256,
242
- step=50
243
  )
244
- generate_btn = gr.Button("Generate Response", variant="primary")
245
-
246
- with gr.Column():
247
- new_text_output = gr.Textbox(
248
- label="Generated Response",
249
- lines=4,
250
- interactive=False
251
- )
252
- full_output = gr.Textbox(
253
- label="Full Generated Text",
254
- lines=6,
255
- interactive=False,
256
- visible=False
257
  )
258
- visualization_output = gr.Image(
259
- label="Visualization with Bounding Boxes",
260
- height=300,
261
- interactive=False
262
  )
263
-
264
- with gr.Accordion("Advanced Options", open=False):
265
- gr.Markdown("""
266
- - The model processes both text and images using TensorStream technology
267
- - Bounding boxes are automatically extracted from the generated text
268
- - Supports complex multimodal reasoning tasks
269
- """)
270
- show_full_checkbox = gr.Checkbox(label="Show Full Generated Text", value=False)
271
-
272
- # Event handlers
273
- show_full_checkbox.change(
274
- lambda x: gr.Textbox(visible=x),
275
- inputs=show_full_checkbox,
276
- outputs=full_output
277
- )
278
-
279
- generate_btn.click(
280
- fn=generate_response,
281
- inputs=[image_input, text_input, max_tokens_slider],
282
- outputs=[new_text_output, full_output, visualization_output]
283
- )
284
-
285
- # Examples
286
  gr.Examples(
287
- examples=[
288
- [
289
- "https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/transformers/tasks/car.jpg",
290
- "Identify all vehicles in the image and describe their positions.",
291
- 200
292
- ],
293
- [
294
- "https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/transformers/tasks/street.jpg",
295
- "Analyze the street scene and identify any potential safety concerns.",
296
- 256
297
- ]
298
- ],
299
- inputs=[image_input, text_input, max_tokens_slider],
300
- outputs=[new_text_output, full_output, visualization_output],
301
  fn=generate_response,
302
- cache_examples=True
 
 
 
 
 
 
303
  )
304
 
305
  if __name__ == "__main__":
306
- demo.launch(share=True)
 
 
 
 
 
 
1
  import os
2
+ import sys
3
+ import torch
4
+ from PIL import Image as PILImage
5
+ from PIL import ImageDraw, ImageFont
6
+ from transformers import AutoConfig, AutoModelForCausalLM, AutoTokenizer, AutoProcessor
7
+ from loguru import logger
8
+ import gradio as gr
9
+ import spaces
10
 
11
+ # Note: The perceptron package needs to be installed or included in the Space
12
+ try:
13
+ from perceptron.tensorstream import VisionType
14
+ from perceptron.tensorstream.ops import tensor_stream_token_view, modality_mask
15
+ from perceptron.pointing.parser import extract_points
16
+ except ImportError:
17
+ logger.error("perceptron package not found. Please ensure it's installed in your Hugging Face Space.")
18
+ raise
19
 
20
+ # Load model at startup
21
+ hf_path = "PerceptronAI/Isaac-0.1"
22
+ logger.info(f"Loading processor and config from HF checkpoint: {hf_path}")
23
+ config = AutoConfig.from_pretrained(hf_path, trust_remote_code=True)
24
+ tokenizer = AutoTokenizer.from_pretrained(hf_path, trust_remote_code=True, use_fast=False)
25
+ processor = AutoProcessor.from_pretrained(hf_path, trust_remote_code=True)
26
+ processor.tokenizer = tokenizer # Ensure tokenizer is set
27
+
28
+ logger.info(f"Loading AutoModelForCausalLM from HF checkpoint: {hf_path}")
29
+ model = AutoModelForCausalLM.from_pretrained(hf_path, trust_remote_code=True)
30
+
31
+ device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
32
+ dtype = torch.bfloat16 if torch.cuda.is_available() else torch.float32
33
+ model = model.to(device=device, dtype=dtype)
34
+ model.eval()
35
+
36
+ logger.info(f"Model loaded on {device} with dtype {dtype}")
37
 
38
  def document_to_messages(document, vision_token="<image>"):
 
39
  messages = []
40
  images = []
 
41
  for item in document:
42
  itype = item.get("type")
43
  if itype == "text":
44
  content = item.get("content")
45
  if content:
46
+ messages.append({"role": item.get("role", "user"), "content": content})
 
 
 
47
  elif itype == "image":
48
+ if "content" in item and item["content"] is not None:
49
+ img = PILImage.open(item["content"]).convert("RGB")
 
 
 
 
 
 
50
  images.append(img)
51
+ messages.append({"role": item.get("role", "user"), "content": vision_token})
 
 
 
 
52
  return messages, images
53
 
54
  def decode_tensor_stream(tensor_stream, tokenizer):
 
55
  token_view = tensor_stream_token_view(tensor_stream)
56
  mod = modality_mask(tensor_stream)
57
+ text_tokens = token_view[(mod != VisionType.image.value)]
 
 
58
  decoded = tokenizer.decode(text_tokens[0] if len(text_tokens.shape) > 1 else text_tokens)
59
  return decoded
60
 
61
+ def visualize_predictions(generated_text, image, output_path="prediction.jpeg"):
 
 
 
 
62
  boxes = extract_points(generated_text, expected="box")
 
63
  if not boxes:
64
+ logger.info("No bounding boxes found in the generated text")
65
  image.save(output_path)
66
  return output_path
67
+
 
68
  img_width, img_height = image.size
 
 
69
  img_with_boxes = image.copy()
70
  draw = ImageDraw.Draw(img_with_boxes)
71
+
 
72
  try:
73
  font = ImageFont.truetype("/usr/share/fonts/truetype/dejavu/DejaVuSans.ttf", 16)
74
  except:
75
  font = ImageFont.load_default()
76
+
 
77
  colors = ["red", "green", "blue", "yellow", "magenta", "cyan", "orange", "purple"]
78
+
79
  for idx, box in enumerate(boxes):
80
  color = colors[idx % len(colors)]
 
 
81
  norm_x1, norm_y1 = box.top_left.x, box.top_left.y
82
  norm_x2, norm_y2 = box.bottom_right.x, box.bottom_right.y
 
 
83
  x1 = int((norm_x1 / 1000.0) * img_width)
84
  y1 = int((norm_y1 / 1000.0) * img_height)
85
  x2 = int((norm_x2 / 1000.0) * img_width)
86
  y2 = int((norm_y2 / 1000.0) * img_height)
87
+
 
88
  x1 = max(0, min(x1, img_width - 1))
89
  y1 = max(0, min(y1, img_height - 1))
90
  x2 = max(0, min(x2, img_width - 1))
91
  y2 = max(0, min(y2, img_height - 1))
92
+
 
93
  draw.rectangle([x1, y1, x2, y2], outline=color, width=3)
94
+
 
95
  if box.mention:
 
96
  text_y = max(y1 - 20, 5)
 
 
97
  text_bbox = draw.textbbox((x1, text_y), box.mention, font=font)
98
  draw.rectangle(text_bbox, fill=color)
99
  draw.text((x1, text_y), box.mention, fill="white", font=font)
100
+
 
101
  img_with_boxes.save(output_path, "JPEG")
102
  return output_path
103
 
104
+ @spaces.GPU(duration=120)
105
+ def generate_response(image, prompt):
106
+ document = [
107
+ {"type": "text", "content": "<hint>BOX</hint>", "role": "user"},
108
+ {"type": "image", "content": image, "role": "user"},
109
+ {"type": "text", "content": prompt, "role": "user"},
110
+ ]
111
+
112
+ messages, images = document_to_messages(document, vision_token=config.vision_token)
113
+ text = processor.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
114
+ inputs = processor(text=text, images=images, return_tensors="pt")
115
+ tensor_stream = inputs["tensor_stream"].to(device)
116
+ input_ids = inputs["input_ids"].to(device)
 
 
 
 
 
 
 
 
117
 
118
+ decoded_content = decode_tensor_stream(tensor_stream, processor.tokenizer)
 
119
 
120
+ with torch.no_grad():
121
+ generated_ids = model.generate(
122
+ tensor_stream=tensor_stream,
123
+ max_new_tokens=256,
124
+ do_sample=False,
125
+ pad_token_id=processor.tokenizer.eos_token_id,
126
+ eos_token_id=processor.tokenizer.eos_token_id,
127
+ )
128
+
129
+ generated_text = processor.tokenizer.decode(generated_ids[0], skip_special_tokens=False)
130
+
131
+ if images:
132
+ vis_path = visualize_predictions(generated_text, images[0])
133
+ return generated_text, vis_path
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
134
  else:
135
+ return generated_text, None
136
+
137
+ # Example images and prompts
138
+ examples = [
139
+ ["example.webp", "Determine whether it is safe to cross the street. Look for signage and moving traffic."],
140
+ ]
141
 
142
+ with gr.Blocks(title="Perceptron Isaac Vision Model", theme=gr.themes.Soft()) as demo:
143
+ gr.Markdown("# πŸ” Perceptron Isaac Vision Model")
144
+ gr.Markdown("Built with [anycoder](https://huggingface.co/spaces/akhaliq/anycoder)")
145
  gr.Markdown("""
146
+ This demo showcases the Perceptron Isaac-0.1 model for multimodal understanding with bounding box visualization.
147
+ Upload an image and provide a prompt to analyze the image and see detected objects with bounding boxes.
 
 
 
 
148
  """)
149
+
150
  with gr.Row():
151
+ with gr.Column(scale=1):
152
  image_input = gr.Image(
153
+ type="filepath",
154
+ label="Upload Image",
155
+ sources=["upload", "webcam", "clipboard"],
156
+ height=400
 
 
 
 
 
157
  )
158
+ prompt_input = gr.Textbox(
159
+ label="Prompt",
160
+ value="Determine whether it is safe to cross the street. Look for signage and moving traffic.",
161
+ lines=3,
162
+ placeholder="Enter your prompt here..."
 
163
  )
164
+ generate_btn = gr.Button("πŸš€ Generate Response", variant="primary", size="lg")
165
+
166
+ with gr.Column(scale=1):
167
+ visualized_image = gr.Image(
168
+ label="Visualized Predictions (with Bounding Boxes)",
169
+ height=400
 
 
 
 
 
 
 
170
  )
171
+ generated_text = gr.Textbox(
172
+ label="Generated Text",
173
+ lines=10,
174
+ max_lines=20
175
  )
176
+
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
177
  gr.Examples(
178
+ examples=examples,
179
+ inputs=[image_input, prompt_input],
180
+ outputs=[generated_text, visualized_image],
 
 
 
 
 
 
 
 
 
 
 
181
  fn=generate_response,
182
+ cache_examples=False
183
+ )
184
+
185
+ generate_btn.click(
186
+ generate_response,
187
+ inputs=[image_input, prompt_input],
188
+ outputs=[generated_text, visualized_image]
189
  )
190
 
191
  if __name__ == "__main__":
192
+ demo.launch()