akhaliq HF Staff commited on
Commit
837d8aa
·
verified ·
1 Parent(s): f3d6bb3

Upload app.py with huggingface_hub

Browse files
Files changed (1) hide show
  1. app.py +234 -120
app.py CHANGED
@@ -1,192 +1,306 @@
1
- import os
2
- import sys
3
- import torch
4
- from PIL import Image as PILImage
5
- from PIL import ImageDraw, ImageFont
6
- from transformers import AutoConfig, AutoModelForCausalLM, AutoTokenizer, AutoProcessor
7
- from loguru import logger
8
- import gradio as gr
9
  import spaces
 
 
 
 
 
 
10
 
11
- # Note: The perceptron package needs to be installed or included in the Space
12
- try:
13
- from perceptron.tensorstream import VisionType
14
- from perceptron.tensorstream.ops import tensor_stream_token_view, modality_mask
15
- from perceptron.pointing.parser import extract_points
16
- except ImportError:
17
- logger.error("perceptron package not found. Please ensure it's installed in your Hugging Face Space.")
18
- raise
19
-
20
- # Load model at startup
21
- hf_path = "PerceptronAI/Isaac-0.1"
22
- logger.info(f"Loading processor and config from HF checkpoint: {hf_path}")
23
- config = AutoConfig.from_pretrained(hf_path, trust_remote_code=True)
24
- tokenizer = AutoTokenizer.from_pretrained(hf_path, trust_remote_code=True, use_fast=False)
25
- processor = AutoProcessor.from_pretrained(hf_path, trust_remote_code=True)
26
- processor.tokenizer = tokenizer # Ensure tokenizer is set
27
-
28
- logger.info(f"Loading AutoModelForCausalLM from HF checkpoint: {hf_path}")
29
- model = AutoModelForCausalLM.from_pretrained(hf_path, trust_remote_code=True)
30
-
31
- device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
32
- dtype = torch.bfloat16 if torch.cuda.is_available() else torch.float32
33
- model = model.to(device=device, dtype=dtype)
34
- model.eval()
35
 
36
- logger.info(f"Model loaded on {device} with dtype {dtype}")
 
 
37
 
38
  def document_to_messages(document, vision_token="<image>"):
 
39
  messages = []
40
  images = []
 
41
  for item in document:
42
  itype = item.get("type")
43
  if itype == "text":
44
  content = item.get("content")
45
  if content:
46
- messages.append({"role": item.get("role", "user"), "content": content})
 
 
 
47
  elif itype == "image":
48
- if "content" in item and item["content"] is not None:
49
- img = PILImage.open(item["content"]).convert("RGB")
 
 
 
 
 
 
50
  images.append(img)
51
- messages.append({"role": item.get("role", "user"), "content": vision_token})
 
 
 
 
52
  return messages, images
53
 
54
  def decode_tensor_stream(tensor_stream, tokenizer):
 
55
  token_view = tensor_stream_token_view(tensor_stream)
56
  mod = modality_mask(tensor_stream)
57
- text_tokens = token_view[(mod != VisionType.image.value)]
 
 
58
  decoded = tokenizer.decode(text_tokens[0] if len(text_tokens.shape) > 1 else text_tokens)
59
  return decoded
60
 
61
- def visualize_predictions(generated_text, image, output_path="prediction.jpeg"):
 
 
 
 
62
  boxes = extract_points(generated_text, expected="box")
 
63
  if not boxes:
64
- logger.info("No bounding boxes found in the generated text")
65
  image.save(output_path)
66
  return output_path
67
-
 
68
  img_width, img_height = image.size
 
 
69
  img_with_boxes = image.copy()
70
  draw = ImageDraw.Draw(img_with_boxes)
71
-
 
72
  try:
73
  font = ImageFont.truetype("/usr/share/fonts/truetype/dejavu/DejaVuSans.ttf", 16)
74
  except:
75
  font = ImageFont.load_default()
76
-
 
77
  colors = ["red", "green", "blue", "yellow", "magenta", "cyan", "orange", "purple"]
78
-
79
  for idx, box in enumerate(boxes):
80
  color = colors[idx % len(colors)]
 
 
81
  norm_x1, norm_y1 = box.top_left.x, box.top_left.y
82
  norm_x2, norm_y2 = box.bottom_right.x, box.bottom_right.y
 
 
83
  x1 = int((norm_x1 / 1000.0) * img_width)
84
  y1 = int((norm_y1 / 1000.0) * img_height)
85
  x2 = int((norm_x2 / 1000.0) * img_width)
86
  y2 = int((norm_y2 / 1000.0) * img_height)
87
-
 
88
  x1 = max(0, min(x1, img_width - 1))
89
  y1 = max(0, min(y1, img_height - 1))
90
  x2 = max(0, min(x2, img_width - 1))
91
  y2 = max(0, min(y2, img_height - 1))
92
-
 
93
  draw.rectangle([x1, y1, x2, y2], outline=color, width=3)
94
-
 
95
  if box.mention:
 
96
  text_y = max(y1 - 20, 5)
 
 
97
  text_bbox = draw.textbbox((x1, text_y), box.mention, font=font)
98
  draw.rectangle(text_bbox, fill=color)
99
  draw.text((x1, text_y), box.mention, fill="white", font=font)
100
-
 
101
  img_with_boxes.save(output_path, "JPEG")
102
  return output_path
103
 
104
- @spaces.GPU(duration=120)
105
- def generate_response(image, prompt):
106
- document = [
107
- {"type": "text", "content": "<hint>BOX</hint>", "role": "user"},
108
- {"type": "image", "content": image, "role": "user"},
109
- {"type": "text", "content": prompt, "role": "user"},
110
- ]
111
-
112
- messages, images = document_to_messages(document, vision_token=config.vision_token)
113
- text = processor.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
114
- inputs = processor(text=text, images=images, return_tensors="pt")
115
- tensor_stream = inputs["tensor_stream"].to(device)
116
- input_ids = inputs["input_ids"].to(device)
117
-
118
- decoded_content = decode_tensor_stream(tensor_stream, processor.tokenizer)
119
-
120
- with torch.no_grad():
121
- generated_ids = model.generate(
122
- tensor_stream=tensor_stream,
123
- max_new_tokens=256,
124
- do_sample=False,
125
- pad_token_id=processor.tokenizer.eos_token_id,
126
- eos_token_id=processor.tokenizer.eos_token_id,
127
- )
128
 
129
- generated_text = processor.tokenizer.decode(generated_ids[0], skip_special_tokens=False)
 
130
 
131
- if images:
132
- vis_path = visualize_predictions(generated_text, images[0])
133
- return generated_text, vis_path
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
134
  else:
135
- return generated_text, None
136
-
137
- # Example images and prompts
138
- examples = [
139
- ["example.webp", "Determine whether it is safe to cross the street. Look for signage and moving traffic."],
140
- ]
141
 
142
- with gr.Blocks(title="Perceptron Isaac Vision Model", theme=gr.themes.Soft()) as demo:
143
- gr.Markdown("# 🔍 Perceptron Isaac Vision Model")
144
- gr.Markdown("Built with [anycoder](https://huggingface.co/spaces/akhaliq/anycoder)")
145
  gr.Markdown("""
146
- This demo showcases the Perceptron Isaac-0.1 model for multimodal understanding with bounding box visualization.
147
- Upload an image and provide a prompt to analyze the image and see detected objects with bounding boxes.
 
 
 
 
148
  """)
149
-
150
  with gr.Row():
151
- with gr.Column(scale=1):
152
  image_input = gr.Image(
153
- type="filepath",
154
- label="Upload Image",
155
- sources=["upload", "webcam", "clipboard"],
156
- height=400
157
  )
158
- prompt_input = gr.Textbox(
159
- label="Prompt",
160
- value="Determine whether it is safe to cross the street. Look for signage and moving traffic.",
161
- lines=3,
162
- placeholder="Enter your prompt here..."
163
  )
164
- generate_btn = gr.Button("🚀 Generate Response", variant="primary", size="lg")
165
-
166
- with gr.Column(scale=1):
167
- visualized_image = gr.Image(
168
- label="Visualized Predictions (with Bounding Boxes)",
169
- height=400
170
  )
171
- generated_text = gr.Textbox(
172
- label="Generated Text",
173
- lines=10,
174
- max_lines=20
 
 
 
175
  )
176
-
177
- gr.Examples(
178
- examples=examples,
179
- inputs=[image_input, prompt_input],
180
- outputs=[generated_text, visualized_image],
181
- fn=generate_response,
182
- cache_examples=False
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
183
  )
184
-
185
  generate_btn.click(
186
- generate_response,
187
- inputs=[image_input, prompt_input],
188
- outputs=[generated_text, visualized_image]
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
189
  )
190
 
191
  if __name__ == "__main__":
192
- demo.launch()
 
 
 
 
 
 
 
 
 
1
  import spaces
2
+ import gradio as gr
3
+ import torch
4
+ from PIL import Image
5
+ from transformers import AutoConfig, AutoModelForCausalLM, AutoProcessor
6
+ import os
7
+ import tempfile
8
 
9
+ # Import required modules from perceptron
10
+ from perceptron.tensorstream.ops import tensor_stream_token_view, modality_mask
11
+ from perceptron.pointing.parser import extract_points
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
12
 
13
+ # Define vision type enum
14
+ class VisionType:
15
+ image = 1
16
 
17
  def document_to_messages(document, vision_token="<image>"):
18
+ """Convert a Document to messages format compatible with chat templates."""
19
  messages = []
20
  images = []
21
+
22
  for item in document:
23
  itype = item.get("type")
24
  if itype == "text":
25
  content = item.get("content")
26
  if content:
27
+ messages.append({
28
+ "role": item.get("role", "user"),
29
+ "content": content,
30
+ })
31
  elif itype == "image":
32
+ content = item.get("content")
33
+ if content:
34
+ if isinstance(content, str) and os.path.exists(content):
35
+ img = Image.open(content)
36
+ elif hasattr(content, 'read'): # Gradio file object
37
+ img = Image.open(content)
38
+ else:
39
+ continue
40
  images.append(img)
41
+ messages.append({
42
+ "role": item.get("role", "user"),
43
+ "content": vision_token,
44
+ })
45
+
46
  return messages, images
47
 
48
  def decode_tensor_stream(tensor_stream, tokenizer):
49
+ """Decode a TensorStream to see its text content."""
50
  token_view = tensor_stream_token_view(tensor_stream)
51
  mod = modality_mask(tensor_stream)
52
+
53
+ # Get text tokens (excluding vision tokens)
54
+ text_tokens = token_view[(mod != VisionType.image)]
55
  decoded = tokenizer.decode(text_tokens[0] if len(text_tokens.shape) > 1 else text_tokens)
56
  return decoded
57
 
58
+ def visualize_predictions(generated_text, image, output_path):
59
+ """Extract bounding boxes from generated text and render them on the input image."""
60
+ from PIL import ImageDraw, ImageFont
61
+
62
+ # Extract bounding boxes from the generated text
63
  boxes = extract_points(generated_text, expected="box")
64
+
65
  if not boxes:
 
66
  image.save(output_path)
67
  return output_path
68
+
69
+ # Get image dimensions
70
  img_width, img_height = image.size
71
+
72
+ # Create a copy of the image to draw on
73
  img_with_boxes = image.copy()
74
  draw = ImageDraw.Draw(img_with_boxes)
75
+
76
+ # Try to use a basic font, fall back to default if not available
77
  try:
78
  font = ImageFont.truetype("/usr/share/fonts/truetype/dejavu/DejaVuSans.ttf", 16)
79
  except:
80
  font = ImageFont.load_default()
81
+
82
+ # Define colors for different boxes
83
  colors = ["red", "green", "blue", "yellow", "magenta", "cyan", "orange", "purple"]
84
+
85
  for idx, box in enumerate(boxes):
86
  color = colors[idx % len(colors)]
87
+
88
+ # Extract normalized coordinates (0-1000 range)
89
  norm_x1, norm_y1 = box.top_left.x, box.top_left.y
90
  norm_x2, norm_y2 = box.bottom_right.x, box.bottom_right.y
91
+
92
+ # Scale coordinates from 0-1000 range to actual image dimensions
93
  x1 = int((norm_x1 / 1000.0) * img_width)
94
  y1 = int((norm_y1 / 1000.0) * img_height)
95
  x2 = int((norm_x2 / 1000.0) * img_width)
96
  y2 = int((norm_y2 / 1000.0) * img_height)
97
+
98
+ # Ensure coordinates are within image bounds
99
  x1 = max(0, min(x1, img_width - 1))
100
  y1 = max(0, min(y1, img_height - 1))
101
  x2 = max(0, min(x2, img_width - 1))
102
  y2 = max(0, min(y2, img_height - 1))
103
+
104
+ # Draw the bounding box
105
  draw.rectangle([x1, y1, x2, y2], outline=color, width=3)
106
+
107
+ # Add label if mention exists
108
  if box.mention:
109
+ # Calculate text position (above the box if possible)
110
  text_y = max(y1 - 20, 5)
111
+
112
+ # Draw text background for better visibility
113
  text_bbox = draw.textbbox((x1, text_y), box.mention, font=font)
114
  draw.rectangle(text_bbox, fill=color)
115
  draw.text((x1, text_y), box.mention, fill="white", font=font)
116
+
117
+ # Save the image with bounding boxes
118
  img_with_boxes.save(output_path, "JPEG")
119
  return output_path
120
 
121
+ # Load model and processor once at startup
122
+ @spaces.GPU(duration=1500)
123
+ def load_model():
124
+ """Load the Perceptron model with AoT compilation."""
125
+ hf_path = "PerceptronAI/Isaac-0.1"
126
+
127
+ print("Loading processor and config...")
128
+ config = AutoConfig.from_pretrained(hf_path, trust_remote_code=True)
129
+ processor = AutoProcessor.from_pretrained(hf_path, trust_remote_code=True)
130
+
131
+ print("Loading model...")
132
+ model = AutoModelForCausalLM.from_pretrained(hf_path, trust_remote_code=True)
133
+
134
+ # Move to appropriate device and dtype
135
+ device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
136
+ dtype = torch.bfloat16 if torch.cuda.is_available() else torch.float32
137
+ model = model.to(device=device, dtype=dtype)
138
+ model.eval()
139
+
140
+ print(f"Model loaded on {device} with dtype {dtype}")
141
+ return model, processor, config, device
 
 
 
142
 
143
+ # Load model during startup
144
+ model, processor, config, device = load_model()
145
 
146
+ @spaces.GPU(duration=120)
147
+ def generate_response(image_file, text_prompt, max_tokens=256):
148
+ """Generate response using Perceptron model."""
149
+ try:
150
+ # Create document from inputs
151
+ document = [
152
+ {
153
+ "type": "text",
154
+ "content": "<hint>BOX</hint>",
155
+ "role": "user",
156
+ },
157
+ {
158
+ "type": "image",
159
+ "content": image_file,
160
+ "role": "user",
161
+ },
162
+ {
163
+ "type": "text",
164
+ "content": text_prompt,
165
+ "role": "user",
166
+ },
167
+ ]
168
+
169
+ # Convert document to messages format
170
+ messages, images = document_to_messages(document, vision_token=config.vision_token)
171
+
172
+ # Apply chat template
173
+ text = processor.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
174
+
175
+ # Process with IsaacProcessor
176
+ inputs = processor(text=text, images=images, return_tensors="pt")
177
+ tensor_stream = inputs["tensor_stream"].to(device)
178
+ input_ids = inputs["input_ids"].to(device)
179
+
180
+ # Generate text using the model
181
+ with torch.no_grad():
182
+ generated_ids = model.generate(
183
+ tensor_stream=tensor_stream,
184
+ max_new_tokens=max_tokens,
185
+ do_sample=False,
186
+ pad_token_id=processor.tokenizer.eos_token_id,
187
+ eos_token_id=processor.tokenizer.eos_token_id,
188
+ )
189
+
190
+ # Decode the generated text
191
+ generated_text = processor.tokenizer.decode(generated_ids[0], skip_special_tokens=False)
192
+
193
+ # Extract new tokens only
194
+ if generated_ids.shape[1] > input_ids.shape[1]:
195
+ new_tokens = generated_ids[0, input_ids.shape[1]:]
196
+ new_text = processor.tokenizer.decode(new_tokens, skip_special_tokens=True)
197
+ else:
198
+ new_text = "No new tokens generated"
199
+
200
+ # Create visualization
201
+ if images and len(images) > 0:
202
+ with tempfile.NamedTemporaryFile(suffix=".jpg", delete=False) as tmp_file:
203
+ viz_path = tmp_file.name
204
+ viz_path = visualize_predictions(generated_text, images[0], viz_path)
205
  else:
206
+ viz_path = None
207
+
208
+ return new_text, generated_text, viz_path if viz_path else None
209
+
210
+ except Exception as e:
211
+ return f"Error: {str(e)}", "", None
212
 
213
+ # Create Gradio interface
214
+ with gr.Blocks(title="HuggingFace Perceptron Demo", theme=gr.themes.Soft()) as demo:
 
215
  gr.Markdown("""
216
+ # 🚀 HuggingFace Perceptron Multimodal AI Demo
217
+
218
+ This demo showcases the PerceptronAI/Isaac-0.1 model for multimodal understanding and generation.
219
+ Upload an image and provide a text prompt to see the model's response with bounding box visualizations.
220
+
221
+ **Built with [anycoder](https://huggingface.co/spaces/akhaliq/anycoder)**
222
  """)
223
+
224
  with gr.Row():
225
+ with gr.Column():
226
  image_input = gr.Image(
227
+ label="Upload Image",
228
+ type="filepath",
229
+ sources=["upload"],
230
+ height=300
231
  )
232
+ text_input = gr.Textbox(
233
+ label="Text Prompt",
234
+ placeholder="Describe what you want to analyze in the image...",
235
+ lines=3
 
236
  )
237
+ max_tokens_slider = gr.Slider(
238
+ label="Max Tokens",
239
+ minimum=50,
240
+ maximum=512,
241
+ value=256,
242
+ step=50
243
  )
244
+ generate_btn = gr.Button("Generate Response", variant="primary")
245
+
246
+ with gr.Column():
247
+ new_text_output = gr.Textbox(
248
+ label="Generated Response",
249
+ lines=4,
250
+ interactive=False
251
  )
252
+ full_output = gr.Textbox(
253
+ label="Full Generated Text",
254
+ lines=6,
255
+ interactive=False,
256
+ visible=False
257
+ )
258
+ visualization_output = gr.Image(
259
+ label="Visualization with Bounding Boxes",
260
+ height=300,
261
+ interactive=False
262
+ )
263
+
264
+ with gr.Accordion("Advanced Options", open=False):
265
+ gr.Markdown("""
266
+ - The model processes both text and images using TensorStream technology
267
+ - Bounding boxes are automatically extracted from the generated text
268
+ - Supports complex multimodal reasoning tasks
269
+ """)
270
+ show_full_checkbox = gr.Checkbox(label="Show Full Generated Text", value=False)
271
+
272
+ # Event handlers
273
+ show_full_checkbox.change(
274
+ lambda x: gr.Textbox(visible=x),
275
+ inputs=show_full_checkbox,
276
+ outputs=full_output
277
  )
278
+
279
  generate_btn.click(
280
+ fn=generate_response,
281
+ inputs=[image_input, text_input, max_tokens_slider],
282
+ outputs=[new_text_output, full_output, visualization_output]
283
+ )
284
+
285
+ # Examples
286
+ gr.Examples(
287
+ examples=[
288
+ [
289
+ "https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/transformers/tasks/car.jpg",
290
+ "Identify all vehicles in the image and describe their positions.",
291
+ 200
292
+ ],
293
+ [
294
+ "https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/transformers/tasks/street.jpg",
295
+ "Analyze the street scene and identify any potential safety concerns.",
296
+ 256
297
+ ]
298
+ ],
299
+ inputs=[image_input, text_input, max_tokens_slider],
300
+ outputs=[new_text_output, full_output, visualization_output],
301
+ fn=generate_response,
302
+ cache_examples=True
303
  )
304
 
305
  if __name__ == "__main__":
306
+ demo.launch(share=True)