Spaces:

akhaliq
/

Isaac-0.1

Running on Zero

App Files Files Community

akhaliq HF Staff commited on Sep 22

Commit

837d8aa

verified ·

1 Parent(s): f3d6bb3

Upload app.py with huggingface_hub

Browse files

Files changed (1) hide show

app.py +234 -120

app.py CHANGED Viewed

@@ -1,192 +1,306 @@
-import os
-import sys
-import torch
-from PIL import Image as PILImage
-from PIL import ImageDraw, ImageFont
-from transformers import AutoConfig, AutoModelForCausalLM, AutoTokenizer, AutoProcessor
-from loguru import logger
-import gradio as gr
 import spaces
-# Note: The perceptron package needs to be installed or included in the Space
-try:
-    from perceptron.tensorstream import VisionType
-    from perceptron.tensorstream.ops import tensor_stream_token_view, modality_mask
-    from perceptron.pointing.parser import extract_points
-except ImportError:
-    logger.error("perceptron package not found. Please ensure it's installed in your Hugging Face Space.")
-    raise
-# Load model at startup
-hf_path = "PerceptronAI/Isaac-0.1"
-logger.info(f"Loading processor and config from HF checkpoint: {hf_path}")
-config = AutoConfig.from_pretrained(hf_path, trust_remote_code=True)
-tokenizer = AutoTokenizer.from_pretrained(hf_path, trust_remote_code=True, use_fast=False)
-processor = AutoProcessor.from_pretrained(hf_path, trust_remote_code=True)
-processor.tokenizer = tokenizer  # Ensure tokenizer is set
-logger.info(f"Loading AutoModelForCausalLM from HF checkpoint: {hf_path}")
-model = AutoModelForCausalLM.from_pretrained(hf_path, trust_remote_code=True)
-device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
-dtype = torch.bfloat16 if torch.cuda.is_available() else torch.float32
-model = model.to(device=device, dtype=dtype)
-model.eval()
-logger.info(f"Model loaded on {device} with dtype {dtype}")
 def document_to_messages(document, vision_token="<image>"):
     messages = []
     images = []
     for item in document:
         itype = item.get("type")
         if itype == "text":
             content = item.get("content")
             if content:
-                messages.append({"role": item.get("role", "user"), "content": content})
         elif itype == "image":
-            if "content" in item and item["content"] is not None:
-                img = PILImage.open(item["content"]).convert("RGB")
                 images.append(img)
-                messages.append({"role": item.get("role", "user"), "content": vision_token})
     return messages, images
 def decode_tensor_stream(tensor_stream, tokenizer):
     token_view = tensor_stream_token_view(tensor_stream)
     mod = modality_mask(tensor_stream)
-    text_tokens = token_view[(mod != VisionType.image.value)]
     decoded = tokenizer.decode(text_tokens[0] if len(text_tokens.shape) > 1 else text_tokens)
     return decoded
-def visualize_predictions(generated_text, image, output_path="prediction.jpeg"):
     boxes = extract_points(generated_text, expected="box")
     if not boxes:
-        logger.info("No bounding boxes found in the generated text")
         image.save(output_path)
         return output_path
     img_width, img_height = image.size
     img_with_boxes = image.copy()
     draw = ImageDraw.Draw(img_with_boxes)
     try:
         font = ImageFont.truetype("/usr/share/fonts/truetype/dejavu/DejaVuSans.ttf", 16)
     except:
         font = ImageFont.load_default()
     colors = ["red", "green", "blue", "yellow", "magenta", "cyan", "orange", "purple"]
     for idx, box in enumerate(boxes):
         color = colors[idx % len(colors)]
         norm_x1, norm_y1 = box.top_left.x, box.top_left.y
         norm_x2, norm_y2 = box.bottom_right.x, box.bottom_right.y
         x1 = int((norm_x1 / 1000.0) * img_width)
         y1 = int((norm_y1 / 1000.0) * img_height)
         x2 = int((norm_x2 / 1000.0) * img_width)
         y2 = int((norm_y2 / 1000.0) * img_height)
         x1 = max(0, min(x1, img_width - 1))
         y1 = max(0, min(y1, img_height - 1))
         x2 = max(0, min(x2, img_width - 1))
         y2 = max(0, min(y2, img_height - 1))
         draw.rectangle([x1, y1, x2, y2], outline=color, width=3)
         if box.mention:
             text_y = max(y1 - 20, 5)
             text_bbox = draw.textbbox((x1, text_y), box.mention, font=font)
             draw.rectangle(text_bbox, fill=color)
             draw.text((x1, text_y), box.mention, fill="white", font=font)
     img_with_boxes.save(output_path, "JPEG")
     return output_path
-@spaces.GPU(duration=120)
-def generate_response(image, prompt):
-    document = [
-        {"type": "text", "content": "<hint>BOX</hint>", "role": "user"},
-        {"type": "image", "content": image, "role": "user"},
-        {"type": "text", "content": prompt, "role": "user"},
-    ]
-    messages, images = document_to_messages(document, vision_token=config.vision_token)
-    text = processor.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
-    inputs = processor(text=text, images=images, return_tensors="pt")
-    tensor_stream = inputs["tensor_stream"].to(device)
-    input_ids = inputs["input_ids"].to(device)
-    decoded_content = decode_tensor_stream(tensor_stream, processor.tokenizer)
-    with torch.no_grad():
-        generated_ids = model.generate(
-            tensor_stream=tensor_stream,
-            max_new_tokens=256,
-            do_sample=False,
-            pad_token_id=processor.tokenizer.eos_token_id,
-            eos_token_id=processor.tokenizer.eos_token_id,
-        )
-        generated_text = processor.tokenizer.decode(generated_ids[0], skip_special_tokens=False)
-        if images:
-            vis_path = visualize_predictions(generated_text, images[0])
-            return generated_text, vis_path
         else:
-            return generated_text, None
-# Example images and prompts
-examples = [
-    ["example.webp", "Determine whether it is safe to cross the street. Look for signage and moving traffic."],
-]
-with gr.Blocks(title="Perceptron Isaac Vision Model", theme=gr.themes.Soft()) as demo:
-    gr.Markdown("# 🔍 Perceptron Isaac Vision Model")
-    gr.Markdown("Built with [anycoder](https://huggingface.co/spaces/akhaliq/anycoder)")
     gr.Markdown("""
-    This demo showcases the Perceptron Isaac-0.1 model for multimodal understanding with bounding box visualization.
-    Upload an image and provide a prompt to analyze the image and see detected objects with bounding boxes.
     """)
     with gr.Row():
-        with gr.Column(scale=1):
             image_input = gr.Image(
-                type="filepath",
-                label="Upload Image",
-                sources=["upload", "webcam", "clipboard"],
-                height=400
             )
-            prompt_input = gr.Textbox(
-                label="Prompt",
-                value="Determine whether it is safe to cross the street. Look for signage and moving traffic.",
-                lines=3,
-                placeholder="Enter your prompt here..."
             )
-            generate_btn = gr.Button("🚀 Generate Response", variant="primary", size="lg")
-        with gr.Column(scale=1):
-            visualized_image = gr.Image(
-                label="Visualized Predictions (with Bounding Boxes)",
-                height=400
             )
-            generated_text = gr.Textbox(
-                label="Generated Text",
-                lines=10,
-                max_lines=20
             )
-    gr.Examples(
-        examples=examples,
-        inputs=[image_input, prompt_input],
-        outputs=[generated_text, visualized_image],
-        fn=generate_response,
-        cache_examples=False
     )
     generate_btn.click(
-        generate_response,
-        inputs=[image_input, prompt_input],
-        outputs=[generated_text, visualized_image]
     )
 if __name__ == "__main__":
-    demo.launch()

 import spaces
+import gradio as gr
+import torch
+from PIL import Image
+from transformers import AutoConfig, AutoModelForCausalLM, AutoProcessor
+import os
+import tempfile
+# Import required modules from perceptron
+from perceptron.tensorstream.ops import tensor_stream_token_view, modality_mask
+from perceptron.pointing.parser import extract_points
+# Define vision type enum
+class VisionType:
+    image = 1
 def document_to_messages(document, vision_token="<image>"):
+    """Convert a Document to messages format compatible with chat templates."""
     messages = []
     images = []
     for item in document:
         itype = item.get("type")
         if itype == "text":
             content = item.get("content")
             if content:
+                messages.append({
+                    "role": item.get("role", "user"),
+                    "content": content,
+                })
         elif itype == "image":
+            content = item.get("content")
+            if content:
+                if isinstance(content, str) and os.path.exists(content):
+                    img = Image.open(content)
+                elif hasattr(content, 'read'):  # Gradio file object
+                    img = Image.open(content)
+                else:
+                    continue
                 images.append(img)
+                messages.append({
+                    "role": item.get("role", "user"),
+                    "content": vision_token,
+                })
     return messages, images
 def decode_tensor_stream(tensor_stream, tokenizer):
+    """Decode a TensorStream to see its text content."""
     token_view = tensor_stream_token_view(tensor_stream)
     mod = modality_mask(tensor_stream)
+    # Get text tokens (excluding vision tokens)
+    text_tokens = token_view[(mod != VisionType.image)]
     decoded = tokenizer.decode(text_tokens[0] if len(text_tokens.shape) > 1 else text_tokens)
     return decoded
+def visualize_predictions(generated_text, image, output_path):
+    """Extract bounding boxes from generated text and render them on the input image."""
+    from PIL import ImageDraw, ImageFont
+    # Extract bounding boxes from the generated text
     boxes = extract_points(generated_text, expected="box")
     if not boxes:
         image.save(output_path)
         return output_path
+    # Get image dimensions
     img_width, img_height = image.size
+    # Create a copy of the image to draw on
     img_with_boxes = image.copy()
     draw = ImageDraw.Draw(img_with_boxes)
+    # Try to use a basic font, fall back to default if not available
     try:
         font = ImageFont.truetype("/usr/share/fonts/truetype/dejavu/DejaVuSans.ttf", 16)
     except:
         font = ImageFont.load_default()
+    # Define colors for different boxes
     colors = ["red", "green", "blue", "yellow", "magenta", "cyan", "orange", "purple"]
     for idx, box in enumerate(boxes):
         color = colors[idx % len(colors)]
+        # Extract normalized coordinates (0-1000 range)
         norm_x1, norm_y1 = box.top_left.x, box.top_left.y
         norm_x2, norm_y2 = box.bottom_right.x, box.bottom_right.y
+        # Scale coordinates from 0-1000 range to actual image dimensions
         x1 = int((norm_x1 / 1000.0) * img_width)
         y1 = int((norm_y1 / 1000.0) * img_height)
         x2 = int((norm_x2 / 1000.0) * img_width)
         y2 = int((norm_y2 / 1000.0) * img_height)
+        # Ensure coordinates are within image bounds
         x1 = max(0, min(x1, img_width - 1))
         y1 = max(0, min(y1, img_height - 1))
         x2 = max(0, min(x2, img_width - 1))
         y2 = max(0, min(y2, img_height - 1))
+        # Draw the bounding box
         draw.rectangle([x1, y1, x2, y2], outline=color, width=3)
+        # Add label if mention exists
         if box.mention:
+            # Calculate text position (above the box if possible)
             text_y = max(y1 - 20, 5)
+            # Draw text background for better visibility
             text_bbox = draw.textbbox((x1, text_y), box.mention, font=font)
             draw.rectangle(text_bbox, fill=color)
             draw.text((x1, text_y), box.mention, fill="white", font=font)
+    # Save the image with bounding boxes
     img_with_boxes.save(output_path, "JPEG")
     return output_path
+# Load model and processor once at startup
+@spaces.GPU(duration=1500)
+def load_model():
+    """Load the Perceptron model with AoT compilation."""
+    hf_path = "PerceptronAI/Isaac-0.1"
+    print("Loading processor and config...")
+    config = AutoConfig.from_pretrained(hf_path, trust_remote_code=True)
+    processor = AutoProcessor.from_pretrained(hf_path, trust_remote_code=True)
+    print("Loading model...")
+    model = AutoModelForCausalLM.from_pretrained(hf_path, trust_remote_code=True)
+    # Move to appropriate device and dtype
+    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
+    dtype = torch.bfloat16 if torch.cuda.is_available() else torch.float32
+    model = model.to(device=device, dtype=dtype)
+    model.eval()
+    print(f"Model loaded on {device} with dtype {dtype}")
+    return model, processor, config, device
+# Load model during startup
+model, processor, config, device = load_model()
+@spaces.GPU(duration=120)
+def generate_response(image_file, text_prompt, max_tokens=256):
+    """Generate response using Perceptron model."""
+    try:
+        # Create document from inputs
+        document = [
+            {
+                "type": "text",
+                "content": "<hint>BOX</hint>",
+                "role": "user",
+            },
+            {
+                "type": "image",
+                "content": image_file,
+                "role": "user",
+            },
+            {
+                "type": "text",
+                "content": text_prompt,
+                "role": "user",
+            },
+        ]
+        # Convert document to messages format
+        messages, images = document_to_messages(document, vision_token=config.vision_token)
+        # Apply chat template
+        text = processor.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
+        # Process with IsaacProcessor
+        inputs = processor(text=text, images=images, return_tensors="pt")
+        tensor_stream = inputs["tensor_stream"].to(device)
+        input_ids = inputs["input_ids"].to(device)
+        # Generate text using the model
+        with torch.no_grad():
+            generated_ids = model.generate(
+                tensor_stream=tensor_stream,
+                max_new_tokens=max_tokens,
+                do_sample=False,
+                pad_token_id=processor.tokenizer.eos_token_id,
+                eos_token_id=processor.tokenizer.eos_token_id,
+            )
+            # Decode the generated text
+            generated_text = processor.tokenizer.decode(generated_ids[0], skip_special_tokens=False)
+            # Extract new tokens only
+            if generated_ids.shape[1] > input_ids.shape[1]:
+                new_tokens = generated_ids[0, input_ids.shape[1]:]
+                new_text = processor.tokenizer.decode(new_tokens, skip_special_tokens=True)
+            else:
+                new_text = "No new tokens generated"
+        # Create visualization
+        if images and len(images) > 0:
+            with tempfile.NamedTemporaryFile(suffix=".jpg", delete=False) as tmp_file:
+                viz_path = tmp_file.name
+            viz_path = visualize_predictions(generated_text, images[0], viz_path)
         else:
+            viz_path = None
+        return new_text, generated_text, viz_path if viz_path else None
+    except Exception as e:
+        return f"Error: {str(e)}", "", None
+# Create Gradio interface
+with gr.Blocks(title="HuggingFace Perceptron Demo", theme=gr.themes.Soft()) as demo:
     gr.Markdown("""
+    # 🚀 HuggingFace Perceptron Multimodal AI Demo
+    This demo showcases the PerceptronAI/Isaac-0.1 model for multimodal understanding and generation.
+    Upload an image and provide a text prompt to see the model's response with bounding box visualizations.
+    **Built with [anycoder](https://huggingface.co/spaces/akhaliq/anycoder)**
     """)
     with gr.Row():
+        with gr.Column():
             image_input = gr.Image(
+                label="Upload Image",
+                type="filepath",
+                sources=["upload"],
+                height=300
             )
+            text_input = gr.Textbox(
+                label="Text Prompt",
+                placeholder="Describe what you want to analyze in the image...",
+                lines=3
             )
+            max_tokens_slider = gr.Slider(
+                label="Max Tokens",
+                minimum=50,
+                maximum=512,
+                value=256,
+                step=50
             )
+            generate_btn = gr.Button("Generate Response", variant="primary")
+        with gr.Column():
+            new_text_output = gr.Textbox(
+                label="Generated Response",
+                lines=4,
+                interactive=False
             )
+            full_output = gr.Textbox(
+                label="Full Generated Text",
+                lines=6,
+                interactive=False,
+                visible=False
+            )
+            visualization_output = gr.Image(
+                label="Visualization with Bounding Boxes",
+                height=300,
+                interactive=False
+            )
+    with gr.Accordion("Advanced Options", open=False):
+        gr.Markdown("""
+        - The model processes both text and images using TensorStream technology
+        - Bounding boxes are automatically extracted from the generated text
+        - Supports complex multimodal reasoning tasks
+        """)
+        show_full_checkbox = gr.Checkbox(label="Show Full Generated Text", value=False)
+    # Event handlers
+    show_full_checkbox.change(
+        lambda x: gr.Textbox(visible=x),
+        inputs=show_full_checkbox,
+        outputs=full_output
     )
     generate_btn.click(
+        fn=generate_response,
+        inputs=[image_input, text_input, max_tokens_slider],
+        outputs=[new_text_output, full_output, visualization_output]
+    )
+    # Examples
+    gr.Examples(
+        examples=[
+            [
+                "https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/transformers/tasks/car.jpg",
+                "Identify all vehicles in the image and describe their positions.",
+                200
+            ],
+            [
+                "https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/transformers/tasks/street.jpg",
+                "Analyze the street scene and identify any potential safety concerns.",
+                256
+            ]
+        ],
+        inputs=[image_input, text_input, max_tokens_slider],
+        outputs=[new_text_output, full_output, visualization_output],
+        fn=generate_response,
+        cache_examples=True
     )
 if __name__ == "__main__":
+    demo.launch(share=True)