Spaces:

akhaliq
/

Isaac-0.1

Running on Zero

App Files Files Community

akhaliq HF Staff commited on Sep 22

Commit

8e814c4

verified ·

1 Parent(s): 837d8aa

Update app.py

Browse files

Files changed (1) hide show

app.py +120 -234

app.py CHANGED Viewed

@@ -1,306 +1,192 @@
-import spaces
-import gradio as gr
-import torch
-from PIL import Image
-from transformers import AutoConfig, AutoModelForCausalLM, AutoProcessor
 import os
-import tempfile
-# Import required modules from perceptron
-from perceptron.tensorstream.ops import tensor_stream_token_view, modality_mask
-from perceptron.pointing.parser import extract_points
-# Define vision type enum
-class VisionType:
-    image = 1
 def document_to_messages(document, vision_token="<image>"):
-    """Convert a Document to messages format compatible with chat templates."""
     messages = []
     images = []
     for item in document:
         itype = item.get("type")
         if itype == "text":
             content = item.get("content")
             if content:
-                messages.append({
-                    "role": item.get("role", "user"),
-                    "content": content,
-                })
         elif itype == "image":
-            content = item.get("content")
-            if content:
-                if isinstance(content, str) and os.path.exists(content):
-                    img = Image.open(content)
-                elif hasattr(content, 'read'):  # Gradio file object
-                    img = Image.open(content)
-                else:
-                    continue
                 images.append(img)
-                messages.append({
-                    "role": item.get("role", "user"),
-                    "content": vision_token,
-                })
     return messages, images
 def decode_tensor_stream(tensor_stream, tokenizer):
-    """Decode a TensorStream to see its text content."""
     token_view = tensor_stream_token_view(tensor_stream)
     mod = modality_mask(tensor_stream)
-    # Get text tokens (excluding vision tokens)
-    text_tokens = token_view[(mod != VisionType.image)]
     decoded = tokenizer.decode(text_tokens[0] if len(text_tokens.shape) > 1 else text_tokens)
     return decoded
-def visualize_predictions(generated_text, image, output_path):
-    """Extract bounding boxes from generated text and render them on the input image."""
-    from PIL import ImageDraw, ImageFont
-    # Extract bounding boxes from the generated text
     boxes = extract_points(generated_text, expected="box")
     if not boxes:
         image.save(output_path)
         return output_path
-    # Get image dimensions
     img_width, img_height = image.size
-    # Create a copy of the image to draw on
     img_with_boxes = image.copy()
     draw = ImageDraw.Draw(img_with_boxes)
-    # Try to use a basic font, fall back to default if not available
     try:
         font = ImageFont.truetype("/usr/share/fonts/truetype/dejavu/DejaVuSans.ttf", 16)
     except:
         font = ImageFont.load_default()
-    # Define colors for different boxes
     colors = ["red", "green", "blue", "yellow", "magenta", "cyan", "orange", "purple"]
     for idx, box in enumerate(boxes):
         color = colors[idx % len(colors)]
-        # Extract normalized coordinates (0-1000 range)
         norm_x1, norm_y1 = box.top_left.x, box.top_left.y
         norm_x2, norm_y2 = box.bottom_right.x, box.bottom_right.y
-        # Scale coordinates from 0-1000 range to actual image dimensions
         x1 = int((norm_x1 / 1000.0) * img_width)
         y1 = int((norm_y1 / 1000.0) * img_height)
         x2 = int((norm_x2 / 1000.0) * img_width)
         y2 = int((norm_y2 / 1000.0) * img_height)
-        # Ensure coordinates are within image bounds
         x1 = max(0, min(x1, img_width - 1))
         y1 = max(0, min(y1, img_height - 1))
         x2 = max(0, min(x2, img_width - 1))
         y2 = max(0, min(y2, img_height - 1))
-        # Draw the bounding box
         draw.rectangle([x1, y1, x2, y2], outline=color, width=3)
-        # Add label if mention exists
         if box.mention:
-            # Calculate text position (above the box if possible)
             text_y = max(y1 - 20, 5)
-            # Draw text background for better visibility
             text_bbox = draw.textbbox((x1, text_y), box.mention, font=font)
             draw.rectangle(text_bbox, fill=color)
             draw.text((x1, text_y), box.mention, fill="white", font=font)
-    # Save the image with bounding boxes
     img_with_boxes.save(output_path, "JPEG")
     return output_path
-# Load model and processor once at startup
-@spaces.GPU(duration=1500)
-def load_model():
-    """Load the Perceptron model with AoT compilation."""
-    hf_path = "PerceptronAI/Isaac-0.1"
-    print("Loading processor and config...")
-    config = AutoConfig.from_pretrained(hf_path, trust_remote_code=True)
-    processor = AutoProcessor.from_pretrained(hf_path, trust_remote_code=True)
-    print("Loading model...")
-    model = AutoModelForCausalLM.from_pretrained(hf_path, trust_remote_code=True)
-    # Move to appropriate device and dtype
-    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
-    dtype = torch.bfloat16 if torch.cuda.is_available() else torch.float32
-    model = model.to(device=device, dtype=dtype)
-    model.eval()
-    print(f"Model loaded on {device} with dtype {dtype}")
-    return model, processor, config, device
-# Load model during startup
-model, processor, config, device = load_model()
-@spaces.GPU(duration=120)
-def generate_response(image_file, text_prompt, max_tokens=256):
-    """Generate response using Perceptron model."""
-    try:
-        # Create document from inputs
-        document = [
-            {
-                "type": "text",
-                "content": "<hint>BOX</hint>",
-                "role": "user",
-            },
-            {
-                "type": "image",
-                "content": image_file,
-                "role": "user",
-            },
-            {
-                "type": "text",
-                "content": text_prompt,
-                "role": "user",
-            },
-        ]
-        # Convert document to messages format
-        messages, images = document_to_messages(document, vision_token=config.vision_token)
-        # Apply chat template
-        text = processor.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
-        # Process with IsaacProcessor
-        inputs = processor(text=text, images=images, return_tensors="pt")
-        tensor_stream = inputs["tensor_stream"].to(device)
-        input_ids = inputs["input_ids"].to(device)
-        # Generate text using the model
-        with torch.no_grad():
-            generated_ids = model.generate(
-                tensor_stream=tensor_stream,
-                max_new_tokens=max_tokens,
-                do_sample=False,
-                pad_token_id=processor.tokenizer.eos_token_id,
-                eos_token_id=processor.tokenizer.eos_token_id,
-            )
-            # Decode the generated text
-            generated_text = processor.tokenizer.decode(generated_ids[0], skip_special_tokens=False)
-            # Extract new tokens only
-            if generated_ids.shape[1] > input_ids.shape[1]:
-                new_tokens = generated_ids[0, input_ids.shape[1]:]
-                new_text = processor.tokenizer.decode(new_tokens, skip_special_tokens=True)
-            else:
-                new_text = "No new tokens generated"
-        # Create visualization
-        if images and len(images) > 0:
-            with tempfile.NamedTemporaryFile(suffix=".jpg", delete=False) as tmp_file:
-                viz_path = tmp_file.name
-            viz_path = visualize_predictions(generated_text, images[0], viz_path)
         else:
-            viz_path = None
-        return new_text, generated_text, viz_path if viz_path else None
-    except Exception as e:
-        return f"Error: {str(e)}", "", None
-# Create Gradio interface
-with gr.Blocks(title="HuggingFace Perceptron Demo", theme=gr.themes.Soft()) as demo:
     gr.Markdown("""
-    # 🚀 HuggingFace Perceptron Multimodal AI Demo
-    This demo showcases the PerceptronAI/Isaac-0.1 model for multimodal understanding and generation.
-    Upload an image and provide a text prompt to see the model's response with bounding box visualizations.
-    **Built with [anycoder](https://huggingface.co/spaces/akhaliq/anycoder)**
     """)
     with gr.Row():
-        with gr.Column():
             image_input = gr.Image(
-                label="Upload Image",
-                type="filepath",
-                sources=["upload"],
-                height=300
-            )
-            text_input = gr.Textbox(
-                label="Text Prompt",
-                placeholder="Describe what you want to analyze in the image...",
-                lines=3
             )
-            max_tokens_slider = gr.Slider(
-                label="Max Tokens",
-                minimum=50,
-                maximum=512,
-                value=256,
-                step=50
             )
-            generate_btn = gr.Button("Generate Response", variant="primary")
-        with gr.Column():
-            new_text_output = gr.Textbox(
-                label="Generated Response",
-                lines=4,
-                interactive=False
-            )
-            full_output = gr.Textbox(
-                label="Full Generated Text",
-                lines=6,
-                interactive=False,
-                visible=False
             )
-            visualization_output = gr.Image(
-                label="Visualization with Bounding Boxes",
-                height=300,
-                interactive=False
             )
-    with gr.Accordion("Advanced Options", open=False):
-        gr.Markdown("""
-        - The model processes both text and images using TensorStream technology
-        - Bounding boxes are automatically extracted from the generated text
-        - Supports complex multimodal reasoning tasks
-        """)
-        show_full_checkbox = gr.Checkbox(label="Show Full Generated Text", value=False)
-    # Event handlers
-    show_full_checkbox.change(
-        lambda x: gr.Textbox(visible=x),
-        inputs=show_full_checkbox,
-        outputs=full_output
-    )
-    generate_btn.click(
-        fn=generate_response,
-        inputs=[image_input, text_input, max_tokens_slider],
-        outputs=[new_text_output, full_output, visualization_output]
-    )
-    # Examples
     gr.Examples(
-        examples=[
-            [
-                "https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/transformers/tasks/car.jpg",
-                "Identify all vehicles in the image and describe their positions.",
-                200
-            ],
-            [
-                "https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/transformers/tasks/street.jpg",
-                "Analyze the street scene and identify any potential safety concerns.",
-                256
-            ]
-        ],
-        inputs=[image_input, text_input, max_tokens_slider],
-        outputs=[new_text_output, full_output, visualization_output],
         fn=generate_response,
-        cache_examples=True
     )
 if __name__ == "__main__":
-    demo.launch(share=True)

 import os
+import sys
+import torch
+from PIL import Image as PILImage
+from PIL import ImageDraw, ImageFont
+from transformers import AutoConfig, AutoModelForCausalLM, AutoTokenizer, AutoProcessor
+from loguru import logger
+import gradio as gr
+import spaces
+# Note: The perceptron package needs to be installed or included in the Space
+try:
+    from perceptron.tensorstream import VisionType
+    from perceptron.tensorstream.ops import tensor_stream_token_view, modality_mask
+    from perceptron.pointing.parser import extract_points
+except ImportError:
+    logger.error("perceptron package not found. Please ensure it's installed in your Hugging Face Space.")
+    raise
+# Load model at startup
+hf_path = "PerceptronAI/Isaac-0.1"
+logger.info(f"Loading processor and config from HF checkpoint: {hf_path}")
+config = AutoConfig.from_pretrained(hf_path, trust_remote_code=True)
+tokenizer = AutoTokenizer.from_pretrained(hf_path, trust_remote_code=True, use_fast=False)
+processor = AutoProcessor.from_pretrained(hf_path, trust_remote_code=True)
+processor.tokenizer = tokenizer  # Ensure tokenizer is set
+logger.info(f"Loading AutoModelForCausalLM from HF checkpoint: {hf_path}")
+model = AutoModelForCausalLM.from_pretrained(hf_path, trust_remote_code=True)
+device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
+dtype = torch.bfloat16 if torch.cuda.is_available() else torch.float32
+model = model.to(device=device, dtype=dtype)
+model.eval()
+logger.info(f"Model loaded on {device} with dtype {dtype}")
 def document_to_messages(document, vision_token="<image>"):
     messages = []
     images = []
     for item in document:
         itype = item.get("type")
         if itype == "text":
             content = item.get("content")
             if content:
+                messages.append({"role": item.get("role", "user"), "content": content})
         elif itype == "image":
+            if "content" in item and item["content"] is not None:
+                img = PILImage.open(item["content"]).convert("RGB")
                 images.append(img)
+                messages.append({"role": item.get("role", "user"), "content": vision_token})
     return messages, images
 def decode_tensor_stream(tensor_stream, tokenizer):
     token_view = tensor_stream_token_view(tensor_stream)
     mod = modality_mask(tensor_stream)
+    text_tokens = token_view[(mod != VisionType.image.value)]
     decoded = tokenizer.decode(text_tokens[0] if len(text_tokens.shape) > 1 else text_tokens)
     return decoded
+def visualize_predictions(generated_text, image, output_path="prediction.jpeg"):
     boxes = extract_points(generated_text, expected="box")
     if not boxes:
+        logger.info("No bounding boxes found in the generated text")
         image.save(output_path)
         return output_path
     img_width, img_height = image.size
     img_with_boxes = image.copy()
     draw = ImageDraw.Draw(img_with_boxes)
     try:
         font = ImageFont.truetype("/usr/share/fonts/truetype/dejavu/DejaVuSans.ttf", 16)
     except:
         font = ImageFont.load_default()
     colors = ["red", "green", "blue", "yellow", "magenta", "cyan", "orange", "purple"]
     for idx, box in enumerate(boxes):
         color = colors[idx % len(colors)]
         norm_x1, norm_y1 = box.top_left.x, box.top_left.y
         norm_x2, norm_y2 = box.bottom_right.x, box.bottom_right.y
         x1 = int((norm_x1 / 1000.0) * img_width)
         y1 = int((norm_y1 / 1000.0) * img_height)
         x2 = int((norm_x2 / 1000.0) * img_width)
         y2 = int((norm_y2 / 1000.0) * img_height)
         x1 = max(0, min(x1, img_width - 1))
         y1 = max(0, min(y1, img_height - 1))
         x2 = max(0, min(x2, img_width - 1))
         y2 = max(0, min(y2, img_height - 1))
         draw.rectangle([x1, y1, x2, y2], outline=color, width=3)
         if box.mention:
             text_y = max(y1 - 20, 5)
             text_bbox = draw.textbbox((x1, text_y), box.mention, font=font)
             draw.rectangle(text_bbox, fill=color)
             draw.text((x1, text_y), box.mention, fill="white", font=font)
     img_with_boxes.save(output_path, "JPEG")
     return output_path
+@spaces.GPU(duration=120)
+def generate_response(image, prompt):
+    document = [
+        {"type": "text", "content": "<hint>BOX</hint>", "role": "user"},
+        {"type": "image", "content": image, "role": "user"},
+        {"type": "text", "content": prompt, "role": "user"},
+    ]
+    messages, images = document_to_messages(document, vision_token=config.vision_token)
+    text = processor.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
+    inputs = processor(text=text, images=images, return_tensors="pt")
+    tensor_stream = inputs["tensor_stream"].to(device)
+    input_ids = inputs["input_ids"].to(device)
+    decoded_content = decode_tensor_stream(tensor_stream, processor.tokenizer)
+    with torch.no_grad():
+        generated_ids = model.generate(
+            tensor_stream=tensor_stream,
+            max_new_tokens=256,
+            do_sample=False,
+            pad_token_id=processor.tokenizer.eos_token_id,
+            eos_token_id=processor.tokenizer.eos_token_id,
+        )
+        generated_text = processor.tokenizer.decode(generated_ids[0], skip_special_tokens=False)
+        if images:
+            vis_path = visualize_predictions(generated_text, images[0])
+            return generated_text, vis_path
         else:
+            return generated_text, None
+# Example images and prompts
+examples = [
+    ["example.webp", "Determine whether it is safe to cross the street. Look for signage and moving traffic."],
+]
+with gr.Blocks(title="Perceptron Isaac Vision Model", theme=gr.themes.Soft()) as demo:
+    gr.Markdown("# 🔍 Perceptron Isaac Vision Model")
+    gr.Markdown("Built with [anycoder](https://huggingface.co/spaces/akhaliq/anycoder)")
     gr.Markdown("""
+    This demo showcases the Perceptron Isaac-0.1 model for multimodal understanding with bounding box visualization.
+    Upload an image and provide a prompt to analyze the image and see detected objects with bounding boxes.
     """)
     with gr.Row():
+        with gr.Column(scale=1):
             image_input = gr.Image(
+                type="filepath",
+                label="Upload Image",
+                sources=["upload", "webcam", "clipboard"],
+                height=400
             )
+            prompt_input = gr.Textbox(
+                label="Prompt",
+                value="Determine whether it is safe to cross the street. Look for signage and moving traffic.",
+                lines=3,
+                placeholder="Enter your prompt here..."
             )
+            generate_btn = gr.Button("🚀 Generate Response", variant="primary", size="lg")
+        with gr.Column(scale=1):
+            visualized_image = gr.Image(
+                label="Visualized Predictions (with Bounding Boxes)",
+                height=400
             )
+            generated_text = gr.Textbox(
+                label="Generated Text",
+                lines=10,
+                max_lines=20
             )
     gr.Examples(
+        examples=examples,
+        inputs=[image_input, prompt_input],
+        outputs=[generated_text, visualized_image],
         fn=generate_response,
+        cache_examples=False
+    )
+    generate_btn.click(
+        generate_response,
+        inputs=[image_input, prompt_input],
+        outputs=[generated_text, visualized_image]
     )
 if __name__ == "__main__":
+    demo.launch()