|
|
import os |
|
|
import sys |
|
|
import torch |
|
|
from PIL import Image as PILImage |
|
|
from PIL import ImageDraw, ImageFont |
|
|
from transformers import AutoConfig, AutoModelForCausalLM, AutoTokenizer, AutoProcessor |
|
|
from loguru import logger |
|
|
import gradio as gr |
|
|
import spaces |
|
|
|
|
|
|
|
|
try: |
|
|
from perceptron.tensorstream import VisionType |
|
|
from perceptron.tensorstream.ops import tensor_stream_token_view, modality_mask |
|
|
from perceptron.pointing.parser import extract_points |
|
|
except ImportError: |
|
|
logger.error("perceptron package not found. Please ensure it's installed in your Hugging Face Space.") |
|
|
raise |
|
|
|
|
|
|
|
|
hf_path = "PerceptronAI/Isaac-0.1" |
|
|
logger.info(f"Loading processor and config from HF checkpoint: {hf_path}") |
|
|
config = AutoConfig.from_pretrained(hf_path, trust_remote_code=True) |
|
|
tokenizer = AutoTokenizer.from_pretrained(hf_path, trust_remote_code=True, use_fast=False) |
|
|
processor = AutoProcessor.from_pretrained(hf_path, trust_remote_code=True) |
|
|
processor.tokenizer = tokenizer |
|
|
|
|
|
logger.info(f"Loading AutoModelForCausalLM from HF checkpoint: {hf_path}") |
|
|
model = AutoModelForCausalLM.from_pretrained(hf_path, trust_remote_code=True) |
|
|
|
|
|
device = torch.device("cuda" if torch.cuda.is_available() else "cpu") |
|
|
dtype = torch.bfloat16 if torch.cuda.is_available() else torch.float32 |
|
|
model = model.to(device=device, dtype=dtype) |
|
|
model.eval() |
|
|
|
|
|
logger.info(f"Model loaded on {device} with dtype {dtype}") |
|
|
|
|
|
def document_to_messages(document, vision_token="<image>"): |
|
|
messages = [] |
|
|
images = [] |
|
|
for item in document: |
|
|
itype = item.get("type") |
|
|
if itype == "text": |
|
|
content = item.get("content") |
|
|
if content: |
|
|
messages.append({"role": item.get("role", "user"), "content": content}) |
|
|
elif itype == "image": |
|
|
if "content" in item and item["content"] is not None: |
|
|
img = PILImage.open(item["content"]).convert("RGB") |
|
|
images.append(img) |
|
|
messages.append({"role": item.get("role", "user"), "content": vision_token}) |
|
|
return messages, images |
|
|
|
|
|
def decode_tensor_stream(tensor_stream, tokenizer): |
|
|
token_view = tensor_stream_token_view(tensor_stream) |
|
|
mod = modality_mask(tensor_stream) |
|
|
text_tokens = token_view[(mod != VisionType.image.value)] |
|
|
decoded = tokenizer.decode(text_tokens[0] if len(text_tokens.shape) > 1 else text_tokens) |
|
|
return decoded |
|
|
|
|
|
def visualize_predictions(generated_text, image, output_path="prediction.jpeg"): |
|
|
boxes = extract_points(generated_text, expected="box") |
|
|
if not boxes: |
|
|
logger.info("No bounding boxes found in the generated text") |
|
|
image.save(output_path) |
|
|
return output_path |
|
|
|
|
|
img_width, img_height = image.size |
|
|
img_with_boxes = image.copy() |
|
|
draw = ImageDraw.Draw(img_with_boxes) |
|
|
|
|
|
try: |
|
|
font = ImageFont.truetype("/usr/share/fonts/truetype/dejavu/DejaVuSans.ttf", 16) |
|
|
except: |
|
|
font = ImageFont.load_default() |
|
|
|
|
|
colors = ["red", "green", "blue", "yellow", "magenta", "cyan", "orange", "purple"] |
|
|
|
|
|
for idx, box in enumerate(boxes): |
|
|
color = colors[idx % len(colors)] |
|
|
norm_x1, norm_y1 = box.top_left.x, box.top_left.y |
|
|
norm_x2, norm_y2 = box.bottom_right.x, box.bottom_right.y |
|
|
x1 = int((norm_x1 / 1000.0) * img_width) |
|
|
y1 = int((norm_y1 / 1000.0) * img_height) |
|
|
x2 = int((norm_x2 / 1000.0) * img_width) |
|
|
y2 = int((norm_y2 / 1000.0) * img_height) |
|
|
|
|
|
x1 = max(0, min(x1, img_width - 1)) |
|
|
y1 = max(0, min(y1, img_height - 1)) |
|
|
x2 = max(0, min(x2, img_width - 1)) |
|
|
y2 = max(0, min(y2, img_height - 1)) |
|
|
|
|
|
draw.rectangle([x1, y1, x2, y2], outline=color, width=3) |
|
|
|
|
|
if box.mention: |
|
|
text_y = max(y1 - 20, 5) |
|
|
text_bbox = draw.textbbox((x1, text_y), box.mention, font=font) |
|
|
draw.rectangle(text_bbox, fill=color) |
|
|
draw.text((x1, text_y), box.mention, fill="white", font=font) |
|
|
|
|
|
img_with_boxes.save(output_path, "JPEG") |
|
|
return output_path |
|
|
|
|
|
@spaces.GPU(duration=120) |
|
|
def generate_response(image, prompt): |
|
|
document = [ |
|
|
{"type": "text", "content": "<hint>BOX</hint>", "role": "user"}, |
|
|
{"type": "image", "content": image, "role": "user"}, |
|
|
{"type": "text", "content": prompt, "role": "user"}, |
|
|
] |
|
|
|
|
|
messages, images = document_to_messages(document, vision_token=config.vision_token) |
|
|
text = processor.apply_chat_template(messages, tokenize=False, add_generation_prompt=True) |
|
|
inputs = processor(text=text, images=images, return_tensors="pt") |
|
|
tensor_stream = inputs["tensor_stream"].to(device) |
|
|
input_ids = inputs["input_ids"].to(device) |
|
|
|
|
|
decoded_content = decode_tensor_stream(tensor_stream, processor.tokenizer) |
|
|
|
|
|
with torch.no_grad(): |
|
|
generated_ids = model.generate( |
|
|
tensor_stream=tensor_stream, |
|
|
max_new_tokens=256, |
|
|
do_sample=False, |
|
|
pad_token_id=processor.tokenizer.eos_token_id, |
|
|
eos_token_id=processor.tokenizer.eos_token_id, |
|
|
) |
|
|
|
|
|
generated_text = processor.tokenizer.decode(generated_ids[0], skip_special_tokens=False) |
|
|
|
|
|
if images: |
|
|
vis_path = visualize_predictions(generated_text, images[0]) |
|
|
return generated_text, vis_path |
|
|
else: |
|
|
return generated_text, None |
|
|
|
|
|
|
|
|
examples = [ |
|
|
["example.webp", "Determine whether it is safe to cross the street. Look for signage and moving traffic."], |
|
|
] |
|
|
|
|
|
with gr.Blocks(title="Perceptron Isaac Vision Model", theme=gr.themes.Soft()) as demo: |
|
|
gr.Markdown("# π Perceptron Isaac Vision Model") |
|
|
gr.Markdown("Built with [anycoder](https://huggingface.co/spaces/akhaliq/anycoder)") |
|
|
gr.Markdown(""" |
|
|
This demo showcases the Perceptron Isaac-0.1 model for multimodal understanding with bounding box visualization. |
|
|
Upload an image and provide a prompt to analyze the image and see detected objects with bounding boxes. |
|
|
""") |
|
|
|
|
|
with gr.Row(): |
|
|
with gr.Column(scale=1): |
|
|
image_input = gr.Image( |
|
|
type="filepath", |
|
|
label="Upload Image", |
|
|
sources=["upload", "webcam", "clipboard"], |
|
|
height=400 |
|
|
) |
|
|
prompt_input = gr.Textbox( |
|
|
label="Prompt", |
|
|
value="Determine whether it is safe to cross the street. Look for signage and moving traffic.", |
|
|
lines=3, |
|
|
placeholder="Enter your prompt here..." |
|
|
) |
|
|
generate_btn = gr.Button("π Generate Response", variant="primary", size="lg") |
|
|
|
|
|
with gr.Column(scale=1): |
|
|
visualized_image = gr.Image( |
|
|
label="Visualized Predictions (with Bounding Boxes)", |
|
|
height=400 |
|
|
) |
|
|
generated_text = gr.Textbox( |
|
|
label="Generated Text", |
|
|
lines=10, |
|
|
max_lines=20 |
|
|
) |
|
|
|
|
|
gr.Examples( |
|
|
examples=examples, |
|
|
inputs=[image_input, prompt_input], |
|
|
outputs=[generated_text, visualized_image], |
|
|
fn=generate_response, |
|
|
cache_examples=False |
|
|
) |
|
|
|
|
|
generate_btn.click( |
|
|
generate_response, |
|
|
inputs=[image_input, prompt_input], |
|
|
outputs=[generated_text, visualized_image] |
|
|
) |
|
|
|
|
|
if __name__ == "__main__": |
|
|
demo.launch() |