akhaliq's picture
akhaliq HF Staff
Update app.py
cb3137f verified
import gradio as gr
from transformers import Qwen3VLForConditionalGeneration, AutoProcessor
import torch
from PIL import Image
import spaces
# Load model and processor
model = Qwen3VLForConditionalGeneration.from_pretrained(
"Qwen/Qwen3-VL-2B-Instruct",
torch_dtype=torch.bfloat16,
device_map="auto"
)
processor = AutoProcessor.from_pretrained("Qwen/Qwen3-VL-2B-Instruct")
@spaces.GPU(duration=120)
def qwen_chat_fn(message, history):
"""
Process chat messages with multimodal support
Args:
message (dict): Contains 'text' and 'files' keys
history (list): Chat history in messages format
Returns:
str: Assistant response
"""
# Extract text and files from the message
text = message.get("text", "")
files = message.get("files", [])
# Build messages list for the model
messages = []
# Add previous chat history
for hist_item in history:
if hist_item["role"] == "user":
messages.append({
"role": "user",
"content": [{"type": "text", "text": hist_item["content"]}]
})
elif hist_item["role"] == "assistant":
messages.append({
"role": "assistant",
"content": [{"type": "text", "text": hist_item["content"]}]
})
# Build current message content
current_content = []
# Add images if provided
if files:
for file_path in files:
try:
image = Image.open(file_path)
current_content.append({
"type": "image",
"image": image
})
except Exception as e:
print(f"Error loading image {file_path}: {e}")
# Add text
if text:
current_content.append({
"type": "text",
"text": text
})
# If no content, return empty
if not current_content:
return ""
# Add current message
messages.append({
"role": "user",
"content": current_content
})
# Prepare inputs for the model
inputs = processor.apply_chat_template(
messages,
tokenize=True,
add_generation_prompt=True,
return_dict=True,
return_tensors="pt"
)
inputs = inputs.to(model.device)
# Generate response
with torch.no_grad():
generated_ids = model.generate(
**inputs,
max_new_tokens=4000,
temperature=0.7,
top_p=0.95,
do_sample=True
)
# Decode output
generated_ids_trimmed = [
out_ids[len(in_ids):] for in_ids, out_ids in zip(inputs.input_ids, generated_ids)
]
output_text = processor.batch_decode(
generated_ids_trimmed,
skip_special_tokens=True,
clean_up_tokenization_spaces=False
)[0]
return output_text
# Example messages for demonstration
example_messages = [
{"text": "Hello! Can you describe what makes a good photograph?", "files": []},
{"text": "What's the weather like in this image?", "files": []},
{"text": "Can you analyze the composition of this picture?", "files": []},
]
# Create the ChatInterface
demo = gr.ChatInterface(
fn=qwen_chat_fn,
type="messages",
multimodal=True,
title="🎨 Qwen3-VL Multimodal Chat",
description="""
Chat with Qwen3-VL-2B-Instruct - A powerful multimodal AI that understands both text and images!
**Features:**
- πŸ“ Text conversations
- πŸ–ΌοΈ Image understanding and analysis
- 🎯 Visual question answering
- πŸ” Detailed image descriptions
**How to use:**
- Type your message in the text box
- Click the attachment button to upload images
- Send your message to get AI responses
[Built with anycoder](https://huggingface.co/spaces/akhaliq/anycoder)
""",
examples=[
{"text": "Hello! What can you help me with today?", "files": []},
{"text": "Can you explain what machine learning is?", "files": []},
{"text": "What are the key elements of good design?", "files": []},
],
theme=gr.themes.Soft(),
autofocus=True,
submit_btn="Send",
stop_btn="Stop",
additional_inputs=None,
additional_inputs_accordion=None,
cache_examples=False,
analytics_enabled=False,
)
# Add additional information in a Markdown block
with demo:
gr.Markdown(
"""
---
### πŸ’‘ Tips for Best Results:
- **For images:** Upload clear, well-lit images for better analysis
- **For questions:** Be specific about what you want to know
- **Context matters:** Provide relevant context for more accurate responses
- **Multiple images:** You can upload multiple images in a single message
### πŸš€ Model Information:
- **Model:** Qwen3-VL-2B-Instruct
- **Parameters:** 2 Billion
- **Capabilities:** Image understanding, OCR, visual reasoning, general conversation
- **Powered by:** Hugging Face Spaces with ZeroGPU
"""
)
if __name__ == "__main__":
demo.launch(share=False)