Spaces:
Running
on
Zero
Running
on
Zero
File size: 5,210 Bytes
a14c972 51b8121 a14c972 51b8121 a14c972 51b8121 a14c972 51b8121 a14c972 51b8121 a14c972 51b8121 a14c972 51b8121 a14c972 51b8121 a14c972 51b8121 a14c972 51b8121 a14c972 51b8121 a14c972 51b8121 cb3137f 51b8121 a14c972 51b8121 a14c972 51b8121 a14c972 51b8121 a14c972 51b8121 a14c972 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 |
import gradio as gr
from transformers import Qwen3VLForConditionalGeneration, AutoProcessor
import torch
from PIL import Image
import spaces
# Load model and processor
model = Qwen3VLForConditionalGeneration.from_pretrained(
"Qwen/Qwen3-VL-2B-Instruct",
torch_dtype=torch.bfloat16,
device_map="auto"
)
processor = AutoProcessor.from_pretrained("Qwen/Qwen3-VL-2B-Instruct")
@spaces.GPU(duration=120)
def qwen_chat_fn(message, history):
"""
Process chat messages with multimodal support
Args:
message (dict): Contains 'text' and 'files' keys
history (list): Chat history in messages format
Returns:
str: Assistant response
"""
# Extract text and files from the message
text = message.get("text", "")
files = message.get("files", [])
# Build messages list for the model
messages = []
# Add previous chat history
for hist_item in history:
if hist_item["role"] == "user":
messages.append({
"role": "user",
"content": [{"type": "text", "text": hist_item["content"]}]
})
elif hist_item["role"] == "assistant":
messages.append({
"role": "assistant",
"content": [{"type": "text", "text": hist_item["content"]}]
})
# Build current message content
current_content = []
# Add images if provided
if files:
for file_path in files:
try:
image = Image.open(file_path)
current_content.append({
"type": "image",
"image": image
})
except Exception as e:
print(f"Error loading image {file_path}: {e}")
# Add text
if text:
current_content.append({
"type": "text",
"text": text
})
# If no content, return empty
if not current_content:
return ""
# Add current message
messages.append({
"role": "user",
"content": current_content
})
# Prepare inputs for the model
inputs = processor.apply_chat_template(
messages,
tokenize=True,
add_generation_prompt=True,
return_dict=True,
return_tensors="pt"
)
inputs = inputs.to(model.device)
# Generate response
with torch.no_grad():
generated_ids = model.generate(
**inputs,
max_new_tokens=4000,
temperature=0.7,
top_p=0.95,
do_sample=True
)
# Decode output
generated_ids_trimmed = [
out_ids[len(in_ids):] for in_ids, out_ids in zip(inputs.input_ids, generated_ids)
]
output_text = processor.batch_decode(
generated_ids_trimmed,
skip_special_tokens=True,
clean_up_tokenization_spaces=False
)[0]
return output_text
# Example messages for demonstration
example_messages = [
{"text": "Hello! Can you describe what makes a good photograph?", "files": []},
{"text": "What's the weather like in this image?", "files": []},
{"text": "Can you analyze the composition of this picture?", "files": []},
]
# Create the ChatInterface
demo = gr.ChatInterface(
fn=qwen_chat_fn,
type="messages",
multimodal=True,
title="π¨ Qwen3-VL Multimodal Chat",
description="""
Chat with Qwen3-VL-2B-Instruct - A powerful multimodal AI that understands both text and images!
**Features:**
- π Text conversations
- πΌοΈ Image understanding and analysis
- π― Visual question answering
- π Detailed image descriptions
**How to use:**
- Type your message in the text box
- Click the attachment button to upload images
- Send your message to get AI responses
[Built with anycoder](https://huggingface.co/spaces/akhaliq/anycoder)
""",
examples=[
{"text": "Hello! What can you help me with today?", "files": []},
{"text": "Can you explain what machine learning is?", "files": []},
{"text": "What are the key elements of good design?", "files": []},
],
theme=gr.themes.Soft(),
autofocus=True,
submit_btn="Send",
stop_btn="Stop",
additional_inputs=None,
additional_inputs_accordion=None,
cache_examples=False,
analytics_enabled=False,
)
# Add additional information in a Markdown block
with demo:
gr.Markdown(
"""
---
### π‘ Tips for Best Results:
- **For images:** Upload clear, well-lit images for better analysis
- **For questions:** Be specific about what you want to know
- **Context matters:** Provide relevant context for more accurate responses
- **Multiple images:** You can upload multiple images in a single message
### π Model Information:
- **Model:** Qwen3-VL-2B-Instruct
- **Parameters:** 2 Billion
- **Capabilities:** Image understanding, OCR, visual reasoning, general conversation
- **Powered by:** Hugging Face Spaces with ZeroGPU
"""
)
if __name__ == "__main__":
demo.launch(share=False) |