Spaces:
Running
on
Zero
Running
on
Zero
File size: 9,649 Bytes
360130d 609cf38 fe7f387 609cf38 53ec15f 609cf38 fe7f387 609cf38 f89ae07 609cf38 fe7f387 609cf38 fe7f387 609cf38 f89ae07 609cf38 f89ae07 609cf38 f503cad 609cf38 f503cad 609cf38 f503cad 609cf38 f503cad 609cf38 f503cad 609cf38 f503cad 609cf38 f503cad 609cf38 bc8a02f 609cf38 bc8a02f 609cf38 f503cad 609cf38 f503cad 609cf38 f503cad 609cf38 bc8a02f f503cad 609cf38 f503cad 609cf38 f503cad 609cf38 bc8a02f 609cf38 5416826 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 |
import spaces
import gradio as gr
import torch
from transformers import Qwen3VLForConditionalGeneration, AutoProcessor
from PIL import Image
import numpy as np
from typing import List, Dict, Any, Optional, Tuple
import io
import base64
# Initialize the model and processor
model_id = "Qwen/Qwen3-VL-8B-Instruct"
# Load model with optimizations for inference
model = Qwen3VLForConditionalGeneration.from_pretrained(
model_id,
torch_dtype=torch.bfloat16,
device_map="auto"
)
processor = AutoProcessor.from_pretrained(model_id)
@spaces.GPU(duration=60)
def process_chat_message(
message: str,
image: Optional[Image.Image],
history: List[Dict[str, Any]]
) -> str:
"""
Process a chat message with optional image input using Qwen3-VL model.
Args:
message: The user's text message
image: Optional PIL Image
history: Chat history
Returns:
The model's response
"""
# Prepare the message content
content = []
# Add image if provided
if image is not None:
# Convert PIL image to format expected by the model
content.append({"type": "image", "image": image})
# Add text message
if message:
content.append({"type": "text", "text": message})
# Create the messages format for the model
messages = []
# Add history if exists (text only for simplicity)
for hist_item in history:
if hist_item["role"] == "user":
messages.append({
"role": "user",
"content": hist_item.get("content", "")
})
elif hist_item["role"] == "assistant":
messages.append({
"role": "assistant",
"content": hist_item.get("content", "")
})
# Add current message
if content:
messages.append({
"role": "user",
"content": content
})
# Prepare inputs for the model
inputs = processor.apply_chat_template(
messages,
tokenize=True,
add_generation_prompt=True,
return_dict=True,
return_tensors="pt"
)
# Move inputs to the same device as the model
inputs = {k: v.to(model.device) if torch.is_tensor(v) else v for k, v in inputs.items()}
# Generate response
with torch.no_grad():
generated_ids = model.generate(
**inputs,
max_new_tokens=512,
temperature=0.7,
do_sample=True,
top_p=0.95
)
# Decode the generated response
generated_ids_trimmed = [
out_ids[len(in_ids):]
for in_ids, out_ids in zip(inputs['input_ids'], generated_ids)
]
response = processor.batch_decode(
generated_ids_trimmed,
skip_special_tokens=True,
clean_up_tokenization_spaces=False
)[0]
return response
def chat_fn(message: Dict[str, Any], history: List[Dict[str, Any]]) -> Tuple[str, List[Dict[str, Any]]]:
"""
Main chat function that processes user input and returns response.
Args:
message: Dictionary containing text and optional files
history: Chat history in messages format
Returns:
Empty string and updated history
"""
text = message.get("text", "")
files = message.get("files", [])
# Process image if provided
image = None
if files and len(files) > 0:
try:
image = Image.open(files[0])
# Convert RGBA to RGB if necessary
if image.mode == "RGBA":
background = Image.new("RGB", image.size, (255, 255, 255))
background.paste(image, mask=image.split()[3])
image = background
except Exception as e:
print(f"Error loading image: {e}")
image = None
# Convert history to format expected by model
model_history = []
for msg in history:
if msg.get("role") == "user":
model_history.append({"role": "user", "content": msg.get("content", "")})
elif msg.get("role") == "assistant":
model_history.append({"role": "assistant", "content": msg.get("content", "")})
# Get response from model
try:
response = process_chat_message(text, image, model_history)
except Exception as e:
response = f"Sorry, I encountered an error: {str(e)}"
# Update history with proper message format
if image is not None:
# Include image indicator in the content
user_content = f"{text}\n[Image uploaded]" if text else "[Image uploaded]"
else:
user_content = text
history.append({"role": "user", "content": user_content})
history.append({"role": "assistant", "content": response})
return "", history
def retry_fn(history: List[Dict[str, Any]]) -> Tuple[str, List[Dict[str, Any]]]:
"""Retry the last message."""
if not history or len(history) < 2:
return "", history
# Remove last assistant response
history = history[:-1]
# Get the last user message
last_user_msg = history[-1] if history else None
if not last_user_msg:
return "", history
# Remove the last user message too (we'll re-add it with new response)
history = history[:-1]
# Recreate the message dict
user_content = last_user_msg.get("content", "")
# Extract text without image indicator
if "[Image uploaded]" in user_content:
text = user_content.replace("\n[Image uploaded]", "").replace("[Image uploaded]", "")
else:
text = user_content
message = {"text": text}
return chat_fn(message, history)
def undo_fn(history: List[Dict[str, Any]]) -> List[Dict[str, Any]]:
"""Undo the last message pair (user + assistant)."""
if history and len(history) >= 2:
# Remove last user and assistant messages
return history[:-2]
return []
def clear_fn() -> Tuple[str, List]:
"""Clear the chat."""
return "", []
# Create the Gradio interface
with gr.Blocks(theme=gr.themes.Soft(), fill_height=True) as demo:
gr.Markdown(
"""
# π Qwen3-VL Multimodal Chat
Chat with Qwen3-VL - A powerful vision-language model that can understand and discuss images!
**Features:**
- π Text conversations
- πΌοΈ Image understanding and analysis
- π¨ Visual question answering
- π Detailed image descriptions
[Built with anycoder](https://huggingface.co/spaces/akhaliq/anycoder)
"""
)
with gr.Row():
with gr.Column(scale=1):
gr.Markdown(
"""
### π‘ Tips:
- Upload an image and ask questions about it
- Try asking for detailed descriptions
- Ask about objects, colors, text in images
- Compare elements within the image
"""
)
gr.Markdown(
"""
### πΈ Example Prompts:
- "What's in this image?"
- "Describe this scene in detail"
- "What text can you see?"
- "Count the objects in the image"
- "What's the mood of this image?"
"""
)
with gr.Column(scale=3):
chatbot = gr.Chatbot(
label="Chat",
type="messages",
height=500,
show_copy_button=True,
bubble_full_width=False,
avatar_images=[None, "π€"],
value=[]
)
with gr.Row():
msg = gr.MultimodalTextbox(
label="Message",
placeholder="Type a message or upload an image...",
file_types=["image"],
submit_btn=True,
stop_btn=False
)
with gr.Row():
retry_btn = gr.Button("π Retry", variant="secondary", size="sm")
undo_btn = gr.Button("β©οΈ Undo", variant="secondary", size="sm")
clear_btn = gr.Button("ποΈ Clear", variant="secondary", size="sm")
with gr.Accordion("βοΈ Advanced Settings", open=False):
gr.Markdown(
"""
**Model Information:**
- Model: Qwen3-VL-4B-Instruct
- Optimized for vision-language tasks
- Supports multiple languages
- Best performance with clear, well-lit images
"""
)
# Set up event handlers
msg.submit(
chat_fn,
inputs=[msg, chatbot],
outputs=[msg, chatbot],
queue=True
)
retry_btn.click(
retry_fn,
inputs=[chatbot],
outputs=[msg, chatbot],
queue=True
)
undo_btn.click(
undo_fn,
inputs=[chatbot],
outputs=[chatbot],
queue=False
)
clear_btn.click(
clear_fn,
outputs=[msg, chatbot],
queue=False
)
# Add examples
gr.Examples(
examples=[
{"text": "Hello! What can you help me with today?"},
{"text": "Can you describe an image if I upload one?"},
{"text": "What are your capabilities?"},
],
inputs=msg,
label="Example Messages"
)
if __name__ == "__main__":
demo.launch(
show_error=True,
share=False,
debug=True
) |