Spaces:
Running
on
Zero
Running
on
Zero
| import spaces | |
| import gradio as gr | |
| import torch | |
| from transformers import Qwen3VLForConditionalGeneration, AutoProcessor | |
| from PIL import Image | |
| import numpy as np | |
| from typing import List, Dict, Any, Optional, Tuple | |
| import io | |
| import base64 | |
| # Initialize the model and processor | |
| model_id = "Qwen/Qwen3-VL-8B-Instruct" | |
| # Load model with optimizations for inference | |
| model = Qwen3VLForConditionalGeneration.from_pretrained( | |
| model_id, | |
| torch_dtype=torch.bfloat16, | |
| device_map="auto" | |
| ) | |
| processor = AutoProcessor.from_pretrained(model_id) | |
| def process_chat_message( | |
| message: str, | |
| image: Optional[Image.Image], | |
| history: List[Dict[str, Any]] | |
| ) -> str: | |
| """ | |
| Process a chat message with optional image input using Qwen3-VL model. | |
| Args: | |
| message: The user's text message | |
| image: Optional PIL Image | |
| history: Chat history | |
| Returns: | |
| The model's response | |
| """ | |
| # Prepare the message content | |
| content = [] | |
| # Add image if provided | |
| if image is not None: | |
| # Convert PIL image to format expected by the model | |
| content.append({"type": "image", "image": image}) | |
| # Add text message | |
| if message: | |
| content.append({"type": "text", "text": message}) | |
| # Create the messages format for the model | |
| messages = [] | |
| # Add history if exists (text only for simplicity) | |
| for hist_item in history: | |
| if hist_item["role"] == "user": | |
| messages.append({ | |
| "role": "user", | |
| "content": hist_item.get("content", "") | |
| }) | |
| elif hist_item["role"] == "assistant": | |
| messages.append({ | |
| "role": "assistant", | |
| "content": hist_item.get("content", "") | |
| }) | |
| # Add current message | |
| if content: | |
| messages.append({ | |
| "role": "user", | |
| "content": content | |
| }) | |
| # Prepare inputs for the model | |
| inputs = processor.apply_chat_template( | |
| messages, | |
| tokenize=True, | |
| add_generation_prompt=True, | |
| return_dict=True, | |
| return_tensors="pt" | |
| ) | |
| # Move inputs to the same device as the model | |
| inputs = {k: v.to(model.device) if torch.is_tensor(v) else v for k, v in inputs.items()} | |
| # Generate response | |
| with torch.no_grad(): | |
| generated_ids = model.generate( | |
| **inputs, | |
| max_new_tokens=512, | |
| temperature=0.7, | |
| do_sample=True, | |
| top_p=0.95 | |
| ) | |
| # Decode the generated response | |
| generated_ids_trimmed = [ | |
| out_ids[len(in_ids):] | |
| for in_ids, out_ids in zip(inputs['input_ids'], generated_ids) | |
| ] | |
| response = processor.batch_decode( | |
| generated_ids_trimmed, | |
| skip_special_tokens=True, | |
| clean_up_tokenization_spaces=False | |
| )[0] | |
| return response | |
| def chat_fn(message: Dict[str, Any], history: List[Dict[str, Any]]) -> Tuple[str, List[Dict[str, Any]]]: | |
| """ | |
| Main chat function that processes user input and returns response. | |
| Args: | |
| message: Dictionary containing text and optional files | |
| history: Chat history in messages format | |
| Returns: | |
| Empty string and updated history | |
| """ | |
| text = message.get("text", "") | |
| files = message.get("files", []) | |
| # Process image if provided | |
| image = None | |
| if files and len(files) > 0: | |
| try: | |
| image = Image.open(files[0]) | |
| # Convert RGBA to RGB if necessary | |
| if image.mode == "RGBA": | |
| background = Image.new("RGB", image.size, (255, 255, 255)) | |
| background.paste(image, mask=image.split()[3]) | |
| image = background | |
| except Exception as e: | |
| print(f"Error loading image: {e}") | |
| image = None | |
| # Convert history to format expected by model | |
| model_history = [] | |
| for msg in history: | |
| if msg.get("role") == "user": | |
| model_history.append({"role": "user", "content": msg.get("content", "")}) | |
| elif msg.get("role") == "assistant": | |
| model_history.append({"role": "assistant", "content": msg.get("content", "")}) | |
| # Get response from model | |
| try: | |
| response = process_chat_message(text, image, model_history) | |
| except Exception as e: | |
| response = f"Sorry, I encountered an error: {str(e)}" | |
| # Update history with proper message format | |
| if image is not None: | |
| # Include image indicator in the content | |
| user_content = f"{text}\n[Image uploaded]" if text else "[Image uploaded]" | |
| else: | |
| user_content = text | |
| history.append({"role": "user", "content": user_content}) | |
| history.append({"role": "assistant", "content": response}) | |
| return "", history | |
| def retry_fn(history: List[Dict[str, Any]]) -> Tuple[str, List[Dict[str, Any]]]: | |
| """Retry the last message.""" | |
| if not history or len(history) < 2: | |
| return "", history | |
| # Remove last assistant response | |
| history = history[:-1] | |
| # Get the last user message | |
| last_user_msg = history[-1] if history else None | |
| if not last_user_msg: | |
| return "", history | |
| # Remove the last user message too (we'll re-add it with new response) | |
| history = history[:-1] | |
| # Recreate the message dict | |
| user_content = last_user_msg.get("content", "") | |
| # Extract text without image indicator | |
| if "[Image uploaded]" in user_content: | |
| text = user_content.replace("\n[Image uploaded]", "").replace("[Image uploaded]", "") | |
| else: | |
| text = user_content | |
| message = {"text": text} | |
| return chat_fn(message, history) | |
| def undo_fn(history: List[Dict[str, Any]]) -> List[Dict[str, Any]]: | |
| """Undo the last message pair (user + assistant).""" | |
| if history and len(history) >= 2: | |
| # Remove last user and assistant messages | |
| return history[:-2] | |
| return [] | |
| def clear_fn() -> Tuple[str, List]: | |
| """Clear the chat.""" | |
| return "", [] | |
| # Create the Gradio interface | |
| with gr.Blocks(theme=gr.themes.Soft(), fill_height=True) as demo: | |
| gr.Markdown( | |
| """ | |
| # π Qwen3-VL Multimodal Chat | |
| Chat with Qwen3-VL - A powerful vision-language model that can understand and discuss images! | |
| **Features:** | |
| - π Text conversations | |
| - πΌοΈ Image understanding and analysis | |
| - π¨ Visual question answering | |
| - π Detailed image descriptions | |
| [Built with anycoder](https://huggingface.co/spaces/akhaliq/anycoder) | |
| """ | |
| ) | |
| with gr.Row(): | |
| with gr.Column(scale=1): | |
| gr.Markdown( | |
| """ | |
| ### π‘ Tips: | |
| - Upload an image and ask questions about it | |
| - Try asking for detailed descriptions | |
| - Ask about objects, colors, text in images | |
| - Compare elements within the image | |
| """ | |
| ) | |
| gr.Markdown( | |
| """ | |
| ### πΈ Example Prompts: | |
| - "What's in this image?" | |
| - "Describe this scene in detail" | |
| - "What text can you see?" | |
| - "Count the objects in the image" | |
| - "What's the mood of this image?" | |
| """ | |
| ) | |
| with gr.Column(scale=3): | |
| chatbot = gr.Chatbot( | |
| label="Chat", | |
| type="messages", | |
| height=500, | |
| show_copy_button=True, | |
| bubble_full_width=False, | |
| avatar_images=[None, "π€"], | |
| value=[] | |
| ) | |
| with gr.Row(): | |
| msg = gr.MultimodalTextbox( | |
| label="Message", | |
| placeholder="Type a message or upload an image...", | |
| file_types=["image"], | |
| submit_btn=True, | |
| stop_btn=False | |
| ) | |
| with gr.Row(): | |
| retry_btn = gr.Button("π Retry", variant="secondary", size="sm") | |
| undo_btn = gr.Button("β©οΈ Undo", variant="secondary", size="sm") | |
| clear_btn = gr.Button("ποΈ Clear", variant="secondary", size="sm") | |
| with gr.Accordion("βοΈ Advanced Settings", open=False): | |
| gr.Markdown( | |
| """ | |
| **Model Information:** | |
| - Model: Qwen3-VL-4B-Instruct | |
| - Optimized for vision-language tasks | |
| - Supports multiple languages | |
| - Best performance with clear, well-lit images | |
| """ | |
| ) | |
| # Set up event handlers | |
| msg.submit( | |
| chat_fn, | |
| inputs=[msg, chatbot], | |
| outputs=[msg, chatbot], | |
| queue=True | |
| ) | |
| retry_btn.click( | |
| retry_fn, | |
| inputs=[chatbot], | |
| outputs=[msg, chatbot], | |
| queue=True | |
| ) | |
| undo_btn.click( | |
| undo_fn, | |
| inputs=[chatbot], | |
| outputs=[chatbot], | |
| queue=False | |
| ) | |
| clear_btn.click( | |
| clear_fn, | |
| outputs=[msg, chatbot], | |
| queue=False | |
| ) | |
| # Add examples | |
| gr.Examples( | |
| examples=[ | |
| {"text": "Hello! What can you help me with today?"}, | |
| {"text": "Can you describe an image if I upload one?"}, | |
| {"text": "What are your capabilities?"}, | |
| ], | |
| inputs=msg, | |
| label="Example Messages" | |
| ) | |
| if __name__ == "__main__": | |
| demo.launch( | |
| show_error=True, | |
| share=False, | |
| debug=True | |
| ) |