Qwen3-VL-8B-Instruct

Running on Zero

File size: 9,649 Bytes

import spaces
import gradio as gr
import torch
from transformers import Qwen3VLForConditionalGeneration, AutoProcessor
from PIL import Image
import numpy as np
from typing import List, Dict, Any, Optional, Tuple
import io
import base64

# Initialize the model and processor
model_id = "Qwen/Qwen3-VL-8B-Instruct"  

# Load model with optimizations for inference
model = Qwen3VLForConditionalGeneration.from_pretrained(
    model_id,
    torch_dtype=torch.bfloat16,
    device_map="auto"
)
processor = AutoProcessor.from_pretrained(model_id)

@spaces.GPU(duration=60)
def process_chat_message(
    message: str,
    image: Optional[Image.Image],
    history: List[Dict[str, Any]]
) -> str:
    """
    Process a chat message with optional image input using Qwen3-VL model.
    
    Args:
        message: The user's text message
        image: Optional PIL Image
        history: Chat history
    
    Returns:
        The model's response
    """
    # Prepare the message content
    content = []
    
    # Add image if provided
    if image is not None:
        # Convert PIL image to format expected by the model
        content.append({"type": "image", "image": image})
    
    # Add text message
    if message:
        content.append({"type": "text", "text": message})
    
    # Create the messages format for the model
    messages = []
    
    # Add history if exists (text only for simplicity)
    for hist_item in history:
        if hist_item["role"] == "user":
            messages.append({
                "role": "user",
                "content": hist_item.get("content", "")
            })
        elif hist_item["role"] == "assistant":
            messages.append({
                "role": "assistant",
                "content": hist_item.get("content", "")
            })
    
    # Add current message
    if content:
        messages.append({
            "role": "user",
            "content": content
        })
    
    # Prepare inputs for the model
    inputs = processor.apply_chat_template(
        messages,
        tokenize=True,
        add_generation_prompt=True,
        return_dict=True,
        return_tensors="pt"
    )
    
    # Move inputs to the same device as the model
    inputs = {k: v.to(model.device) if torch.is_tensor(v) else v for k, v in inputs.items()}
    
    # Generate response
    with torch.no_grad():
        generated_ids = model.generate(
            **inputs,
            max_new_tokens=512,
            temperature=0.7,
            do_sample=True,
            top_p=0.95
        )
    
    # Decode the generated response
    generated_ids_trimmed = [
        out_ids[len(in_ids):] 
        for in_ids, out_ids in zip(inputs['input_ids'], generated_ids)
    ]
    
    response = processor.batch_decode(
        generated_ids_trimmed,
        skip_special_tokens=True,
        clean_up_tokenization_spaces=False
    )[0]
    
    return response

def chat_fn(message: Dict[str, Any], history: List[Dict[str, Any]]) -> Tuple[str, List[Dict[str, Any]]]:
    """
    Main chat function that processes user input and returns response.
    
    Args:
        message: Dictionary containing text and optional files
        history: Chat history in messages format
    
    Returns:
        Empty string and updated history
    """
    text = message.get("text", "")
    files = message.get("files", [])
    
    # Process image if provided
    image = None
    if files and len(files) > 0:
        try:
            image = Image.open(files[0])
            # Convert RGBA to RGB if necessary
            if image.mode == "RGBA":
                background = Image.new("RGB", image.size, (255, 255, 255))
                background.paste(image, mask=image.split()[3])
                image = background
        except Exception as e:
            print(f"Error loading image: {e}")
            image = None
    
    # Convert history to format expected by model
    model_history = []
    for msg in history:
        if msg.get("role") == "user":
            model_history.append({"role": "user", "content": msg.get("content", "")})
        elif msg.get("role") == "assistant":
            model_history.append({"role": "assistant", "content": msg.get("content", "")})
    
    # Get response from model
    try:
        response = process_chat_message(text, image, model_history)
    except Exception as e:
        response = f"Sorry, I encountered an error: {str(e)}"
    
    # Update history with proper message format
    if image is not None:
        # Include image indicator in the content
        user_content = f"{text}\n[Image uploaded]" if text else "[Image uploaded]"
    else:
        user_content = text
    
    history.append({"role": "user", "content": user_content})
    history.append({"role": "assistant", "content": response})
    
    return "", history

def retry_fn(history: List[Dict[str, Any]]) -> Tuple[str, List[Dict[str, Any]]]:
    """Retry the last message."""
    if not history or len(history) < 2:
        return "", history
    
    # Remove last assistant response
    history = history[:-1]
    
    # Get the last user message
    last_user_msg = history[-1] if history else None
    if not last_user_msg:
        return "", history
    
    # Remove the last user message too (we'll re-add it with new response)
    history = history[:-1]
    
    # Recreate the message dict
    user_content = last_user_msg.get("content", "")
    # Extract text without image indicator
    if "[Image uploaded]" in user_content:
        text = user_content.replace("\n[Image uploaded]", "").replace("[Image uploaded]", "")
    else:
        text = user_content
    
    message = {"text": text}
    
    return chat_fn(message, history)

def undo_fn(history: List[Dict[str, Any]]) -> List[Dict[str, Any]]:
    """Undo the last message pair (user + assistant)."""
    if history and len(history) >= 2:
        # Remove last user and assistant messages
        return history[:-2]
    return []

def clear_fn() -> Tuple[str, List]:
    """Clear the chat."""
    return "", []

# Create the Gradio interface
with gr.Blocks(theme=gr.themes.Soft(), fill_height=True) as demo:
    gr.Markdown(
        """
        # 🌟 Qwen3-VL Multimodal Chat
        
        Chat with Qwen3-VL - A powerful vision-language model that can understand and discuss images!
        
        **Features:**
        - 📝 Text conversations
        - 🖼️ Image understanding and analysis
        - 🎨 Visual question answering
        - 🔍 Detailed image descriptions
        
        [Built with anycoder](https://huggingface.co/spaces/akhaliq/anycoder)
        """
    )
    
    with gr.Row():
        with gr.Column(scale=1):
            gr.Markdown(
                """
                ### 💡 Tips:
                - Upload an image and ask questions about it
                - Try asking for detailed descriptions
                - Ask about objects, colors, text in images
                - Compare elements within the image
                """
            )
            
            gr.Markdown(
                """
                ### 📸 Example Prompts:
                - "What's in this image?"
                - "Describe this scene in detail"
                - "What text can you see?"
                - "Count the objects in the image"
                - "What's the mood of this image?"
                """
            )
        
        with gr.Column(scale=3):
            chatbot = gr.Chatbot(
                label="Chat",
                type="messages",
                height=500,
                show_copy_button=True,
                bubble_full_width=False,
                avatar_images=[None, "🤖"],
                value=[]
            )
            
            with gr.Row():
                msg = gr.MultimodalTextbox(
                    label="Message",
                    placeholder="Type a message or upload an image...",
                    file_types=["image"],
                    submit_btn=True,
                    stop_btn=False
                )
            
            with gr.Row():
                retry_btn = gr.Button("🔄 Retry", variant="secondary", size="sm")
                undo_btn = gr.Button("↩️ Undo", variant="secondary", size="sm")
                clear_btn = gr.Button("🗑️ Clear", variant="secondary", size="sm")
    
    with gr.Accordion("⚙️ Advanced Settings", open=False):
        gr.Markdown(
            """
            **Model Information:**
            - Model: Qwen3-VL-4B-Instruct
            - Optimized for vision-language tasks
            - Supports multiple languages
            - Best performance with clear, well-lit images
            """
        )
    
    # Set up event handlers
    msg.submit(
        chat_fn,
        inputs=[msg, chatbot],
        outputs=[msg, chatbot],
        queue=True
    )
    
    retry_btn.click(
        retry_fn,
        inputs=[chatbot],
        outputs=[msg, chatbot],
        queue=True
    )
    
    undo_btn.click(
        undo_fn,
        inputs=[chatbot],
        outputs=[chatbot],
        queue=False
    )
    
    clear_btn.click(
        clear_fn,
        outputs=[msg, chatbot],
        queue=False
    )
    
    # Add examples
    gr.Examples(
        examples=[
            {"text": "Hello! What can you help me with today?"},
            {"text": "Can you describe an image if I upload one?"},
            {"text": "What are your capabilities?"},
        ],
        inputs=msg,
        label="Example Messages"
    )

if __name__ == "__main__":
    demo.launch(
        show_error=True,
        share=False,
        debug=True
    )