Spaces:

akhaliq
/

Qwen3-VL-2B-Instruct

Running on Zero

File size: 5,210 Bytes

import gradio as gr
from transformers import Qwen3VLForConditionalGeneration, AutoProcessor
import torch
from PIL import Image
import spaces

# Load model and processor
model = Qwen3VLForConditionalGeneration.from_pretrained(
    "Qwen/Qwen3-VL-2B-Instruct",
    torch_dtype=torch.bfloat16,
    device_map="auto"
)
processor = AutoProcessor.from_pretrained("Qwen/Qwen3-VL-2B-Instruct")

@spaces.GPU(duration=120)
def qwen_chat_fn(message, history):
    """
    Process chat messages with multimodal support
    
    Args:
        message (dict): Contains 'text' and 'files' keys
        history (list): Chat history in messages format
    
    Returns:
        str: Assistant response
    """
    # Extract text and files from the message
    text = message.get("text", "")
    files = message.get("files", [])
    
    # Build messages list for the model
    messages = []
    
    # Add previous chat history
    for hist_item in history:
        if hist_item["role"] == "user":
            messages.append({
                "role": "user",
                "content": [{"type": "text", "text": hist_item["content"]}]
            })
        elif hist_item["role"] == "assistant":
            messages.append({
                "role": "assistant",
                "content": [{"type": "text", "text": hist_item["content"]}]
            })
    
    # Build current message content
    current_content = []
    
    # Add images if provided
    if files:
        for file_path in files:
            try:
                image = Image.open(file_path)
                current_content.append({
                    "type": "image",
                    "image": image
                })
            except Exception as e:
                print(f"Error loading image {file_path}: {e}")
    
    # Add text
    if text:
        current_content.append({
            "type": "text",
            "text": text
        })
    
    # If no content, return empty
    if not current_content:
        return ""
    
    # Add current message
    messages.append({
        "role": "user",
        "content": current_content
    })
    
    # Prepare inputs for the model
    inputs = processor.apply_chat_template(
        messages,
        tokenize=True,
        add_generation_prompt=True,
        return_dict=True,
        return_tensors="pt"
    )
    inputs = inputs.to(model.device)
    
    # Generate response
    with torch.no_grad():
        generated_ids = model.generate(
            **inputs,
            max_new_tokens=4000,
            temperature=0.7,
            top_p=0.95,
            do_sample=True
        )
    
    # Decode output
    generated_ids_trimmed = [
        out_ids[len(in_ids):] for in_ids, out_ids in zip(inputs.input_ids, generated_ids)
    ]
    output_text = processor.batch_decode(
        generated_ids_trimmed,
        skip_special_tokens=True,
        clean_up_tokenization_spaces=False
    )[0]
    
    return output_text

# Example messages for demonstration
example_messages = [
    {"text": "Hello! Can you describe what makes a good photograph?", "files": []},
    {"text": "What's the weather like in this image?", "files": []},
    {"text": "Can you analyze the composition of this picture?", "files": []},
]

# Create the ChatInterface
demo = gr.ChatInterface(
    fn=qwen_chat_fn,
    type="messages",
    multimodal=True,
    title="🎨 Qwen3-VL Multimodal Chat",
    description="""
    Chat with Qwen3-VL-2B-Instruct - A powerful multimodal AI that understands both text and images!
    
    **Features:**
    - 📝 Text conversations
    - 🖼️ Image understanding and analysis
    - 🎯 Visual question answering
    - 🔍 Detailed image descriptions
    
    **How to use:**
    - Type your message in the text box
    - Click the attachment button to upload images
    - Send your message to get AI responses
    
    [Built with anycoder](https://huggingface.co/spaces/akhaliq/anycoder)
    """,
    examples=[
        {"text": "Hello! What can you help me with today?", "files": []},
        {"text": "Can you explain what machine learning is?", "files": []},
        {"text": "What are the key elements of good design?", "files": []},
    ],
    theme=gr.themes.Soft(),
    autofocus=True,
    submit_btn="Send",
    stop_btn="Stop",
    additional_inputs=None,
    additional_inputs_accordion=None,
    cache_examples=False,
    analytics_enabled=False,
)

# Add additional information in a Markdown block
with demo:
    gr.Markdown(
        """
        ---
        ### 💡 Tips for Best Results:
        - **For images:** Upload clear, well-lit images for better analysis
        - **For questions:** Be specific about what you want to know
        - **Context matters:** Provide relevant context for more accurate responses
        - **Multiple images:** You can upload multiple images in a single message
        
        ### 🚀 Model Information:
        - **Model:** Qwen3-VL-2B-Instruct
        - **Parameters:** 2 Billion
        - **Capabilities:** Image understanding, OCR, visual reasoning, general conversation
        - **Powered by:** Hugging Face Spaces with ZeroGPU
        """
    )

if __name__ == "__main__":
    demo.launch(share=False)