File size: 5,210 Bytes
a14c972
 
 
 
 
 
 
 
 
 
 
 
 
 
 
51b8121
a14c972
51b8121
a14c972
 
51b8121
 
a14c972
 
51b8121
a14c972
51b8121
 
 
a14c972
51b8121
a14c972
 
 
51b8121
 
 
 
 
 
 
 
 
 
 
 
 
a14c972
 
51b8121
 
 
 
 
 
 
 
 
 
 
 
 
 
a14c972
 
51b8121
a14c972
 
51b8121
 
 
 
 
a14c972
 
 
 
 
51b8121
a14c972
 
 
 
 
 
 
 
 
 
 
51b8121
 
cb3137f
51b8121
 
 
 
a14c972
 
 
 
 
 
 
 
 
 
 
51b8121
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
a14c972
51b8121
 
a14c972
 
51b8121
 
 
 
 
 
a14c972
51b8121
 
 
 
 
a14c972
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
import gradio as gr
from transformers import Qwen3VLForConditionalGeneration, AutoProcessor
import torch
from PIL import Image
import spaces

# Load model and processor
model = Qwen3VLForConditionalGeneration.from_pretrained(
    "Qwen/Qwen3-VL-2B-Instruct",
    torch_dtype=torch.bfloat16,
    device_map="auto"
)
processor = AutoProcessor.from_pretrained("Qwen/Qwen3-VL-2B-Instruct")

@spaces.GPU(duration=120)
def qwen_chat_fn(message, history):
    """
    Process chat messages with multimodal support
    
    Args:
        message (dict): Contains 'text' and 'files' keys
        history (list): Chat history in messages format
    
    Returns:
        str: Assistant response
    """
    # Extract text and files from the message
    text = message.get("text", "")
    files = message.get("files", [])
    
    # Build messages list for the model
    messages = []
    
    # Add previous chat history
    for hist_item in history:
        if hist_item["role"] == "user":
            messages.append({
                "role": "user",
                "content": [{"type": "text", "text": hist_item["content"]}]
            })
        elif hist_item["role"] == "assistant":
            messages.append({
                "role": "assistant",
                "content": [{"type": "text", "text": hist_item["content"]}]
            })
    
    # Build current message content
    current_content = []
    
    # Add images if provided
    if files:
        for file_path in files:
            try:
                image = Image.open(file_path)
                current_content.append({
                    "type": "image",
                    "image": image
                })
            except Exception as e:
                print(f"Error loading image {file_path}: {e}")
    
    # Add text
    if text:
        current_content.append({
            "type": "text",
            "text": text
        })
    
    # If no content, return empty
    if not current_content:
        return ""
    
    # Add current message
    messages.append({
        "role": "user",
        "content": current_content
    })
    
    # Prepare inputs for the model
    inputs = processor.apply_chat_template(
        messages,
        tokenize=True,
        add_generation_prompt=True,
        return_dict=True,
        return_tensors="pt"
    )
    inputs = inputs.to(model.device)
    
    # Generate response
    with torch.no_grad():
        generated_ids = model.generate(
            **inputs,
            max_new_tokens=4000,
            temperature=0.7,
            top_p=0.95,
            do_sample=True
        )
    
    # Decode output
    generated_ids_trimmed = [
        out_ids[len(in_ids):] for in_ids, out_ids in zip(inputs.input_ids, generated_ids)
    ]
    output_text = processor.batch_decode(
        generated_ids_trimmed,
        skip_special_tokens=True,
        clean_up_tokenization_spaces=False
    )[0]
    
    return output_text

# Example messages for demonstration
example_messages = [
    {"text": "Hello! Can you describe what makes a good photograph?", "files": []},
    {"text": "What's the weather like in this image?", "files": []},
    {"text": "Can you analyze the composition of this picture?", "files": []},
]

# Create the ChatInterface
demo = gr.ChatInterface(
    fn=qwen_chat_fn,
    type="messages",
    multimodal=True,
    title="🎨 Qwen3-VL Multimodal Chat",
    description="""
    Chat with Qwen3-VL-2B-Instruct - A powerful multimodal AI that understands both text and images!
    
    **Features:**
    - πŸ“ Text conversations
    - πŸ–ΌοΈ Image understanding and analysis
    - 🎯 Visual question answering
    - πŸ” Detailed image descriptions
    
    **How to use:**
    - Type your message in the text box
    - Click the attachment button to upload images
    - Send your message to get AI responses
    
    [Built with anycoder](https://huggingface.co/spaces/akhaliq/anycoder)
    """,
    examples=[
        {"text": "Hello! What can you help me with today?", "files": []},
        {"text": "Can you explain what machine learning is?", "files": []},
        {"text": "What are the key elements of good design?", "files": []},
    ],
    theme=gr.themes.Soft(),
    autofocus=True,
    submit_btn="Send",
    stop_btn="Stop",
    additional_inputs=None,
    additional_inputs_accordion=None,
    cache_examples=False,
    analytics_enabled=False,
)

# Add additional information in a Markdown block
with demo:
    gr.Markdown(
        """
        ---
        ### πŸ’‘ Tips for Best Results:
        - **For images:** Upload clear, well-lit images for better analysis
        - **For questions:** Be specific about what you want to know
        - **Context matters:** Provide relevant context for more accurate responses
        - **Multiple images:** You can upload multiple images in a single message
        
        ### πŸš€ Model Information:
        - **Model:** Qwen3-VL-2B-Instruct
        - **Parameters:** 2 Billion
        - **Capabilities:** Image understanding, OCR, visual reasoning, general conversation
        - **Powered by:** Hugging Face Spaces with ZeroGPU
        """
    )

if __name__ == "__main__":
    demo.launch(share=False)