import gradio as gr from transformers import Qwen3VLForConditionalGeneration, AutoProcessor import torch from PIL import Image import spaces # Load model and processor model = Qwen3VLForConditionalGeneration.from_pretrained( "Qwen/Qwen3-VL-2B-Instruct", torch_dtype=torch.bfloat16, device_map="auto" ) processor = AutoProcessor.from_pretrained("Qwen/Qwen3-VL-2B-Instruct") @spaces.GPU(duration=120) def qwen_chat_fn(message, history): """ Process chat messages with multimodal support Args: message (dict): Contains 'text' and 'files' keys history (list): Chat history in messages format Returns: str: Assistant response """ # Extract text and files from the message text = message.get("text", "") files = message.get("files", []) # Build messages list for the model messages = [] # Add previous chat history for hist_item in history: if hist_item["role"] == "user": messages.append({ "role": "user", "content": [{"type": "text", "text": hist_item["content"]}] }) elif hist_item["role"] == "assistant": messages.append({ "role": "assistant", "content": [{"type": "text", "text": hist_item["content"]}] }) # Build current message content current_content = [] # Add images if provided if files: for file_path in files: try: image = Image.open(file_path) current_content.append({ "type": "image", "image": image }) except Exception as e: print(f"Error loading image {file_path}: {e}") # Add text if text: current_content.append({ "type": "text", "text": text }) # If no content, return empty if not current_content: return "" # Add current message messages.append({ "role": "user", "content": current_content }) # Prepare inputs for the model inputs = processor.apply_chat_template( messages, tokenize=True, add_generation_prompt=True, return_dict=True, return_tensors="pt" ) inputs = inputs.to(model.device) # Generate response with torch.no_grad(): generated_ids = model.generate( **inputs, max_new_tokens=4000, temperature=0.7, top_p=0.95, do_sample=True ) # Decode output generated_ids_trimmed = [ out_ids[len(in_ids):] for in_ids, out_ids in zip(inputs.input_ids, generated_ids) ] output_text = processor.batch_decode( generated_ids_trimmed, skip_special_tokens=True, clean_up_tokenization_spaces=False )[0] return output_text # Example messages for demonstration example_messages = [ {"text": "Hello! Can you describe what makes a good photograph?", "files": []}, {"text": "What's the weather like in this image?", "files": []}, {"text": "Can you analyze the composition of this picture?", "files": []}, ] # Create the ChatInterface demo = gr.ChatInterface( fn=qwen_chat_fn, type="messages", multimodal=True, title="🎨 Qwen3-VL Multimodal Chat", description=""" Chat with Qwen3-VL-2B-Instruct - A powerful multimodal AI that understands both text and images! **Features:** - 📝 Text conversations - 🖼️ Image understanding and analysis - 🎯 Visual question answering - 🔍 Detailed image descriptions **How to use:** - Type your message in the text box - Click the attachment button to upload images - Send your message to get AI responses [Built with anycoder](https://huggingface.co/spaces/akhaliq/anycoder) """, examples=[ {"text": "Hello! What can you help me with today?", "files": []}, {"text": "Can you explain what machine learning is?", "files": []}, {"text": "What are the key elements of good design?", "files": []}, ], theme=gr.themes.Soft(), autofocus=True, submit_btn="Send", stop_btn="Stop", additional_inputs=None, additional_inputs_accordion=None, cache_examples=False, analytics_enabled=False, ) # Add additional information in a Markdown block with demo: gr.Markdown( """ --- ### 💡 Tips for Best Results: - **For images:** Upload clear, well-lit images for better analysis - **For questions:** Be specific about what you want to know - **Context matters:** Provide relevant context for more accurate responses - **Multiple images:** You can upload multiple images in a single message ### 🚀 Model Information: - **Model:** Qwen3-VL-2B-Instruct - **Parameters:** 2 Billion - **Capabilities:** Image understanding, OCR, visual reasoning, general conversation - **Powered by:** Hugging Face Spaces with ZeroGPU """ ) if __name__ == "__main__": demo.launch(share=False)