Spaces:
Running
on
Zero
Running
on
Zero
| import gradio as gr | |
| from transformers import Qwen3VLForConditionalGeneration, AutoProcessor | |
| import torch | |
| from PIL import Image | |
| import spaces | |
| # Load model and processor | |
| model = Qwen3VLForConditionalGeneration.from_pretrained( | |
| "Qwen/Qwen3-VL-2B-Instruct", | |
| torch_dtype=torch.bfloat16, | |
| device_map="auto" | |
| ) | |
| processor = AutoProcessor.from_pretrained("Qwen/Qwen3-VL-2B-Instruct") | |
| def qwen_chat_fn(message, history): | |
| """ | |
| Process chat messages with multimodal support | |
| Args: | |
| message (dict): Contains 'text' and 'files' keys | |
| history (list): Chat history in messages format | |
| Returns: | |
| str: Assistant response | |
| """ | |
| # Extract text and files from the message | |
| text = message.get("text", "") | |
| files = message.get("files", []) | |
| # Build messages list for the model | |
| messages = [] | |
| # Add previous chat history | |
| for hist_item in history: | |
| if hist_item["role"] == "user": | |
| messages.append({ | |
| "role": "user", | |
| "content": [{"type": "text", "text": hist_item["content"]}] | |
| }) | |
| elif hist_item["role"] == "assistant": | |
| messages.append({ | |
| "role": "assistant", | |
| "content": [{"type": "text", "text": hist_item["content"]}] | |
| }) | |
| # Build current message content | |
| current_content = [] | |
| # Add images if provided | |
| if files: | |
| for file_path in files: | |
| try: | |
| image = Image.open(file_path) | |
| current_content.append({ | |
| "type": "image", | |
| "image": image | |
| }) | |
| except Exception as e: | |
| print(f"Error loading image {file_path}: {e}") | |
| # Add text | |
| if text: | |
| current_content.append({ | |
| "type": "text", | |
| "text": text | |
| }) | |
| # If no content, return empty | |
| if not current_content: | |
| return "" | |
| # Add current message | |
| messages.append({ | |
| "role": "user", | |
| "content": current_content | |
| }) | |
| # Prepare inputs for the model | |
| inputs = processor.apply_chat_template( | |
| messages, | |
| tokenize=True, | |
| add_generation_prompt=True, | |
| return_dict=True, | |
| return_tensors="pt" | |
| ) | |
| inputs = inputs.to(model.device) | |
| # Generate response | |
| with torch.no_grad(): | |
| generated_ids = model.generate( | |
| **inputs, | |
| max_new_tokens=4000, | |
| temperature=0.7, | |
| top_p=0.95, | |
| do_sample=True | |
| ) | |
| # Decode output | |
| generated_ids_trimmed = [ | |
| out_ids[len(in_ids):] for in_ids, out_ids in zip(inputs.input_ids, generated_ids) | |
| ] | |
| output_text = processor.batch_decode( | |
| generated_ids_trimmed, | |
| skip_special_tokens=True, | |
| clean_up_tokenization_spaces=False | |
| )[0] | |
| return output_text | |
| # Example messages for demonstration | |
| example_messages = [ | |
| {"text": "Hello! Can you describe what makes a good photograph?", "files": []}, | |
| {"text": "What's the weather like in this image?", "files": []}, | |
| {"text": "Can you analyze the composition of this picture?", "files": []}, | |
| ] | |
| # Create the ChatInterface | |
| demo = gr.ChatInterface( | |
| fn=qwen_chat_fn, | |
| type="messages", | |
| multimodal=True, | |
| title="π¨ Qwen3-VL Multimodal Chat", | |
| description=""" | |
| Chat with Qwen3-VL-2B-Instruct - A powerful multimodal AI that understands both text and images! | |
| **Features:** | |
| - π Text conversations | |
| - πΌοΈ Image understanding and analysis | |
| - π― Visual question answering | |
| - π Detailed image descriptions | |
| **How to use:** | |
| - Type your message in the text box | |
| - Click the attachment button to upload images | |
| - Send your message to get AI responses | |
| [Built with anycoder](https://huggingface.co/spaces/akhaliq/anycoder) | |
| """, | |
| examples=[ | |
| {"text": "Hello! What can you help me with today?", "files": []}, | |
| {"text": "Can you explain what machine learning is?", "files": []}, | |
| {"text": "What are the key elements of good design?", "files": []}, | |
| ], | |
| theme=gr.themes.Soft(), | |
| autofocus=True, | |
| submit_btn="Send", | |
| stop_btn="Stop", | |
| additional_inputs=None, | |
| additional_inputs_accordion=None, | |
| cache_examples=False, | |
| analytics_enabled=False, | |
| ) | |
| # Add additional information in a Markdown block | |
| with demo: | |
| gr.Markdown( | |
| """ | |
| --- | |
| ### π‘ Tips for Best Results: | |
| - **For images:** Upload clear, well-lit images for better analysis | |
| - **For questions:** Be specific about what you want to know | |
| - **Context matters:** Provide relevant context for more accurate responses | |
| - **Multiple images:** You can upload multiple images in a single message | |
| ### π Model Information: | |
| - **Model:** Qwen3-VL-2B-Instruct | |
| - **Parameters:** 2 Billion | |
| - **Capabilities:** Image understanding, OCR, visual reasoning, general conversation | |
| - **Powered by:** Hugging Face Spaces with ZeroGPU | |
| """ | |
| ) | |
| if __name__ == "__main__": | |
| demo.launch(share=False) |