Spaces:

akhaliq
/

Qwen3-VL-2B-Instruct

Running on Zero

App Files Files Community

akhaliq HF Staff commited on 15 days ago

Commit

51b8121

verified ·

1 Parent(s): d30dbca

Update Gradio app with multiple files

Browse files

Files changed (2) hide show

app.py +118 -105
requirements.txt +2 -0

app.py CHANGED Viewed

@@ -2,9 +2,6 @@ import gradio as gr
 from transformers import Qwen3VLForConditionalGeneration, AutoProcessor
 import torch
 from PIL import Image
-import io
-import base64
 import spaces
 # Load model and processor
@@ -15,61 +12,71 @@ model = Qwen3VLForConditionalGeneration.from_pretrained(
 )
 processor = AutoProcessor.from_pretrained("Qwen/Qwen3-VL-2B-Instruct")
-def process_image(image):
-    """Convert image to base64 string for processing"""
-    if isinstance(image, str):
-        return image
-    if isinstance(image, Image.Image):
-        buffered = io.BytesIO()
-        image.save(buffered, format="PNG")
-        img_str = base64.b64encode(buffered.getvalue()).decode()
-        return f"data:image/png;base64,{img_str}"
-    return image
 @spaces.GPU(duration=120)
-def qwen_chat(message, image, chat_history):
     """
-    Process chat message with optional image input
     Args:
-        message (str): User's text message
-        image: Optional image input
-        chat_history (list): Previous conversation history
     Returns:
-        tuple: Updated chat history and empty message input
     """
-    if not message and image is None:
-        return chat_history, ""
-    # Build messages list
     messages = []
     # Add previous chat history
-    for user_msg, assistant_msg in chat_history:
-        messages.append({"role": "user", "content": [{"type": "text", "text": user_msg}]})
-        messages.append({"role": "assistant", "content": [{"type": "text", "text": assistant_msg}]})
-    # Add current message with optional image
     current_content = []
-    if image is not None:
-        current_content.append({
-            "type": "image",
-            "image": image
-        })
-    if message:
         current_content.append({
             "type": "text",
-            "text": message
         })
     messages.append({
         "role": "user",
         "content": current_content
     })
-    # Prepare inputs
     inputs = processor.apply_chat_template(
         messages,
         tokenize=True,
@@ -81,7 +88,13 @@ def qwen_chat(message, image, chat_history):
     # Generate response
     with torch.no_grad():
-        generated_ids = model.generate(**inputs, max_new_tokens=256)
     # Decode output
     generated_ids_trimmed = [
@@ -93,80 +106,80 @@ def qwen_chat(message, image, chat_history):
         clean_up_tokenization_spaces=False
     )[0]
-    # Update chat history
-    chat_history.append((message if message else "[Image provided]", output_text))
-    return chat_history, ""
-# Create Gradio interface
-with gr.Blocks(title="Qwen3-VL Chat") as demo:
     gr.Markdown(
         """
-        # 🎨 Qwen3-VL Chat
-        Chat with Qwen3-VL-2B-Instruct - A multimodal AI that can understand both text and images!
-        [Built with anycoder](https://huggingface.co/spaces/akhaliq/anycoder)
         """
     )
-    with gr.Row():
-        with gr.Column(scale=3):
-            chatbot = gr.Chatbot(
-                label="Chat History",
-                type="messages",
-                height=600,
-                show_copy_button=True
-            )
-        with gr.Column(scale=1):
-            image_input = gr.Image(
-                label="Upload Image (Optional)",
-                type="pil",
-                sources=["upload", "clipboard"],
-                interactive=True
-            )
-    with gr.Row():
-        message_input = gr.Textbox(
-            label="Message",
-            placeholder="Type your message here...",
-            lines=2,
-            scale=4
-        )
-        send_btn = gr.Button("Send", scale=1, variant="primary")
-    with gr.Row():
-        clear_btn = gr.Button("Clear Chat", variant="secondary")
-    gr.Markdown(
-        """
-        ### Tips:
-        - Upload an image to ask questions about it
-        - Describe what you see or ask for analysis
-        - The model can answer questions about images and text
-        """
-    )
-    # Event handlers
-    def send_message(msg, img, history):
-        return qwen_chat(msg, img, history)
-    send_btn.click(
-        send_message,
-        inputs=[message_input, image_input, chatbot],
-        outputs=[chatbot, message_input]
-    )
-    message_input.submit(
-        send_message,
-        inputs=[message_input, image_input, chatbot],
-        outputs=[chatbot, message_input]
-    )
-    clear_btn.click(
-        lambda: ([], None, ""),
-        outputs=[chatbot, image_input, message_input]
-    )
 if __name__ == "__main__":
     demo.launch(share=False)

 from transformers import Qwen3VLForConditionalGeneration, AutoProcessor
 import torch
 from PIL import Image
 import spaces
 # Load model and processor
 )
 processor = AutoProcessor.from_pretrained("Qwen/Qwen3-VL-2B-Instruct")
 @spaces.GPU(duration=120)
+def qwen_chat_fn(message, history):
     """
+    Process chat messages with multimodal support
     Args:
+        message (dict): Contains 'text' and 'files' keys
+        history (list): Chat history in messages format
     Returns:
+        str: Assistant response
     """
+    # Extract text and files from the message
+    text = message.get("text", "")
+    files = message.get("files", [])
+    # Build messages list for the model
     messages = []
     # Add previous chat history
+    for hist_item in history:
+        if hist_item["role"] == "user":
+            messages.append({
+                "role": "user",
+                "content": [{"type": "text", "text": hist_item["content"]}]
+            })
+        elif hist_item["role"] == "assistant":
+            messages.append({
+                "role": "assistant",
+                "content": [{"type": "text", "text": hist_item["content"]}]
+            })
+    # Build current message content
     current_content = []
+    # Add images if provided
+    if files:
+        for file_path in files:
+            try:
+                image = Image.open(file_path)
+                current_content.append({
+                    "type": "image",
+                    "image": image
+                })
+            except Exception as e:
+                print(f"Error loading image {file_path}: {e}")
+    # Add text
+    if text:
         current_content.append({
             "type": "text",
+            "text": text
         })
+    # If no content, return empty
+    if not current_content:
+        return ""
+    # Add current message
     messages.append({
         "role": "user",
         "content": current_content
     })
+    # Prepare inputs for the model
     inputs = processor.apply_chat_template(
         messages,
         tokenize=True,
     # Generate response
     with torch.no_grad():
+        generated_ids = model.generate(
+            **inputs,
+            max_new_tokens=512,
+            temperature=0.7,
+            top_p=0.95,
+            do_sample=True
+        )
     # Decode output
     generated_ids_trimmed = [
         clean_up_tokenization_spaces=False
     )[0]
+    return output_text
+# Example messages for demonstration
+example_messages = [
+    {"text": "Hello! Can you describe what makes a good photograph?", "files": []},
+    {"text": "What's the weather like in this image?", "files": []},
+    {"text": "Can you analyze the composition of this picture?", "files": []},
+]
+# Create the ChatInterface
+demo = gr.ChatInterface(
+    fn=qwen_chat_fn,
+    type="messages",
+    multimodal=True,
+    title="🎨 Qwen3-VL Multimodal Chat",
+    description="""
+    Chat with Qwen3-VL-2B-Instruct - A powerful multimodal AI that understands both text and images!
+    **Features:**
+    - 📝 Text conversations
+    - 🖼️ Image understanding and analysis
+    - 🎯 Visual question answering
+    - 🔍 Detailed image descriptions
+    **How to use:**
+    - Type your message in the text box
+    - Click the attachment button to upload images
+    - Send your message to get AI responses
+    [Built with anycoder](https://huggingface.co/spaces/akhaliq/anycoder)
+    """,
+    examples=[
+        {"text": "Hello! What can you help me with today?", "files": []},
+        {"text": "Can you explain what machine learning is?", "files": []},
+        {"text": "What are the key elements of good design?", "files": []},
+    ],
+    theme=gr.themes.Soft(),
+    autofocus=True,
+    submit_btn="Send",
+    stop_btn="Stop",
+    retry_btn="🔄 Retry",
+    undo_btn="↩️ Undo",
+    clear_btn="🗑️ Clear",
+    additional_inputs=None,
+    additional_inputs_accordion=None,
+    cache_examples=False,
+    analytics_enabled=False,
+    css="""
+    .contain { max-width: 1200px; margin: auto; }
+    .message { font-size: 14px; }
+    footer { display: none !important; }
+    """,
+    fill_height=True,
+    concurrency_limit=10
+)
+# Add additional information in a Markdown block
+with demo:
     gr.Markdown(
         """
+        ---
+        ### 💡 Tips for Best Results:
+        - **For images:** Upload clear, well-lit images for better analysis
+        - **For questions:** Be specific about what you want to know
+        - **Context matters:** Provide relevant context for more accurate responses
+        - **Multiple images:** You can upload multiple images in a single message
+        ### 🚀 Model Information:
+        - **Model:** Qwen3-VL-2B-Instruct
+        - **Parameters:** 2 Billion
+        - **Capabilities:** Image understanding, OCR, visual reasoning, general conversation
+        - **Powered by:** Hugging Face Spaces with ZeroGPU
         """
     )
 if __name__ == "__main__":
     demo.launch(share=False)

requirements.txt CHANGED Viewed

@@ -5,3 +5,5 @@ torchvision
 pillow
 accelerate
 spaces

 pillow
 accelerate
 spaces
+sentencepiece
+qwen-vl-utils