Spaces:

akhaliq
/

Qwen3-VL-2B-Instruct

Running on Zero

App Files Files Community

Qwen3-VL-2B-Instruct / app.py

akhaliq HF Staff

Update app.py

cb3137f verified 18 days ago

raw

history blame contribute delete

5.21 kB

	import gradio as gr
	from transformers import Qwen3VLForConditionalGeneration, AutoProcessor
	import torch
	from PIL import Image
	import spaces

	# Load model and processor
	model = Qwen3VLForConditionalGeneration.from_pretrained(
	"Qwen/Qwen3-VL-2B-Instruct",
	torch_dtype=torch.bfloat16,
	device_map="auto"
	)
	processor = AutoProcessor.from_pretrained("Qwen/Qwen3-VL-2B-Instruct")

	@spaces.GPU(duration=120)
	def qwen_chat_fn(message, history):
	"""
	Process chat messages with multimodal support

	Args:
	message (dict): Contains 'text' and 'files' keys
	history (list): Chat history in messages format

	Returns:
	str: Assistant response
	"""
	# Extract text and files from the message
	text = message.get("text", "")
	files = message.get("files", [])

	# Build messages list for the model
	messages = []

	# Add previous chat history
	for hist_item in history:
	if hist_item["role"] == "user":
	messages.append({
	"role": "user",
	"content": [{"type": "text", "text": hist_item["content"]}]
	})
	elif hist_item["role"] == "assistant":
	messages.append({
	"role": "assistant",
	"content": [{"type": "text", "text": hist_item["content"]}]
	})

	# Build current message content
	current_content = []

	# Add images if provided
	if files:
	for file_path in files:
	try:
	image = Image.open(file_path)
	current_content.append({
	"type": "image",
	"image": image
	})
	except Exception as e:
	print(f"Error loading image {file_path}: {e}")

	# Add text
	if text:
	current_content.append({
	"type": "text",
	"text": text
	})

	# If no content, return empty
	if not current_content:
	return ""

	# Add current message
	messages.append({
	"role": "user",
	"content": current_content
	})

	# Prepare inputs for the model
	inputs = processor.apply_chat_template(
	messages,
	tokenize=True,
	add_generation_prompt=True,
	return_dict=True,
	return_tensors="pt"
	)
	inputs = inputs.to(model.device)

	# Generate response
	with torch.no_grad():
	generated_ids = model.generate(
	**inputs,
	max_new_tokens=4000,
	temperature=0.7,
	top_p=0.95,
	do_sample=True
	)

	# Decode output
	generated_ids_trimmed = [
	out_ids[len(in_ids):] for in_ids, out_ids in zip(inputs.input_ids, generated_ids)
	]
	output_text = processor.batch_decode(
	generated_ids_trimmed,
	skip_special_tokens=True,
	clean_up_tokenization_spaces=False
	)[0]

	return output_text

	# Example messages for demonstration
	example_messages = [
	{"text": "Hello! Can you describe what makes a good photograph?", "files": []},
	{"text": "What's the weather like in this image?", "files": []},
	{"text": "Can you analyze the composition of this picture?", "files": []},
	]

	# Create the ChatInterface
	demo = gr.ChatInterface(
	fn=qwen_chat_fn,
	type="messages",
	multimodal=True,
	title="🎨 Qwen3-VL Multimodal Chat",
	description="""
	Chat with Qwen3-VL-2B-Instruct - A powerful multimodal AI that understands both text and images!

	Features:
	- 📝 Text conversations
	- 🖼️ Image understanding and analysis
	- 🎯 Visual question answering
	- 🔍 Detailed image descriptions

	How to use:
	- Type your message in the text box
	- Click the attachment button to upload images
	- Send your message to get AI responses

	[Built with anycoder](https://huggingface.co/spaces/akhaliq/anycoder)
	""",
	examples=[
	{"text": "Hello! What can you help me with today?", "files": []},
	{"text": "Can you explain what machine learning is?", "files": []},
	{"text": "What are the key elements of good design?", "files": []},
	],
	theme=gr.themes.Soft(),
	autofocus=True,
	submit_btn="Send",
	stop_btn="Stop",
	additional_inputs=None,
	additional_inputs_accordion=None,
	cache_examples=False,
	analytics_enabled=False,
	)

	# Add additional information in a Markdown block
	with demo:
	gr.Markdown(
	"""
	---
	### 💡 Tips for Best Results:
	- For images: Upload clear, well-lit images for better analysis
	- For questions: Be specific about what you want to know
	- Context matters: Provide relevant context for more accurate responses
	- Multiple images: You can upload multiple images in a single message

	### 🚀 Model Information:
	- Model: Qwen3-VL-2B-Instruct
	- Parameters: 2 Billion
	- Capabilities: Image understanding, OCR, visual reasoning, general conversation
	- Powered by: Hugging Face Spaces with ZeroGPU
	"""
	)

	if __name__ == "__main__":
	demo.launch(share=False)