Spaces:

prithivMLmods
/

Multimodal-VLM-v1.0

Running on Zero

App Files Files Community

Multimodal-VLM-v1.0 / app.py

prithivMLmods

Update app.py

cfdf27b verified 3 months ago

raw

history blame

8.61 kB

	import os
	import time
	from threading import Thread
	import re
	from PIL import Image, ImageDraw

	import gradio as gr
	import spaces
	import torch

	from transformers import (
	Qwen2_5_VLForConditionalGeneration,
	AutoProcessor,
	TextIteratorStreamer,
	)

	# Constants for text generation
	MAX_MAX_NEW_TOKENS = 2048
	DEFAULT_MAX_NEW_TOKENS = 1024
	MAX_INPUT_TOKEN_LENGTH = int(os.getenv("MAX_INPUT_TOKEN_LENGTH", "4096"))

	device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")

	# Load Lumian2-VLR-7B-Thinking
	MODEL_ID_Y = "prithivMLmods/Lumian2-VLR-7B-Thinking"
	processor = AutoProcessor.from_pretrained(MODEL_ID_Y, trust_remote_code=True)
	model = Qwen2_5_VLForConditionalGeneration.from_pretrained(
	MODEL_ID_Y,
	trust_remote_code=True,
	torch_dtype=torch.float16
	).to(device).eval()

	def parse_model_output(text: str):
	"""
	Parses the model output to extract the answer and bounding box coordinates.
	"""
	# Extract coordinates from the <think> block
	think_match = re.search(r"<think>(.*?)</think>", text, re.DOTALL)
	coordinates = []
	if think_match:
	think_content = think_match.group(1)
	# Find all occurrences of (x, y) coordinates
	coords_raw = re.findall(r'\((\d+),\s*(\d+)\)', think_content)
	coordinates = [(int(x), int(y)) for x, y in coords_raw]

	# Extract the answer from the <answer> block
	answer_match = re.search(r"<answer>(.*?)</answer>", text, re.DOTALL)
	answer = answer_match.group(1).strip() if answer_match else text

	return answer, coordinates

	def draw_bounding_boxes(image: Image.Image, coordinates: list, box_size: int = 60, use_dotted_style: bool = False):
	"""
	Draws square bounding boxes on the image at the given coordinates.
	"""
	if not coordinates:
	return image

	img_with_boxes = image.copy()
	draw = ImageDraw.Draw(img_with_boxes, "RGBA")

	half_box = box_size // 2

	for (x, y) in coordinates:
	# Define the bounding box corners
	x1 = x - half_box
	y1 = y - half_box
	x2 = x + half_box
	y2 = y + half_box

	if use_dotted_style:
	# "Dotted like seaborn" - a semi-transparent fill with a solid outline
	fill_color = (0, 100, 255, 60) # Light blue, semi-transparent
	outline_color = (0, 0, 255) # Solid blue
	draw.rectangle([x1, y1, x2, y2], fill=fill_color, outline=outline_color, width=2)
	else:
	# Default solid box
	outline_color = (255, 0, 0) # Red
	draw.rectangle([x1, y1, x2, y2], outline=outline_color, width=3)

	return img_with_boxes

	@spaces.GPU
	def generate_image(text: str, image: Image.Image,
	max_new_tokens: int,
	temperature: float,
	top_p: float,
	top_k: int,
	repetition_penalty: float,
	draw_boxes: bool,
	use_dotted_style: bool):
	"""
	Generates responses and draws bounding boxes based on model output.
	Yields raw text, markdown-formatted text, and the processed image.
	"""
	if image is None:
	yield "Please upload an image.", "Please upload an image.", None
	return

	# Yield the original image immediately for the output display
	yield "", "", image

	messages = [{
	"role": "user",
	"content": [
	{"type": "image", "image": image},
	{"type": "text", "text": text},
	]
	}]
	prompt_full = processor.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
	inputs = processor(
	text=[prompt_full],
	images=[image],
	return_tensors="pt",
	padding=True,
	truncation=False,
	max_length=MAX_INPUT_TOKEN_LENGTH
	).to(device)
	streamer = TextIteratorStreamer(processor, skip_prompt=True, skip_special_tokens=True)
	generation_kwargs = {
	**inputs,
	"streamer": streamer,
	"max_new_tokens": max_new_tokens,
	"temperature": temperature,
	"top_p": top_p,
	"top_k": top_k,
	"repetition_penalty": repetition_penalty,
	"do_sample": True
	}
	thread = Thread(target=model.generate, kwargs=generation_kwargs)
	thread.start()

	buffer = ""
	for new_text in streamer:
	buffer += new_text
	time.sleep(0.01)
	# During generation, yield text updates but keep the original image
	yield buffer, buffer, image

	# After generation is complete, parse the output and draw boxes
	final_answer, coordinates = parse_model_output(buffer)

	output_image = image
	if draw_boxes and coordinates:
	output_image = draw_bounding_boxes(image, coordinates, use_dotted_style=use_dotted_style)

	# Yield the final result with the processed image
	yield buffer, final_answer, output_image

	# Define examples for image inference
	image_examples = [
	["Explain the content in detail.", "images/D.jpg"],
	["Explain the content (ocr).", "images/O.jpg"],
	["What is the core meaning of the poem?", "images/S.jpg"],
	["Provide a detailed caption for the image.", "images/A.jpg"],
	["Explain the pie-chart in detail.", "images/2.jpg"],
	["Jsonify Data.", "images/1.jpg"],
	]

	css = """
	.submit-btn {
	background-color: #2980b9 !important;
	color: white !important;
	}
	.submit-btn:hover {
	background-color: #3498db !important;
	}
	.canvas-output {
	border: 2px solid #4682B4;
	border-radius: 10px;
	padding: 20px;
	}
	"""

	# Create the Gradio Interface
	with gr.Blocks(css=css, theme="bethecloud/storj_theme") as demo:
	gr.Markdown("# Lumian2-VLR-7B-Thinking Image Inference")
	with gr.Row():
	with gr.Column(scale=1):
	gr.Markdown("## Image Inference")
	image_query = gr.Textbox(label="Query Input", placeholder="Enter your query here...")
	image_upload = gr.Image(type="pil", label="Image")
	image_submit = gr.Button("Submit", elem_classes="submit-btn")

	with gr.Accordion("Advanced options", open=False):
	max_new_tokens = gr.Slider(label="Max new tokens", minimum=1, maximum=MAX_MAX_NEW_TOKENS, step=1, value=DEFAULT_MAX_NEW_TOKENS)
	temperature = gr.Slider(label="Temperature", minimum=0.1, maximum=4.0, step=0.1, value=0.6)
	top_p = gr.Slider(label="Top-p (nucleus sampling)", minimum=0.05, maximum=1.0, step=0.05, value=0.9)
	top_k = gr.Slider(label="Top-k", minimum=1, maximum=1000, step=1, value=50)
	repetition_penalty = gr.Slider(label="Repetition penalty", minimum=1.0, maximum=2.0, step=0.05, value=1.2)

	gr.Examples(
	examples=image_examples,
	inputs=[image_query, image_upload]
	)

	with gr.Column(scale=2):
	gr.Markdown("## Output")
	with gr.Tabs():
	with gr.TabItem("Image with Bounding Box"):
	image_output = gr.Image(label="Processed Image")
	with gr.TabItem("Raw Text"):
	output = gr.Textbox(label="Raw Model Output", interactive=False, lines=10)
	with gr.TabItem("Parsed Answer"):
	markdown_output = gr.Markdown(label="Parsed Answer")

	gr.Markdown("Model Info 💻 \| [Report Bug](https://huggingface.co/spaces/prithivMLmods/Qwen2.5-VL/discussions)")

	gr.Markdown(
	"""> [Lumian2-VLR-7B-Thinking](https://huggingface.co/prithivMLmods/Lumian2-VLR-7B-Thinking): The Lumian2-VLR-7B-Thinking model is a high-fidelity vision-language reasoning (experimental model) system designed for fine-grained multimodal understanding. Built on Qwen2.5-VL-7B-Instruct, this model enhances image captioning, and document comprehension through explicit grounded reasoning. It produces structured reasoning traces aligned with visual coordinates, enabling explainable multimodal reasoning."""
	)

	with gr.Row():
	draw_boxes_checkbox = gr.Checkbox(label="Draw Bounding Boxes", value=True)
	dotted_style_checkbox = gr.Checkbox(label="Use Dotted Style for Boxes", value=False)


	image_submit.click(
	fn=generate_image,
	inputs=[image_query, image_upload, max_new_tokens, temperature, top_p, top_k, repetition_penalty, draw_boxes_checkbox, dotted_style_checkbox],
	outputs=[output, markdown_output, image_output]
	)

	if __name__ == "__main__":
	demo.queue(max_size=50).launch(share=True)```