prithivMLmods's picture
Update app.py
cfdf27b verified
raw
history blame
8.61 kB
import os
import time
from threading import Thread
import re
from PIL import Image, ImageDraw
import gradio as gr
import spaces
import torch
from transformers import (
Qwen2_5_VLForConditionalGeneration,
AutoProcessor,
TextIteratorStreamer,
)
# Constants for text generation
MAX_MAX_NEW_TOKENS = 2048
DEFAULT_MAX_NEW_TOKENS = 1024
MAX_INPUT_TOKEN_LENGTH = int(os.getenv("MAX_INPUT_TOKEN_LENGTH", "4096"))
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
# Load Lumian2-VLR-7B-Thinking
MODEL_ID_Y = "prithivMLmods/Lumian2-VLR-7B-Thinking"
processor = AutoProcessor.from_pretrained(MODEL_ID_Y, trust_remote_code=True)
model = Qwen2_5_VLForConditionalGeneration.from_pretrained(
MODEL_ID_Y,
trust_remote_code=True,
torch_dtype=torch.float16
).to(device).eval()
def parse_model_output(text: str):
"""
Parses the model output to extract the answer and bounding box coordinates.
"""
# Extract coordinates from the <think> block
think_match = re.search(r"<think>(.*?)</think>", text, re.DOTALL)
coordinates = []
if think_match:
think_content = think_match.group(1)
# Find all occurrences of (x, y) coordinates
coords_raw = re.findall(r'\((\d+),\s*(\d+)\)', think_content)
coordinates = [(int(x), int(y)) for x, y in coords_raw]
# Extract the answer from the <answer> block
answer_match = re.search(r"<answer>(.*?)</answer>", text, re.DOTALL)
answer = answer_match.group(1).strip() if answer_match else text
return answer, coordinates
def draw_bounding_boxes(image: Image.Image, coordinates: list, box_size: int = 60, use_dotted_style: bool = False):
"""
Draws square bounding boxes on the image at the given coordinates.
"""
if not coordinates:
return image
img_with_boxes = image.copy()
draw = ImageDraw.Draw(img_with_boxes, "RGBA")
half_box = box_size // 2
for (x, y) in coordinates:
# Define the bounding box corners
x1 = x - half_box
y1 = y - half_box
x2 = x + half_box
y2 = y + half_box
if use_dotted_style:
# "Dotted like seaborn" - a semi-transparent fill with a solid outline
fill_color = (0, 100, 255, 60) # Light blue, semi-transparent
outline_color = (0, 0, 255) # Solid blue
draw.rectangle([x1, y1, x2, y2], fill=fill_color, outline=outline_color, width=2)
else:
# Default solid box
outline_color = (255, 0, 0) # Red
draw.rectangle([x1, y1, x2, y2], outline=outline_color, width=3)
return img_with_boxes
@spaces.GPU
def generate_image(text: str, image: Image.Image,
max_new_tokens: int,
temperature: float,
top_p: float,
top_k: int,
repetition_penalty: float,
draw_boxes: bool,
use_dotted_style: bool):
"""
Generates responses and draws bounding boxes based on model output.
Yields raw text, markdown-formatted text, and the processed image.
"""
if image is None:
yield "Please upload an image.", "Please upload an image.", None
return
# Yield the original image immediately for the output display
yield "", "", image
messages = [{
"role": "user",
"content": [
{"type": "image", "image": image},
{"type": "text", "text": text},
]
}]
prompt_full = processor.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
inputs = processor(
text=[prompt_full],
images=[image],
return_tensors="pt",
padding=True,
truncation=False,
max_length=MAX_INPUT_TOKEN_LENGTH
).to(device)
streamer = TextIteratorStreamer(processor, skip_prompt=True, skip_special_tokens=True)
generation_kwargs = {
**inputs,
"streamer": streamer,
"max_new_tokens": max_new_tokens,
"temperature": temperature,
"top_p": top_p,
"top_k": top_k,
"repetition_penalty": repetition_penalty,
"do_sample": True
}
thread = Thread(target=model.generate, kwargs=generation_kwargs)
thread.start()
buffer = ""
for new_text in streamer:
buffer += new_text
time.sleep(0.01)
# During generation, yield text updates but keep the original image
yield buffer, buffer, image
# After generation is complete, parse the output and draw boxes
final_answer, coordinates = parse_model_output(buffer)
output_image = image
if draw_boxes and coordinates:
output_image = draw_bounding_boxes(image, coordinates, use_dotted_style=use_dotted_style)
# Yield the final result with the processed image
yield buffer, final_answer, output_image
# Define examples for image inference
image_examples = [
["Explain the content in detail.", "images/D.jpg"],
["Explain the content (ocr).", "images/O.jpg"],
["What is the core meaning of the poem?", "images/S.jpg"],
["Provide a detailed caption for the image.", "images/A.jpg"],
["Explain the pie-chart in detail.", "images/2.jpg"],
["Jsonify Data.", "images/1.jpg"],
]
css = """
.submit-btn {
background-color: #2980b9 !important;
color: white !important;
}
.submit-btn:hover {
background-color: #3498db !important;
}
.canvas-output {
border: 2px solid #4682B4;
border-radius: 10px;
padding: 20px;
}
"""
# Create the Gradio Interface
with gr.Blocks(css=css, theme="bethecloud/storj_theme") as demo:
gr.Markdown("# **Lumian2-VLR-7B-Thinking Image Inference**")
with gr.Row():
with gr.Column(scale=1):
gr.Markdown("## Image Inference")
image_query = gr.Textbox(label="Query Input", placeholder="Enter your query here...")
image_upload = gr.Image(type="pil", label="Image")
image_submit = gr.Button("Submit", elem_classes="submit-btn")
with gr.Accordion("Advanced options", open=False):
max_new_tokens = gr.Slider(label="Max new tokens", minimum=1, maximum=MAX_MAX_NEW_TOKENS, step=1, value=DEFAULT_MAX_NEW_TOKENS)
temperature = gr.Slider(label="Temperature", minimum=0.1, maximum=4.0, step=0.1, value=0.6)
top_p = gr.Slider(label="Top-p (nucleus sampling)", minimum=0.05, maximum=1.0, step=0.05, value=0.9)
top_k = gr.Slider(label="Top-k", minimum=1, maximum=1000, step=1, value=50)
repetition_penalty = gr.Slider(label="Repetition penalty", minimum=1.0, maximum=2.0, step=0.05, value=1.2)
gr.Examples(
examples=image_examples,
inputs=[image_query, image_upload]
)
with gr.Column(scale=2):
gr.Markdown("## Output")
with gr.Tabs():
with gr.TabItem("Image with Bounding Box"):
image_output = gr.Image(label="Processed Image")
with gr.TabItem("Raw Text"):
output = gr.Textbox(label="Raw Model Output", interactive=False, lines=10)
with gr.TabItem("Parsed Answer"):
markdown_output = gr.Markdown(label="Parsed Answer")
gr.Markdown("**Model Info 💻** | [Report Bug](https://huggingface.co/spaces/prithivMLmods/Qwen2.5-VL/discussions)")
gr.Markdown(
"""> [Lumian2-VLR-7B-Thinking](https://huggingface.co/prithivMLmods/Lumian2-VLR-7B-Thinking): The Lumian2-VLR-7B-Thinking model is a high-fidelity vision-language reasoning (experimental model) system designed for fine-grained multimodal understanding. Built on Qwen2.5-VL-7B-Instruct, this model enhances image captioning, and document comprehension through explicit grounded reasoning. It produces structured reasoning traces aligned with visual coordinates, enabling explainable multimodal reasoning."""
)
with gr.Row():
draw_boxes_checkbox = gr.Checkbox(label="Draw Bounding Boxes", value=True)
dotted_style_checkbox = gr.Checkbox(label="Use Dotted Style for Boxes", value=False)
image_submit.click(
fn=generate_image,
inputs=[image_query, image_upload, max_new_tokens, temperature, top_p, top_k, repetition_penalty, draw_boxes_checkbox, dotted_style_checkbox],
outputs=[output, markdown_output, image_output]
)
if __name__ == "__main__":
demo.queue(max_size=50).launch(share=True)```