Spaces:

ChaseHan
/

Latex2Layout-Qwen2.5VL

Runtime error

App Files Files Community

Latex2Layout-Qwen2.5VL / app.py

ChaseHan

Update app.py

7ca7548 verified 5 months ago

raw

history blame

7.99 kB

	import gradio as gr
	import torch
	from transformers import AutoProcessor, Qwen2_5_VLForConditionalGeneration
	from PIL import Image, ImageDraw, ImageFont
	import json
	import re
	from spaces import GPU

	# --- 1. Configurations and Constants ---
	# Model repository on Hugging Face
	MODEL_ID = "ChaseHan/Latex2Layout-2000-sync"

	# Target image size for model input
	TARGET_SIZE = (924, 1204)

	# Visualization Style Constants
	OUTLINE_WIDTH = 3
	# Color mapping for different layout regions (RGBA for transparency)
	LABEL_COLORS = {
	"title": (255, 82, 82, 90), # Red
	"abstract": (46, 204, 113, 90), # Green
	"heading": (52, 152, 219, 90), # Blue
	"footnote": (241, 196, 15, 90), # Yellow
	"figure": (155, 89, 182, 90), # Purple
	"figure caption": (26, 188, 156, 90),# Teal
	"table": (230, 126, 34, 90), # Orange
	"table caption": (44, 62, 80, 90), # Dark Blue/Gray
	"math": (231, 76, 60, 90), # Pomegranate
	"text": (149, 165, 166, 90), # Gray
	"other": (127, 140, 141, 90) # Light Gray
	}
	# The default prompt sent to the model for layout detection
	DEFAULT_PROMPT = (
	"""<image>Please carefully observe the document and detect the following regions: "title", "abstract", "heading", "footnote", "figure", "figure caption", "table", "table caption", "math", "text". Output each detected region's bbox coordinates in JSON format. The format of the output is: <answer>```json[{"bbox_2d": [x1, y1, x2, y2], "label": "region name", "order": "reading order"}]```</answer>."""
	)

	# --- 2. Load Model and Processor ---
	print("Loading model and processor, this may take a moment...")
	try:
	model = Qwen2_5_VLForConditionalGeneration.from_pretrained(
	MODEL_ID,
	torch_dtype=torch.float16,
	device_map="auto"
	)
	processor = AutoProcessor.from_pretrained(MODEL_ID)
	print("Model loaded successfully!")
	except Exception as e:
	print(f"Error loading model: {e}")
	exit()

	# --- 3. Core Inference and Visualization Function ---
	@GPU
	def analyze_and_visualize_layout(input_image: Image.Image, prompt: str, temperature: float, top_p: float, progress=gr.Progress(track_tqdm=True)):
	"""
	Takes an image and model parameters, runs inference, and returns a visualized image and raw text output.
	"""
	if input_image is None:
	return None, "Please upload an image first."

	progress(0, desc="Resizing image...")
	image = input_image.resize(TARGET_SIZE)
	image = image.convert("RGBA")

	messages = [
	{"role": "user", "content": [
	{"type": "image", "image": image},
	{"type": "text", "text": prompt} # Use the configurable prompt
	]}
	]

	progress(0.2, desc="Preparing model inputs...")
	text = processor.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
	inputs = processor(text=[text], images=[image], padding=True, return_tensors="pt").to(model.device)

	progress(0.5, desc="Generating layout data...")
	with torch.no_grad():
	# Pass new parameters to the model generation
	output_ids = model.generate(
	**inputs,
	max_new_tokens=4096,
	do_sample=True, # Must be True for temperature/top_p to have an effect
	temperature=temperature,
	top_p=top_p
	)

	output_text = processor.batch_decode(
	output_ids[:, inputs.input_ids.shape[1]:], skip_special_tokens=True
	)[0]

	progress(0.8, desc="Parsing and visualizing results...")
	try:
	json_match = re.search(r"```json(.*?)```", output_text, re.DOTALL)
	json_str = json_match.group(1).strip() if json_match else output_text.strip()
	results = json.loads(json_str)
	except (json.JSONDecodeError, AttributeError):
	return image.convert("RGB"), f"Failed to parse JSON from model output:\n\n{output_text}"

	overlay = Image.new('RGBA', image.size, (255, 255, 255, 0))
	draw = ImageDraw.Draw(overlay)

	try:
	font = ImageFont.truetype("Arial.ttf", 15)
	except IOError:
	font = ImageFont.load_default()

	for item in sorted(results, key=lambda x: x.get("order", 999)):
	bbox = item.get("bbox_2d")
	label = item.get("label", "other")
	order = item.get("order", "")

	if not bbox or len(bbox) != 4: continue

	fill_color_rgba = LABEL_COLORS.get(label, LABEL_COLORS["other"])
	solid_color_rgb = fill_color_rgba[:3]

	draw.rectangle(bbox, fill=fill_color_rgba, outline=solid_color_rgb, width=OUTLINE_WIDTH)

	tag_text = f"{order}: {label}"
	tag_bbox = draw.textbbox((0, 0), tag_text, font=font)
	tag_w, tag_h = tag_bbox[2] - tag_bbox[0], tag_bbox[3] - tag_bbox[1]

	tag_bg_box = [bbox[0], bbox[1], bbox[0] + tag_w + 10, bbox[1] + tag_h + 6]
	draw.rectangle(tag_bg_box, fill=solid_color_rgb)
	draw.text((bbox[0] + 5, bbox[1] + 3), tag_text, font=font, fill="white")

	visualized_image = Image.alpha_composite(image, overlay).convert("RGB")
	return visualized_image, output_text

	def clear_outputs():
	"""Helper function to clear the output fields."""
	return None, None

	# --- 4. Gradio User Interface ---
	with gr.Blocks(theme=gr.themes.Glass(), title="Academic Paper Layout Detection") as demo:

	gr.Markdown("# 📄 Academic Paper Layout Detection")
	gr.Markdown(
	"Welcome! This tool uses a Qwen2.5-VL-3B-Instruct model fine-tuned on our Latex2Layout annotated layout dataset to identify layout regions in academic papers. "
	"Upload a document image to begin."
	"\n> Please note: All uploaded images are automatically resized to 924x1204 pixels to meet the model's input requirements."
	)
	gr.Markdown("<hr>")

	with gr.Row():
	with gr.Column(scale=4):
	input_image = gr.Image(type="pil", label="Upload Document Image", height=700)
	with gr.Column(scale=5):
	output_image = gr.Image(type="pil", label="Analyzed Layout", interactive=False, height=700)

	with gr.Row():
	analyze_btn = gr.Button("✨ Analyze Layout", variant="primary", scale=1)

	# --- NEW: Advanced Settings Panel ---
	with gr.Accordion("Advanced Settings", open=False):
	prompt_textbox = gr.Textbox(
	label="Prompt",
	value=DEFAULT_PROMPT,
	lines=5,
	info="The prompt used to instruct the model."
	)
	temp_slider = gr.Slider(
	minimum=0.0,
	maximum=2.0,
	step=0.05,
	value=0.7,
	label="Temperature",
	info="Controls randomness. Higher values mean more random outputs."
	)
	top_p_slider = gr.Slider(
	minimum=0.0,
	maximum=1.0,
	step=0.05,
	value=0.9,
	label="Top-p (Nucleus Sampling)",
	info="Filters a cumulative probability mass. Lower values are less random."
	)

	output_text = gr.Textbox(label="Model Raw Output", lines=8, interactive=False, visible=True)

	gr.Examples(
	examples=[["page_2.png"], ["page_3.png"], ["page_5.png"], ["page_13.png"]],
	inputs=[input_image],
	label="Examples (Click to Run)",
	# Examples now only populate the image input. The user clicks "Analyze" to run with current settings.
	)

	gr.Markdown("<p style='text-align:center; color:grey;'>Powered by the Latex2Layout dataset generated by Feijiang Han</p>")

	# --- Event Handlers ---
	analyze_btn.click(
	fn=analyze_and_visualize_layout,
	inputs=[input_image, prompt_textbox, temp_slider, top_p_slider], # Add new inputs
	outputs=[output_image, output_text]
	)

	input_image.upload(fn=clear_outputs, inputs=None, outputs=[output_image, output_text])
	input_image.clear(fn=clear_outputs, inputs=None, outputs=[output_image, output_text])

	# --- 5. Launch the Application ---
	if __name__ == "__main__":
	demo.launch()