Spaces:

ChaseHan
/

Latex2Layout-Qwen2.5VL

Running on Zero

App Files Files Community

Latex2Layout-Qwen2.5VL / app.py

ChaseHan

Update app.py

baeaeb1 verified 4 months ago

raw

history blame contribute delete

12.8 kB

	import gradio as gr
	import torch
	from transformers import AutoProcessor, Qwen2_5_VLForConditionalGeneration
	from PIL import Image, ImageDraw, ImageFont
	import json
	import re
	from spaces import GPU

	# --- 1. Configurations and Constants ---
	# Define user-facing names and Hugging Face IDs for the models
	MODEL_BASE_NAME = "Latex2Layout-SFT"
	MODEL_BASE_ID = "ChaseHan/Latex2Layout-2000-sync"

	# MODEL_ENHANCED_NAME = "Latex2Layout-RL"
	# MODEL_ENHANCED_ID = "ChaseHan/Latex2Layout-RL"

	MODEL_ENHANCED_NAME = "Latex2Layout-RL"
	MODEL_ENHANCED_ID = "ChaseHan/Latex2Layout-2000-sync-enhanced"



	# Add a name for the Mixing mode
	MODEL_MIXING_NAME = "Mixing Beta Version(Powerful Mode)"
	MODEL_CHOICES = [MODEL_BASE_NAME, MODEL_ENHANCED_NAME]


	# Target image size for model input
	TARGET_SIZE = (924, 1204)

	# Visualization Style Constants
	OUTLINE_WIDTH = 3
	# Color mapping for different layout regions (RGBA for transparency)
	LABEL_COLORS = {
	"title": (255, 82, 82, 90), # Red
	"abstract": (46, 204, 113, 90), # Green
	"heading": (52, 152, 219, 90), # Blue
	"footnote": (241, 196, 15, 90), # Yellow
	"figure": (155, 89, 182, 90), # Purple
	"figure caption": (26, 188, 156, 90),# Teal
	"table": (230, 126, 34, 90), # Orange
	"table caption": (44, 62, 80, 90), # Dark Blue/Gray
	"math": (231, 76, 60, 90), # Pomegranate
	"text": (149, 165, 166, 90), # Gray
	"other": (127, 140, 141, 90) # Light Gray
	}
	# The default prompt sent to the model for layout detection
	DEFAULT_PROMPT = (
	"""<image>Please carefully observe the document and detect the following regions: "title", "abstract", "heading", "footnote", "figure", "figure caption", "table", "table caption", "math", "text". Output each detected region's bbox coordinates in JSON format. The format of the output is: <answer>```json[{"bbox_2d": [x1, y1, x2, y2], "label": "region name", "order": "reading order"}]```</answer>."""
	)

	# --- 2. Load Models and Processor ---
	print("Loading models, this will take some time and VRAM...")
	try:
	print(f"Loading {MODEL_BASE_NAME}...")
	model_base = Qwen2_5_VLForConditionalGeneration.from_pretrained(
	MODEL_BASE_ID,
	torch_dtype=torch.float16,
	device_map="auto"
	)

	print(f"Loading {MODEL_ENHANCED_NAME}...")
	model_enhanced = Qwen2_5_VLForConditionalGeneration.from_pretrained(
	MODEL_ENHANCED_ID,
	torch_dtype=torch.float16,
	device_map="auto"
	)

	processor = AutoProcessor.from_pretrained(MODEL_BASE_ID)
	print("All models loaded successfully!")
	except Exception as e:
	print(f"Error loading models: {e}")
	exit()

	# --- Helper functions for geometric calculations ---
	def calculate_iou(boxA, boxB):
	"""Calculate Intersection over Union (IoU) of two bounding boxes."""
	xA = max(boxA[0], boxB[0])
	yA = max(boxA[1], boxB[1])
	xB = min(boxA[2], boxB[2])
	yB = min(boxA[3], boxB[3])

	interArea = max(0, xB - xA) * max(0, yB - yA)
	boxAArea = (boxA[2] - boxA[0]) * (boxA[3] - boxA[1])
	boxBArea = (boxB[2] - boxB[0]) * (boxB[3] - boxB[1])
	unionArea = float(boxAArea + boxBArea - interArea)
	return interArea / unionArea if unionArea > 0 else 0

	def calculate_intersection_area(boxA, boxB):
	"""Calculate the absolute intersection area of two bounding boxes."""
	xA = max(boxA[0], boxB[0])
	yA = max(boxA[1], boxB[1])
	xB = min(boxA[2], boxB[2])
	yB = min(boxA[3], boxB[3])
	return max(0, xB - xA) * max(0, yB - yA)

	# --- NEW: Function to remove nested elements of the same type ---
	def remove_nested_elements(results):
	"""
	Removes smaller elements that are heavily nested within larger elements of the same label.
	An element is considered nested if >80% of its area is inside the other.
	"""
	indices_to_remove = set()
	for i in range(len(results)):
	for j in range(len(results)):
	if i == j:
	continue

	item_i = results[i]
	item_j = results[j]

	# Rule only applies to elements with the same label
	if item_i.get("label") != item_j.get("label"):
	continue

	bbox_i = item_i.get("bbox_2d")
	bbox_j = item_j.get("bbox_2d")

	if not bbox_i or not bbox_j:
	continue

	area_i = (bbox_i[2] - bbox_i[0]) * (bbox_i[3] - bbox_i[1])
	area_j = (bbox_j[2] - bbox_j[0]) * (bbox_j[3] - bbox_j[1])

	if area_i == 0 or area_j == 0:
	continue

	# Identify smaller and larger box
	if area_i < area_j:
	smaller_box, larger_box, smaller_area, smaller_idx = bbox_i, bbox_j, area_i, i
	else:
	smaller_box, larger_box, smaller_area, smaller_idx = bbox_j, bbox_i, area_j, j

	intersection = calculate_intersection_area(smaller_box, larger_box)

	# If the smaller box is >80% contained in the larger one, mark it for removal
	if (intersection / smaller_area) > 0.8:
	indices_to_remove.add(smaller_idx)

	# Return a new list containing only the elements that were not marked for removal
	return [item for idx, item in enumerate(results) if idx not in indices_to_remove]


	# --- 3. Core Inference and Visualization Function ---
	@GPU
	def analyze_and_visualize_layout(input_image: Image.Image, selected_model_name: str, prompt: str, progress=gr.Progress(track_tqdm=True)):
	if input_image is None:
	return None, "Please upload an image first."

	progress(0, desc="Resizing image...")
	image_resized = input_image.resize(TARGET_SIZE)
	image_rgba = image_resized.convert("RGBA")

	def run_inference(model_to_run, model_name_desc):
	progress(0.1, desc=f"Preparing inputs for {model_name_desc}...")
	messages = [{"role": "user", "content": [{"type": "image", "image": image_rgba}, {"type": "text", "text": prompt}]}]
	text = processor.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
	inputs = processor(text=[text], images=[image_rgba], padding=True, return_tensors="pt").to(model_to_run.device)

	progress(0.5, desc=f"Generating layout data with {model_name_desc}...")
	with torch.no_grad():
	output_ids = model_to_run.generate(*inputs, max_new_tokens=40962, do_sample=False)

	raw_text = processor.batch_decode(output_ids[:, inputs.input_ids.shape[1]:], skip_special_tokens=True)[0]

	try:
	json_match = re.search(r"```json(.*?)```", raw_text, re.DOTALL)
	json_str = json_match.group(1).strip() if json_match else raw_text.strip()
	return json.loads(json_str), raw_text
	except (json.JSONDecodeError, AttributeError):
	return None, raw_text

	if selected_model_name == MODEL_MIXING_NAME:
	base_results, raw_text_base = run_inference(model_base, "Base Model")
	enhanced_results, raw_text_enhanced = run_inference(model_enhanced, "Enhanced Model")
	output_text = f"--- Base Model Output ---\n{raw_text_base}\n\n--- Enhanced Model Output ---\n{raw_text_enhanced}"

	if base_results is None or enhanced_results is None:
	return image_rgba.convert("RGB"), f"Failed to parse JSON from one or both models:\n\n{output_text}"

	progress(0.8, desc="Merging results based on IoU...")
	merged_results = list(base_results)
	base_bboxes = [item['bbox_2d'] for item in base_results if 'bbox_2d' in item]

	for enhanced_item in enhanced_results:
	if 'bbox_2d' not in enhanced_item: continue

	is_duplicate = False
	for base_bbox in base_bboxes:
	if calculate_iou(enhanced_item['bbox_2d'], base_bbox) > 0.5:
	is_duplicate = True
	break

	if not is_duplicate:
	merged_results.append(enhanced_item)

	results = merged_results
	else:
	model = model_base if selected_model_name == MODEL_BASE_NAME else model_enhanced
	results, output_text = run_inference(model, selected_model_name)
	if results is None:
	return image_rgba.convert("RGB"), f"Failed to parse JSON from model output:\n\n{output_text}"

	# --- NEW: Apply the final post-processing step to remove nested elements ---
	progress(0.85, desc="Cleaning up nested elements...")
	results = remove_nested_elements(results)


	# --- Visualization ---
	progress(0.9, desc="Visualizing final results...")
	overlay = Image.new('RGBA', image_rgba.size, (255, 255, 255, 0))
	draw = ImageDraw.Draw(overlay)

	try:
	font = ImageFont.truetype("Arial.ttf", 15)
	except IOError:
	font = ImageFont.load_default()

	# 推荐的修改
	def get_safe_order(item):
	"""一个安全的函数，用于获取并转换order值为整数，处理潜在的错误。"""
	try:
	# 尝试将获取到的值转换为整数
	return int(item.get("order", 999))
	except (ValueError, TypeError):
	# 如果转换失败（例如，值是"abc"这样的非数字字符串），则返回默认值
	return 999

	# 在排序时使用这个新函数
	for item in sorted(results, key=get_safe_order):
	bbox, label, order = item.get("bbox_2d"), item.get("label", "other"), item.get("order", "")
	if not bbox or len(bbox) != 4: continue

	fill_color_rgba = LABEL_COLORS.get(label, LABEL_COLORS["other"])
	solid_color_rgb = fill_color_rgba[:3]
	draw.rectangle(bbox, fill=fill_color_rgba, outline=solid_color_rgb, width=OUTLINE_WIDTH)

	tag_text = f"{order}: {label}"
	tag_bbox = draw.textbbox((0, 0), tag_text, font=font)
	tag_w, tag_h = tag_bbox[2] - tag_bbox[0], tag_bbox[3] - tag_bbox[1]
	tag_bg_box = [bbox[0], bbox[1], bbox[0] + tag_w + 10, bbox[1] + tag_h + 6]
	draw.rectangle(tag_bg_box, fill=solid_color_rgb)
	draw.text((bbox[0] + 5, bbox[1] + 3), tag_text, font=font, fill="white")

	visualized_image = Image.alpha_composite(image_rgba, overlay).convert("RGB")
	return visualized_image, output_text


	def clear_outputs():
	"""Helper function to clear the output fields."""
	return None, None

	# --- 4. Gradio User Interface ---
	with gr.Blocks(theme=gr.themes.Glass(), title="Academic Paper Layout Detection") as demo:

	gr.Markdown("# 📄 Academic Paper Layout Detection")
	gr.Markdown(
	"Welcome! This tool uses a Qwen2.5-VL-3B-Instruct model fine-tuned on our Latex2Layout annotated layout dataset to identify layout regions in academic papers. "
	"Upload a document image to begin."
	"\n> Please note: All uploaded images are automatically resized to 924x1204 pixels to meet the model's input requirements."
	)
	gr.Markdown("<hr>")

	with gr.Row():
	with gr.Column(scale=4):
	input_image = gr.Image(type="pil", label="Upload Document Image", height=700)
	with gr.Column(scale=5):
	output_image = gr.Image(type="pil", label="Analyzed Layout", interactive=False, height=700)

	with gr.Row():
	analyze_btn = gr.Button("✨ Analyze Layout", variant="primary", scale=1)

	# --- Advanced Settings Panel ---
	with gr.Accordion("Advanced Settings", open=False):
	model_selector = gr.Radio(
	choices=MODEL_CHOICES,
	value=MODEL_BASE_NAME,
	label="Select Model",
	info="Choose which model to use for inference. "
	)
	prompt_textbox = gr.Textbox(
	label="Prompt",
	value=DEFAULT_PROMPT,
	lines=5,
	info="The prompt used to instruct the model."
	)

	output_text = gr.Textbox(label="Model Raw Output", lines=8, interactive=False, visible=True)

	gr.Examples(
	examples=[["1.png"], ["2.png"], ["12.png"], ["13.png"], ["14.png"], ["11.png"], ["3.png"], ["7.png"], ["8.png"]],
	inputs=[input_image],
	label="Examples (Click to Run)",
	)

	gr.Markdown("<p style='text-align:center; color:grey;'>Powered by the Latex2Layout dataset generated by Feijiang Han</p>")

	# --- Event Handlers ---
	analyze_btn.click(
	fn=analyze_and_visualize_layout,
	inputs=[input_image, model_selector, prompt_textbox],
	outputs=[output_image, output_text]
	)

	input_image.upload(fn=clear_outputs, inputs=None, outputs=[output_image, output_text])
	input_image.clear(fn=clear_outputs, inputs=None, outputs=[output_image, output_text])

	# --- 5. Launch the Application ---
	if __name__ == "__main__":
	demo.launch()