Spaces:
Running
on
Zero
Running
on
Zero
| import gradio as gr | |
| import torch | |
| from transformers import AutoProcessor, Qwen2_5_VLForConditionalGeneration | |
| from PIL import Image, ImageDraw, ImageFont | |
| import json | |
| import re | |
| from spaces import GPU | |
| # --- 1. Configurations and Constants --- | |
| # Define user-facing names and Hugging Face IDs for the models | |
| MODEL_BASE_NAME = "Latex2Layout-SFT" | |
| MODEL_BASE_ID = "ChaseHan/Latex2Layout-2000-sync" | |
| # MODEL_ENHANCED_NAME = "Latex2Layout-RL" | |
| # MODEL_ENHANCED_ID = "ChaseHan/Latex2Layout-RL" | |
| MODEL_ENHANCED_NAME = "Latex2Layout-RL" | |
| MODEL_ENHANCED_ID = "ChaseHan/Latex2Layout-2000-sync-enhanced" | |
| # Add a name for the Mixing mode | |
| MODEL_MIXING_NAME = "Mixing Beta Version(Powerful Mode)" | |
| MODEL_CHOICES = [MODEL_BASE_NAME, MODEL_ENHANCED_NAME] | |
| # Target image size for model input | |
| TARGET_SIZE = (924, 1204) | |
| # Visualization Style Constants | |
| OUTLINE_WIDTH = 3 | |
| # Color mapping for different layout regions (RGBA for transparency) | |
| LABEL_COLORS = { | |
| "title": (255, 82, 82, 90), # Red | |
| "abstract": (46, 204, 113, 90), # Green | |
| "heading": (52, 152, 219, 90), # Blue | |
| "footnote": (241, 196, 15, 90), # Yellow | |
| "figure": (155, 89, 182, 90), # Purple | |
| "figure caption": (26, 188, 156, 90),# Teal | |
| "table": (230, 126, 34, 90), # Orange | |
| "table caption": (44, 62, 80, 90), # Dark Blue/Gray | |
| "math": (231, 76, 60, 90), # Pomegranate | |
| "text": (149, 165, 166, 90), # Gray | |
| "other": (127, 140, 141, 90) # Light Gray | |
| } | |
| # The default prompt sent to the model for layout detection | |
| DEFAULT_PROMPT = ( | |
| """<image>Please carefully observe the document and detect the following regions: "title", "abstract", "heading", "footnote", "figure", "figure caption", "table", "table caption", "math", "text". Output each detected region's bbox coordinates in JSON format. The format of the output is: <answer>```json[{"bbox_2d": [x1, y1, x2, y2], "label": "region name", "order": "reading order"}]```</answer>.""" | |
| ) | |
| # --- 2. Load Models and Processor --- | |
| print("Loading models, this will take some time and VRAM...") | |
| try: | |
| print(f"Loading {MODEL_BASE_NAME}...") | |
| model_base = Qwen2_5_VLForConditionalGeneration.from_pretrained( | |
| MODEL_BASE_ID, | |
| torch_dtype=torch.float16, | |
| device_map="auto" | |
| ) | |
| print(f"Loading {MODEL_ENHANCED_NAME}...") | |
| model_enhanced = Qwen2_5_VLForConditionalGeneration.from_pretrained( | |
| MODEL_ENHANCED_ID, | |
| torch_dtype=torch.float16, | |
| device_map="auto" | |
| ) | |
| processor = AutoProcessor.from_pretrained(MODEL_BASE_ID) | |
| print("All models loaded successfully!") | |
| except Exception as e: | |
| print(f"Error loading models: {e}") | |
| exit() | |
| # --- Helper functions for geometric calculations --- | |
| def calculate_iou(boxA, boxB): | |
| """Calculate Intersection over Union (IoU) of two bounding boxes.""" | |
| xA = max(boxA[0], boxB[0]) | |
| yA = max(boxA[1], boxB[1]) | |
| xB = min(boxA[2], boxB[2]) | |
| yB = min(boxA[3], boxB[3]) | |
| interArea = max(0, xB - xA) * max(0, yB - yA) | |
| boxAArea = (boxA[2] - boxA[0]) * (boxA[3] - boxA[1]) | |
| boxBArea = (boxB[2] - boxB[0]) * (boxB[3] - boxB[1]) | |
| unionArea = float(boxAArea + boxBArea - interArea) | |
| return interArea / unionArea if unionArea > 0 else 0 | |
| def calculate_intersection_area(boxA, boxB): | |
| """Calculate the absolute intersection area of two bounding boxes.""" | |
| xA = max(boxA[0], boxB[0]) | |
| yA = max(boxA[1], boxB[1]) | |
| xB = min(boxA[2], boxB[2]) | |
| yB = min(boxA[3], boxB[3]) | |
| return max(0, xB - xA) * max(0, yB - yA) | |
| # --- NEW: Function to remove nested elements of the same type --- | |
| def remove_nested_elements(results): | |
| """ | |
| Removes smaller elements that are heavily nested within larger elements of the same label. | |
| An element is considered nested if >80% of its area is inside the other. | |
| """ | |
| indices_to_remove = set() | |
| for i in range(len(results)): | |
| for j in range(len(results)): | |
| if i == j: | |
| continue | |
| item_i = results[i] | |
| item_j = results[j] | |
| # Rule only applies to elements with the same label | |
| if item_i.get("label") != item_j.get("label"): | |
| continue | |
| bbox_i = item_i.get("bbox_2d") | |
| bbox_j = item_j.get("bbox_2d") | |
| if not bbox_i or not bbox_j: | |
| continue | |
| area_i = (bbox_i[2] - bbox_i[0]) * (bbox_i[3] - bbox_i[1]) | |
| area_j = (bbox_j[2] - bbox_j[0]) * (bbox_j[3] - bbox_j[1]) | |
| if area_i == 0 or area_j == 0: | |
| continue | |
| # Identify smaller and larger box | |
| if area_i < area_j: | |
| smaller_box, larger_box, smaller_area, smaller_idx = bbox_i, bbox_j, area_i, i | |
| else: | |
| smaller_box, larger_box, smaller_area, smaller_idx = bbox_j, bbox_i, area_j, j | |
| intersection = calculate_intersection_area(smaller_box, larger_box) | |
| # If the smaller box is >80% contained in the larger one, mark it for removal | |
| if (intersection / smaller_area) > 0.8: | |
| indices_to_remove.add(smaller_idx) | |
| # Return a new list containing only the elements that were not marked for removal | |
| return [item for idx, item in enumerate(results) if idx not in indices_to_remove] | |
| # --- 3. Core Inference and Visualization Function --- | |
| def analyze_and_visualize_layout(input_image: Image.Image, selected_model_name: str, prompt: str, progress=gr.Progress(track_tqdm=True)): | |
| if input_image is None: | |
| return None, "Please upload an image first." | |
| progress(0, desc="Resizing image...") | |
| image_resized = input_image.resize(TARGET_SIZE) | |
| image_rgba = image_resized.convert("RGBA") | |
| def run_inference(model_to_run, model_name_desc): | |
| progress(0.1, desc=f"Preparing inputs for {model_name_desc}...") | |
| messages = [{"role": "user", "content": [{"type": "image", "image": image_rgba}, {"type": "text", "text": prompt}]}] | |
| text = processor.apply_chat_template(messages, tokenize=False, add_generation_prompt=True) | |
| inputs = processor(text=[text], images=[image_rgba], padding=True, return_tensors="pt").to(model_to_run.device) | |
| progress(0.5, desc=f"Generating layout data with {model_name_desc}...") | |
| with torch.no_grad(): | |
| output_ids = model_to_run.generate(**inputs, max_new_tokens=4096*2, do_sample=False) | |
| raw_text = processor.batch_decode(output_ids[:, inputs.input_ids.shape[1]:], skip_special_tokens=True)[0] | |
| try: | |
| json_match = re.search(r"```json(.*?)```", raw_text, re.DOTALL) | |
| json_str = json_match.group(1).strip() if json_match else raw_text.strip() | |
| return json.loads(json_str), raw_text | |
| except (json.JSONDecodeError, AttributeError): | |
| return None, raw_text | |
| if selected_model_name == MODEL_MIXING_NAME: | |
| base_results, raw_text_base = run_inference(model_base, "Base Model") | |
| enhanced_results, raw_text_enhanced = run_inference(model_enhanced, "Enhanced Model") | |
| output_text = f"--- Base Model Output ---\n{raw_text_base}\n\n--- Enhanced Model Output ---\n{raw_text_enhanced}" | |
| if base_results is None or enhanced_results is None: | |
| return image_rgba.convert("RGB"), f"Failed to parse JSON from one or both models:\n\n{output_text}" | |
| progress(0.8, desc="Merging results based on IoU...") | |
| merged_results = list(base_results) | |
| base_bboxes = [item['bbox_2d'] for item in base_results if 'bbox_2d' in item] | |
| for enhanced_item in enhanced_results: | |
| if 'bbox_2d' not in enhanced_item: continue | |
| is_duplicate = False | |
| for base_bbox in base_bboxes: | |
| if calculate_iou(enhanced_item['bbox_2d'], base_bbox) > 0.5: | |
| is_duplicate = True | |
| break | |
| if not is_duplicate: | |
| merged_results.append(enhanced_item) | |
| results = merged_results | |
| else: | |
| model = model_base if selected_model_name == MODEL_BASE_NAME else model_enhanced | |
| results, output_text = run_inference(model, selected_model_name) | |
| if results is None: | |
| return image_rgba.convert("RGB"), f"Failed to parse JSON from model output:\n\n{output_text}" | |
| # --- NEW: Apply the final post-processing step to remove nested elements --- | |
| progress(0.85, desc="Cleaning up nested elements...") | |
| results = remove_nested_elements(results) | |
| # --- Visualization --- | |
| progress(0.9, desc="Visualizing final results...") | |
| overlay = Image.new('RGBA', image_rgba.size, (255, 255, 255, 0)) | |
| draw = ImageDraw.Draw(overlay) | |
| try: | |
| font = ImageFont.truetype("Arial.ttf", 15) | |
| except IOError: | |
| font = ImageFont.load_default() | |
| # 推荐的修改 | |
| def get_safe_order(item): | |
| """一个安全的函数,用于获取并转换order值为整数,处理潜在的错误。""" | |
| try: | |
| # 尝试将获取到的值转换为整数 | |
| return int(item.get("order", 999)) | |
| except (ValueError, TypeError): | |
| # 如果转换失败(例如,值是"abc"这样的非数字字符串),则返回默认值 | |
| return 999 | |
| # 在排序时使用这个新函数 | |
| for item in sorted(results, key=get_safe_order): | |
| bbox, label, order = item.get("bbox_2d"), item.get("label", "other"), item.get("order", "") | |
| if not bbox or len(bbox) != 4: continue | |
| fill_color_rgba = LABEL_COLORS.get(label, LABEL_COLORS["other"]) | |
| solid_color_rgb = fill_color_rgba[:3] | |
| draw.rectangle(bbox, fill=fill_color_rgba, outline=solid_color_rgb, width=OUTLINE_WIDTH) | |
| tag_text = f"{order}: {label}" | |
| tag_bbox = draw.textbbox((0, 0), tag_text, font=font) | |
| tag_w, tag_h = tag_bbox[2] - tag_bbox[0], tag_bbox[3] - tag_bbox[1] | |
| tag_bg_box = [bbox[0], bbox[1], bbox[0] + tag_w + 10, bbox[1] + tag_h + 6] | |
| draw.rectangle(tag_bg_box, fill=solid_color_rgb) | |
| draw.text((bbox[0] + 5, bbox[1] + 3), tag_text, font=font, fill="white") | |
| visualized_image = Image.alpha_composite(image_rgba, overlay).convert("RGB") | |
| return visualized_image, output_text | |
| def clear_outputs(): | |
| """Helper function to clear the output fields.""" | |
| return None, None | |
| # --- 4. Gradio User Interface --- | |
| with gr.Blocks(theme=gr.themes.Glass(), title="Academic Paper Layout Detection") as demo: | |
| gr.Markdown("# 📄 Academic Paper Layout Detection") | |
| gr.Markdown( | |
| "Welcome! This tool uses a Qwen2.5-VL-3B-Instruct model fine-tuned on our Latex2Layout annotated layout dataset to identify layout regions in academic papers. " | |
| "Upload a document image to begin." | |
| "\n> **Please note:** All uploaded images are automatically resized to 924x1204 pixels to meet the model's input requirements." | |
| ) | |
| gr.Markdown("<hr>") | |
| with gr.Row(): | |
| with gr.Column(scale=4): | |
| input_image = gr.Image(type="pil", label="Upload Document Image", height=700) | |
| with gr.Column(scale=5): | |
| output_image = gr.Image(type="pil", label="Analyzed Layout", interactive=False, height=700) | |
| with gr.Row(): | |
| analyze_btn = gr.Button("✨ Analyze Layout", variant="primary", scale=1) | |
| # --- Advanced Settings Panel --- | |
| with gr.Accordion("Advanced Settings", open=False): | |
| model_selector = gr.Radio( | |
| choices=MODEL_CHOICES, | |
| value=MODEL_BASE_NAME, | |
| label="Select Model", | |
| info="Choose which model to use for inference. " | |
| ) | |
| prompt_textbox = gr.Textbox( | |
| label="Prompt", | |
| value=DEFAULT_PROMPT, | |
| lines=5, | |
| info="The prompt used to instruct the model." | |
| ) | |
| output_text = gr.Textbox(label="Model Raw Output", lines=8, interactive=False, visible=True) | |
| gr.Examples( | |
| examples=[["1.png"], ["2.png"], ["12.png"], ["13.png"], ["14.png"], ["11.png"], ["3.png"], ["7.png"], ["8.png"]], | |
| inputs=[input_image], | |
| label="Examples (Click to Run)", | |
| ) | |
| gr.Markdown("<p style='text-align:center; color:grey;'>Powered by the Latex2Layout dataset generated by Feijiang Han</p>") | |
| # --- Event Handlers --- | |
| analyze_btn.click( | |
| fn=analyze_and_visualize_layout, | |
| inputs=[input_image, model_selector, prompt_textbox], | |
| outputs=[output_image, output_text] | |
| ) | |
| input_image.upload(fn=clear_outputs, inputs=None, outputs=[output_image, output_text]) | |
| input_image.clear(fn=clear_outputs, inputs=None, outputs=[output_image, output_text]) | |
| # --- 5. Launch the Application --- | |
| if __name__ == "__main__": | |
| demo.launch() |