Spaces:
Runtime error
Runtime error
| import gradio as gr | |
| import torch | |
| from transformers import AutoProcessor, Qwen2_5_VLForConditionalGeneration | |
| from PIL import Image, ImageDraw, ImageFont | |
| import json | |
| import re | |
| from spaces import GPU | |
| # --- 1. Configurations and Constants --- | |
| # Model repository on Hugging Face | |
| MODEL_ID = "ChaseHan/Latex2Layout-2000-sync" | |
| # Target image size for model input | |
| TARGET_SIZE = (924, 1204) | |
| # Visualization Style Constants | |
| OUTLINE_WIDTH = 3 | |
| # Color mapping for different layout regions (RGBA for transparency) | |
| LABEL_COLORS = { | |
| "title": (255, 82, 82, 90), # Red | |
| "abstract": (46, 204, 113, 90), # Green | |
| "heading": (52, 152, 219, 90), # Blue | |
| "footnote": (241, 196, 15, 90), # Yellow | |
| "figure": (155, 89, 182, 90), # Purple | |
| "figure caption": (26, 188, 156, 90),# Teal | |
| "table": (230, 126, 34, 90), # Orange | |
| "table caption": (44, 62, 80, 90), # Dark Blue/Gray | |
| "math": (231, 76, 60, 90), # Pomegranate | |
| "text": (149, 165, 166, 90), # Gray | |
| "other": (127, 140, 141, 90) # Light Gray | |
| } | |
| # The default prompt sent to the model for layout detection | |
| DEFAULT_PROMPT = ( | |
| """<image>Please carefully observe the document and detect the following regions: "title", "abstract", "heading", "footnote", "figure", "figure caption", "table", "table caption", "math", "text". Output each detected region's bbox coordinates in JSON format. The format of the output is: <answer>```json[{"bbox_2d": [x1, y1, x2, y2], "label": "region name", "order": "reading order"}]```</answer>.""" | |
| ) | |
| # --- 2. Load Model and Processor --- | |
| print("Loading model and processor, this may take a moment...") | |
| try: | |
| model = Qwen2_5_VLForConditionalGeneration.from_pretrained( | |
| MODEL_ID, | |
| torch_dtype=torch.float16, | |
| device_map="auto" | |
| ) | |
| processor = AutoProcessor.from_pretrained(MODEL_ID) | |
| print("Model loaded successfully!") | |
| except Exception as e: | |
| print(f"Error loading model: {e}") | |
| exit() | |
| # --- 3. Core Inference and Visualization Function --- | |
| def analyze_and_visualize_layout(input_image: Image.Image, prompt: str, temperature: float, top_p: float, progress=gr.Progress(track_tqdm=True)): | |
| """ | |
| Takes an image and model parameters, runs inference, and returns a visualized image and raw text output. | |
| """ | |
| if input_image is None: | |
| return None, "Please upload an image first." | |
| progress(0, desc="Resizing image...") | |
| image = input_image.resize(TARGET_SIZE) | |
| image = image.convert("RGBA") | |
| messages = [ | |
| {"role": "user", "content": [ | |
| {"type": "image", "image": image}, | |
| {"type": "text", "text": prompt} # Use the configurable prompt | |
| ]} | |
| ] | |
| progress(0.2, desc="Preparing model inputs...") | |
| text = processor.apply_chat_template(messages, tokenize=False, add_generation_prompt=True) | |
| inputs = processor(text=[text], images=[image], padding=True, return_tensors="pt").to(model.device) | |
| progress(0.5, desc="Generating layout data...") | |
| with torch.no_grad(): | |
| # Pass new parameters to the model generation | |
| output_ids = model.generate( | |
| **inputs, | |
| max_new_tokens=4096, | |
| do_sample=True, # Must be True for temperature/top_p to have an effect | |
| temperature=temperature, | |
| top_p=top_p | |
| ) | |
| output_text = processor.batch_decode( | |
| output_ids[:, inputs.input_ids.shape[1]:], skip_special_tokens=True | |
| )[0] | |
| progress(0.8, desc="Parsing and visualizing results...") | |
| try: | |
| json_match = re.search(r"```json(.*?)```", output_text, re.DOTALL) | |
| json_str = json_match.group(1).strip() if json_match else output_text.strip() | |
| results = json.loads(json_str) | |
| except (json.JSONDecodeError, AttributeError): | |
| return image.convert("RGB"), f"Failed to parse JSON from model output:\n\n{output_text}" | |
| overlay = Image.new('RGBA', image.size, (255, 255, 255, 0)) | |
| draw = ImageDraw.Draw(overlay) | |
| try: | |
| font = ImageFont.truetype("Arial.ttf", 15) | |
| except IOError: | |
| font = ImageFont.load_default() | |
| for item in sorted(results, key=lambda x: x.get("order", 999)): | |
| bbox = item.get("bbox_2d") | |
| label = item.get("label", "other") | |
| order = item.get("order", "") | |
| if not bbox or len(bbox) != 4: continue | |
| fill_color_rgba = LABEL_COLORS.get(label, LABEL_COLORS["other"]) | |
| solid_color_rgb = fill_color_rgba[:3] | |
| draw.rectangle(bbox, fill=fill_color_rgba, outline=solid_color_rgb, width=OUTLINE_WIDTH) | |
| tag_text = f"{order}: {label}" | |
| tag_bbox = draw.textbbox((0, 0), tag_text, font=font) | |
| tag_w, tag_h = tag_bbox[2] - tag_bbox[0], tag_bbox[3] - tag_bbox[1] | |
| tag_bg_box = [bbox[0], bbox[1], bbox[0] + tag_w + 10, bbox[1] + tag_h + 6] | |
| draw.rectangle(tag_bg_box, fill=solid_color_rgb) | |
| draw.text((bbox[0] + 5, bbox[1] + 3), tag_text, font=font, fill="white") | |
| visualized_image = Image.alpha_composite(image, overlay).convert("RGB") | |
| return visualized_image, output_text | |
| def clear_outputs(): | |
| """Helper function to clear the output fields.""" | |
| return None, None | |
| # --- 4. Gradio User Interface --- | |
| with gr.Blocks(theme=gr.themes.Glass(), title="Academic Paper Layout Detection") as demo: | |
| gr.Markdown("# 📄 Academic Paper Layout Detection") | |
| gr.Markdown( | |
| "Welcome! This tool uses a Qwen2.5-VL-3B-Instruct model fine-tuned on our Latex2Layout annotated layout dataset to identify layout regions in academic papers. " | |
| "Upload a document image to begin." | |
| "\n> **Please note:** All uploaded images are automatically resized to 924x1204 pixels to meet the model's input requirements." | |
| ) | |
| gr.Markdown("<hr>") | |
| with gr.Row(): | |
| with gr.Column(scale=4): | |
| input_image = gr.Image(type="pil", label="Upload Document Image", height=700) | |
| with gr.Column(scale=5): | |
| output_image = gr.Image(type="pil", label="Analyzed Layout", interactive=False, height=700) | |
| with gr.Row(): | |
| analyze_btn = gr.Button("✨ Analyze Layout", variant="primary", scale=1) | |
| # --- NEW: Advanced Settings Panel --- | |
| with gr.Accordion("Advanced Settings", open=False): | |
| prompt_textbox = gr.Textbox( | |
| label="Prompt", | |
| value=DEFAULT_PROMPT, | |
| lines=5, | |
| info="The prompt used to instruct the model." | |
| ) | |
| temp_slider = gr.Slider( | |
| minimum=0.0, | |
| maximum=2.0, | |
| step=0.05, | |
| value=0.7, | |
| label="Temperature", | |
| info="Controls randomness. Higher values mean more random outputs." | |
| ) | |
| top_p_slider = gr.Slider( | |
| minimum=0.0, | |
| maximum=1.0, | |
| step=0.05, | |
| value=0.9, | |
| label="Top-p (Nucleus Sampling)", | |
| info="Filters a cumulative probability mass. Lower values are less random." | |
| ) | |
| output_text = gr.Textbox(label="Model Raw Output", lines=8, interactive=False, visible=True) | |
| gr.Examples( | |
| examples=[["page_2.png"], ["page_3.png"], ["page_5.png"], ["page_13.png"]], | |
| inputs=[input_image], | |
| label="Examples (Click to Run)", | |
| # Examples now only populate the image input. The user clicks "Analyze" to run with current settings. | |
| ) | |
| gr.Markdown("<p style='text-align:center; color:grey;'>Powered by the Latex2Layout dataset generated by Feijiang Han</p>") | |
| # --- Event Handlers --- | |
| analyze_btn.click( | |
| fn=analyze_and_visualize_layout, | |
| inputs=[input_image, prompt_textbox, temp_slider, top_p_slider], # Add new inputs | |
| outputs=[output_image, output_text] | |
| ) | |
| input_image.upload(fn=clear_outputs, inputs=None, outputs=[output_image, output_text]) | |
| input_image.clear(fn=clear_outputs, inputs=None, outputs=[output_image, output_text]) | |
| # --- 5. Launch the Application --- | |
| if __name__ == "__main__": | |
| demo.launch() |