Spaces:

ChaseHan
/

Latex2Layout-Qwen2.5VL

Running on Zero

App Files Files Community

ChaseHan commited on Jul 15

Commit

bf476ff

verified ·

1 Parent(s): 12c272a

Update app.py

Browse files

Files changed (1) hide show

app.py +143 -58

app.py CHANGED Viewed

@@ -5,33 +5,38 @@ from PIL import Image, ImageDraw, ImageFont
 import json
 import re
 from spaces import GPU
 # --- 1. Configurations and Constants ---
 # Define user-facing names and Hugging Face IDs for the models
 MODEL_BASE_NAME = "Latex2Layout-Base"
 MODEL_BASE_ID = "ChaseHan/Latex2Layout-2000-sync"
-MODEL_ENHANCED_NAME = "Latex2Layout-Noise"
-MODEL_ENHANCED_ID = "ChaseHan/Latex2Layout-2000-sync-enhanced"
-MODEL_CHOICES = [MODEL_BASE_NAME, MODEL_ENHANCED_NAME]
 # Target image size for model input
 TARGET_SIZE = (924, 1204)
 # Visualization Style Constants
 OUTLINE_WIDTH = 3
-# Color mapping for different layout regions (RGBA for transparency)
 LABEL_COLORS = {
-    "title": (255, 82, 82, 90),          # Red
     "abstract": (46, 204, 113, 90),     # Green
     "heading": (52, 152, 219, 90),      # Blue
     "footnote": (241, 196, 15, 90),     # Yellow
     "figure": (155, 89, 182, 90),      # Purple
     "figure caption": (26, 188, 156, 90),# Teal
-    "table": (230, 126, 34, 90),        # Orange
-    "table caption": (44, 62, 80, 90),   # Dark Blue/Gray
-    "math": (231, 76, 60, 90),        # Pomegranate
     "text": (149, 165, 166, 90),       # Gray
     "other": (127, 140, 141, 90)       # Light Gray
 }
@@ -43,71 +48,149 @@ DEFAULT_PROMPT = (
 # --- 2. Load Models and Processor ---
 print("Loading models, this will take some time and VRAM...")
 try:
-    # WARNING: Loading two 3B models without quantization requires a large amount of VRAM (>12 GB).
-    print(f"Loading {MODEL_BASE_NAME}...")
     model_base = Qwen2_5_VLForConditionalGeneration.from_pretrained(
         MODEL_BASE_ID,
         torch_dtype=torch.float16,
         device_map="auto"
     )
-    print(f"Loading {MODEL_ENHANCED_NAME}...")
     model_enhanced = Qwen2_5_VLForConditionalGeneration.from_pretrained(
-        MODEL_ENHANCED_ID,
-        torch_dtype=torch.float16,
         device_map="auto"
     )
     processor = AutoProcessor.from_pretrained(MODEL_BASE_ID)
-    print("All models loaded successfully!")
 except Exception as e:
     print(f"Error loading models: {e}")
     exit()
-# --- 3. Core Inference and Visualization Function ---
 @GPU
 def analyze_and_visualize_layout(input_image: Image.Image, selected_model_name: str, prompt: str, use_greedy: bool, temperature: float, top_p: float, progress=gr.Progress(track_tqdm=True)):
     """
     Takes an image and model parameters, runs inference, and returns a visualized image and raw text output.
     """
     if input_image is None:
         return None, "Please upload an image first."
-    # Select the model based on user's choice
-    model = model_base if selected_model_name == MODEL_BASE_NAME else model_enhanced
-    progress(0, desc=f"Resizing image for {selected_model_name}...")
     image = input_image.resize(TARGET_SIZE).convert("RGBA")
-    messages = [{"role": "user", "content": [{"type": "image", "image": image}, {"type": "text", "text": prompt}]}]
-    progress(0.2, desc="Preparing model inputs...")
-    text = processor.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
-    inputs = processor(text=[text], images=[image], padding=True, return_tensors="pt").to(model.device)
-    # Dynamically build generation arguments based on user's choice
-    gen_kwargs = {"max_new_tokens": 4096}
-    if use_greedy:
-        gen_kwargs["do_sample"] = False
-    else:
-        gen_kwargs["do_sample"] = True
-        gen_kwargs["temperature"] = temperature
-        gen_kwargs["top_p"] = top_p
-    progress(0.5, desc="Generating layout data...")
-    with torch.no_grad():
-        output_ids = model.generate(**inputs, **gen_kwargs)
-    output_text = processor.batch_decode(output_ids[:, inputs.input_ids.shape[1]:], skip_special_tokens=True)[0]
-    progress(0.8, desc="Parsing and visualizing results...")
-    try:
-        json_match = re.search(r"```json(.*?)```", output_text, re.DOTALL)
-        json_str = json_match.group(1).strip() if json_match else output_text.strip()
-        results = json.loads(json_str)
-    except (json.JSONDecodeError, AttributeError):
-        return image.convert("RGB"), f"Failed to parse JSON from model output:\n\n{output_text}"
     overlay = Image.new('RGBA', image.size, (255, 255, 255, 0))
     draw = ImageDraw.Draw(overlay)
@@ -117,7 +200,9 @@ def analyze_and_visualize_layout(input_image: Image.Image, selected_model_name:
         font = ImageFont.load_default()
     for item in sorted(results, key=lambda x: x.get("order", 999)):
-        bbox, label, order = item.get("bbox_2d"), item.get("label", "other"), item.get("order", "")
         if not bbox or len(bbox) != 4: continue
         fill_color_rgba = LABEL_COLORS.get(label, LABEL_COLORS["other"])
@@ -139,12 +224,12 @@ def clear_outputs():
 def toggle_sampling_params(use_greedy):
     """Updates visibility of temperature and top-p sliders."""
     is_visible = not use_greedy
-    return gr.update(visible=is_visible), gr.update(visible=is_visible)
 # --- 4. Gradio User Interface ---
 with gr.Blocks(theme=gr.themes.Glass(), title="Academic Paper Layout Detection") as demo:
     gr.Markdown("# 📄 Academic Paper Layout Detection")
-    gr.Markdown("Welcome! This tool uses a Qwen2.5-VL-3B-Instruct model...") # Truncated for brevity
     gr.Markdown("<hr>")
     with gr.Row():
@@ -154,22 +239,23 @@ with gr.Blocks(theme=gr.themes.Glass(), title="Academic Paper Layout Detection")
             output_image = gr.Image(type="pil", label="Analyzed Layout", interactive=False, height=700)
     with gr.Row():
-         analyze_btn = gr.Button("✨ Analyze Layout", variant="primary", scale=1)
     with gr.Accordion("Advanced Settings", open=False):
-        model_selector = gr.Radio(choices=MODEL_CHOICES, value=MODEL_BASE_NAME, label="Select Model")
         prompt_textbox = gr.Textbox(label="Prompt", value=DEFAULT_PROMPT, lines=5)
-        # NEW: Checkbox to toggle between greedy and sampling
         greedy_checkbox = gr.Checkbox(label="Use Greedy Decoding", value=True, info="Faster and deterministic. Uncheck to enable Temperature and Top-p.")
-        # NEW: Sliders are initially hidden
-        with gr.Row(visible=False) as sampling_params:
-            temp_slider = gr.Slider(minimum=0.0, maximum=2.0, step=0.05, value=0.7, label="Temperature")
-            top_p_slider = gr.Slider(minimum=0.0, maximum=1.0, step=0.05, value=0.9, label="Top-p")
-    output_text = gr.Textbox(label="Model Raw Output", lines=8, interactive=False, visible=True)
-    gr.Examples(examples=[["1.png"], ["2.png"], ["10.png"]], inputs=[input_image], label="Examples (Click to Run)")
     gr.Markdown("<p style='text-align:center; color:grey;'>Powered by the Latex2Layout dataset by Feijiang Han</p>")
     # --- Event Handlers ---
@@ -182,7 +268,6 @@ with gr.Blocks(theme=gr.themes.Glass(), title="Academic Paper Layout Detection")
     input_image.upload(fn=clear_outputs, inputs=None, outputs=[output_image, output_text])
     input_image.clear(fn=clear_outputs, inputs=None, outputs=[output_image, output_text])
-    # NEW: Event handler to show/hide sliders
     greedy_checkbox.change(
         fn=toggle_sampling_params,
         inputs=greedy_checkbox,

 import json
 import re
 from spaces import GPU
+from peft import PeftModel
 # --- 1. Configurations and Constants ---
 # Define user-facing names and Hugging Face IDs for the models
 MODEL_BASE_NAME = "Latex2Layout-Base"
 MODEL_BASE_ID = "ChaseHan/Latex2Layout-2000-sync"
+MODEL_ENHANCED_NAME = "Qwen2.5-VL + GRPO LoRA (Merged)"
+MODEL_ENHANCED_BASE_ID = "ZelongWang/Qwen2.5-VL-3B-Instruct-DocOD-2"
+MODEL_ENHANCED_LORA_ID = "ZelongWang/Qwen2.5-VL-3B-GRPO-lora-pdf-v3"
+LORA_CHECKPOINT_FOLDER = "checkpoint-525"  # Subfolder containing the adapter
+# --- NEW: Add a name for the Mixing mode ---
+MODEL_MIXING_NAME = "Mixing (Base + Enhanced)"
+MODEL_CHOICES = [MODEL_BASE_NAME, MODEL_ENHANCED_NAME, MODEL_MIXING_NAME]
 # Target image size for model input
 TARGET_SIZE = (924, 1204)
 # Visualization Style Constants
 OUTLINE_WIDTH = 3
 LABEL_COLORS = {
+    "title": (255, 82, 82, 90),         # Red
     "abstract": (46, 204, 113, 90),     # Green
     "heading": (52, 152, 219, 90),      # Blue
     "footnote": (241, 196, 15, 90),     # Yellow
     "figure": (155, 89, 182, 90),      # Purple
     "figure caption": (26, 188, 156, 90),# Teal
+    "table": (230, 126, 34, 90),       # Orange
+    "table caption": (44, 62, 80, 90), # Dark Blue/Gray
+    "math": (231, 76, 60, 90),         # Pomegranate
     "text": (149, 165, 166, 90),       # Gray
     "other": (127, 140, 141, 90)       # Light Gray
 }
 # --- 2. Load Models and Processor ---
 print("Loading models, this will take some time and VRAM...")
 try:
+    # Load the original base model
+    print(f"Loading base model: {MODEL_BASE_NAME}...")
     model_base = Qwen2_5_VLForConditionalGeneration.from_pretrained(
         MODEL_BASE_ID,
         torch_dtype=torch.float16,
         device_map="auto"
     )
+    # Load and merge the new enhanced model directly from the Hub
+    print(f"Loading enhanced model base: {MODEL_ENHANCED_BASE_ID}...")
+    # Step 1: Load the new base model
     model_enhanced = Qwen2_5_VLForConditionalGeneration.from_pretrained(
+        MODEL_ENHANCED_BASE_ID,
+        torch_dtype=torch.bfloat16,
+        device_map="auto",
+    )
+    print(f"Loading LoRA adapter online from: {MODEL_ENHANCED_LORA_ID}...")
+    # Step 2: Load Peft adapter directly from the Hub, specifying the subfolder
+    model_enhanced = PeftModel.from_pretrained(
+        model_enhanced,
+        MODEL_ENHANCED_LORA_ID,
+        subfolder=LORA_CHECKPOINT_FOLDER,
         device_map="auto"
     )
+    # Step 3: Merge the adapter weights and unload the PeftModel
+    print("Merging LoRA adapter...")
+    model_enhanced = model_enhanced.merge_and_unload()
+    print(f"Successfully loaded and merged model: {MODEL_ENHANCED_NAME}")
+    # Load processor
     processor = AutoProcessor.from_pretrained(MODEL_BASE_ID)
+    print("All models and processor loaded successfully!")
 except Exception as e:
     print(f"Error loading models: {e}")
     exit()
+# --- 3. Core Inference, Merging, and Visualization ---
+def calculate_iou(boxA, boxB):
+    """Calculate Intersection over Union (IoU) of two bounding boxes."""
+    # Determine the coordinates of the intersection rectangle
+    xA = max(boxA[0], boxB[0])
+    yA = max(boxA[1], boxB[1])
+    xB = min(boxA[2], boxB[2])
+    yB = min(boxA[3], boxB[3])
+    # Compute the area of intersection
+    interArea = max(0, xB - xA) * max(0, yB - yA)
+    # Compute the area of both bounding boxes
+    boxAArea = (boxA[2] - boxA[0]) * (boxA[3] - boxA[1])
+    boxBArea = (boxB[2] - boxB[0]) * (boxB[3] - boxB[1])
+    # Compute the area of union
+    unionArea = float(boxAArea + boxBArea - interArea)
+    # Return the IoU
+    return interArea / unionArea if unionArea > 0 else 0
 @GPU
 def analyze_and_visualize_layout(input_image: Image.Image, selected_model_name: str, prompt: str, use_greedy: bool, temperature: float, top_p: float, progress=gr.Progress(track_tqdm=True)):
     """
     Takes an image and model parameters, runs inference, and returns a visualized image and raw text output.
+    Supports running a single model or mixing results from two models.
     """
     if input_image is None:
         return None, "Please upload an image first."
+    progress(0, desc="Resizing image...")
     image = input_image.resize(TARGET_SIZE).convert("RGBA")
+    # --- Nested function to run inference on a given model ---
+    def run_inference(model_to_run, model_name_desc):
+        progress(0.1, desc=f"Preparing inputs for {model_name_desc}...")
+        messages = [{"role": "user", "content": [{"type": "image", "image": image}, {"type": "text", "text": prompt}]}]
+        text = processor.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
+        inputs = processor(text=[text], images=[image], padding=True, return_tensors="pt").to(model_to_run.device)
+        gen_kwargs = {"max_new_tokens": 4096}
+        if use_greedy:
+            gen_kwargs["do_sample"] = False
+        else:
+            gen_kwargs["do_sample"] = True
+            gen_kwargs["temperature"] = temperature
+            gen_kwargs["top_p"] = top_p
+        progress(0.5, desc=f"Generating layout data with {model_name_desc}...")
+        with torch.no_grad():
+            output_ids = model_to_run.generate(**inputs, **gen_kwargs)
+        raw_text = processor.batch_decode(output_ids[:, inputs.input_ids.shape[1]:], skip_special_tokens=True)[0]
+        try:
+            json_match = re.search(r"```json(.*?)```", raw_text, re.DOTALL)
+            json_str = json_match.group(1).strip() if json_match else raw_text.strip()
+            parsed_results = json.loads(json_str)
+            return parsed_results, raw_text
+        except (json.JSONDecodeError, AttributeError):
+            # Return raw text on failure for debugging
+            return None, raw_text
+    # --- Main logic: single model or mixing ---
+    if selected_model_name == MODEL_MIXING_NAME:
+        # Run both models
+        base_results, raw_text_base = run_inference(model_base, "Base Model")
+        enhanced_results, raw_text_enhanced = run_inference(model_enhanced, "Enhanced Model")
+        output_text = f"--- Base Model Output ---\n{raw_text_base}\n\n--- Enhanced Model Output ---\n{raw_text_enhanced}"
+        if base_results is None or enhanced_results is None:
+            return image.convert("RGB"), f"Failed to parse JSON from one or both models:\n\n{output_text}"
+        # Merge results
+        progress(0.8, desc="Merging results from both models...")
+        merged_results = list(base_results)
+        base_bboxes = [item['bbox_2d'] for item in base_results if 'bbox_2d' in item]
+        for enhanced_item in enhanced_results:
+            if 'bbox_2d' not in enhanced_item: continue
+            is_duplicate = False
+            for base_bbox in base_bboxes:
+                iou = calculate_iou(enhanced_item['bbox_2d'], base_bbox)
+                if iou > 0.5:  # IoU threshold for duplication
+                    is_duplicate = True
+                    break
+            if not is_duplicate:
+                merged_results.append(enhanced_item)
+        results = merged_results
+    else:
+        # Run a single model
+        model = model_base if selected_model_name == MODEL_BASE_NAME else model_enhanced
+        results, output_text = run_inference(model, selected_model_name)
+        if results is None:
+            return image.convert("RGB"), f"Failed to parse JSON from model output:\n\n{output_text}"
+    # --- Visualization ---
+    progress(0.9, desc="Visualizing final results...")
     overlay = Image.new('RGBA', image.size, (255, 255, 255, 0))
     draw = ImageDraw.Draw(overlay)
         font = ImageFont.load_default()
     for item in sorted(results, key=lambda x: x.get("order", 999)):
+        bbox = item.get("bbox_2d")
+        label = item.get("label", "other")
+        order = item.get("order", "")
         if not bbox or len(bbox) != 4: continue
         fill_color_rgba = LABEL_COLORS.get(label, LABEL_COLORS["other"])
 def toggle_sampling_params(use_greedy):
     """Updates visibility of temperature and top-p sliders."""
     is_visible = not use_greedy
+    return gr.update(visible=is_visible)
 # --- 4. Gradio User Interface ---
 with gr.Blocks(theme=gr.themes.Glass(), title="Academic Paper Layout Detection") as demo:
     gr.Markdown("# 📄 Academic Paper Layout Detection")
+    gr.Markdown("Welcome! This tool uses Qwen2.5-VL models to detect layout components in academic papers. You can choose the **Latex2Layout** model, an **Enhanced** version, or **Mix** the results of both.")
     gr.Markdown("<hr>")
     with gr.Row():
             output_image = gr.Image(type="pil", label="Analyzed Layout", interactive=False, height=700)
     with gr.Row():
+          analyze_btn = gr.Button("✨ Analyze Layout", variant="primary", scale=1)
     with gr.Accordion("Advanced Settings", open=False):
+        model_selector = gr.Radio(
+            choices=MODEL_CHOICES,
+            value=MODEL_MIXING_NAME, # Default to the new mixing mode
+            label="Select Model"
+        )
         prompt_textbox = gr.Textbox(label="Prompt", value=DEFAULT_PROMPT, lines=5)
         greedy_checkbox = gr.Checkbox(label="Use Greedy Decoding", value=True, info="Faster and deterministic. Uncheck to enable Temperature and Top-p.")
+        temp_slider = gr.Slider(minimum=0.0, maximum=2.0, step=0.05, value=0.7, label="Temperature")
+        top_p_slider = gr.Slider(minimum=0.0, maximum=1.0, step=0.05, value=0.9, label="Top-p")
+    output_text = gr.Textbox(label="Model Raw Output", lines=10, interactive=False, visible=True)
+    gr.Examples(examples=[["1.png"], ["2.png"], ["12.png"], ["13.png"], ["14.png"], ["11.png"], ["3.png"], ["7.png"], ["8.png"]], inputs=[input_image], label="Examples (Click to Run)")
     gr.Markdown("<p style='text-align:center; color:grey;'>Powered by the Latex2Layout dataset by Feijiang Han</p>")
     # --- Event Handlers ---
     input_image.upload(fn=clear_outputs, inputs=None, outputs=[output_image, output_text])
     input_image.clear(fn=clear_outputs, inputs=None, outputs=[output_image, output_text])
     greedy_checkbox.change(
         fn=toggle_sampling_params,
         inputs=greedy_checkbox,