Spaces:

ChaseHan
/

Latex2Layout-Qwen2.5VL

Running on Zero

App Files Files Community

ChaseHan commited on Jul 15

Commit

8ba699b

verified ·

1 Parent(s): 1a4ada5

Update app.py

Browse files

Files changed (1) hide show

app.py +41 -20

app.py CHANGED Viewed

@@ -7,8 +7,12 @@ import re
 from spaces import GPU
 # --- 1. Configurations and Constants ---
-# Model repository on Hugging Face
-MODEL_ID = "ChaseHan/Latex2Layout-2000-sync-enhanced"
 # Target image size for model input
 TARGET_SIZE = (924, 1204)
@@ -34,37 +38,49 @@ DEFAULT_PROMPT = (
     """<image>Please carefully observe the document and detect the following regions: "title", "abstract", "heading", "footnote", "figure", "figure caption", "table", "table caption", "math", "text". Output each detected region's bbox coordinates in JSON format. The format of the output is: <answer>```json[{"bbox_2d": [x1, y1, x2, y2], "label": "region name", "order": "reading order"}]```</answer>."""
 )
-# --- 2. Load Model and Processor ---
-print("Loading model and processor, this may take a moment...")
 try:
-    model = Qwen2_5_VLForConditionalGeneration.from_pretrained(
-        MODEL_ID,
-        torch_dtype=torch.float16,
-        device_map="auto"
-    )
-    processor = AutoProcessor.from_pretrained(MODEL_ID)
-    print("Model loaded successfully!")
 except Exception as e:
-    print(f"Error loading model: {e}")
     exit()
 # --- 3. Core Inference and Visualization Function ---
 @GPU
-def analyze_and_visualize_layout(input_image: Image.Image, prompt: str, temperature: float, top_p: float, progress=gr.Progress(track_tqdm=True)):
     """
     Takes an image and model parameters, runs inference, and returns a visualized image and raw text output.
     """
     if input_image is None:
         return None, "Please upload an image first."
-    progress(0, desc="Resizing image...")
     image = input_image.resize(TARGET_SIZE)
     image = image.convert("RGBA")
     messages = [
         {"role": "user", "content": [
             {"type": "image", "image": image},
-            {"type": "text", "text": prompt} # Use the configurable prompt
         ]}
     ]
@@ -74,11 +90,10 @@ def analyze_and_visualize_layout(input_image: Image.Image, prompt: str, temperat
     progress(0.5, desc="Generating layout data...")
     with torch.no_grad():
-        # Pass new parameters to the model generation
         output_ids = model.generate(
             **inputs,
             max_new_tokens=4096,
-            do_sample=True, # Must be True for temperature/top_p to have an effect
             temperature=temperature,
             top_p=top_p
         )
@@ -150,8 +165,15 @@ with gr.Blocks(theme=gr.themes.Glass(), title="Academic Paper Layout Detection")
     with gr.Row():
          analyze_btn = gr.Button("✨ Analyze Layout", variant="primary", scale=1)
-    # --- NEW: Advanced Settings Panel ---
     with gr.Accordion("Advanced Settings", open=False):
         prompt_textbox = gr.Textbox(
             label="Prompt",
             value=DEFAULT_PROMPT,
@@ -181,7 +203,6 @@ with gr.Blocks(theme=gr.themes.Glass(), title="Academic Paper Layout Detection")
         examples=[["1.png"], ["2.png"], ["10.png"], ["11.png"], ["3.png"], ["7.png"], ["8.png"]],
         inputs=[input_image],
         label="Examples (Click to Run)",
-        # Examples now only populate the image input. The user clicks "Analyze" to run with current settings.
     )
     gr.Markdown("<p style='text-align:center; color:grey;'>Powered by the Latex2Layout dataset generated by Feijiang Han</p>")
@@ -189,7 +210,7 @@ with gr.Blocks(theme=gr.themes.Glass(), title="Academic Paper Layout Detection")
     # --- Event Handlers ---
     analyze_btn.click(
         fn=analyze_and_visualize_layout,
-        inputs=[input_image, prompt_textbox, temp_slider, top_p_slider], # Add new inputs
         outputs=[output_image, output_text]
     )

 from spaces import GPU
 # --- 1. Configurations and Constants ---
+# Define IDs for both models
+MODEL_CHOICES = {
+    "Latex2Layout-2000-sync (Base)": "ChaseHan/Latex2Layout-2000-sync",
+    "Latex2Layout-2000-sync-enhanced (Enhanced)": "ChaseHan/Latex2Layout-2000-sync-enhanced"
+}
+DEFAULT_MODEL_NAME = list(MODEL_CHOICES.keys())[0]
 # Target image size for model input
 TARGET_SIZE = (924, 1204)
     """<image>Please carefully observe the document and detect the following regions: "title", "abstract", "heading", "footnote", "figure", "figure caption", "table", "table caption", "math", "text". Output each detected region's bbox coordinates in JSON format. The format of the output is: <answer>```json[{"bbox_2d": [x1, y1, x2, y2], "label": "region name", "order": "reading order"}]```</answer>."""
 )
+# --- 2. Load Models and Processor ---
+# NOTE: Quantization is used to fit two models in memory.
+# Ensure `bitsandbytes` and `accelerate` are in your requirements.txt
+print("Loading models, this will take some time and VRAM...")
+MODELS = {}
 try:
+    for name, model_id in MODEL_CHOICES.items():
+        print(f"Loading {name}...")
+        model = Qwen2_5_VLForConditionalGeneration.from_pretrained(
+            model_id,
+            torch_dtype=torch.float16,
+            device_map="auto",
+            load_in_4bit=True # Essential for loading two models
+        )
+        MODELS[name] = model
+    # Processor is the same for both models
+    processor = AutoProcessor.from_pretrained(list(MODEL_CHOICES.values())[0])
+    print("All models loaded successfully!")
 except Exception as e:
+    print(f"Error loading models: {e}")
     exit()
 # --- 3. Core Inference and Visualization Function ---
 @GPU
+def analyze_and_visualize_layout(input_image: Image.Image, selected_model_name: str, prompt: str, temperature: float, top_p: float, progress=gr.Progress(track_tqdm=True)):
     """
     Takes an image and model parameters, runs inference, and returns a visualized image and raw text output.
     """
     if input_image is None:
         return None, "Please upload an image first."
+    # Select the model based on user's choice
+    model = MODELS[selected_model_name]
+    progress(0, desc=f"Resizing image for {selected_model_name}...")
     image = input_image.resize(TARGET_SIZE)
     image = image.convert("RGBA")
     messages = [
         {"role": "user", "content": [
             {"type": "image", "image": image},
+            {"type": "text", "text": prompt}
         ]}
     ]
     progress(0.5, desc="Generating layout data...")
     with torch.no_grad():
         output_ids = model.generate(
             **inputs,
             max_new_tokens=4096,
+            do_sample=True,
             temperature=temperature,
             top_p=top_p
         )
     with gr.Row():
          analyze_btn = gr.Button("✨ Analyze Layout", variant="primary", scale=1)
+    # --- Advanced Settings Panel ---
     with gr.Accordion("Advanced Settings", open=False):
+        # NEW: Model Selector
+        model_selector = gr.Radio(
+            choices=list(MODEL_CHOICES.keys()),
+            value=DEFAULT_MODEL_NAME,
+            label="Select Model",
+            info="Choose which model to use for inference."
+        )
         prompt_textbox = gr.Textbox(
             label="Prompt",
             value=DEFAULT_PROMPT,
         examples=[["1.png"], ["2.png"], ["10.png"], ["11.png"], ["3.png"], ["7.png"], ["8.png"]],
         inputs=[input_image],
         label="Examples (Click to Run)",
     )
     gr.Markdown("<p style='text-align:center; color:grey;'>Powered by the Latex2Layout dataset generated by Feijiang Han</p>")
     # --- Event Handlers ---
     analyze_btn.click(
         fn=analyze_and_visualize_layout,
+        inputs=[input_image, model_selector, prompt_textbox, temp_slider, top_p_slider],
         outputs=[output_image, output_text]
     )