Spaces:

tohoku-nlp
/

Sketch2Diagram

Runtime error

App Files Files Community

DaddyDaniel commited on Apr 24

Commit

8557bbe

1 Parent(s): 979c542

Fix inference and Dockerfile

Browse files

- Fix model generation
- Added proper output rendering
- Added download buttons

Files changed (9) hide show

.dockerignore +1 -0
Dockerfile +34 -0
NLP_Group_logo.png +0 -0
main.py +6 -1
main_page.py +6 -0
qwen2_inference.py +62 -12
requirements.txt +11 -2
sketch2diagram.py +39 -11
util.py +26 -0

.dockerignore ADDED Viewed

	@@ -0,0 +1 @@


1	+ .venv

Dockerfile ADDED Viewed

	@@ -0,0 +1,34 @@

+FROM nvidia/cuda:12.4.1-cudnn-runtime-ubuntu22.04
+# Set environment variables to reduce interactive prompts
+ENV DEBIAN_FRONTEND=noninteractive
+# Install dependencies
+RUN apt-get update && apt-get install -y \
+    python3.10 \
+    python3-pip \
+    git \
+    texlive-latex-base \
+    texlive-latex-extra \
+    texlive-fonts-recommended \
+    texlive-latex-recommended \
+    latexmk \
+    poppler-utils \
+    && rm -rf /var/lib/apt/lists/*
+# Copy the files
+WORKDIR /app
+COPY requirements.txt .
+RUN pip install --upgrade pip \
+    && pip install --no-cache-dir -r requirements.txt
+ENV PATH="/root/.local/bin:$PATH"
+ENV STREAMLIT_WATCHER_TYPE none
+RUN pip install --no-cache-dir https://github.com/mjun0812/flash-attention-prebuild-wheels/releases/download/v0.0.6/flash_attn-2.6.3+cu124torch2.6-cp310-cp310-linux_x86_64.whl
+COPY . .
+# Default command
+ENTRYPOINT ["streamlit", "run", "main.py"]

NLP_Group_logo.png ADDED Viewed

main.py CHANGED Viewed

@@ -1,6 +1,11 @@
 import streamlit as st
-st.logo("NLP_Group_logo.svg", size="large")
 main_page = st.Page("main_page.py", title="Main Page", icon="🏠")
 sketch2diagram_page = st.Page("sketch2diagram.py", title="Sketch2Diagram", icon="🖼️")
 # Add pages to the main page

+import os
 import streamlit as st
+from PIL import Image
+logo_path = os.path.join(os.path.dirname(__file__), "NLP_Group_logo.png")
+logo = Image.open(logo_path)
+st.logo(logo, size="large")
 main_page = st.Page("main_page.py", title="Main Page", icon="🏠")
 sketch2diagram_page = st.Page("sketch2diagram.py", title="Sketch2Diagram", icon="🖼️")
 # Add pages to the main page

main_page.py CHANGED Viewed

@@ -3,3 +3,9 @@ import streamlit as st
 st.title("Tohoku NLP Group - Language and Information Science Laboratory ")
 st.write("Welcome to the Language and Information Science Laboratory!")
 st.write("We are working on various projects and research focused on Visual Language Models.")

 st.title("Tohoku NLP Group - Language and Information Science Laboratory ")
 st.write("Welcome to the Language and Information Science Laboratory!")
 st.write("We are working on various projects and research focused on Visual Language Models.")
+# Link to sketch2diagram page
+st.subheader("You can check out our models and demos here:")
+st.write("[Sketch2Diagram](sketch2diagram) - A model that generates TikZ code from sketches.")

qwen2_inference.py CHANGED Viewed

@@ -1,21 +1,47 @@
 import streamlit as st
 import torch
 from PIL import Image
 from transformers import Qwen2VLForConditionalGeneration, AutoProcessor
 # Inference steps taken from https://huggingface.co/Qwen/Qwen2-VL-7B-Instruct
-@st.cache_resource
 def get_model(model_path):
     try:
         with st.spinner(f"Loading model {model_path}"):
-            device = "cuda" if torch.cuda.is_available() else "cpu"
             # Load the model here
             model_import = Qwen2VLForConditionalGeneration.from_pretrained(
-                model_path, torch_dtype="auto", device_map=device
             )
-            processor_import = AutoProcessor.from_pretrained(model_path)
             return model_import, processor_import
     except Exception as e:
@@ -27,27 +53,43 @@ def run_inference(input_file, model_path, args):
     model, processor = get_model(model_path)
     if model is None or processor is None:
         return "Error loading model."
     image = Image.open(input_file)
     conversation = [
         {
             "role": "user",
             "content": [
-                {"type": "image"},
                 {"type": "text", "text": "Please generate TikZ code to draw the diagram of the given image."}
             ],
         }
     ]
-    text_prompt = processor.apply_chat_template(conversation, add_generation_prompt=True)
-    inputs = processor(image, text_prompt, return_tensors="pt").to("cuda")
     output_ids = model.generate(**inputs,
-                                max_new_tokens=args.max_length,
                                 do_sample=True,
-                                top_p=args.top_p,
-                                top_k=args.top_k,
                                 num_return_sequences=1,
-                                temperature=args.temperature
-                            )
     generated_ids = [
         output_ids[len(input_ids):]
         for input_ids, output_ids in zip(inputs.input_ids, output_ids)
@@ -55,4 +97,12 @@ def run_inference(input_file, model_path, args):
     output_text = processor.batch_decode(
         generated_ids, skip_special_tokens=True, clean_up_tokenization_spaces=True
     )
     return output_text

+import os
 import streamlit as st
 import torch
 from PIL import Image
+from dotenv import load_dotenv
+from qwen_vl_utils import process_vision_info
 from transformers import Qwen2VLForConditionalGeneration, AutoProcessor
+load_dotenv()
+HUGGINGFACE_TOKEN = os.getenv("HUGGINGFACE_HUB_TOKEN")
+def print_gpu_memory(label, memory_allocated, memory_reserved):
+    if torch.cuda.is_available():
+        print("-----------------------------------")
+        print(f"{label} GPU Memory Usage:")
+        print(f"Allocated: {memory_allocated / 1024 ** 2:.2f} MB")
+        print(f"Cached: {memory_reserved / 1024 ** 2:.2f} MB")
 # Inference steps taken from https://huggingface.co/Qwen/Qwen2-VL-7B-Instruct
+# @st.cache_resource
 def get_model(model_path):
     try:
         with st.spinner(f"Loading model {model_path}"):
             # Load the model here
             model_import = Qwen2VLForConditionalGeneration.from_pretrained(
+                model_path, torch_dtype="auto", device_map="auto",
+                attn_implementation="flash_attention_2",
+                token=HUGGINGFACE_TOKEN,
             )
+            model_import = model_import.to("cuda")
+            size = {
+                "shortest_edge": 224,
+                "longest_edge": 1024,
+            }
+            processor_import = AutoProcessor.from_pretrained("itsumi-st/imgtikz_qwen2vl",
+                                                             size=size,
+                                                             min_pixels=256 * 256,
+                                                             max_pixels=1024 * 1024,
+                                                             token=HUGGINGFACE_TOKEN)
+            processor_import.tokenizer.padding_side = 'left'
             return model_import, processor_import
     except Exception as e:
     model, processor = get_model(model_path)
     if model is None or processor is None:
         return "Error loading model."
+    # GPU Memory after model loading:
+    after_model_dump = (torch.cuda.memory_allocated(), torch.cuda.memory_reserved())
     image = Image.open(input_file)
     conversation = [
         {
             "role": "user",
             "content": [
+                {"type": "image", "image": image},
                 {"type": "text", "text": "Please generate TikZ code to draw the diagram of the given image."}
             ],
         }
     ]
+    text_prompt = processor.apply_chat_template(conversation, tokenize=False, add_generation_prompt=True)
+    image_input, video_inputs = process_vision_info(conversation)
+    inputs = processor(
+        text=[text_prompt],
+        images=image_input,
+        videos=video_inputs,
+        padding=True,
+        return_tensors="pt",
+    ).to("cuda")
+    # GPU Memory after input processing
+    after_input_dump = (torch.cuda.memory_allocated(), torch.cuda.memory_reserved())
     output_ids = model.generate(**inputs,
+                                max_new_tokens=args['max_length'],
                                 do_sample=True,
+                                top_p=args['top_p'],
+                                top_k=args['top_k'],
+                                use_cache=True,
                                 num_return_sequences=1,
+                                pad_token_id=processor.tokenizer.pad_token_id,
+                                temperature=args['temperature']
+                                )
     generated_ids = [
         output_ids[len(input_ids):]
         for input_ids, output_ids in zip(inputs.input_ids, output_ids)
     output_text = processor.batch_decode(
         generated_ids, skip_special_tokens=True, clean_up_tokenization_spaces=True
     )
+    # GPU Memory after generation
+    after_gen_dump = (torch.cuda.memory_allocated(), torch.cuda.memory_reserved())
+    print_gpu_memory("Before Model", after_model_dump[0], after_model_dump[1])
+    print_gpu_memory("After Input", after_input_dump[0], after_input_dump[1])
+    print_gpu_memory("After Generation", after_gen_dump[0], after_gen_dump[1])
     return output_text

requirements.txt CHANGED Viewed

@@ -1,3 +1,12 @@
 streamlit~=1.43.2
-transformers~=4.50.0
-pillow~=11.1.0

 streamlit~=1.43.2
+torch==2.6.0
+torchvision==0.21.0
+torchaudio
+transformers==4.48.2
+qwen-vl-utils==0.0.10
+packaging
+accelerate==1.0.1
+requests
+pillow
+python-dotenv
+pdf2image

sketch2diagram.py CHANGED Viewed

@@ -1,6 +1,8 @@
 import streamlit as st
 from qwen2_inference import run_inference
 args = {}
@@ -8,12 +10,16 @@ args = {}
 st.sidebar.title("Model Configuration")
 model_name = st.sidebar.selectbox("Model Name", ['Itsumi-st/Imgtikz_Qwen2vl', 'Qwen/Qwen2-VL-7B-Instruct'])
 args['inference_strat'] = st.sidebar.selectbox("Inference Strategy", ["Iterative", "Multi-candidate"],
-                                            help="Choose the inference strategy for the model. Iterative generates one candidate at a time until an output compiles, while Multi-candidate generates multiple candidates in parallel.")
-args['max_length'] = st.sidebar.slider("Max Length", 1, 5096, 2048, help="Maximum length of the generated output. The model will generate text up to this length.")
 args['seed'] = st.sidebar.number_input("Seed", min_value=0, value=42, step=1)
-args['top_p'] = st.sidebar.slider("Top P", 0.0, 1.0, 1.0, step=0.01, help="Top P sampling parameter. The model will sample from the top P percentage of the probability distribution.")
-args['temperature'] = st.sidebar.slider("Top P", 0.0, 1.0, 0.6, step=0.01, help="Temperature parameter for sampling. Higher values result in more random outputs.")
-args['top_k'] = st.sidebar.slider("Top K", 0, 100, 50, step=1, help="Top K sampling parameter. The model will sample from the top K tokens with the highest probabilities.")
 # Introduction Section
 st.title("Sketch2Diagram")
@@ -22,7 +28,6 @@ st.write("This is a runnable demo of ImgTikZ model introduced in the Sketch2Diag
 st.write("Please refer to the [original paper](https://openreview.net/pdf?id=KvaDHPhhir) for more details.")
 st.write("The model is trained to convert sketches into TikZ code, which can be used to generate vectorized diagrams.")
 # User Input Section
 st.subheader("Upload your sketch")
@@ -32,10 +37,10 @@ input_method = st.selectbox("Input Method", ["Upload", "Camera"],
 input_file = None
 if input_method == "Camera":
     input_file = st.camera_input("Take a picture of your sketch")
-    # Implement camera input functionality here
 else:
     input_file = st.file_uploader("Upload an image of your sketch", type=["png", "jpg", "jpeg"])
 generate_command = None
 # Display the uploaded image
 if input_file is not None:
@@ -45,6 +50,29 @@ if input_file is not None:
 # Run model inference
 if generate_command:
     with st.spinner("Generating TikZ code..."):
-        output = run_inference(input_file, model_name, args)
-        st.success("TikZ code generated successfully!")
-        st.code(output, language='latex')

 import streamlit as st
+from pdf2image import convert_from_path
 from qwen2_inference import run_inference
+from util import compile_tikz_to_pdf
 args = {}
 st.sidebar.title("Model Configuration")
 model_name = st.sidebar.selectbox("Model Name", ['Itsumi-st/Imgtikz_Qwen2vl', 'Qwen/Qwen2-VL-7B-Instruct'])
 args['inference_strat'] = st.sidebar.selectbox("Inference Strategy", ["Iterative", "Multi-candidate"],
+                                               help="Choose the inference strategy for the model. Iterative generates one candidate at a time until an output compiles, while Multi-candidate generates multiple candidates in parallel.")
+args['max_length'] = st.sidebar.slider("Max Length", 1, 5096, 2048,
+                                       help="Maximum length of the generated output. The model will generate text up to this length.")
 args['seed'] = st.sidebar.number_input("Seed", min_value=0, value=42, step=1)
+args['temperature'] = st.sidebar.slider("Temperature", 0.0, 1.0, 0.6, step=0.01,
+                                        help="Temperature parameter for sampling. Higher values result in more random outputs.")
+args['top_p'] = st.sidebar.slider("Top P", 0.0, 1.0, 1.0, step=0.01,
+                                  help="Top P sampling parameter. The model will sample from the top P percentage of the probability distribution.")
+args['top_k'] = st.sidebar.slider("Top K", 0, 100, 50, step=1,
+                                  help="Top K sampling parameter. The model will sample from the top K tokens with the highest probabilities.")
 # Introduction Section
 st.title("Sketch2Diagram")
 st.write("Please refer to the [original paper](https://openreview.net/pdf?id=KvaDHPhhir) for more details.")
 st.write("The model is trained to convert sketches into TikZ code, which can be used to generate vectorized diagrams.")
 # User Input Section
 st.subheader("Upload your sketch")
 input_file = None
 if input_method == "Camera":
     input_file = st.camera_input("Take a picture of your sketch")
+    # todo: Implement camera input functionality here
 else:
     input_file = st.file_uploader("Upload an image of your sketch", type=["png", "jpg", "jpeg"])
+st.write(args)
 generate_command = None
 # Display the uploaded image
 if input_file is not None:
 # Run model inference
 if generate_command:
     with st.spinner("Generating TikZ code..."):
+        output = run_inference(input_file, model_name, args)[0]
+        pdf_file_path = compile_tikz_to_pdf(output)
+        if output and pdf_file_path:
+            st.success("TikZ code generated successfully!")
+            st.code(output, language='latex')
+            st.download_button(
+                label="Download LaTeX Code",
+                data=output,
+                file_name="output.tex",
+                mime="text/plain"
+            )
+            # st.image(pdf_file_path, caption="Generated Diagram", use_column_width=True)
+            with open(pdf_file_path, "rb") as f:
+                st.download_button(
+                    label="Download PDF",
+                    data=f.read(),  # ✅ this is the binary content
+                    file_name="output.pdf",
+                    mime="application/pdf"
+                )
+            images = convert_from_path(pdf_file_path)
+            st.image(images[0], caption="Generated Diagram", use_column_width=True)
+        else:
+            st.error("Failed to generate TikZ code.")

util.py ADDED Viewed

	@@ -0,0 +1,26 @@

+import os
+import subprocess
+import tempfile
+def compile_tikz_to_pdf(tikz_code):
+    temp_dir = tempfile.mkdtemp()
+    tex_path = os.path.join(temp_dir, "output.tex")
+    pdf_path = os.path.join(temp_dir, "output.pdf")
+    with open(tex_path, "w") as f:
+        f.write(tikz_code)
+    try:
+        subprocess.run(
+            ["pdflatex", "-interaction=nonstopmode", tex_path],
+            cwd=temp_dir,
+            stdout=subprocess.PIPE,
+            stderr=subprocess.PIPE,
+            check=True,
+        )
+        return pdf_path
+    except subprocess.CalledProcessError as e:
+        print("PDF compilation failed:", e)
+        return None