File size: 4,188 Bytes
8557bbe
 
979c542
 
 
8557bbe
 
979c542
 
8557bbe
 
 
 
 
 
 
 
 
 
 
979c542
 
e497738
979c542
 
 
 
 
8557bbe
 
 
979c542
8557bbe
 
 
 
 
 
 
 
 
 
979c542
 
 
 
 
 
 
 
 
 
 
8557bbe
 
 
 
979c542
 
 
 
 
8557bbe
979c542
 
 
 
8557bbe
 
 
 
 
 
 
 
1d66415
8557bbe
 
 
979c542
1d66415
 
 
 
 
 
 
 
 
 
 
 
 
979c542
 
 
 
 
 
 
8557bbe
 
 
 
 
 
 
 
1d66415
 
 
 
 
 
979c542
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
import os

import streamlit as st
import torch
from PIL import Image
from dotenv import load_dotenv
from qwen_vl_utils import process_vision_info
from transformers import Qwen2VLForConditionalGeneration, AutoProcessor

load_dotenv()
HUGGINGFACE_TOKEN = os.getenv("HUGGINGFACE_HUB_TOKEN")


def print_gpu_memory(label, memory_allocated, memory_reserved):
    if torch.cuda.is_available():
        print("-----------------------------------")
        print(f"{label} GPU Memory Usage:")
        print(f"Allocated: {memory_allocated / 1024 ** 2:.2f} MB")
        print(f"Cached: {memory_reserved / 1024 ** 2:.2f} MB")


# Inference steps taken from https://huggingface.co/Qwen/Qwen2-VL-7B-Instruct
@st.cache_resource
def get_model(model_path):
    try:
        with st.spinner(f"Loading model {model_path}"):
            # Load the model here
            model_import = Qwen2VLForConditionalGeneration.from_pretrained(
                model_path, torch_dtype="auto", device_map="auto",
                attn_implementation="flash_attention_2",
                token=HUGGINGFACE_TOKEN,
            )
            size = {
                "shortest_edge": 224,
                "longest_edge": 1024,
            }
            processor_import = AutoProcessor.from_pretrained("itsumi-st/imgtikz_qwen2vl",
                                                             size=size,
                                                             min_pixels=256 * 256,
                                                             max_pixels=1024 * 1024,
                                                             token=HUGGINGFACE_TOKEN)
            processor_import.tokenizer.padding_side = 'left'

            return model_import, processor_import
    except Exception as e:
        st.error(f"Error loading model: {e}")
        return None, None


def run_inference(input_file, model_path, args):
    model, processor = get_model(model_path)
    if model is None or processor is None:
        return "Error loading model."

    # GPU Memory after model loading:
    after_model_dump = (torch.cuda.memory_allocated(), torch.cuda.memory_reserved())

    image = Image.open(input_file)
    conversation = [
        {
            "role": "user",
            "content": [
                {"type": "image", "image": image},
                {"type": "text", "text": "Please generate TikZ code to draw the diagram of the given image."}
            ],
        }
    ]
    text_prompt = processor.apply_chat_template(conversation, tokenize=False, add_generation_prompt=True)
    image_input, video_inputs = process_vision_info(conversation)
    inputs = processor(
        text=[text_prompt],
        images=image_input,
        videos=video_inputs,
        padding=True,
        return_tensors="pt",
    )

    # GPU Memory after input processing
    after_input_dump = (torch.cuda.memory_allocated(), torch.cuda.memory_reserved())

    with torch.inference_mode():
        output_ids = model.generate(
            **inputs,
            max_new_tokens=int(args["max_length"]),
            do_sample=True,
            top_p=float(args["top_p"]),
            top_k=int(args["top_k"]),
            temperature=float(args["temperature"]),
            use_cache=True,
            num_return_sequences=1,
            pad_token_id=processor.tokenizer.pad_token_id,
        )
    
    generated_ids = [
        output_ids[len(input_ids):]
        for input_ids, output_ids in zip(inputs.input_ids, output_ids)
    ]
    output_text = processor.batch_decode(
        generated_ids, skip_special_tokens=True, clean_up_tokenization_spaces=True
    )

    # GPU Memory after generation
    after_gen_dump = (torch.cuda.memory_allocated(), torch.cuda.memory_reserved())

    print_gpu_memory("Before Model", after_model_dump[0], after_model_dump[1])
    print_gpu_memory("After Input", after_input_dump[0], after_input_dump[1])
    print_gpu_memory("After Generation", after_gen_dump[0], after_gen_dump[1])

    # Clean up
    del inputs, output_ids, generated_ids, image, image_input, video_inputs
    if torch.cuda.is_available():
        torch.cuda.empty_cache()
        torch.cuda.ipc_collect()

    return output_text