Spaces:
Runtime error
Runtime error
| import os | |
| import streamlit as st | |
| import torch | |
| from PIL import Image | |
| from dotenv import load_dotenv | |
| from qwen_vl_utils import process_vision_info | |
| from transformers import Qwen2VLForConditionalGeneration, AutoProcessor | |
| load_dotenv() | |
| HUGGINGFACE_TOKEN = os.getenv("HUGGINGFACE_HUB_TOKEN") | |
| def print_gpu_memory(label, memory_allocated, memory_reserved): | |
| if torch.cuda.is_available(): | |
| print("-----------------------------------") | |
| print(f"{label} GPU Memory Usage:") | |
| print(f"Allocated: {memory_allocated / 1024 ** 2:.2f} MB") | |
| print(f"Cached: {memory_reserved / 1024 ** 2:.2f} MB") | |
| # Inference steps taken from https://huggingface.co/Qwen/Qwen2-VL-7B-Instruct | |
| def get_model(model_path): | |
| try: | |
| with st.spinner(f"Loading model {model_path}"): | |
| # Load the model here | |
| model_import = Qwen2VLForConditionalGeneration.from_pretrained( | |
| model_path, torch_dtype="auto", device_map="auto", | |
| attn_implementation="flash_attention_2", | |
| token=HUGGINGFACE_TOKEN, | |
| ) | |
| size = { | |
| "shortest_edge": 224, | |
| "longest_edge": 1024, | |
| } | |
| processor_import = AutoProcessor.from_pretrained("itsumi-st/imgtikz_qwen2vl", | |
| size=size, | |
| min_pixels=256 * 256, | |
| max_pixels=1024 * 1024, | |
| token=HUGGINGFACE_TOKEN) | |
| processor_import.tokenizer.padding_side = 'left' | |
| return model_import, processor_import | |
| except Exception as e: | |
| st.error(f"Error loading model: {e}") | |
| return None, None | |
| def run_inference(input_file, model_path, args): | |
| model, processor = get_model(model_path) | |
| if model is None or processor is None: | |
| return "Error loading model." | |
| # GPU Memory after model loading: | |
| after_model_dump = (torch.cuda.memory_allocated(), torch.cuda.memory_reserved()) | |
| image = Image.open(input_file) | |
| conversation = [ | |
| { | |
| "role": "user", | |
| "content": [ | |
| {"type": "image", "image": image}, | |
| {"type": "text", "text": "Please generate TikZ code to draw the diagram of the given image."} | |
| ], | |
| } | |
| ] | |
| text_prompt = processor.apply_chat_template(conversation, tokenize=False, add_generation_prompt=True) | |
| image_input, video_inputs = process_vision_info(conversation) | |
| inputs = processor( | |
| text=[text_prompt], | |
| images=image_input, | |
| videos=video_inputs, | |
| padding=True, | |
| return_tensors="pt", | |
| ) | |
| # GPU Memory after input processing | |
| after_input_dump = (torch.cuda.memory_allocated(), torch.cuda.memory_reserved()) | |
| with torch.inference_mode(): | |
| output_ids = model.generate( | |
| **inputs, | |
| max_new_tokens=int(args["max_length"]), | |
| do_sample=True, | |
| top_p=float(args["top_p"]), | |
| top_k=int(args["top_k"]), | |
| temperature=float(args["temperature"]), | |
| use_cache=True, | |
| num_return_sequences=1, | |
| pad_token_id=processor.tokenizer.pad_token_id, | |
| ) | |
| generated_ids = [ | |
| output_ids[len(input_ids):] | |
| for input_ids, output_ids in zip(inputs.input_ids, output_ids) | |
| ] | |
| output_text = processor.batch_decode( | |
| generated_ids, skip_special_tokens=True, clean_up_tokenization_spaces=True | |
| ) | |
| # GPU Memory after generation | |
| after_gen_dump = (torch.cuda.memory_allocated(), torch.cuda.memory_reserved()) | |
| print_gpu_memory("Before Model", after_model_dump[0], after_model_dump[1]) | |
| print_gpu_memory("After Input", after_input_dump[0], after_input_dump[1]) | |
| print_gpu_memory("After Generation", after_gen_dump[0], after_gen_dump[1]) | |
| # Clean up | |
| del inputs, output_ids, generated_ids, image, image_input, video_inputs | |
| if torch.cuda.is_available(): | |
| torch.cuda.empty_cache() | |
| torch.cuda.ipc_collect() | |
| return output_text | |