hperkins
/

Qwen2-VL-7B-Instruct

@@ -5,18 +5,19 @@ import json
 class EndpointHandler:
     def __init__(self, model_dir):
-        # Load the model with FP32 precision and automatic device placement
         self.model = Qwen2VLForConditionalGeneration.from_pretrained(
             model_dir,
-            torch_dtype=torch.float32,  # Switch to FP32 for full precision
-            device_map="auto"  # Automatically use available GPUs
         )
-        # Load processor for handling inputs (text, images, videos)
         self.processor = AutoProcessor.from_pretrained(model_dir)
-        # Set the device (GPU or CPU)
         self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
         self.model.to(self.device)  # Move the model to the appropriate device
-        self.model.eval()  # Set model to evaluation mode
     def preprocess(self, request_data):
         # Extract the 'messages' from incoming request
@@ -26,11 +27,11 @@ class EndpointHandler:
         # Process vision inputs (images, videos) from the messages
         image_inputs, video_inputs = process_vision_info(messages)
-        # Prepare text input for the model
         text = self.processor.apply_chat_template(
             messages, tokenize=False, add_generation_prompt=True
         )
-        # Package inputs (text, images, videos) into tensors
         inputs = self.processor(
             text=[text],
             images=image_inputs,
@@ -38,23 +39,27 @@ class EndpointHandler:
             padding=True,
             return_tensors="pt",
         )
         return inputs.to(self.device)  # Move inputs to the correct device
     def inference(self, inputs):
-        # Run model inference (without gradient computation)
         with torch.no_grad():
             generated_ids = self.model.generate(
                 **inputs,
-                max_new_tokens=128,  # Limit response length
-                num_beams=1,  # Use beam search for diversity
-                max_batch_size=4  # Increase batch size if memory allows
             )
-        # Remove input tokens from generated output
         generated_ids_trimmed = [
             out_ids[len(in_ids):] for in_ids, out_ids in zip(inputs.input_ids, generated_ids)
         ]
         return generated_ids_trimmed
     def postprocess(self, inference_output):
@@ -68,12 +73,13 @@ class EndpointHandler:
         try:
             # Parse the incoming JSON request
             request_data = json.loads(request)
-            # Preprocess inputs
             inputs = self.preprocess(request_data)
-            # Perform inference
             outputs = self.inference(inputs)
-            # Postprocess the model output
             result = self.postprocess(outputs)
-            return json.dumps({"result": result})  # Return result as JSON
         except Exception as e:
-            return json.dumps({"error": str(e)})  # Return error message if any

 class EndpointHandler:
     def __init__(self, model_dir):
+        # Load the model and processor for Qwen2-VL-7B
         self.model = Qwen2VLForConditionalGeneration.from_pretrained(
             model_dir,
+            torch_dtype=torch.float16,  # Using FP16 to reduce memory usage (Switch to FP32 if needed)
+            device_map="auto"  # Automatically assigns the model to the available GPU(s)
         )
         self.processor = AutoProcessor.from_pretrained(model_dir)
         self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
         self.model.to(self.device)  # Move the model to the appropriate device
+        self.model.eval()
+        # Enable gradient checkpointing to save memory during large model runs
+        self.model.gradient_checkpointing_enable()
     def preprocess(self, request_data):
         # Extract the 'messages' from incoming request
         # Process vision inputs (images, videos) from the messages
         image_inputs, video_inputs = process_vision_info(messages)
+        # Prepare text input for the chat model
         text = self.processor.apply_chat_template(
             messages, tokenize=False, add_generation_prompt=True
         )
+        # Prepare inputs for the model (text + vision inputs) and return tensors
         inputs = self.processor(
             text=[text],
             images=image_inputs,
             padding=True,
             return_tensors="pt",
         )
         return inputs.to(self.device)  # Move inputs to the correct device
     def inference(self, inputs):
+        # Perform inference with the model in no-grad mode (memory efficient)
         with torch.no_grad():
             generated_ids = self.model.generate(
                 **inputs,
+                max_new_tokens=128,  # Limit the response length to reduce memory usage
+                num_beams=1,  # Set beam size for lower memory consumption
+                max_batch_size=1  # Keep batch size small for memory optimization
             )
+        # Trim generated output (remove input tokens from generated output)
         generated_ids_trimmed = [
             out_ids[len(in_ids):] for in_ids, out_ids in zip(inputs.input_ids, generated_ids)
         ]
+        # Clear the CUDA cache after inference to release unused memory
+        torch.cuda.empty_cache()
         return generated_ids_trimmed
     def postprocess(self, inference_output):
         try:
             # Parse the incoming JSON request
             request_data = json.loads(request)
+            # Preprocess inputs (text, images, videos)
             inputs = self.preprocess(request_data)
+            # Perform inference with the model
             outputs = self.inference(inputs)
+            # Postprocess the generated model outputs
             result = self.postprocess(outputs)
+            return json.dumps({"result": result})  # Return the result in JSON format
         except Exception as e:
+            # Handle any errors that occur during the process
+            return json.dumps({"error": str(e)})