hperkins
/

Qwen2-VL-7B-Instruct

@@ -1,20 +1,19 @@
-from transformers import Qwen2VLForConditionalGeneration, AutoProcessor
-from qwen_vl_utils import process_vision_info
 import torch
 import json
 class EndpointHandler:
     def __init__(self, model_dir):
         # Load the model and processor for Qwen2-VL-7B
         self.model = Qwen2VLForConditionalGeneration.from_pretrained(
             model_dir,
-            torch_dtype=torch.float16,  # Use float16 for reduced memory usage
-            device_map="auto"  # Automatically assign to available GPU(s)
         )
         self.processor = AutoProcessor.from_pretrained(model_dir)
         self.model.eval()
-        # Enable gradient checkpointing for memory savings
         self.model.gradient_checkpointing_enable()
     def preprocess(self, request_data):
@@ -22,7 +21,7 @@ class EndpointHandler:
         messages = request_data.get('messages')
         if not messages:
             raise ValueError("Messages are required")
         # Process vision info (image or video) from the messages
         image_inputs, video_inputs = process_vision_info(messages)
@@ -39,18 +38,18 @@ class EndpointHandler:
             padding=True,
             return_tensors="pt",
         )
-        return inputs.to(self.model.device)
     def inference(self, inputs):
         # Perform inference with the model
         with torch.no_grad():
             generated_ids = self.model.generate(
-                **inputs,
-                max_new_tokens=256,  # Increased token length for richer output
-                num_beams=5,  # Increase beam size for better quality
-                early_stopping=True,  # Stop when all beams have finished
-                max_batch_size=1  # Keep batch size small to manage memory usage
             )
         # Trim the output (remove input tokens from generated output)
@@ -72,22 +71,14 @@ class EndpointHandler:
     def __call__(self, request):
         try:
-            # Ensure request is a string before attempting to load it as JSON
-            if isinstance(request, dict):
-                request_data = request
-            else:
-                request_data = json.loads(request)  # Parse the JSON request data
             # Preprocess the input data (text, images, videos)
             inputs = self.preprocess(request_data)
             # Perform inference
             outputs = self.inference(inputs)
             # Postprocess the output
             result = self.postprocess(outputs)
-            return json.dumps({"result": result})  # Return a JSON response
         except Exception as e:
-            return json.dumps({"error": str(e)})  # Return error as JSON

 import torch
 import json
+from transformers import Qwen2VLForConditionalGeneration, AutoProcessor
+from qwen_vl_utils import process_vision_info
 class EndpointHandler:
     def __init__(self, model_dir):
         # Load the model and processor for Qwen2-VL-7B
         self.model = Qwen2VLForConditionalGeneration.from_pretrained(
             model_dir,
+            torch_dtype=torch.float16,  # FP16 precision to reduce memory
+            device_map="auto"  # Automatically distribute model across devices
         )
         self.processor = AutoProcessor.from_pretrained(model_dir)
         self.model.eval()
+        # Enable gradient checkpointing to save memory
         self.model.gradient_checkpointing_enable()
     def preprocess(self, request_data):
         messages = request_data.get('messages')
         if not messages:
             raise ValueError("Messages are required")
         # Process vision info (image or video) from the messages
         image_inputs, video_inputs = process_vision_info(messages)
             padding=True,
             return_tensors="pt",
         )
+        return inputs.to("cuda")
     def inference(self, inputs):
         # Perform inference with the model
         with torch.no_grad():
+            # Generate the output
             generated_ids = self.model.generate(
+                **inputs,
+                max_new_tokens=128,
+                num_beams=1,
+                max_batch_size=1
             )
         # Trim the output (remove input tokens from generated output)
     def __call__(self, request):
         try:
+            # Parse the JSON request data
+            request_data = json.loads(request)
             # Preprocess the input data (text, images, videos)
             inputs = self.preprocess(request_data)
             # Perform inference
             outputs = self.inference(inputs)
             # Postprocess the output
             result = self.postprocess(outputs)
+            return json.dumps({"result": result})
         except Exception as e:
+            return json.dumps({"error": str(e)})