Update handler.py
Browse files- handler.py +24 -18
handler.py
CHANGED
|
@@ -5,18 +5,19 @@ import json
|
|
| 5 |
|
| 6 |
class EndpointHandler:
|
| 7 |
def __init__(self, model_dir):
|
| 8 |
-
# Load the model
|
| 9 |
self.model = Qwen2VLForConditionalGeneration.from_pretrained(
|
| 10 |
model_dir,
|
| 11 |
-
torch_dtype=torch.
|
| 12 |
-
device_map="auto" # Automatically
|
| 13 |
)
|
| 14 |
-
# Load processor for handling inputs (text, images, videos)
|
| 15 |
self.processor = AutoProcessor.from_pretrained(model_dir)
|
| 16 |
-
# Set the device (GPU or CPU)
|
| 17 |
self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
|
| 18 |
self.model.to(self.device) # Move the model to the appropriate device
|
| 19 |
-
self.model.eval()
|
|
|
|
|
|
|
|
|
|
| 20 |
|
| 21 |
def preprocess(self, request_data):
|
| 22 |
# Extract the 'messages' from incoming request
|
|
@@ -26,11 +27,11 @@ class EndpointHandler:
|
|
| 26 |
|
| 27 |
# Process vision inputs (images, videos) from the messages
|
| 28 |
image_inputs, video_inputs = process_vision_info(messages)
|
| 29 |
-
# Prepare text input for the model
|
| 30 |
text = self.processor.apply_chat_template(
|
| 31 |
messages, tokenize=False, add_generation_prompt=True
|
| 32 |
)
|
| 33 |
-
#
|
| 34 |
inputs = self.processor(
|
| 35 |
text=[text],
|
| 36 |
images=image_inputs,
|
|
@@ -38,23 +39,27 @@ class EndpointHandler:
|
|
| 38 |
padding=True,
|
| 39 |
return_tensors="pt",
|
| 40 |
)
|
|
|
|
| 41 |
return inputs.to(self.device) # Move inputs to the correct device
|
| 42 |
|
| 43 |
def inference(self, inputs):
|
| 44 |
-
#
|
| 45 |
with torch.no_grad():
|
| 46 |
generated_ids = self.model.generate(
|
| 47 |
**inputs,
|
| 48 |
-
max_new_tokens=128, # Limit response length
|
| 49 |
-
num_beams=1, #
|
| 50 |
-
max_batch_size=
|
| 51 |
)
|
| 52 |
|
| 53 |
-
#
|
| 54 |
generated_ids_trimmed = [
|
| 55 |
out_ids[len(in_ids):] for in_ids, out_ids in zip(inputs.input_ids, generated_ids)
|
| 56 |
]
|
| 57 |
|
|
|
|
|
|
|
|
|
|
| 58 |
return generated_ids_trimmed
|
| 59 |
|
| 60 |
def postprocess(self, inference_output):
|
|
@@ -68,12 +73,13 @@ class EndpointHandler:
|
|
| 68 |
try:
|
| 69 |
# Parse the incoming JSON request
|
| 70 |
request_data = json.loads(request)
|
| 71 |
-
# Preprocess inputs
|
| 72 |
inputs = self.preprocess(request_data)
|
| 73 |
-
# Perform inference
|
| 74 |
outputs = self.inference(inputs)
|
| 75 |
-
# Postprocess the model
|
| 76 |
result = self.postprocess(outputs)
|
| 77 |
-
return json.dumps({"result": result}) # Return result
|
| 78 |
except Exception as e:
|
| 79 |
-
|
|
|
|
|
|
| 5 |
|
| 6 |
class EndpointHandler:
|
| 7 |
def __init__(self, model_dir):
|
| 8 |
+
# Load the model and processor for Qwen2-VL-7B
|
| 9 |
self.model = Qwen2VLForConditionalGeneration.from_pretrained(
|
| 10 |
model_dir,
|
| 11 |
+
torch_dtype=torch.float16, # Using FP16 to reduce memory usage (Switch to FP32 if needed)
|
| 12 |
+
device_map="auto" # Automatically assigns the model to the available GPU(s)
|
| 13 |
)
|
|
|
|
| 14 |
self.processor = AutoProcessor.from_pretrained(model_dir)
|
|
|
|
| 15 |
self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
|
| 16 |
self.model.to(self.device) # Move the model to the appropriate device
|
| 17 |
+
self.model.eval()
|
| 18 |
+
|
| 19 |
+
# Enable gradient checkpointing to save memory during large model runs
|
| 20 |
+
self.model.gradient_checkpointing_enable()
|
| 21 |
|
| 22 |
def preprocess(self, request_data):
|
| 23 |
# Extract the 'messages' from incoming request
|
|
|
|
| 27 |
|
| 28 |
# Process vision inputs (images, videos) from the messages
|
| 29 |
image_inputs, video_inputs = process_vision_info(messages)
|
| 30 |
+
# Prepare text input for the chat model
|
| 31 |
text = self.processor.apply_chat_template(
|
| 32 |
messages, tokenize=False, add_generation_prompt=True
|
| 33 |
)
|
| 34 |
+
# Prepare inputs for the model (text + vision inputs) and return tensors
|
| 35 |
inputs = self.processor(
|
| 36 |
text=[text],
|
| 37 |
images=image_inputs,
|
|
|
|
| 39 |
padding=True,
|
| 40 |
return_tensors="pt",
|
| 41 |
)
|
| 42 |
+
|
| 43 |
return inputs.to(self.device) # Move inputs to the correct device
|
| 44 |
|
| 45 |
def inference(self, inputs):
|
| 46 |
+
# Perform inference with the model in no-grad mode (memory efficient)
|
| 47 |
with torch.no_grad():
|
| 48 |
generated_ids = self.model.generate(
|
| 49 |
**inputs,
|
| 50 |
+
max_new_tokens=128, # Limit the response length to reduce memory usage
|
| 51 |
+
num_beams=1, # Set beam size for lower memory consumption
|
| 52 |
+
max_batch_size=1 # Keep batch size small for memory optimization
|
| 53 |
)
|
| 54 |
|
| 55 |
+
# Trim generated output (remove input tokens from generated output)
|
| 56 |
generated_ids_trimmed = [
|
| 57 |
out_ids[len(in_ids):] for in_ids, out_ids in zip(inputs.input_ids, generated_ids)
|
| 58 |
]
|
| 59 |
|
| 60 |
+
# Clear the CUDA cache after inference to release unused memory
|
| 61 |
+
torch.cuda.empty_cache()
|
| 62 |
+
|
| 63 |
return generated_ids_trimmed
|
| 64 |
|
| 65 |
def postprocess(self, inference_output):
|
|
|
|
| 73 |
try:
|
| 74 |
# Parse the incoming JSON request
|
| 75 |
request_data = json.loads(request)
|
| 76 |
+
# Preprocess inputs (text, images, videos)
|
| 77 |
inputs = self.preprocess(request_data)
|
| 78 |
+
# Perform inference with the model
|
| 79 |
outputs = self.inference(inputs)
|
| 80 |
+
# Postprocess the generated model outputs
|
| 81 |
result = self.postprocess(outputs)
|
| 82 |
+
return json.dumps({"result": result}) # Return the result in JSON format
|
| 83 |
except Exception as e:
|
| 84 |
+
# Handle any errors that occur during the process
|
| 85 |
+
return json.dumps({"error": str(e)})
|