hperkins commited on
Commit
8a55047
·
verified ·
1 Parent(s): a469cc0

Update handler.py

Browse files
Files changed (1) hide show
  1. handler.py +24 -18
handler.py CHANGED
@@ -5,18 +5,19 @@ import json
5
 
6
  class EndpointHandler:
7
  def __init__(self, model_dir):
8
- # Load the model with FP32 precision and automatic device placement
9
  self.model = Qwen2VLForConditionalGeneration.from_pretrained(
10
  model_dir,
11
- torch_dtype=torch.float32, # Switch to FP32 for full precision
12
- device_map="auto" # Automatically use available GPUs
13
  )
14
- # Load processor for handling inputs (text, images, videos)
15
  self.processor = AutoProcessor.from_pretrained(model_dir)
16
- # Set the device (GPU or CPU)
17
  self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
18
  self.model.to(self.device) # Move the model to the appropriate device
19
- self.model.eval() # Set model to evaluation mode
 
 
 
20
 
21
  def preprocess(self, request_data):
22
  # Extract the 'messages' from incoming request
@@ -26,11 +27,11 @@ class EndpointHandler:
26
 
27
  # Process vision inputs (images, videos) from the messages
28
  image_inputs, video_inputs = process_vision_info(messages)
29
- # Prepare text input for the model
30
  text = self.processor.apply_chat_template(
31
  messages, tokenize=False, add_generation_prompt=True
32
  )
33
- # Package inputs (text, images, videos) into tensors
34
  inputs = self.processor(
35
  text=[text],
36
  images=image_inputs,
@@ -38,23 +39,27 @@ class EndpointHandler:
38
  padding=True,
39
  return_tensors="pt",
40
  )
 
41
  return inputs.to(self.device) # Move inputs to the correct device
42
 
43
  def inference(self, inputs):
44
- # Run model inference (without gradient computation)
45
  with torch.no_grad():
46
  generated_ids = self.model.generate(
47
  **inputs,
48
- max_new_tokens=128, # Limit response length
49
- num_beams=1, # Use beam search for diversity
50
- max_batch_size=4 # Increase batch size if memory allows
51
  )
52
 
53
- # Remove input tokens from generated output
54
  generated_ids_trimmed = [
55
  out_ids[len(in_ids):] for in_ids, out_ids in zip(inputs.input_ids, generated_ids)
56
  ]
57
 
 
 
 
58
  return generated_ids_trimmed
59
 
60
  def postprocess(self, inference_output):
@@ -68,12 +73,13 @@ class EndpointHandler:
68
  try:
69
  # Parse the incoming JSON request
70
  request_data = json.loads(request)
71
- # Preprocess inputs
72
  inputs = self.preprocess(request_data)
73
- # Perform inference
74
  outputs = self.inference(inputs)
75
- # Postprocess the model output
76
  result = self.postprocess(outputs)
77
- return json.dumps({"result": result}) # Return result as JSON
78
  except Exception as e:
79
- return json.dumps({"error": str(e)}) # Return error message if any
 
 
5
 
6
  class EndpointHandler:
7
  def __init__(self, model_dir):
8
+ # Load the model and processor for Qwen2-VL-7B
9
  self.model = Qwen2VLForConditionalGeneration.from_pretrained(
10
  model_dir,
11
+ torch_dtype=torch.float16, # Using FP16 to reduce memory usage (Switch to FP32 if needed)
12
+ device_map="auto" # Automatically assigns the model to the available GPU(s)
13
  )
 
14
  self.processor = AutoProcessor.from_pretrained(model_dir)
 
15
  self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
16
  self.model.to(self.device) # Move the model to the appropriate device
17
+ self.model.eval()
18
+
19
+ # Enable gradient checkpointing to save memory during large model runs
20
+ self.model.gradient_checkpointing_enable()
21
 
22
  def preprocess(self, request_data):
23
  # Extract the 'messages' from incoming request
 
27
 
28
  # Process vision inputs (images, videos) from the messages
29
  image_inputs, video_inputs = process_vision_info(messages)
30
+ # Prepare text input for the chat model
31
  text = self.processor.apply_chat_template(
32
  messages, tokenize=False, add_generation_prompt=True
33
  )
34
+ # Prepare inputs for the model (text + vision inputs) and return tensors
35
  inputs = self.processor(
36
  text=[text],
37
  images=image_inputs,
 
39
  padding=True,
40
  return_tensors="pt",
41
  )
42
+
43
  return inputs.to(self.device) # Move inputs to the correct device
44
 
45
  def inference(self, inputs):
46
+ # Perform inference with the model in no-grad mode (memory efficient)
47
  with torch.no_grad():
48
  generated_ids = self.model.generate(
49
  **inputs,
50
+ max_new_tokens=128, # Limit the response length to reduce memory usage
51
+ num_beams=1, # Set beam size for lower memory consumption
52
+ max_batch_size=1 # Keep batch size small for memory optimization
53
  )
54
 
55
+ # Trim generated output (remove input tokens from generated output)
56
  generated_ids_trimmed = [
57
  out_ids[len(in_ids):] for in_ids, out_ids in zip(inputs.input_ids, generated_ids)
58
  ]
59
 
60
+ # Clear the CUDA cache after inference to release unused memory
61
+ torch.cuda.empty_cache()
62
+
63
  return generated_ids_trimmed
64
 
65
  def postprocess(self, inference_output):
 
73
  try:
74
  # Parse the incoming JSON request
75
  request_data = json.loads(request)
76
+ # Preprocess inputs (text, images, videos)
77
  inputs = self.preprocess(request_data)
78
+ # Perform inference with the model
79
  outputs = self.inference(inputs)
80
+ # Postprocess the generated model outputs
81
  result = self.postprocess(outputs)
82
+ return json.dumps({"result": result}) # Return the result in JSON format
83
  except Exception as e:
84
+ # Handle any errors that occur during the process
85
+ return json.dumps({"error": str(e)})