DaddyDaniel commited on
Commit
1d66415
·
verified ·
1 Parent(s): 5ec012b

Use better clean up

Browse files
Files changed (1) hide show
  1. qwen2_inference.py +20 -11
qwen2_inference.py CHANGED
@@ -73,21 +73,24 @@ def run_inference(input_file, model_path, args):
73
  videos=video_inputs,
74
  padding=True,
75
  return_tensors="pt",
76
- ).to("cuda")
77
 
78
  # GPU Memory after input processing
79
  after_input_dump = (torch.cuda.memory_allocated(), torch.cuda.memory_reserved())
80
 
81
- output_ids = model.generate(**inputs,
82
- max_new_tokens=args['max_length'],
83
- do_sample=True,
84
- top_p=args['top_p'],
85
- top_k=args['top_k'],
86
- use_cache=True,
87
- num_return_sequences=1,
88
- pad_token_id=processor.tokenizer.pad_token_id,
89
- temperature=args['temperature']
90
- )
 
 
 
91
  generated_ids = [
92
  output_ids[len(input_ids):]
93
  for input_ids, output_ids in zip(inputs.input_ids, output_ids)
@@ -103,4 +106,10 @@ def run_inference(input_file, model_path, args):
103
  print_gpu_memory("After Input", after_input_dump[0], after_input_dump[1])
104
  print_gpu_memory("After Generation", after_gen_dump[0], after_gen_dump[1])
105
 
 
 
 
 
 
 
106
  return output_text
 
73
  videos=video_inputs,
74
  padding=True,
75
  return_tensors="pt",
76
+ )
77
 
78
  # GPU Memory after input processing
79
  after_input_dump = (torch.cuda.memory_allocated(), torch.cuda.memory_reserved())
80
 
81
+ with torch.inference_mode():
82
+ output_ids = model.generate(
83
+ **inputs,
84
+ max_new_tokens=int(args["max_length"]),
85
+ do_sample=True,
86
+ top_p=float(args["top_p"]),
87
+ top_k=int(args["top_k"]),
88
+ temperature=float(args["temperature"]),
89
+ use_cache=True,
90
+ num_return_sequences=1,
91
+ pad_token_id=processor.tokenizer.pad_token_id,
92
+ )
93
+
94
  generated_ids = [
95
  output_ids[len(input_ids):]
96
  for input_ids, output_ids in zip(inputs.input_ids, output_ids)
 
106
  print_gpu_memory("After Input", after_input_dump[0], after_input_dump[1])
107
  print_gpu_memory("After Generation", after_gen_dump[0], after_gen_dump[1])
108
 
109
+ # Clean up
110
+ del inputs, output_ids, generated_ids, image, image_input, video_inputs
111
+ if torch.cuda.is_available():
112
+ torch.cuda.empty_cache()
113
+ torch.cuda.ipc_collect()
114
+
115
  return output_text