Spaces:
Sleeping
Sleeping
we worked to handle gpu operations
Browse files
main.py
CHANGED
|
@@ -15,6 +15,36 @@ from rewards import get_reward_losses
|
|
| 15 |
from training import LatentNoiseTrainer, get_optimizer
|
| 16 |
|
| 17 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 18 |
def setup(args, loaded_model_setup=None):
|
| 19 |
seed_everything(args.seed)
|
| 20 |
bf.makedirs(f"{args.save_dir}/logs/{args.task}")
|
|
@@ -52,14 +82,10 @@ def setup(args, loaded_model_setup=None):
|
|
| 52 |
os.environ["CUDA_VISIBLE_DEVICES"] = args.device_id
|
| 53 |
|
| 54 |
device = torch.device("cuda")
|
| 55 |
-
if args.dtype == "float32
|
| 56 |
-
dtype = torch.float32
|
| 57 |
-
elif args.dtype == "float16":
|
| 58 |
-
dtype = torch.float16
|
| 59 |
|
| 60 |
# If args.model is the same as the one in loaded_model_setup, reuse the trainer and pipe
|
| 61 |
if loaded_model_setup and args.model == loaded_model_setup[0].model:
|
| 62 |
-
# Reuse the trainer and pipe from the loaded model setup
|
| 63 |
print(f"Reusing model {args.model} from loaded setup.")
|
| 64 |
trainer = loaded_model_setup[1] # Trainer is at position 1 in loaded_model_setup
|
| 65 |
|
|
@@ -97,10 +123,13 @@ def setup(args, loaded_model_setup=None):
|
|
| 97 |
width // trainer.model.vae_scale_factor,
|
| 98 |
)
|
| 99 |
|
| 100 |
-
|
| 101 |
enable_grad = not args.no_optim
|
| 102 |
|
| 103 |
-
return args, trainer, device, dtype, shape, enable_grad,
|
|
|
|
|
|
|
|
|
|
| 104 |
|
| 105 |
# Proceed with full model loading if args.model is different
|
| 106 |
print(f"Loading new model: {args.model}")
|
|
@@ -113,27 +142,8 @@ def setup(args, loaded_model_setup=None):
|
|
| 113 |
args.model, dtype, device, args.cache_dir, args.memsave, args.cpu_offloading
|
| 114 |
)
|
| 115 |
|
| 116 |
-
#
|
| 117 |
-
|
| 118 |
-
if not args.cpu_offloading:
|
| 119 |
-
pipe.to(device)
|
| 120 |
-
except RuntimeError as e:
|
| 121 |
-
if 'out of memory' in str(e):
|
| 122 |
-
print("CUDA OOM error. Attempting to handle OOM situation.")
|
| 123 |
-
# Attempt to clear memory and retry moving to GPU
|
| 124 |
-
torch.cuda.empty_cache() # Free up cached memory
|
| 125 |
-
gc.collect()
|
| 126 |
-
try:
|
| 127 |
-
# Retry loading after clearing cache
|
| 128 |
-
if not args.cpu_offloading:
|
| 129 |
-
pipe.to(device)
|
| 130 |
-
except RuntimeError as e:
|
| 131 |
-
print("Still facing OOM issues. Keeping model on CPU.")
|
| 132 |
-
args.cpu_offloading = True # Force CPU offloading
|
| 133 |
-
else:
|
| 134 |
-
raise e # Re-raise the exception if it's not OOM
|
| 135 |
-
|
| 136 |
-
torch.cuda.empty_cache() # Free up cached memory
|
| 137 |
gc.collect()
|
| 138 |
|
| 139 |
trainer = LatentNoiseTrainer(
|
|
@@ -180,28 +190,47 @@ def setup(args, loaded_model_setup=None):
|
|
| 180 |
torch.cuda.empty_cache() # Free up cached memory
|
| 181 |
gc.collect()
|
| 182 |
|
| 183 |
-
|
| 184 |
-
multi_apply_fn = get_multi_apply_fn(
|
| 185 |
-
model_type=args.multi_step_model,
|
| 186 |
-
seed=args.seed,
|
| 187 |
-
pipe=pipe,
|
| 188 |
-
cache_dir=args.cache_dir,
|
| 189 |
-
device=device if not args.cpu_offloading else 'cpu',
|
| 190 |
-
dtype=dtype,
|
| 191 |
-
)
|
| 192 |
-
else:
|
| 193 |
-
multi_apply_fn = None
|
| 194 |
|
| 195 |
-
|
| 196 |
-
gc.collect()
|
| 197 |
|
| 198 |
-
return args, trainer, device, dtype, shape, enable_grad, multi_apply_fn, settings
|
| 199 |
|
| 200 |
|
| 201 |
|
| 202 |
-
def execute_task(args, trainer, device, dtype, shape, enable_grad,
|
| 203 |
|
| 204 |
if args.task == "single":
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 205 |
init_latents = torch.randn(shape, device=device, dtype=dtype)
|
| 206 |
latents = torch.nn.Parameter(init_latents, requires_grad=enable_grad)
|
| 207 |
optimizer = get_optimizer(args.optim, latents, args.lr, args.nesterov)
|
|
@@ -383,8 +412,8 @@ def execute_task(args, trainer, device, dtype, shape, enable_grad, multi_apply_f
|
|
| 383 |
|
| 384 |
def main():
|
| 385 |
args = parse_args()
|
| 386 |
-
args, trainer, device, dtype, shape, enable_grad,
|
| 387 |
-
execute_task(args, trainer, device, dtype, shape, enable_grad,
|
| 388 |
|
| 389 |
if __name__ == "__main__":
|
| 390 |
main()
|
|
|
|
| 15 |
from training import LatentNoiseTrainer, get_optimizer
|
| 16 |
|
| 17 |
|
| 18 |
+
import torch
|
| 19 |
+
import gc
|
| 20 |
+
|
| 21 |
+
def clear_gpu():
|
| 22 |
+
"""Clear GPU memory by removing tensors, freeing cache, and moving data to CPU."""
|
| 23 |
+
# List memory usage before clearing
|
| 24 |
+
print(f"Memory allocated before clearing: {torch.cuda.memory_allocated() / (1024 ** 2)} MB")
|
| 25 |
+
print(f"Memory reserved before clearing: {torch.cuda.memory_reserved() / (1024 ** 2)} MB")
|
| 26 |
+
|
| 27 |
+
# Force the garbage collector to free unreferenced objects
|
| 28 |
+
gc.collect()
|
| 29 |
+
|
| 30 |
+
# Move any bound tensors back to CPU if needed
|
| 31 |
+
if torch.cuda.is_available():
|
| 32 |
+
torch.cuda.empty_cache() # Free up the cached memory
|
| 33 |
+
torch.cuda.ipc_collect() # Clear any cross-process memory
|
| 34 |
+
|
| 35 |
+
print(f"Memory allocated after clearing: {torch.cuda.memory_allocated() / (1024 ** 2)} MB")
|
| 36 |
+
print(f"Memory reserved after clearing: {torch.cuda.memory_reserved() / (1024 ** 2)} MB")
|
| 37 |
+
|
| 38 |
+
def unload_previous_model_if_needed(loaded_model_setup):
|
| 39 |
+
"""Unload the current model from the GPU and free resources if a new model is being loaded."""
|
| 40 |
+
if loaded_model_setup is not None:
|
| 41 |
+
print("Unloading previous model from GPU to free memory.")
|
| 42 |
+
previous_model = loaded_model_setup[7] # Assuming pipe is at position [7] in the setup
|
| 43 |
+
if hasattr(previous_model, 'to') and loaded_model_setup[0].model != "flux":
|
| 44 |
+
previous_model.to('cpu') # Move model to CPU to free GPU memory
|
| 45 |
+
del previous_model # Delete the reference to the model
|
| 46 |
+
clear_gpu() # Clear all remaining GPU memory
|
| 47 |
+
|
| 48 |
def setup(args, loaded_model_setup=None):
|
| 49 |
seed_everything(args.seed)
|
| 50 |
bf.makedirs(f"{args.save_dir}/logs/{args.task}")
|
|
|
|
| 82 |
os.environ["CUDA_VISIBLE_DEVICES"] = args.device_id
|
| 83 |
|
| 84 |
device = torch.device("cuda")
|
| 85 |
+
dtype = torch.float16 if args.dtype == "float16" else torch.float32
|
|
|
|
|
|
|
|
|
|
| 86 |
|
| 87 |
# If args.model is the same as the one in loaded_model_setup, reuse the trainer and pipe
|
| 88 |
if loaded_model_setup and args.model == loaded_model_setup[0].model:
|
|
|
|
| 89 |
print(f"Reusing model {args.model} from loaded setup.")
|
| 90 |
trainer = loaded_model_setup[1] # Trainer is at position 1 in loaded_model_setup
|
| 91 |
|
|
|
|
| 123 |
width // trainer.model.vae_scale_factor,
|
| 124 |
)
|
| 125 |
|
| 126 |
+
pipe = loaded_model_setup[7]
|
| 127 |
enable_grad = not args.no_optim
|
| 128 |
|
| 129 |
+
return args, trainer, device, dtype, shape, enable_grad, settings, pipe
|
| 130 |
+
|
| 131 |
+
# Unload previous model and clear GPU resources
|
| 132 |
+
unload_previous_model_if_needed(loaded_model_setup)
|
| 133 |
|
| 134 |
# Proceed with full model loading if args.model is different
|
| 135 |
print(f"Loading new model: {args.model}")
|
|
|
|
| 142 |
args.model, dtype, device, args.cache_dir, args.memsave, args.cpu_offloading
|
| 143 |
)
|
| 144 |
|
| 145 |
+
# Final memory cleanup after model loading
|
| 146 |
+
torch.cuda.empty_cache()
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 147 |
gc.collect()
|
| 148 |
|
| 149 |
trainer = LatentNoiseTrainer(
|
|
|
|
| 190 |
torch.cuda.empty_cache() # Free up cached memory
|
| 191 |
gc.collect()
|
| 192 |
|
| 193 |
+
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 194 |
|
| 195 |
+
return args, trainer, device, dtype, shape, enable_grad, settings, pipe
|
|
|
|
| 196 |
|
|
|
|
| 197 |
|
| 198 |
|
| 199 |
|
| 200 |
+
def execute_task(args, trainer, device, dtype, shape, enable_grad, settings, pipe, progress_callback=None):
|
| 201 |
|
| 202 |
if args.task == "single":
|
| 203 |
+
# Attempt to move the model to GPU if model is not Flux
|
| 204 |
+
if args.model != "flux":
|
| 205 |
+
if pipe.device != torch.device('cuda'):
|
| 206 |
+
pipe.to(device, dtype)
|
| 207 |
+
else:
|
| 208 |
+
print(f"PIPE:{pipe}")
|
| 209 |
+
|
| 210 |
+
|
| 211 |
+
if args.cpu_offloading:
|
| 212 |
+
pipe.enable_sequential_cpu_offload()
|
| 213 |
+
|
| 214 |
+
#if pipe.device != torch.device('cuda'):
|
| 215 |
+
# pipe.to(device, dtype)
|
| 216 |
+
|
| 217 |
+
if args.enable_multi_apply:
|
| 218 |
+
|
| 219 |
+
multi_apply_fn = get_multi_apply_fn(
|
| 220 |
+
model_type=args.multi_step_model,
|
| 221 |
+
seed=args.seed,
|
| 222 |
+
pipe=pipe,
|
| 223 |
+
cache_dir=args.cache_dir,
|
| 224 |
+
device=device if not args.cpu_offloading else 'cpu',
|
| 225 |
+
dtype=dtype,
|
| 226 |
+
)
|
| 227 |
+
else:
|
| 228 |
+
multi_apply_fn = None
|
| 229 |
+
|
| 230 |
+
torch.cuda.empty_cache() # Free up cached memory
|
| 231 |
+
gc.collect()
|
| 232 |
+
|
| 233 |
+
|
| 234 |
init_latents = torch.randn(shape, device=device, dtype=dtype)
|
| 235 |
latents = torch.nn.Parameter(init_latents, requires_grad=enable_grad)
|
| 236 |
optimizer = get_optimizer(args.optim, latents, args.lr, args.nesterov)
|
|
|
|
| 412 |
|
| 413 |
def main():
|
| 414 |
args = parse_args()
|
| 415 |
+
args, trainer, device, dtype, shape, enable_grad, settings, pipe = setup(args, loaded_model_setup=None)
|
| 416 |
+
execute_task(args, trainer, device, dtype, shape, enable_grad, settings, pipe)
|
| 417 |
|
| 418 |
if __name__ == "__main__":
|
| 419 |
main()
|