Spaces:
Runtime error
Runtime error
Commit
·
31f9cfa
1
Parent(s):
1265a5f
Update with h2oGPT hash dba6431da758fe9d822c9659f144ee64ea80f111
Browse files- generate.py +42 -24
- stopping.py +2 -2
- utils.py +1 -1
generate.py
CHANGED
|
@@ -6,6 +6,7 @@ import typing
|
|
| 6 |
from threading import Thread
|
| 7 |
|
| 8 |
import filelock
|
|
|
|
| 9 |
|
| 10 |
from utils import set_seed, clear_torch_cache, save_generate_output, NullContext, wrapped_partial
|
| 11 |
|
|
@@ -135,7 +136,19 @@ def main(
|
|
| 135 |
api_open = bool(int(os.getenv('API_OPEN', api_open)))
|
| 136 |
allow_api = bool(int(os.getenv('ALLOW_API', allow_api)))
|
| 137 |
|
| 138 |
-
n_gpus = torch.cuda.device_count()
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 139 |
|
| 140 |
# get defaults
|
| 141 |
model_lower = base_model.lower()
|
|
@@ -210,7 +223,7 @@ def main(
|
|
| 210 |
eval_filename = os.path.join(scoring_path, eval_filename)
|
| 211 |
|
| 212 |
# torch.device("cuda") leads to cuda:x cuda:y mismatches for multi-GPU consistently
|
| 213 |
-
context_class = NullContext() if n_gpus > 1 else torch.device("cuda")
|
| 214 |
|
| 215 |
with context_class:
|
| 216 |
# ensure was set right above before examples generated
|
|
@@ -340,7 +353,7 @@ def get_device():
|
|
| 340 |
if torch.cuda.is_available():
|
| 341 |
device = "cuda"
|
| 342 |
else:
|
| 343 |
-
|
| 344 |
|
| 345 |
return device
|
| 346 |
|
|
@@ -381,16 +394,21 @@ def get_non_lora_model(base_model, model_loader, load_half, model_kwargs, reward
|
|
| 381 |
device_map.update(device_map_model)
|
| 382 |
print('device_map: %s' % device_map, flush=True)
|
| 383 |
|
| 384 |
-
if
|
| 385 |
-
|
| 386 |
-
|
| 387 |
-
|
| 388 |
-
|
| 389 |
-
|
| 390 |
-
|
| 391 |
-
|
| 392 |
-
|
| 393 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 394 |
|
| 395 |
load_in_8bit = model_kwargs.get('load_in_8bit', False)
|
| 396 |
model_kwargs['device_map'] = device_map
|
|
@@ -483,24 +501,24 @@ def get_model(
|
|
| 483 |
model = model_loader(tokenizer,
|
| 484 |
model=base_model,
|
| 485 |
device=0 if device == "cuda" else -1,
|
| 486 |
-
torch_dtype=torch.float16)
|
| 487 |
else:
|
| 488 |
-
assert device
|
| 489 |
model_kwargs = dict(local_files_only=local_files_only,
|
| 490 |
-
torch_dtype=torch.float16,
|
| 491 |
resume_download=resume_download,
|
| 492 |
use_auth_token=use_auth_token)
|
| 493 |
if 'mbart-' not in base_model.lower():
|
| 494 |
model_kwargs.update(dict(load_in_8bit=load_8bit,
|
| 495 |
-
device_map={"": 0} if load_8bit else "auto",
|
| 496 |
))
|
| 497 |
if 'OpenAssistant/reward-model'.lower() in base_model.lower():
|
| 498 |
# could put on other GPUs
|
| 499 |
-
model_kwargs['device_map'] = {"": 0}
|
| 500 |
model_kwargs.pop('torch_dtype', None)
|
| 501 |
|
| 502 |
if not lora_weights:
|
| 503 |
-
with torch.device(
|
| 504 |
if infer_devices:
|
| 505 |
model = get_non_lora_model(base_model, model_loader, load_half, model_kwargs, reward_type,
|
| 506 |
gpu_id=gpu_id, use_auth_token=use_auth_token)
|
|
@@ -521,14 +539,14 @@ def get_model(
|
|
| 521 |
model = PeftModel.from_pretrained(
|
| 522 |
model,
|
| 523 |
lora_weights,
|
| 524 |
-
torch_dtype=torch.float16,
|
| 525 |
local_files_only=local_files_only,
|
| 526 |
resume_download=resume_download,
|
| 527 |
use_auth_token=use_auth_token,
|
| 528 |
-
device_map={"": 0}, # seems to be required
|
| 529 |
)
|
| 530 |
else:
|
| 531 |
-
with torch.device(
|
| 532 |
model = model_loader.from_pretrained(
|
| 533 |
base_model,
|
| 534 |
**model_kwargs
|
|
@@ -536,7 +554,7 @@ def get_model(
|
|
| 536 |
model = PeftModel.from_pretrained(
|
| 537 |
model,
|
| 538 |
lora_weights,
|
| 539 |
-
torch_dtype=torch.float16,
|
| 540 |
local_files_only=local_files_only,
|
| 541 |
resume_download=resume_download,
|
| 542 |
use_auth_token=use_auth_token,
|
|
@@ -751,7 +769,7 @@ def evaluate(
|
|
| 751 |
# handle fake \n added
|
| 752 |
stop_words_ids = [x[1:] if y[0] == '\n' else x for x, y in zip(stop_words_ids, stop_words)]
|
| 753 |
# build stopper
|
| 754 |
-
stopping_criteria = StoppingCriteriaList([StoppingCriteriaSub(stops=stop_words_ids, encounters=encounters)])
|
| 755 |
else:
|
| 756 |
stopping_criteria = StoppingCriteriaList()
|
| 757 |
|
|
|
|
| 6 |
from threading import Thread
|
| 7 |
|
| 8 |
import filelock
|
| 9 |
+
import psutil
|
| 10 |
|
| 11 |
from utils import set_seed, clear_torch_cache, save_generate_output, NullContext, wrapped_partial
|
| 12 |
|
|
|
|
| 136 |
api_open = bool(int(os.getenv('API_OPEN', api_open)))
|
| 137 |
allow_api = bool(int(os.getenv('ALLOW_API', allow_api)))
|
| 138 |
|
| 139 |
+
n_gpus = torch.cuda.device_count() if torch.cuda.is_available else 0
|
| 140 |
+
if n_gpus == 0:
|
| 141 |
+
gpu_id = None
|
| 142 |
+
load_8bit = False
|
| 143 |
+
load_half = False
|
| 144 |
+
infer_devices = False
|
| 145 |
+
torch.backends.cudnn.benchmark = True
|
| 146 |
+
torch.backends.cudnn.enabled = False
|
| 147 |
+
torch.set_default_dtype(torch.float32)
|
| 148 |
+
if psutil.virtual_memory().available < 94*1024**3:
|
| 149 |
+
# 12B uses ~94GB
|
| 150 |
+
# 6.9B uses ~47GB
|
| 151 |
+
base_model = 'h2oai/h2ogpt-oig-oasst1-512-6.9b'
|
| 152 |
|
| 153 |
# get defaults
|
| 154 |
model_lower = base_model.lower()
|
|
|
|
| 223 |
eval_filename = os.path.join(scoring_path, eval_filename)
|
| 224 |
|
| 225 |
# torch.device("cuda") leads to cuda:x cuda:y mismatches for multi-GPU consistently
|
| 226 |
+
context_class = NullContext() if n_gpus > 1 or n_gpus == 0 else torch.device("cuda")
|
| 227 |
|
| 228 |
with context_class:
|
| 229 |
# ensure was set right above before examples generated
|
|
|
|
| 353 |
if torch.cuda.is_available():
|
| 354 |
device = "cuda"
|
| 355 |
else:
|
| 356 |
+
device = "cpu"
|
| 357 |
|
| 358 |
return device
|
| 359 |
|
|
|
|
| 394 |
device_map.update(device_map_model)
|
| 395 |
print('device_map: %s' % device_map, flush=True)
|
| 396 |
|
| 397 |
+
n_gpus = torch.cuda.device_count() if torch.cuda.is_available else 0
|
| 398 |
+
|
| 399 |
+
if n_gpus > 0:
|
| 400 |
+
if gpu_id >= 0:
|
| 401 |
+
# FIXME: If really distributes model, tend to get things like: ValueError: gpt_neox.embed_in.weight doesn't have any device set.
|
| 402 |
+
# So avoid for now, just put on first GPU, unless score_model, put on last
|
| 403 |
+
if reward_type:
|
| 404 |
+
device_map = {'': n_gpus - 1}
|
| 405 |
+
else:
|
| 406 |
+
device_map = {'': min(n_gpus - 1, gpu_id)}
|
| 407 |
+
if gpu_id == -1:
|
| 408 |
+
device_map = {'': 'cuda'}
|
| 409 |
+
else:
|
| 410 |
+
device_map = {'': 'cpu'}
|
| 411 |
+
model_kwargs['load_in_8bit'] = False
|
| 412 |
|
| 413 |
load_in_8bit = model_kwargs.get('load_in_8bit', False)
|
| 414 |
model_kwargs['device_map'] = device_map
|
|
|
|
| 501 |
model = model_loader(tokenizer,
|
| 502 |
model=base_model,
|
| 503 |
device=0 if device == "cuda" else -1,
|
| 504 |
+
torch_dtype=torch.float16 if device == 'cuda' else torch.float32)
|
| 505 |
else:
|
| 506 |
+
assert device in ["cuda", "cpu"], "Unsupported device %s" % device
|
| 507 |
model_kwargs = dict(local_files_only=local_files_only,
|
| 508 |
+
torch_dtype=torch.float16 if device == 'cuda' else torch.float32,
|
| 509 |
resume_download=resume_download,
|
| 510 |
use_auth_token=use_auth_token)
|
| 511 |
if 'mbart-' not in base_model.lower():
|
| 512 |
model_kwargs.update(dict(load_in_8bit=load_8bit,
|
| 513 |
+
device_map={"": 0} if load_8bit and device == 'cuda' else "auto",
|
| 514 |
))
|
| 515 |
if 'OpenAssistant/reward-model'.lower() in base_model.lower():
|
| 516 |
# could put on other GPUs
|
| 517 |
+
model_kwargs['device_map'] = {"": 0} if device == 'cuda' else {"": 'cpu'}
|
| 518 |
model_kwargs.pop('torch_dtype', None)
|
| 519 |
|
| 520 |
if not lora_weights:
|
| 521 |
+
with torch.device(device):
|
| 522 |
if infer_devices:
|
| 523 |
model = get_non_lora_model(base_model, model_loader, load_half, model_kwargs, reward_type,
|
| 524 |
gpu_id=gpu_id, use_auth_token=use_auth_token)
|
|
|
|
| 539 |
model = PeftModel.from_pretrained(
|
| 540 |
model,
|
| 541 |
lora_weights,
|
| 542 |
+
torch_dtype=torch.float16 if device == 'cuda' else torch.float32,
|
| 543 |
local_files_only=local_files_only,
|
| 544 |
resume_download=resume_download,
|
| 545 |
use_auth_token=use_auth_token,
|
| 546 |
+
device_map={"": 0} if device == 'cuda' else {"": 'cpu'}, # seems to be required
|
| 547 |
)
|
| 548 |
else:
|
| 549 |
+
with torch.device(device):
|
| 550 |
model = model_loader.from_pretrained(
|
| 551 |
base_model,
|
| 552 |
**model_kwargs
|
|
|
|
| 554 |
model = PeftModel.from_pretrained(
|
| 555 |
model,
|
| 556 |
lora_weights,
|
| 557 |
+
torch_dtype=torch.float16 if device == 'cuda' else torch.float32,
|
| 558 |
local_files_only=local_files_only,
|
| 559 |
resume_download=resume_download,
|
| 560 |
use_auth_token=use_auth_token,
|
|
|
|
| 769 |
# handle fake \n added
|
| 770 |
stop_words_ids = [x[1:] if y[0] == '\n' else x for x, y in zip(stop_words_ids, stop_words)]
|
| 771 |
# build stopper
|
| 772 |
+
stopping_criteria = StoppingCriteriaList([StoppingCriteriaSub(stops=stop_words_ids, encounters=encounters, device=device)])
|
| 773 |
else:
|
| 774 |
stopping_criteria = StoppingCriteriaList()
|
| 775 |
|
stopping.py
CHANGED
|
@@ -9,11 +9,11 @@ from transformers import StoppingCriteria
|
|
| 9 |
|
| 10 |
class StoppingCriteriaSub(StoppingCriteria):
|
| 11 |
|
| 12 |
-
def __init__(self, stops=[], encounters=[]):
|
| 13 |
super().__init__()
|
| 14 |
assert len(stops) % len(encounters) == 0, "Number of stops and encounters must match"
|
| 15 |
self.encounters = encounters
|
| 16 |
-
self.stops = [stop.to(
|
| 17 |
self.num_stops = [0] * len(stops)
|
| 18 |
|
| 19 |
def __call__(self, input_ids: torch.LongTensor, scores: torch.FloatTensor, **kwargs) -> bool:
|
|
|
|
| 9 |
|
| 10 |
class StoppingCriteriaSub(StoppingCriteria):
|
| 11 |
|
| 12 |
+
def __init__(self, stops=[], encounters=[], device="cuda"):
|
| 13 |
super().__init__()
|
| 14 |
assert len(stops) % len(encounters) == 0, "Number of stops and encounters must match"
|
| 15 |
self.encounters = encounters
|
| 16 |
+
self.stops = [stop.to(device) for stop in stops]
|
| 17 |
self.num_stops = [0] * len(stops)
|
| 18 |
|
| 19 |
def __call__(self, input_ids: torch.LongTensor, scores: torch.FloatTensor, **kwargs) -> bool:
|
utils.py
CHANGED
|
@@ -46,7 +46,7 @@ def flatten_list(lis):
|
|
| 46 |
|
| 47 |
def clear_torch_cache():
|
| 48 |
import torch
|
| 49 |
-
if torch.cuda.is_available:
|
| 50 |
torch.cuda.empty_cache()
|
| 51 |
torch.cuda.ipc_collect()
|
| 52 |
gc.collect()
|
|
|
|
| 46 |
|
| 47 |
def clear_torch_cache():
|
| 48 |
import torch
|
| 49 |
+
if torch.cuda.is_available():
|
| 50 |
torch.cuda.empty_cache()
|
| 51 |
torch.cuda.ipc_collect()
|
| 52 |
gc.collect()
|