▼ code
▼ output
▶ uv-logs
|
Cell: setup | 99.80s | FAILED
|
Raw
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
# /// script
# requires-python = ">=3.12"
# dependencies = [
# "accelerate>=1.10.1",
# "torch>=2.7.0",
# "kernels==0.10.0",
# "transformers@https://github.com/huggingface/transformers.git",
# "ipdb>=0.13.13",
# "matplotlib>=3.7.2",
# "numpy>=1.24.3",
# ]
# ///
import torch
from transformers import GptOssForCausalLM, PreTrainedTokenizerFast, Mxfp4Config
import time
import torch.nn as nn
from kernels import register_kernel_mapping, Mode, LayerRepository
import sys
import torch.profiler
import gc
import logging
# set to debug logging
logging.basicConfig(level=logging.INFO)
def reset_peak_memory_stats():
"""Clear CUDA cache and reset memory allocation counters."""
torch.cuda.empty_cache()
if torch.cuda.is_available():
torch.cuda.reset_peak_memory_stats()
gc.collect()
def get_memory_stats():
"""Get current and peak CUDA memory usage."""
if not torch.cuda.is_available():
return {"allocated_gb": 0, "peak_gb": 0, "reserved_gb": 0}
return {
"allocated_gb": torch.cuda.memory_allocated() / 1e9,
"peak_gb": torch.cuda.max_memory_allocated() / 1e9,
"reserved_gb": torch.cuda.memory_reserved() / 1e9,
}
def override_kernel_layer_name(cls_name: str, value) -> bool:
"""Helper to dynamically override the kernel_layer_name in a model class."""
for mod in sys.modules.values():
if mod is None:
continue
obj = getattr(mod, cls_name, None)
if isinstance(obj, type) and issubclass(obj, nn.Module):
setattr(obj, "kernel_layer_name", value)
print(f"Overrode {cls_name}.kernel_layer_name to {value}")
return True
return False
# Init the model the normal way
model_id = "openai/gpt-oss-20b"
tokenizer = PreTrainedTokenizerFast.from_pretrained(model_id)
quantization_config = Mxfp4Config(dequantize=True)
from kernels import replace_kernel_forward_from_hub, register_kernel_mapping, LayerRepository, Mode
from transformers.models.gpt_oss.modeling_gpt_oss import GptOssMLP, GptOssRMSNorm
replace_kernel_forward_from_hub(GptOssMLP, "Yamoe") # direct, type-safe
replace_kernel_forward_from_hub(GptOssRMSNorm, None) # direct, type-safe
custom_mapping = {
"Yamoe": {
"cuda": {
Mode.INFERENCE: LayerRepository(
repo_id="drbh/yamoe",
layer_name="Yamoe",
revision="v0.3.0",
)
}
}
}
register_kernel_mapping(custom_mapping)
model = GptOssForCausalLM.from_pretrained(
model_id,
dtype="bfloat16",
device_map="auto",
use_kernels=True,
quantization_config=quantization_config,
).eval()
messages = [
{"role": "system", "content": "What is Tensor Parallelism?"},
]
inputs = tokenizer.apply_chat_template(
messages,
add_generation_prompt=True,
return_tensors="pt",
return_dict=True,
reasoning_effort="low",
).to("cuda")
max_tokens = 512
with torch.inference_mode():
start_time = time.perf_counter()
generated = model.generate(
**inputs,
max_new_tokens=max_tokens,
do_sample=False,
temperature=None,
)
end_time = time.perf_counter()
print(tokenizer.decode(generated[0], skip_special_tokens=False))
print(f"Generation took {end_time - start_time:.2f} seconds")
▶ UV Install Logs
Fetching 3 files: 0%| | 0/3 [00:00<?, ?it/s]
Fetching 3 files: 33%|███▎ | 1/3 [00:15<00:31, 15.83s/it]
Fetching 3 files: 67%|██████▋ | 2/3 [00:18<00:08, 8.05s/it]
Fetching 3 files: 100%|██████████| 3/3 [00:18<00:00, 6.14s/it]
You are using full precision kernels, we will dequantize the model to bf16. To use the quantized model with quantization kernels, please set use_kernels=False
INFO:accelerate.utils.modeling:We will use 90% of the memory on device 0 for storing the model, and 10% for the buffer to avoid OOM. You can set `max_memory` in to a higher value to use more memory (at your own risk).
Loading checkpoint shards: 0%| | 0/3 [00:00<?, ?it/s]
Loading checkpoint shards: 33%|███▎ | 1/3 [00:07<00:15, 7.50s/it]
Loading checkpoint shards: 67%|██████▋ | 2/3 [00:14<00:07, 7.33s/it]
Loading checkpoint shards: 67%|██████▋ | 2/3 [00:15<00:07, 7.51s/it]
Traceback (most recent call last):
File "/tmp/uvnote_5cbrsnjg/.uvnote/cells/setup.py", line 83, in <module>
model = GptOssForCausalLM.from_pretrained(
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~^
model_id,
^^^^^^^^^
...<3 lines>...
quantization_config=quantization_config,
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
).eval()
^
File "/tmp/uvnote-run-vr4catz8/home/.cache/uv/environments-v2/setup-4117b8f0d0f9a3df/lib/python3.13/site-packages/transformers/modeling_utils.py", line 285, in _wrapper
return func(*args, **kwargs)
File "/tmp/uvnote-run-vr4catz8/home/.cache/uv/environments-v2/setup-4117b8f0d0f9a3df/lib/python3.13/site-packages/transformers/modeling_utils.py", line 5035, in from_pretrained
) = cls._load_pretrained_model(
~~~~~~~~~~~~~~~~~~~~~~~~~~^
model,
^^^^^^
...<13 lines>...
weights_only=weights_only,
^^^^^^^^^^^^^^^^^^^^^^^^^^
)
^
File "/tmp/uvnote-run-vr4catz8/home/.cache/uv/environments-v2/setup-4117b8f0d0f9a3df/lib/python3.13/site-packages/transformers/modeling_utils.py", line 5488, in _load_pretrained_model
_error_msgs, disk_offload_index, cpu_offload_index = load_shard_file(args)
~~~~~~~~~~~~~~~^^^^^^
File "/tmp/uvnote-run-vr4catz8/home/.cache/uv/environments-v2/setup-4117b8f0d0f9a3df/lib/python3.13/site-packages/transformers/modeling_utils.py", line 932, in load_shard_file
disk_offload_index, cpu_offload_index = _load_state_dict_into_meta_model(
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~^
model_to_load,
^^^^^^^^^^^^^^
...<13 lines>...
device_mesh=device_mesh,
^^^^^^^^^^^^^^^^^^^^^^^^
)
^
File "/tmp/uvnote-run-vr4catz8/home/.cache/uv/environments-v2/setup-4117b8f0d0f9a3df/lib/python3.13/site-packages/torch/utils/_contextlib.py", line 120, in decorate_context
return func(*args, **kwargs)
File "/tmp/uvnote-run-vr4catz8/home/.cache/uv/environments-v2/setup-4117b8f0d0f9a3df/lib/python3.13/site-packages/transformers/modeling_utils.py", line 840, in _load_state_dict_into_meta_model
hf_quantizer.create_quantized_param(
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~^
model, param, param_name, param_device, state_dict, unexpected_keys
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
)
^
File "/tmp/uvnote-run-vr4catz8/home/.cache/uv/environments-v2/setup-4117b8f0d0f9a3df/lib/python3.13/site-packages/transformers/quantizers/quantizer_mxfp4.py", line 249, in create_quantized_param
dequantize(module, param_name, param_value, target_device, dq_param_name, **shard_kwargs)
~~~~~~~~~~^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
File "/tmp/uvnote-run-vr4catz8/home/.cache/uv/environments-v2/setup-4117b8f0d0f9a3df/lib/python3.13/site-packages/transformers/integrations/mxfp4.py", line 329, in dequantize
dequantized = convert_moe_packed_tensors(getattr(module, blocks_attr), getattr(module, scales_attr))
File "/tmp/uvnote-run-vr4catz8/home/.cache/uv/environments-v2/setup-4117b8f0d0f9a3df/lib/python3.13/site-packages/transformers/integrations/mxfp4.py", line 117, in convert_moe_packed_tensors
idx_hi = (blk >> 4).to(torch.long)
torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 1.98 GiB. GPU 0 has a total capacity of 22.30 GiB of which 1.69 GiB is free. Process 43404 has 20.61 GiB memory in use. Of the allocated memory 17.37 GiB is allocated by PyTorch, and 2.96 GiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation. See documentation for Memory Management (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)