▼ code ▼ output ▶ uv-logs | Cell: setup | 99.80s | FAILED | Raw
# /// script
# requires-python = ">=3.12"
# dependencies = [
#     "accelerate>=1.10.1",
#     "torch>=2.7.0",
#     "kernels==0.10.0",
#     "transformers@https://github.com/huggingface/transformers.git",
#     "ipdb>=0.13.13",
#     "matplotlib>=3.7.2",
#     "numpy>=1.24.3",
# ]
# ///

import torch
from transformers import GptOssForCausalLM, PreTrainedTokenizerFast, Mxfp4Config
import time
import torch.nn as nn
from kernels import register_kernel_mapping, Mode, LayerRepository
import sys
import torch.profiler
import gc
import logging

# set to debug logging
logging.basicConfig(level=logging.INFO)

def reset_peak_memory_stats():
    """Clear CUDA cache and reset memory allocation counters."""
    torch.cuda.empty_cache()
    if torch.cuda.is_available():
        torch.cuda.reset_peak_memory_stats()
    gc.collect()

def get_memory_stats():
    """Get current and peak CUDA memory usage."""
    if not torch.cuda.is_available():
        return {"allocated_gb": 0, "peak_gb": 0, "reserved_gb": 0}
    return {
        "allocated_gb": torch.cuda.memory_allocated() / 1e9,
        "peak_gb": torch.cuda.max_memory_allocated() / 1e9,
        "reserved_gb": torch.cuda.memory_reserved() / 1e9,
    }

def override_kernel_layer_name(cls_name: str, value) -> bool:
    """Helper to dynamically override the kernel_layer_name in a model class."""
    for mod in sys.modules.values():
        if mod is None:
            continue
        obj = getattr(mod, cls_name, None)
        if isinstance(obj, type) and issubclass(obj, nn.Module):
            setattr(obj, "kernel_layer_name", value)
            print(f"Overrode {cls_name}.kernel_layer_name to {value}")
            return True
    return False


# Init the model the normal way
model_id = "openai/gpt-oss-20b"
tokenizer = PreTrainedTokenizerFast.from_pretrained(model_id)
quantization_config = Mxfp4Config(dequantize=True)


from kernels import replace_kernel_forward_from_hub, register_kernel_mapping, LayerRepository, Mode

from transformers.models.gpt_oss.modeling_gpt_oss import GptOssMLP, GptOssRMSNorm

replace_kernel_forward_from_hub(GptOssMLP, "Yamoe")  # direct, type-safe
replace_kernel_forward_from_hub(GptOssRMSNorm, None)  # direct, type-safe
custom_mapping = {
    "Yamoe": {
        "cuda": {
            Mode.INFERENCE: LayerRepository(
                repo_id="drbh/yamoe",
                layer_name="Yamoe",
                revision="v0.3.0",
            )
        }
    }
}
register_kernel_mapping(custom_mapping)


model = GptOssForCausalLM.from_pretrained(
    model_id,
    dtype="bfloat16",
    device_map="auto",
    use_kernels=True,
    quantization_config=quantization_config,
).eval()

messages = [
    {"role": "system", "content": "What is Tensor Parallelism?"},
]

inputs = tokenizer.apply_chat_template(
    messages,
    add_generation_prompt=True,
    return_tensors="pt",
    return_dict=True,
    reasoning_effort="low",
).to("cuda")

max_tokens = 512

with torch.inference_mode():
    start_time = time.perf_counter()
    generated = model.generate(
        **inputs,
        max_new_tokens=max_tokens,
        do_sample=False,
        temperature=None,
    )
    end_time = time.perf_counter()

print(tokenizer.decode(generated[0], skip_special_tokens=False))
print(f"Generation took {end_time - start_time:.2f} seconds")
▶ UV Install Logs
Fetching 3 files: 0%| | 0/3 [00:00<?, ?it/s] Fetching 3 files: 33%|███▎ | 1/3 [00:15<00:31, 15.83s/it] Fetching 3 files: 67%|██████▋ | 2/3 [00:18<00:08, 8.05s/it] Fetching 3 files: 100%|██████████| 3/3 [00:18<00:00, 6.14s/it] You are using full precision kernels, we will dequantize the model to bf16. To use the quantized model with quantization kernels, please set use_kernels=False INFO:accelerate.utils.modeling:We will use 90% of the memory on device 0 for storing the model, and 10% for the buffer to avoid OOM. You can set `max_memory` in to a higher value to use more memory (at your own risk). Loading checkpoint shards: 0%| | 0/3 [00:00<?, ?it/s] Loading checkpoint shards: 33%|███▎ | 1/3 [00:07<00:15, 7.50s/it] Loading checkpoint shards: 67%|██████▋ | 2/3 [00:14<00:07, 7.33s/it] Loading checkpoint shards: 67%|██████▋ | 2/3 [00:15<00:07, 7.51s/it] Traceback (most recent call last): File "/tmp/uvnote_5cbrsnjg/.uvnote/cells/setup.py", line 83, in <module> model = GptOssForCausalLM.from_pretrained( ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~^ model_id, ^^^^^^^^^ ...<3 lines>... quantization_config=quantization_config, ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ ).eval() ^ File "/tmp/uvnote-run-vr4catz8/home/.cache/uv/environments-v2/setup-4117b8f0d0f9a3df/lib/python3.13/site-packages/transformers/modeling_utils.py", line 285, in _wrapper return func(*args, **kwargs) File "/tmp/uvnote-run-vr4catz8/home/.cache/uv/environments-v2/setup-4117b8f0d0f9a3df/lib/python3.13/site-packages/transformers/modeling_utils.py", line 5035, in from_pretrained ) = cls._load_pretrained_model( ~~~~~~~~~~~~~~~~~~~~~~~~~~^ model, ^^^^^^ ...<13 lines>... weights_only=weights_only, ^^^^^^^^^^^^^^^^^^^^^^^^^^ ) ^ File "/tmp/uvnote-run-vr4catz8/home/.cache/uv/environments-v2/setup-4117b8f0d0f9a3df/lib/python3.13/site-packages/transformers/modeling_utils.py", line 5488, in _load_pretrained_model _error_msgs, disk_offload_index, cpu_offload_index = load_shard_file(args) ~~~~~~~~~~~~~~~^^^^^^ File "/tmp/uvnote-run-vr4catz8/home/.cache/uv/environments-v2/setup-4117b8f0d0f9a3df/lib/python3.13/site-packages/transformers/modeling_utils.py", line 932, in load_shard_file disk_offload_index, cpu_offload_index = _load_state_dict_into_meta_model( ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~^ model_to_load, ^^^^^^^^^^^^^^ ...<13 lines>... device_mesh=device_mesh, ^^^^^^^^^^^^^^^^^^^^^^^^ ) ^ File "/tmp/uvnote-run-vr4catz8/home/.cache/uv/environments-v2/setup-4117b8f0d0f9a3df/lib/python3.13/site-packages/torch/utils/_contextlib.py", line 120, in decorate_context return func(*args, **kwargs) File "/tmp/uvnote-run-vr4catz8/home/.cache/uv/environments-v2/setup-4117b8f0d0f9a3df/lib/python3.13/site-packages/transformers/modeling_utils.py", line 840, in _load_state_dict_into_meta_model hf_quantizer.create_quantized_param( ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~^ model, param, param_name, param_device, state_dict, unexpected_keys ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ ) ^ File "/tmp/uvnote-run-vr4catz8/home/.cache/uv/environments-v2/setup-4117b8f0d0f9a3df/lib/python3.13/site-packages/transformers/quantizers/quantizer_mxfp4.py", line 249, in create_quantized_param dequantize(module, param_name, param_value, target_device, dq_param_name, **shard_kwargs) ~~~~~~~~~~^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ File "/tmp/uvnote-run-vr4catz8/home/.cache/uv/environments-v2/setup-4117b8f0d0f9a3df/lib/python3.13/site-packages/transformers/integrations/mxfp4.py", line 329, in dequantize dequantized = convert_moe_packed_tensors(getattr(module, blocks_attr), getattr(module, scales_attr)) File "/tmp/uvnote-run-vr4catz8/home/.cache/uv/environments-v2/setup-4117b8f0d0f9a3df/lib/python3.13/site-packages/transformers/integrations/mxfp4.py", line 117, in convert_moe_packed_tensors idx_hi = (blk >> 4).to(torch.long) torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 1.98 GiB. GPU 0 has a total capacity of 22.30 GiB of which 1.69 GiB is free. Process 43404 has 20.61 GiB memory in use. Of the allocated memory 17.37 GiB is allocated by PyTorch, and 2.96 GiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation. See documentation for Memory Management (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)

Reference kernel