▼ code
▼ output
▶ uv-logs
|
Cell: setup | 304.89s
|
Raw
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 | # /// script # requires-python = ">=3.12" # dependencies = [ # "accelerate>=1.10.1", # "torch>=2.7.0", # "kernels==0.10.0", # "transformers@https://github.com/huggingface/transformers.git", # "ipdb>=0.13.13", # "matplotlib>=3.7.2", # "numpy>=1.24.3", # ] # /// import torch from transformers import GptOssForCausalLM, PreTrainedTokenizerFast, Mxfp4Config import time import torch.nn as nn from kernels import register_kernel_mapping, Mode, LayerRepository import sys import torch.profiler import gc def reset_peak_memory_stats(): """Clear CUDA cache and reset memory allocation counters.""" torch.cuda.empty_cache() if torch.cuda.is_available(): torch.cuda.reset_peak_memory_stats() gc.collect() def get_memory_stats(): """Get current and peak CUDA memory usage.""" if not torch.cuda.is_available(): return {"allocated_gb": 0, "peak_gb": 0, "reserved_gb": 0} return { "allocated_gb": torch.cuda.memory_allocated() / 1e9, "peak_gb": torch.cuda.max_memory_allocated() / 1e9, "reserved_gb": torch.cuda.memory_reserved() / 1e9, } def override_kernel_layer_name(cls_name: str, value) -> bool: """Helper to dynamically override the kernel_layer_name in a model class.""" for mod in sys.modules.values(): if mod is None: continue obj = getattr(mod, cls_name, None) if isinstance(obj, type) and issubclass(obj, nn.Module): setattr(obj, "kernel_layer_name", value) print(f"Overrode {cls_name}.kernel_layer_name to {value}") return True return False def run_generation(model, inputs, max_tokens=64): """Run a single generation pass and measure its duration.""" with torch.inference_mode(): start_time = time.perf_counter() generated = model.generate( **inputs, max_new_tokens=max_tokens, do_sample=False, temperature=None, ) end_time = time.perf_counter() return generated, end_time - start_time # Init the model the normal way model_id = "openai/gpt-oss-20b" tokenizer = PreTrainedTokenizerFast.from_pretrained(model_id) quantization_config = Mxfp4Config(dequantize=True) # Now we want to add some custom kernel mapping custom_mapping = dict( Yamoe=dict( cuda={ Mode.INFERENCE: LayerRepository( repo_id="drbh/yamoe", layer_name="Yamoe", revision="v0.3.0", ), }, ) ) # First add the mapping register_kernel_mapping(custom_mapping) # Then override the layer name in the model class override_kernel_layer_name("GptOssMLP", "Yamoe") # TODO: remove this line once RMSNorm is working override_kernel_layer_name("GptOssRMSNorm", None) ## Normal model stuff model = GptOssForCausalLM.from_pretrained( model_id, dtype="bfloat16", device_map="auto", use_kernels=True, quantization_config=quantization_config, ).eval() messages = [ {"role": "system", "content": "What is Tensor Parallelism?"}, ] inputs = tokenizer.apply_chat_template( messages, add_generation_prompt=True, return_tensors="pt", return_dict=True, reasoning_effort="low", ).to("cuda") def run_generation(model, inputs, max_tokens=64): with torch.inference_mode(): start_time = time.perf_counter() generated = model.generate( **inputs, max_new_tokens=max_tokens, do_sample=False, temperature=None, ) end_time = time.perf_counter() return generated, end_time - start_time print("\n=== Running Benchmarks ===") print(f"Model: {model_id}") print(f"Device: {torch.cuda.get_device_name()}") print(f"Initial memory: {get_memory_stats()}\n") # Warmup print("Running warmup...") for _ in range(2): _ = run_generation(model, inputs, max_tokens=16) reset_peak_memory_stats() # Benchmark runs num_runs = 5 max_tokens = 64 times = [] print(f"\nRunning {num_runs} benchmark iterations with {max_tokens} tokens...") for i in range(num_runs): reset_peak_memory_stats() generated, elapsed = run_generation(model, inputs, max_tokens) times.append(elapsed) mem_stats = get_memory_stats() tokens_per_sec = max_tokens / elapsed print(f"Run {i+1}: {elapsed:.3f}s ({tokens_per_sec:.1f} tok/s) | Peak: {mem_stats['peak_gb']:.2f}GB") # Statistics avg_time = sum(times) / len(times) min_time = min(times) max_time = max(times) avg_tokens_per_sec = max_tokens / avg_time print(f"\n=== Benchmark Results ===") print(f"Average: {avg_time:.3f}s ({avg_tokens_per_sec:.1f} tok/s)") print(f"Min: {min_time:.3f}s | Max: {max_time:.3f}s") # Final memory stats final_mem = get_memory_stats() print(f"\nFinal Memory:") print(f" Allocated: {final_mem['allocated_gb']:.2f}GB") print(f" Peak: {final_mem['peak_gb']:.2f}GB") print(f" Reserved: {final_mem['reserved_gb']:.2f}GB") print("\n=== Running with Profiler ===") reset_peak_memory_stats() with torch.profiler.profile( activities=[ torch.profiler.ProfilerActivity.CPU, torch.profiler.ProfilerActivity.CUDA, ], record_shapes=True, profile_memory=True, with_stack=True, ) as prof: generated, elapsed = run_generation(model, inputs, max_tokens=64) print(f"Generation time: {elapsed:.2f} seconds") # Print profiler results print("\n=== Top 10 CUDA operations by time ===") print(prof.key_averages().table(sort_by="cuda_time_total", row_limit=10)) print("\n=== Top 10 operations by memory ===") print(prof.key_averages().table(sort_by="self_cuda_memory_usage", row_limit=10)) # Memory stats mem_stats = get_memory_stats() print(f"\nPeak Memory: {mem_stats['peak_gb']:.2f}GB") # Save trace if needed prof.export_chrome_trace("trace.json") print("\nProfile trace saved to trace.json") # Decode and print output print("\nGenerated text:") print(tokenizer.decode(generated[0][inputs["input_ids"].shape[-1] :])) # save times and memory stats for charting with open("benchmark_times.txt", "w") as f: for t in times: f.write(f"{t}\n") with open("benchmark_memory.txt", "w") as f: f.write(f"{final_mem['allocated_gb']},{final_mem['peak_gb']},{final_mem['reserved_gb']}\n") # save avg_tokens_per_sec for charting with open("benchmark_avg_tokens_per_sec.txt", "w") as f: f.write(f"{avg_tokens_per_sec}\n") |
Overrode GptOssMLP.kernel_layer_name to Yamoe
Overrode GptOssRMSNorm.kernel_layer_name to None
=== Running Benchmarks ===
Model: openai/gpt-oss-20b
Device: NVIDIA L4
Initial memory: {'allocated_gb': 9.390148608, 'peak_gb': 15.5643264, 'reserved_gb': 17.177772032}
Running warmup...
Running 5 benchmark iterations with 64 tokens...
Run 1: 12.075s (5.3 tok/s) | Peak: 9.41GB
Run 2: 12.071s (5.3 tok/s) | Peak: 9.41GB
Run 3: 12.070s (5.3 tok/s) | Peak: 9.41GB
Run 4: 12.071s (5.3 tok/s) | Peak: 9.41GB
Run 5: 12.071s (5.3 tok/s) | Peak: 9.41GB
=== Benchmark Results ===
Average: 12.072s (5.3 tok/s)
Min: 12.070s | Max: 12.075s
Final Memory:
Allocated: 9.40GB
Peak: 9.41GB
Reserved: 10.33GB
=== Running with Profiler ===
Generation time: 12.73 seconds
=== Top 10 CUDA operations by time ===
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg CPU Mem Self CPU Mem CUDA Mem Self CUDA Mem # of Calls
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
_yamoe_74a2acb_dirty::experts 1.40% 148.156ms 66.87% 7.074s 4.606ms 52.388ms 0.46% 10.583s 6.890ms 0 B -2.98 KB 18.88 MB -2.11 GB 1536
aten::bmm 1.25% 132.560ms 1.75% 185.015ms 29.803us 10.486s 91.79% 10.486s 1.689ms 0 B 0 B 63.12 MB 63.12 MB 6208
void cutlass::Kernel2<cutlass_80_wmma_tensorop_bf16_... 0.00% 0.000us 0.00% 0.000us 0.000us 10.319s 90.32% 10.319s 3.412ms 0 B 0 B 0 B 0 B 3024
aten::linear 0.54% 57.566ms 3.78% 399.802ms 51.627us 0.000us 0.00% 645.165ms 83.312us 0 B 0 B 76.88 MB 0 B 7744
aten::addmm 1.81% 191.354ms 2.57% 272.095ms 35.429us 352.039ms 3.08% 352.151ms 45.853us 0 B 0 B 52.31 MB 52.31 MB 7680
std::enable_if<!(false), void>::type internal::gemvx... 0.00% 0.000us 0.00% 0.000us 0.000us 344.917ms 3.02% 344.917ms 74.982us 0 B 0 B 0 B 0 B 4600
aten::matmul 0.31% 32.441ms 1.72% 181.712ms 56.785us 0.000us 0.00% 303.821ms 94.944us 0 B 0 B 87.68 MB 0 B 3200
std::enable_if<!(false), void>::type internal::gemvx... 0.00% 0.000us 0.00% 0.000us 0.000us 293.850ms 2.57% 293.850ms 97.173us 0 B 0 B 0 B 0 B 3024
aten::mm 0.01% 1.506ms 0.02% 2.161ms 33.768us 293.014ms 2.56% 293.014ms 4.578ms 0 B 0 B 24.56 MB 24.56 MB 64
ampere_bf16_s16816gemm_bf16_128x64_ldg8_f2f_nn 0.00% 0.000us 0.00% 0.000us 0.000us 102.278ms 0.90% 102.278ms 4.262ms 0 B 0 B 0 B 0 B 24
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
Self CPU time total: 10.579s
Self CUDA time total: 11.424s
=== Top 10 operations by memory ===
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg CPU Mem Self CPU Mem CUDA Mem Self CUDA Mem # of Calls
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
aten::empty 0.68% 72.026ms 0.68% 72.026ms 4.244us 0.000us 0.00% 0.000us 0.000us 296 B 296 B 3.49 GB 3.49 GB 16973
aten::clamp 0.46% 48.185ms 0.69% 72.630ms 15.762us 10.269ms 0.09% 10.269ms 2.229us 0 B 0 B 616.69 MB 616.69 MB 4608
aten::mul 1.76% 186.048ms 2.93% 310.383ms 14.181us 47.780ms 0.42% 47.792ms 2.184us 784 B 784 B 554.93 MB 554.93 MB 21888
aten::cat 0.78% 82.030ms 1.22% 129.113ms 16.536us 17.028ms 0.15% 17.030ms 2.181us 0 B 0 B 387.88 MB 387.88 MB 7808
aten::sigmoid 0.09% 9.855ms 0.16% 16.652ms 10.841us 2.889ms 0.03% 2.889ms 1.881us 0 B 0 B 307.97 MB 307.97 MB 1536
aten::empty_strided 1.08% 114.498ms 1.10% 116.720ms 5.564us 0.000us 0.00% 0.000us 0.000us 0 B 0 B 216.60 MB 216.60 MB 20979
aten::add 0.93% 97.861ms 1.56% 164.673ms 15.047us 16.394ms 0.14% 16.395ms 1.498us 0 B 0 B 91.03 MB 91.03 MB 10944
aten::pow 0.36% 38.271ms 0.55% 58.020ms 18.501us 4.117ms 0.04% 4.117ms 1.313us 0 B 0 B 75.58 MB 75.58 MB 3136
aten::bmm 1.25% 132.560ms 1.75% 185.015ms 29.803us 10.486s 91.79% 10.486s 1.689ms 0 B 0 B 63.12 MB 63.12 MB 6208
aten::sub 0.51% 53.869ms 0.82% 87.218ms 13.626us 9.277ms 0.08% 9.355ms 1.461us 0 B 0 B 53.04 MB 53.01 MB 6401
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
Self CPU time total: 10.579s
Self CUDA time total: 11.424s
Peak Memory: 9.41GB
Profile trace saved to trace.json
Generated text:
<|channel|>analysis<|message|>We need to explain what Tensor Parallelism is. It's a concept in distributed training of large language models. It refers to splitting the weight matrices (tensors) across multiple devices, so each device holds a slice of the matrix. During forward/backward passes, each device computes partial results and then they are
▶ UV Install Logs
Fetching 3 files: 0%| | 0/3 [00:00<?, ?it/s]
Fetching 3 files: 33%|███▎ | 1/3 [00:11<00:23, 11.59s/it]
Fetching 3 files: 67%|██████▋ | 2/3 [00:16<00:07, 7.73s/it]
Fetching 3 files: 100%|██████████| 3/3 [00:16<00:00, 5.54s/it]
You are using full precision kernels, we will dequantize the model to bf16. To use the quantized model with quantization kernels, please set use_kernels=False
Loading checkpoint shards: 0%| | 0/3 [00:00<?, ?it/s]
Loading checkpoint shards: 33%|███▎ | 1/3 [00:03<00:06, 3.23s/it]
Loading checkpoint shards: 67%|██████▋ | 2/3 [00:06<00:03, 3.15s/it]
Loading checkpoint shards: 100%|██████████| 3/3 [00:08<00:00, 2.50s/it]
Loading checkpoint shards: 100%|██████████| 3/3 [00:08<00:00, 2.68s/it]
Fetching 6 files: 0%| | 0/6 [00:00<?, ?it/s]
Fetching 6 files: 17%|█▋ | 1/6 [00:00<00:00, 5.23it/s]
Fetching 6 files: 50%|█████ | 3/6 [00:00<00:00, 6.19it/s]
Fetching 6 files: 100%|██████████| 6/6 [00:00<00:00, 12.15it/s]
/tmp/uvnote-run-hjgpkuq6/home/.cache/uv/environments-v2/setup-30bb029f3f83f37d/lib/python3.12/site-packages/kernels/layer.py:868: UserWarning:
No kernel mapping found for layer `None`. Check if the layer name matches one of the kernels in the mapping or add the kernel you want to use to the mapping. Defaulting to original forward implementation.
warnings.warn(
/tmp/uvnote-run-hjgpkuq6/home/.cache/uv/environments-v2/setup-30bb029f3f83f37d/lib/python3.12/site-packages/kernels/layer.py:868: UserWarning:
No kernel mapping found for layer `None`. Check if the layer name matches one of the kernels in the mapping or add the kernel you want to use to the mapping. Defaulting to original forward implementation.
warnings.warn(