diff --git a/.gitattributes b/.gitattributes index e876c71818134e1e50ea0f25b9f1914428dd7a40..bd95ef366a7925e5606949714cdbf51f01c5fbc5 100644 --- a/.gitattributes +++ b/.gitattributes @@ -33,4 +33,14 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text *.zip filter=lfs diff=lfs merge=lfs -text *.zst filter=lfs diff=lfs merge=lfs -text *tfevents* filter=lfs diff=lfs merge=lfs -text -moe_benchmarks/megablocks_yamoe/artifacts/visualization/moe_performance_comparison.png filter=lfs diff=lfs merge=lfs -text +# Image files +*.png filter=lfs diff=lfs merge=lfs -text +*.jpg filter=lfs diff=lfs merge=lfs -text +*.jpeg filter=lfs diff=lfs merge=lfs -text +*.gif filter=lfs diff=lfs merge=lfs -text +*.bmp filter=lfs diff=lfs merge=lfs -text +*.tiff filter=lfs diff=lfs merge=lfs -text +*.tif filter=lfs diff=lfs merge=lfs -text +*.webp filter=lfs diff=lfs merge=lfs -text +*.svg filter=lfs diff=lfs merge=lfs -text +*.ico filter=lfs diff=lfs merge=lfs -text diff --git a/.venv/index.html b/.venv/index.html deleted file mode 100644 index f3cb6ee07e1711f67b6447c984a2998e866c84b7..0000000000000000000000000000000000000000 --- a/.venv/index.html +++ /dev/null @@ -1,24 +0,0 @@ - - - - - Directory Index - - - -

Index of /.venv

- - - \ No newline at end of file diff --git a/.venv/lib/index.html b/.venv/lib/index.html deleted file mode 100644 index ccdf4339e7d9656235b67279909e397bd4c1dd5e..0000000000000000000000000000000000000000 --- a/.venv/lib/index.html +++ /dev/null @@ -1,24 +0,0 @@ - - - - - Directory Index - - - -

Index of /.venv/lib

- - - \ No newline at end of file diff --git a/.venv/lib/python3.11/index.html b/.venv/lib/python3.11/index.html deleted file mode 100644 index 3af34a216f69970720906537015aa45ee791045c..0000000000000000000000000000000000000000 --- a/.venv/lib/python3.11/index.html +++ /dev/null @@ -1,24 +0,0 @@ - - - - - Directory Index - - - -

Index of /.venv/lib/python3.11

- - - \ No newline at end of file diff --git a/.venv/lib/python3.11/site-packages/flask/index.html b/.venv/lib/python3.11/site-packages/flask/index.html deleted file mode 100644 index b7ce8a2a5d485d91433d50deb142d19e909e1adb..0000000000000000000000000000000000000000 --- a/.venv/lib/python3.11/site-packages/flask/index.html +++ /dev/null @@ -1,24 +0,0 @@ - - - - - Directory Index - - - -

Index of /.venv/lib/python3.11/site-packages/flask

- - - \ No newline at end of file diff --git a/.venv/lib/python3.11/site-packages/flask/sansio/index.html b/.venv/lib/python3.11/site-packages/flask/sansio/index.html deleted file mode 100644 index 94bc5e014b7c53407b9c7c5f1848d268c3f3028f..0000000000000000000000000000000000000000 --- a/.venv/lib/python3.11/site-packages/flask/sansio/index.html +++ /dev/null @@ -1,24 +0,0 @@ - - - - - Directory Index - - - -

Index of /.venv/lib/python3.11/site-packages/flask/sansio

- - - \ No newline at end of file diff --git a/.venv/lib/python3.11/site-packages/index.html b/.venv/lib/python3.11/site-packages/index.html deleted file mode 100644 index d4c40d702642ed2e26b172d36dac4831f914c234..0000000000000000000000000000000000000000 --- a/.venv/lib/python3.11/site-packages/index.html +++ /dev/null @@ -1,26 +0,0 @@ - - - - - Directory Index - - - -

Index of /.venv/lib/python3.11/site-packages

- - - \ No newline at end of file diff --git a/.venv/lib/python3.11/site-packages/markdown-3.9.dist-info/index.html b/.venv/lib/python3.11/site-packages/markdown-3.9.dist-info/index.html deleted file mode 100644 index 7f69784006ffb575de6d1e146a4f7dbeca91a70d..0000000000000000000000000000000000000000 --- a/.venv/lib/python3.11/site-packages/markdown-3.9.dist-info/index.html +++ /dev/null @@ -1,24 +0,0 @@ - - - - - Directory Index - - - -

Index of /.venv/lib/python3.11/site-packages/markdown-3.9.dist-info

- - - \ No newline at end of file diff --git a/.venv/lib/python3.11/site-packages/markdown-3.9.dist-info/licenses/index.html b/.venv/lib/python3.11/site-packages/markdown-3.9.dist-info/licenses/index.html deleted file mode 100644 index ea09a5b2b7d7c6a483abc292a6092f3adb6d0391..0000000000000000000000000000000000000000 --- a/.venv/lib/python3.11/site-packages/markdown-3.9.dist-info/licenses/index.html +++ /dev/null @@ -1,24 +0,0 @@ - - - - - Directory Index - - - -

Index of /.venv/lib/python3.11/site-packages/markdown-3.9.dist-info/licenses

- - - \ No newline at end of file diff --git a/.venv/lib/python3.11/site-packages/werkzeug/debug/index.html b/.venv/lib/python3.11/site-packages/werkzeug/debug/index.html deleted file mode 100644 index ac697e964e418006a7e0a563c4e536e12994224a..0000000000000000000000000000000000000000 --- a/.venv/lib/python3.11/site-packages/werkzeug/debug/index.html +++ /dev/null @@ -1,24 +0,0 @@ - - - - - Directory Index - - - -

Index of /.venv/lib/python3.11/site-packages/werkzeug/debug

- - - \ No newline at end of file diff --git a/.venv/lib/python3.11/site-packages/werkzeug/debug/shared/index.html b/.venv/lib/python3.11/site-packages/werkzeug/debug/shared/index.html deleted file mode 100644 index 818ff7551c3a328860e310bc5e5ea4daf0bb9ae8..0000000000000000000000000000000000000000 --- a/.venv/lib/python3.11/site-packages/werkzeug/debug/shared/index.html +++ /dev/null @@ -1,24 +0,0 @@ - - - - - Directory Index - - - -

Index of /.venv/lib/python3.11/site-packages/werkzeug/debug/shared

- - - \ No newline at end of file diff --git a/.venv/lib/python3.11/site-packages/werkzeug/index.html b/.venv/lib/python3.11/site-packages/werkzeug/index.html deleted file mode 100644 index 0deece299e75d2120c0b4c58b8f2d20ad8784c80..0000000000000000000000000000000000000000 --- a/.venv/lib/python3.11/site-packages/werkzeug/index.html +++ /dev/null @@ -1,24 +0,0 @@ - - - - - Directory Index - - - -

Index of /.venv/lib/python3.11/site-packages/werkzeug

- - - \ No newline at end of file diff --git a/artifacts/charts/benchmark_dashboard.png b/artifacts/charts/benchmark_dashboard.png deleted file mode 100644 index 3e41a250a9cf81deca81d3b8680906932190c3ce..0000000000000000000000000000000000000000 Binary files a/artifacts/charts/benchmark_dashboard.png and /dev/null differ diff --git a/artifacts/charts/latency.png b/artifacts/charts/latency.png deleted file mode 100644 index 3a47479e94c29b164c07640ac4e3b0839a09b136..0000000000000000000000000000000000000000 Binary files a/artifacts/charts/latency.png and /dev/null differ diff --git a/artifacts/charts/memory.png b/artifacts/charts/memory.png deleted file mode 100644 index 4f2455d5cf6c7ec026cfb2d3237bb00ade76bb5d..0000000000000000000000000000000000000000 Binary files a/artifacts/charts/memory.png and /dev/null differ diff --git a/artifacts/charts/throughput.png b/artifacts/charts/throughput.png deleted file mode 100644 index 03dca989b1ab57e1a4ba8471c4ef62579b743e0b..0000000000000000000000000000000000000000 Binary files a/artifacts/charts/throughput.png and /dev/null differ diff --git a/artifacts/setup/benchmark_avg_tokens_per_sec.txt b/artifacts/setup/benchmark_avg_tokens_per_sec.txt deleted file mode 100644 index af3f00403ff74eb6871c2c28a93d941c2ce0ae9d..0000000000000000000000000000000000000000 --- a/artifacts/setup/benchmark_avg_tokens_per_sec.txt +++ /dev/null @@ -1 +0,0 @@ -5.301658854167735 diff --git a/artifacts/setup/benchmark_dashboard.png b/artifacts/setup/benchmark_dashboard.png deleted file mode 100644 index e34083a615418e4c216b3c892b87a0bee7a77f64..0000000000000000000000000000000000000000 Binary files a/artifacts/setup/benchmark_dashboard.png and /dev/null differ diff --git a/artifacts/setup/benchmark_memory.txt b/artifacts/setup/benchmark_memory.txt deleted file mode 100644 index cafad0766947b5c3809e939b898b158a2f6520d8..0000000000000000000000000000000000000000 --- a/artifacts/setup/benchmark_memory.txt +++ /dev/null @@ -1 +0,0 @@ -9.398672896,9.414898176,10.334765056 diff --git a/artifacts/setup/benchmark_times.txt b/artifacts/setup/benchmark_times.txt deleted file mode 100644 index 01883f89fd7f6b87427c696c3bdce733de2ab011..0000000000000000000000000000000000000000 --- a/artifacts/setup/benchmark_times.txt +++ /dev/null @@ -1,5 +0,0 @@ -12.075035744113848 -12.0710428240709 -12.070115809096023 -12.070908240042627 -12.071364195086062 diff --git a/cells/charts.py b/cells/charts.py deleted file mode 100644 index 68834c18ee9a315ae893cb7cdca1980324a2e517..0000000000000000000000000000000000000000 --- a/cells/charts.py +++ /dev/null @@ -1,140 +0,0 @@ -# /// script -# dependencies = [ -# "matplotlib", -# "numpy", -# ] -# /// - -import matplotlib.pyplot as plt -import numpy as np -import os - -# get the pathf rom UVNOTE_SETUP env var -setup_path = os.getenv("UVNOTE_INPUT_SETUP", ".") -print(f"Reading benchmark data from: {setup_path}") - -num_runs = 5 -max_tokens = 64 -times = [] -with open(os.path.join(setup_path, "benchmark_times.txt"), "r") as f: - for line in f: - times.append(float(line.strip())) - - -avg_time = 0.0 -min_time = 0.0 -max_time = 0.0 -final_mem = {"allocated_gb": 0.0, "peak_gb": 0.0, "reserved_gb": 0.0} - -avg_tokens_per_sec = 0.0 -with open(os.path.join(setup_path, "benchmark_avg_tokens_per_sec.txt"), "r") as f: - avg_tokens_per_sec = float(f.read().strip()) - -times_file = os.path.join(setup_path, "benchmark_times.txt") -memory_file = os.path.join(setup_path, "benchmark_memory.txt") - - -# Minimal brutalist palette (dark theme): grayscale + 1 accent -ACCENT = '#5ec8f8' # calm cyan-blue accent -FG = '#e6e6e6' # light gray text/lines -MUTED = '#9aa0a6' # muted gray for secondary -GRID = '#333333' # grid lines - -# Styling tuned for clarity, high contrast, few colors -plt.style.use('dark_background') -plt.rcParams['figure.facecolor'] = 'none' -plt.rcParams['axes.facecolor'] = 'none' -plt.rcParams['savefig.facecolor'] = 'none' -plt.rcParams['savefig.transparent'] = True -plt.rcParams['font.family'] = 'monospace' -plt.rcParams['font.weight'] = 'bold' -plt.rcParams['axes.linewidth'] = 3 -plt.rcParams['grid.linewidth'] = 2 -plt.rcParams['lines.linewidth'] = 3 -plt.rcParams['patch.linewidth'] = 2 - -# Prepare data -runs = list(range(1, len(times) + 1)) -tokens_per_sec_all = [max_tokens / t for t in times] - -# Chart 1: Throughput Performance -fig1, ax1 = plt.subplots(1, 1, figsize=(12, 6)) -fig1.patch.set_alpha(0) -ax1.patch.set_alpha(0) - -ax1.plot(runs, tokens_per_sec_all, color=ACCENT, marker='o', markersize=12, - markerfacecolor=ACCENT, markeredgecolor=FG, markeredgewidth=3, linewidth=5, label='tok/s') -ax1.fill_between(runs, 0, tokens_per_sec_all, alpha=0.2, color=ACCENT) -ax1.axhline(y=avg_tokens_per_sec, color=FG, linestyle='--', linewidth=3, - label=f'AVG: {avg_tokens_per_sec:.1f}') -ax1.set_title('THROUGHPUT PERFORMANCE', color=FG, fontsize=18, pad=20, fontweight='bold') -ax1.set_xlabel('RUN NUMBER', color=FG, fontsize=14, fontweight='bold') -ax1.set_ylabel('TOKENS/SEC', color=FG, fontsize=14, fontweight='bold') -ax1.grid(True, color=GRID, alpha=0.5, linewidth=2) -ax1.tick_params(colors=FG, labelsize=12) -legend1 = ax1.legend(frameon=False, loc='lower right') -for text in legend1.get_texts(): - text.set_color(FG) - text.set_fontweight('bold') -plt.tight_layout() -plt.savefig('throughput.png', dpi=150, bbox_inches='tight', transparent=True) -plt.show() - -# Chart 2: Generation Latency -fig2, ax2 = plt.subplots(1, 1, figsize=(12, 6)) -fig2.patch.set_alpha(0) -ax2.patch.set_alpha(0) - -bar_colors = [ACCENT if i % 2 == 0 else MUTED for i in range(len(times))] -bars = ax2.bar(runs, times, color=bar_colors, edgecolor=FG, linewidth=3, width=0.6) -ax2.axhline(y=avg_time, color=FG, linestyle='--', linewidth=3, - label=f'AVG: {avg_time:.2f}s') -for i, (run, time, bar) in enumerate(zip(runs, times, bars)): - ax2.text(run, time + 0.02, f'{time:.2f}s', ha='center', va='bottom', - color=FG, fontweight='bold', fontsize=11) -ax2.set_title('GENERATION LATENCY', color=FG, fontsize=18, pad=20, fontweight='bold') -ax2.set_xlabel('RUN NUMBER', color=FG, fontsize=14, fontweight='bold') -ax2.set_ylabel('TIME (SECONDS)', color=FG, fontsize=14, fontweight='bold') -ax2.grid(True, axis='y', color=GRID, alpha=0.5, linewidth=2) -ax2.tick_params(colors=FG, labelsize=12) -ax2.set_ylim(0, max(times) * 1.15) -legend2 = ax2.legend(frameon=False, loc='upper right') -for text in legend2.get_texts(): - text.set_color(FG) - text.set_fontweight('bold') -plt.tight_layout() -plt.savefig('latency.png', dpi=150, bbox_inches='tight', transparent=True) -plt.show() - -# Chart 3: Memory Usage -fig3, ax3 = plt.subplots(1, 1, figsize=(12, 6)) -fig3.patch.set_alpha(0) -ax3.patch.set_alpha(0) - -memory_labels = ['ALLOCATED', 'PEAK', 'RESERVED'] -memory_values = [final_mem['allocated_gb'], final_mem['peak_gb'], final_mem['reserved_gb']] -colors_mem = [MUTED, ACCENT, FG] -bars = ax3.barh(memory_labels, memory_values, color=colors_mem, edgecolor=FG, linewidth=3, height=0.5) -for i, (label, value, bar) in enumerate(zip(memory_labels, memory_values, bars)): - ax3.text(value + 0.5, i, f'{value:.1f} GB', va='center', - color=FG, fontweight='bold', fontsize=13) -ax3.set_title('MEMORY USAGE', color=FG, fontsize=18, pad=20, fontweight='bold') -ax3.set_xlabel('GIGABYTES', color=FG, fontsize=14, fontweight='bold') -ax3.set_xlim(0, max(memory_values) * 1.3) -ax3.grid(True, axis='x', color=GRID, alpha=0.5, linewidth=2) -ax3.tick_params(colors=FG, labelsize=12) -ax3.set_yticks(range(len(memory_labels))) -ax3.set_yticklabels(memory_labels, fontweight='bold') -plt.tight_layout() -plt.savefig('memory.png', dpi=150, bbox_inches='tight', transparent=True) -plt.show() - -print(f"\nπŸ“Š Charts saved as:") -print(f" β€’ throughput.png") -print(f" β€’ latency.png") -print(f" β€’ memory.png") -print(f"\nBenchmark Summary:") -print(f" avg tokens/sec: {avg_tokens_per_sec:.1f}") -print(f" min time: {min_time:.3f}s") -print(f" max time: {max_time:.3f}s") -print(f" peak memory: {final_mem['peak_gb']:.2f}GB") diff --git a/cells/forward_and_backward.py b/cells/forward_and_backward.py deleted file mode 100644 index e9c24970def28b0b80af5848de10e41cc87a1349..0000000000000000000000000000000000000000 --- a/cells/forward_and_backward.py +++ /dev/null @@ -1,102 +0,0 @@ -# /// script -# requires-python = ">=3.12" -# dependencies = [ -# "accelerate>=1.10.1", -# "torch>=2.7.0", -# "kernels==0.10.0", -# "transformers@https://github.com/huggingface/transformers.git", -# "ipdb>=0.13.13", -# "matplotlib>=3.7.2", -# "numpy>=1.24.3", -# ] -# /// - -import torch -from transformers import GptOssForCausalLM, PreTrainedTokenizerFast, Mxfp4Config -import time -import torch.nn as nn -from kernels import register_kernel_mapping, Mode, LayerRepository -import sys -import torch.profiler -import gc -import logging -from transformers.models.gpt_oss.modeling_gpt_oss import GptOssRMSNorm - -# remove liger kernel for testing -replace_kernel_forward_from_hub(GptOssRMSNorm, None) - -# set to debug logging -logging.basicConfig(level=logging.INFO) - -def reset_peak_memory_stats(): - """Clear CUDA cache and reset memory allocation counters.""" - torch.cuda.empty_cache() - if torch.cuda.is_available(): - torch.cuda.reset_peak_memory_stats() - gc.collect() - -def get_memory_stats(): - """Get current and peak CUDA memory usage.""" - if not torch.cuda.is_available(): - return {"allocated_gb": 0, "peak_gb": 0, "reserved_gb": 0} - return { - "allocated_gb": torch.cuda.memory_allocated() / 1e9, - "peak_gb": torch.cuda.max_memory_allocated() / 1e9, - "reserved_gb": torch.cuda.memory_reserved() / 1e9, - } - -def override_kernel_layer_name(cls_name: str, value) -> bool: - """Helper to dynamically override the kernel_layer_name in a model class.""" - for mod in sys.modules.values(): - if mod is None: - continue - obj = getattr(mod, cls_name, None) - if isinstance(obj, type) and issubclass(obj, nn.Module): - setattr(obj, "kernel_layer_name", value) - print(f"Overrode {cls_name}.kernel_layer_name to {value}") - return True - return False - - -# Init the model the normal way -model_id = "openai/gpt-oss-20b" -tokenizer = PreTrainedTokenizerFast.from_pretrained(model_id) -quantization_config = Mxfp4Config(dequantize=True) - -model = GptOssForCausalLM.from_pretrained( - model_id, - dtype="bfloat16", - device_map="auto", - use_kernels=True, - quantization_config=quantization_config, - training=True, -).eval() - -messages = [ - {"role": "system", "content": "What is Tensor Parallelism?"}, -] - -inputs = tokenizer.apply_chat_template( - messages, - add_generation_prompt=True, - return_tensors="pt", - return_dict=True, - reasoning_effort="low", -).to("cuda") - -max_tokens = 512 - - -# forward and backward pass -with torch.autograd.set_grad_enabled(True): - start_time = time.perf_counter() - generated = model.generate( - **inputs, - max_new_tokens=max_tokens, - do_sample=False, - temperature=None, - ) - end_time = time.perf_counter() - print(tokenizer.decode(generated[0], skip_special_tokens=False)) - print(f"Generation took {end_time - start_time:.2f} seconds") - diff --git a/cells/forward_only.py b/cells/forward_only.py deleted file mode 100644 index 9a26615c3e9705cfdbc9be29bdcf68f9d24e597b..0000000000000000000000000000000000000000 --- a/cells/forward_only.py +++ /dev/null @@ -1,96 +0,0 @@ -# /// script -# requires-python = ">=3.12" -# dependencies = [ -# "accelerate>=1.10.1", -# "torch>=2.7.0", -# "kernels==0.10.0", -# "transformers@https://github.com/huggingface/transformers.git", -# "ipdb>=0.13.13", -# "matplotlib>=3.7.2", -# "numpy>=1.24.3", -# ] -# /// - -import torch -from transformers import GptOssForCausalLM, PreTrainedTokenizerFast, Mxfp4Config -import time -import torch.nn as nn -from kernels import register_kernel_mapping, Mode, LayerRepository -import sys -import torch.profiler -import gc -import logging - -# set to debug logging -logging.basicConfig(level=logging.INFO) - -def reset_peak_memory_stats(): - """Clear CUDA cache and reset memory allocation counters.""" - torch.cuda.empty_cache() - if torch.cuda.is_available(): - torch.cuda.reset_peak_memory_stats() - gc.collect() - -def get_memory_stats(): - """Get current and peak CUDA memory usage.""" - if not torch.cuda.is_available(): - return {"allocated_gb": 0, "peak_gb": 0, "reserved_gb": 0} - return { - "allocated_gb": torch.cuda.memory_allocated() / 1e9, - "peak_gb": torch.cuda.max_memory_allocated() / 1e9, - "reserved_gb": torch.cuda.memory_reserved() / 1e9, - } - -def override_kernel_layer_name(cls_name: str, value) -> bool: - """Helper to dynamically override the kernel_layer_name in a model class.""" - for mod in sys.modules.values(): - if mod is None: - continue - obj = getattr(mod, cls_name, None) - if isinstance(obj, type) and issubclass(obj, nn.Module): - setattr(obj, "kernel_layer_name", value) - print(f"Overrode {cls_name}.kernel_layer_name to {value}") - return True - return False - - -# Init the model the normal way -model_id = "openai/gpt-oss-20b" -tokenizer = PreTrainedTokenizerFast.from_pretrained(model_id) -quantization_config = Mxfp4Config(dequantize=True) - - -model = GptOssForCausalLM.from_pretrained( - model_id, - dtype="bfloat16", - device_map="auto", - use_kernels=True, - quantization_config=quantization_config, -).eval() - -messages = [ - {"role": "system", "content": "What is Tensor Parallelism?"}, -] - -inputs = tokenizer.apply_chat_template( - messages, - add_generation_prompt=True, - return_tensors="pt", - return_dict=True, - reasoning_effort="low", -).to("cuda") - -max_tokens = 512 - -with torch.inference_mode(): - start_time = time.perf_counter() - generated = model.generate( - **inputs, - max_new_tokens=max_tokens, - do_sample=False, - temperature=None, - ) - end_time = time.perf_counter() - -print(tokenizer.decode(generated[0], skip_special_tokens=False)) -print(f"Generation took {end_time - start_time:.2f} seconds") diff --git a/cells/nv.py b/cells/nv.py deleted file mode 100644 index 80eef60a7536ed875fb21731ab2d059458bd20b4..0000000000000000000000000000000000000000 --- a/cells/nv.py +++ /dev/null @@ -1,3 +0,0 @@ -import subprocess - -print(subprocess.run(["nvidia-smi"], capture_output=True, text=True).stdout) \ No newline at end of file diff --git a/cells/setup.py b/cells/setup.py deleted file mode 100644 index 1d44c93415f7502e5fb2ecec3a07ba74863d372e..0000000000000000000000000000000000000000 --- a/cells/setup.py +++ /dev/null @@ -1,116 +0,0 @@ -# /// script -# requires-python = ">=3.12" -# dependencies = [ -# "accelerate>=1.10.1", -# "torch>=2.7.0", -# "kernels==0.10.0", -# "transformers@https://github.com/huggingface/transformers.git", -# "ipdb>=0.13.13", -# "matplotlib>=3.7.2", -# "numpy>=1.24.3", -# ] -# /// - -import torch -from transformers import GptOssForCausalLM, PreTrainedTokenizerFast, Mxfp4Config -import time -import torch.nn as nn -from kernels import register_kernel_mapping, Mode, LayerRepository -import sys -import torch.profiler -import gc -import logging - -# set to debug logging -logging.basicConfig(level=logging.INFO) - -def reset_peak_memory_stats(): - """Clear CUDA cache and reset memory allocation counters.""" - torch.cuda.empty_cache() - if torch.cuda.is_available(): - torch.cuda.reset_peak_memory_stats() - gc.collect() - -def get_memory_stats(): - """Get current and peak CUDA memory usage.""" - if not torch.cuda.is_available(): - return {"allocated_gb": 0, "peak_gb": 0, "reserved_gb": 0} - return { - "allocated_gb": torch.cuda.memory_allocated() / 1e9, - "peak_gb": torch.cuda.max_memory_allocated() / 1e9, - "reserved_gb": torch.cuda.memory_reserved() / 1e9, - } - -def override_kernel_layer_name(cls_name: str, value) -> bool: - """Helper to dynamically override the kernel_layer_name in a model class.""" - for mod in sys.modules.values(): - if mod is None: - continue - obj = getattr(mod, cls_name, None) - if isinstance(obj, type) and issubclass(obj, nn.Module): - setattr(obj, "kernel_layer_name", value) - print(f"Overrode {cls_name}.kernel_layer_name to {value}") - return True - return False - - -# Init the model the normal way -model_id = "openai/gpt-oss-20b" -tokenizer = PreTrainedTokenizerFast.from_pretrained(model_id) -quantization_config = Mxfp4Config(dequantize=True) - - -from kernels import replace_kernel_forward_from_hub, register_kernel_mapping, LayerRepository, Mode - -from transformers.models.gpt_oss.modeling_gpt_oss import GptOssMLP, GptOssRMSNorm - -replace_kernel_forward_from_hub(GptOssMLP, "Yamoe") # direct, type-safe -replace_kernel_forward_from_hub(GptOssRMSNorm, None) # direct, type-safe -custom_mapping = { - "Yamoe": { - "cuda": { - Mode.INFERENCE: LayerRepository( - repo_id="drbh/yamoe", - layer_name="Yamoe", - revision="v0.3.0", - ) - } - } -} -register_kernel_mapping(custom_mapping) - - -model = GptOssForCausalLM.from_pretrained( - model_id, - dtype="bfloat16", - device_map="auto", - use_kernels=True, - quantization_config=quantization_config, -).eval() - -messages = [ - {"role": "system", "content": "What is Tensor Parallelism?"}, -] - -inputs = tokenizer.apply_chat_template( - messages, - add_generation_prompt=True, - return_tensors="pt", - return_dict=True, - reasoning_effort="low", -).to("cuda") - -max_tokens = 512 - -with torch.inference_mode(): - start_time = time.perf_counter() - generated = model.generate( - **inputs, - max_new_tokens=max_tokens, - do_sample=False, - temperature=None, - ) - end_time = time.perf_counter() - -print(tokenizer.decode(generated[0], skip_special_tokens=False)) -print(f"Generation took {end_time - start_time:.2f} seconds") diff --git a/cells/setup2.py b/cells/setup2.py deleted file mode 100644 index 1e0c0cda49faa5c3bac38e61579c62e182be98df..0000000000000000000000000000000000000000 --- a/cells/setup2.py +++ /dev/null @@ -1,115 +0,0 @@ -# /// script -# requires-python = ">=3.12" -# dependencies = [ -# "accelerate>=1.10.1", -# "torch>=2.7.0", -# "kernels==0.10.0", -# "transformers@https://github.com/huggingface/transformers.git", -# "ipdb>=0.13.13", -# "matplotlib>=3.7.2", -# "numpy>=1.24.3", -# ] -# /// - -import torch -from transformers import GptOssForCausalLM, PreTrainedTokenizerFast, Mxfp4Config -import time -import torch.nn as nn -from kernels import register_kernel_mapping, Mode, LayerRepository -import sys -import torch.profiler -import gc -import logging - -# set to debug logging -logging.basicConfig(level=logging.INFO) - -def reset_peak_memory_stats(): - """Clear CUDA cache and reset memory allocation counters.""" - torch.cuda.empty_cache() - if torch.cuda.is_available(): - torch.cuda.reset_peak_memory_stats() - gc.collect() - -def get_memory_stats(): - """Get current and peak CUDA memory usage.""" - if not torch.cuda.is_available(): - return {"allocated_gb": 0, "peak_gb": 0, "reserved_gb": 0} - return { - "allocated_gb": torch.cuda.memory_allocated() / 1e9, - "peak_gb": torch.cuda.max_memory_allocated() / 1e9, - "reserved_gb": torch.cuda.memory_reserved() / 1e9, - } - -def override_kernel_layer_name(cls_name: str, value) -> bool: - """Helper to dynamically override the kernel_layer_name in a model class.""" - for mod in sys.modules.values(): - if mod is None: - continue - obj = getattr(mod, cls_name, None) - if isinstance(obj, type) and issubclass(obj, nn.Module): - setattr(obj, "kernel_layer_name", value) - print(f"Overrode {cls_name}.kernel_layer_name to {value}") - return True - return False - - -# Init the model the normal way -model_id = "openai/gpt-oss-20b" -tokenizer = PreTrainedTokenizerFast.from_pretrained(model_id) -quantization_config = Mxfp4Config(dequantize=True) - - -from kernels import replace_kernel_forward_from_hub, register_kernel_mapping, LayerRepository, Mode - -from transformers.models.gpt_oss.modeling_gpt_oss import GptOssMLP, GptOssRMSNorm - -replace_kernel_forward_from_hub(GptOssRMSNorm, None) # direct, type-safe -custom_mapping = { - "Yamoe": { - "cuda": { - Mode.INFERENCE: LayerRepository( - repo_id="drbh/yamoe", - layer_name="Yamoe", - revision="v0.3.0", - ) - } - } -} -register_kernel_mapping(custom_mapping) - - -model = GptOssForCausalLM.from_pretrained( - model_id, - dtype="bfloat16", - device_map="auto", - use_kernels=True, - quantization_config=quantization_config, -).eval() - -messages = [ - {"role": "system", "content": "What is Tensor Parallelism?"}, -] - -inputs = tokenizer.apply_chat_template( - messages, - add_generation_prompt=True, - return_tensors="pt", - return_dict=True, - reasoning_effort="low", -).to("cuda") - -max_tokens = 512 - -with torch.inference_mode(): - start_time = time.perf_counter() - generated = model.generate( - **inputs, - max_new_tokens=max_tokens, - do_sample=False, - temperature=None, - ) - end_time = time.perf_counter() - -print(tokenizer.decode(generated[0], skip_special_tokens=False)) -print(f"Generation took {end_time - start_time:.2f} seconds") diff --git a/index.html b/index.html deleted file mode 100644 index 0c248cc83161f77a6f15eafdaf3758bde0650b7a..0000000000000000000000000000000000000000 --- a/index.html +++ /dev/null @@ -1,24 +0,0 @@ - - - - - Directory Index - - - -

Index of /

- - - \ No newline at end of file diff --git a/megablocks_only.html b/megablocks_only.html deleted file mode 100644 index 9626698e0d7527be74b531739756fd38544ed636..0000000000000000000000000000000000000000 --- a/megablocks_only.html +++ /dev/null @@ -1,4250 +0,0 @@ - - - - - - Megablocks Only Test - - - - - - - -
-
-
light
-
reset
- -
-
- -
-
Generated on:
-
- Linux x86_64 | Linux-5.15.0-1084-aws-x86_64-with-glibc2.31 -
-
- -
-

Reference kernel

-
-
- -β–Ό code -β–Ό output - β–Ά uv-logs - | -Cell: forward_only | 100.45s | FAILED - | - -Raw -
-
-
-
-1 -2 -3 -4 -5 -6 -7 -8 -9 -10 -11 -12 -13 -14 -15 -16 -17 -18 -19 -20 -21 -22 -23 -24 -25 -26 -27 -28 -29 -30 -31 -32 -33 -34 -35 -36 -37 -38 -39 -40 -41 -42 -43 -44 -45 -46 -47 -48 -49 -50 -51 -52 -53 -54 -55 -56 -57 -58 -59 -60 -61 -62 -63 -64 -65 -66 -67 -68 -69 -70 -71 -72 -73 -74 -75 -76 -77 -78 -79 -80 -81 -82 -83 -84 -85 -86 -87 -88 -89 -90 -91 -92 -93 -94 -95 -96 -
-
-
# /// script
-# requires-python = ">=3.12"
-# dependencies = [
-#     "accelerate>=1.10.1",
-#     "torch>=2.7.0",
-#     "kernels==0.10.0",
-#     "transformers@https://github.com/huggingface/transformers.git",
-#     "ipdb>=0.13.13",
-#     "matplotlib>=3.7.2",
-#     "numpy>=1.24.3",
-# ]
-# ///
-
-import torch
-from transformers import GptOssForCausalLM, PreTrainedTokenizerFast, Mxfp4Config
-import time
-import torch.nn as nn
-from kernels import register_kernel_mapping, Mode, LayerRepository
-import sys
-import torch.profiler
-import gc
-import logging
-
-# set to debug logging
-logging.basicConfig(level=logging.INFO)
-
-def reset_peak_memory_stats():
-    """Clear CUDA cache and reset memory allocation counters."""
-    torch.cuda.empty_cache()
-    if torch.cuda.is_available():
-        torch.cuda.reset_peak_memory_stats()
-    gc.collect()
-
-def get_memory_stats():
-    """Get current and peak CUDA memory usage."""
-    if not torch.cuda.is_available():
-        return {"allocated_gb": 0, "peak_gb": 0, "reserved_gb": 0}
-    return {
-        "allocated_gb": torch.cuda.memory_allocated() / 1e9,
-        "peak_gb": torch.cuda.max_memory_allocated() / 1e9,
-        "reserved_gb": torch.cuda.memory_reserved() / 1e9,
-    }
-
-def override_kernel_layer_name(cls_name: str, value) -> bool:
-    """Helper to dynamically override the kernel_layer_name in a model class."""
-    for mod in sys.modules.values():
-        if mod is None:
-            continue
-        obj = getattr(mod, cls_name, None)
-        if isinstance(obj, type) and issubclass(obj, nn.Module):
-            setattr(obj, "kernel_layer_name", value)
-            print(f"Overrode {cls_name}.kernel_layer_name to {value}")
-            return True
-    return False
-
-
-# Init the model the normal way
-model_id = "openai/gpt-oss-20b"
-tokenizer = PreTrainedTokenizerFast.from_pretrained(model_id)
-quantization_config = Mxfp4Config(dequantize=True)
-
-
-model = GptOssForCausalLM.from_pretrained(
-    model_id,
-    dtype="bfloat16",
-    device_map="auto",
-    use_kernels=True,
-    quantization_config=quantization_config,
-).eval()
-
-messages = [
-    {"role": "system", "content": "What is Tensor Parallelism?"},
-]
-
-inputs = tokenizer.apply_chat_template(
-    messages,
-    add_generation_prompt=True,
-    return_tensors="pt",
-    return_dict=True,
-    reasoning_effort="low",
-).to("cuda")
-
-max_tokens = 512
-
-with torch.inference_mode():
-    start_time = time.perf_counter()
-    generated = model.generate(
-        **inputs,
-        max_new_tokens=max_tokens,
-        do_sample=False,
-        temperature=None,
-    )
-    end_time = time.perf_counter()
-
-print(tokenizer.decode(generated[0], skip_special_tokens=False))
-print(f"Generation took {end_time - start_time:.2f} seconds")
-
- -
-
-
-
-
-
-
β–Ά UV Install Logs
- -
-
Fetching 3 files: 0%| | 0/3 [00:00<?, ?it/s] -Fetching 3 files: 33%|β–ˆβ–ˆβ–ˆβ–Ž | 1/3 [00:13<00:27, 13.93s/it] -Fetching 3 files: 67%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–‹ | 2/3 [00:17<00:08, 8.08s/it] -Fetching 3 files: 100%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆ| 3/3 [00:17<00:00, 5.97s/it] -You are using full precision kernels, we will dequantize the model to bf16. To use the quantized model with quantization kernels, please set use_kernels=False - -Loading checkpoint shards: 0%| | 0/3 [00:00<?, ?it/s] -Loading checkpoint shards: 33%|β–ˆβ–ˆβ–ˆβ–Ž | 1/3 [00:03<00:06, 3.23s/it] -Loading checkpoint shards: 67%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–‹ | 2/3 [00:06<00:03, 3.14s/it] -Loading checkpoint shards: 100%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆ| 3/3 [00:08<00:00, 2.49s/it] -Loading checkpoint shards: 100%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆ| 3/3 [00:08<00:00, 2.68s/it] -INFO:root:Using layer `MegaBlocksMoeMLP` from repo `kernels-community/megablocks` (revision: main) for layer `MegaBlocksMoeMLP` - -Fetching 66 files: 0%| | 0/66 [00:00<?, ?it/s] -Fetching 66 files: 2%|▏ | 1/66 [00:00<00:15, 4.28it/s] -Fetching 66 files: 26%|β–ˆβ–ˆβ–Œ | 17/66 [00:01<00:03, 12.73it/s] -Fetching 66 files: 100%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆ| 66/66 [00:01<00:00, 47.76it/s] -INFO:root:Using layer `LigerRMSNorm` from repo `kernels-community/liger_kernels` (revision: main) for layer `LigerRMSNorm` - -Fetching 17 files: 0%| | 0/17 [00:00<?, ?it/s] -Fetching 17 files: 65%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ– | 11/17 [00:00<00:00, 104.99it/s] -Fetching 17 files: 100%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆ| 17/17 [00:00<00:00, 128.06it/s] -INFO:root:Using layer `LigerRMSNorm` from repo `kernels-community/liger_kernels` (revision: main) for layer `LigerRMSNorm` -INFO:root:Using layer `MegaBlocksMoeMLP` from repo `kernels-community/megablocks` (revision: main) for layer `MegaBlocksMoeMLP` -INFO:root:Using layer `LigerRMSNorm` from repo `kernels-community/liger_kernels` (revision: main) for layer `LigerRMSNorm` -INFO:root:Using layer `LigerRMSNorm` from repo `kernels-community/liger_kernels` (revision: main) for layer `LigerRMSNorm` -INFO:root:Using layer `MegaBlocksMoeMLP` from repo `kernels-community/megablocks` (revision: main) for layer `MegaBlocksMoeMLP` -INFO:root:Using layer `LigerRMSNorm` from repo `kernels-community/liger_kernels` (revision: main) for layer `LigerRMSNorm` -INFO:root:Using layer `LigerRMSNorm` from repo `kernels-community/liger_kernels` (revision: main) for layer `LigerRMSNorm` -INFO:root:Using layer `MegaBlocksMoeMLP` from repo `kernels-community/megablocks` (revision: main) for layer `MegaBlocksMoeMLP` -INFO:root:Using layer `LigerRMSNorm` from repo `kernels-community/liger_kernels` (revision: main) for layer `LigerRMSNorm` -INFO:root:Using layer `LigerRMSNorm` from repo `kernels-community/liger_kernels` (revision: main) for layer `LigerRMSNorm` -INFO:root:Using layer `MegaBlocksMoeMLP` from repo `kernels-community/megablocks` (revision: main) for layer `MegaBlocksMoeMLP` -INFO:root:Using layer `LigerRMSNorm` from repo `kernels-community/liger_kernels` (revision: main) for layer `LigerRMSNorm` -INFO:root:Using layer `LigerRMSNorm` from repo `kernels-community/liger_kernels` (revision: main) for layer `LigerRMSNorm` -INFO:root:Using layer `MegaBlocksMoeMLP` from repo `kernels-community/megablocks` (revision: main) for layer `MegaBlocksMoeMLP` -INFO:root:Using layer `LigerRMSNorm` from repo `kernels-community/liger_kernels` (revision: main) for layer `LigerRMSNorm` -INFO:root:Using layer `LigerRMSNorm` from repo `kernels-community/liger_kernels` (revision: main) for layer `LigerRMSNorm` -INFO:root:Using layer `MegaBlocksMoeMLP` from repo `kernels-community/megablocks` (revision: main) for layer `MegaBlocksMoeMLP` -INFO:root:Using layer `LigerRMSNorm` from repo `kernels-community/liger_kernels` (revision: main) for layer `LigerRMSNorm` -INFO:root:Using layer `LigerRMSNorm` from repo `kernels-community/liger_kernels` (revision: main) for layer `LigerRMSNorm` -INFO:root:Using layer `MegaBlocksMoeMLP` from repo `kernels-community/megablocks` (revision: main) for layer `MegaBlocksMoeMLP` -INFO:root:Using layer `LigerRMSNorm` from repo `kernels-community/liger_kernels` (revision: main) for layer `LigerRMSNorm` -INFO:root:Using layer `LigerRMSNorm` from repo `kernels-community/liger_kernels` (revision: main) for layer `LigerRMSNorm` -INFO:root:Using layer `MegaBlocksMoeMLP` from repo `kernels-community/megablocks` (revision: main) for layer `MegaBlocksMoeMLP` -INFO:root:Using layer `LigerRMSNorm` from repo `kernels-community/liger_kernels` (revision: main) for layer `LigerRMSNorm` -INFO:root:Using layer `LigerRMSNorm` from repo `kernels-community/liger_kernels` (revision: main) for layer `LigerRMSNorm` -INFO:root:Using layer `MegaBlocksMoeMLP` from repo `kernels-community/megablocks` (revision: main) for layer `MegaBlocksMoeMLP` -INFO:root:Using layer `LigerRMSNorm` from repo `kernels-community/liger_kernels` (revision: main) for layer `LigerRMSNorm` -INFO:root:Using layer `LigerRMSNorm` from repo `kernels-community/liger_kernels` (revision: main) for layer `LigerRMSNorm` -INFO:root:Using layer `MegaBlocksMoeMLP` from repo `kernels-community/megablocks` (revision: main) for layer `MegaBlocksMoeMLP` -INFO:root:Using layer `LigerRMSNorm` from repo `kernels-community/liger_kernels` (revision: main) for layer `LigerRMSNorm` -INFO:root:Using layer `LigerRMSNorm` from repo `kernels-community/liger_kernels` (revision: main) for layer `LigerRMSNorm` -INFO:root:Using layer `MegaBlocksMoeMLP` from repo `kernels-community/megablocks` (revision: main) for layer `MegaBlocksMoeMLP` -INFO:root:Using layer `LigerRMSNorm` from repo `kernels-community/liger_kernels` (revision: main) for layer `LigerRMSNorm` -INFO:root:Using layer `LigerRMSNorm` from repo `kernels-community/liger_kernels` (revision: main) for layer `LigerRMSNorm` -INFO:root:Using layer `MegaBlocksMoeMLP` from repo `kernels-community/megablocks` (revision: main) for layer `MegaBlocksMoeMLP` -INFO:root:Using layer `LigerRMSNorm` from repo `kernels-community/liger_kernels` (revision: main) for layer `LigerRMSNorm` -INFO:root:Using layer `LigerRMSNorm` from repo `kernels-community/liger_kernels` (revision: main) for layer `LigerRMSNorm` -INFO:root:Using layer `MegaBlocksMoeMLP` from repo `kernels-community/megablocks` (revision: main) for layer `MegaBlocksMoeMLP` -INFO:root:Using layer `LigerRMSNorm` from repo `kernels-community/liger_kernels` (revision: main) for layer `LigerRMSNorm` -INFO:root:Using layer `LigerRMSNorm` from repo `kernels-community/liger_kernels` (revision: main) for layer `LigerRMSNorm` -INFO:root:Using layer `MegaBlocksMoeMLP` from repo `kernels-community/megablocks` (revision: main) for layer `MegaBlocksMoeMLP` -INFO:root:Using layer `LigerRMSNorm` from repo `kernels-community/liger_kernels` (revision: main) for layer `LigerRMSNorm` -INFO:root:Using layer `LigerRMSNorm` from repo `kernels-community/liger_kernels` (revision: main) for layer `LigerRMSNorm` -INFO:root:Using layer `MegaBlocksMoeMLP` from repo `kernels-community/megablocks` (revision: main) for layer `MegaBlocksMoeMLP` -INFO:root:Using layer `LigerRMSNorm` from repo `kernels-community/liger_kernels` (revision: main) for layer `LigerRMSNorm` -INFO:root:Using layer `LigerRMSNorm` from repo `kernels-community/liger_kernels` (revision: main) for layer `LigerRMSNorm` -INFO:root:Using layer `MegaBlocksMoeMLP` from repo `kernels-community/megablocks` (revision: main) for layer `MegaBlocksMoeMLP` -INFO:root:Using layer `LigerRMSNorm` from repo `kernels-community/liger_kernels` (revision: main) for layer `LigerRMSNorm` -INFO:root:Using layer `LigerRMSNorm` from repo `kernels-community/liger_kernels` (revision: main) for layer `LigerRMSNorm` -INFO:root:Using layer `MegaBlocksMoeMLP` from repo `kernels-community/megablocks` (revision: main) for layer `MegaBlocksMoeMLP` -INFO:root:Using layer `LigerRMSNorm` from repo `kernels-community/liger_kernels` (revision: main) for layer `LigerRMSNorm` -INFO:root:Using layer `LigerRMSNorm` from repo `kernels-community/liger_kernels` (revision: main) for layer `LigerRMSNorm` -INFO:root:Using layer `MegaBlocksMoeMLP` from repo `kernels-community/megablocks` (revision: main) for layer `MegaBlocksMoeMLP` -INFO:root:Using layer `LigerRMSNorm` from repo `kernels-community/liger_kernels` (revision: main) for layer `LigerRMSNorm` -INFO:root:Using layer `LigerRMSNorm` from repo `kernels-community/liger_kernels` (revision: main) for layer `LigerRMSNorm` -INFO:root:Using layer `MegaBlocksMoeMLP` from repo `kernels-community/megablocks` (revision: main) for layer `MegaBlocksMoeMLP` -INFO:root:Using layer `LigerRMSNorm` from repo `kernels-community/liger_kernels` (revision: main) for layer `LigerRMSNorm` -INFO:root:Using layer `LigerRMSNorm` from repo `kernels-community/liger_kernels` (revision: main) for layer `LigerRMSNorm` -INFO:root:Using layer `MegaBlocksMoeMLP` from repo `kernels-community/megablocks` (revision: main) for layer `MegaBlocksMoeMLP` -INFO:root:Using layer `LigerRMSNorm` from repo `kernels-community/liger_kernels` (revision: main) for layer `LigerRMSNorm` -INFO:root:Using layer `LigerRMSNorm` from repo `kernels-community/liger_kernels` (revision: main) for layer `LigerRMSNorm` -INFO:root:Using layer `MegaBlocksMoeMLP` from repo `kernels-community/megablocks` (revision: main) for layer `MegaBlocksMoeMLP` -INFO:root:Using layer `LigerRMSNorm` from repo `kernels-community/liger_kernels` (revision: main) for layer `LigerRMSNorm` -INFO:root:Using layer `LigerRMSNorm` from repo `kernels-community/liger_kernels` (revision: main) for layer `LigerRMSNorm` -INFO:root:Using layer `MegaBlocksMoeMLP` from repo `kernels-community/megablocks` (revision: main) for layer `MegaBlocksMoeMLP` -INFO:root:Using layer `LigerRMSNorm` from repo `kernels-community/liger_kernels` (revision: main) for layer `LigerRMSNorm` -INFO:root:Using layer `LigerRMSNorm` from repo `kernels-community/liger_kernels` (revision: main) for layer `LigerRMSNorm` -INFO:root:Using layer `MegaBlocksMoeMLP` from repo `kernels-community/megablocks` (revision: main) for layer `MegaBlocksMoeMLP` -INFO:root:Using layer `LigerRMSNorm` from repo `kernels-community/liger_kernels` (revision: main) for layer `LigerRMSNorm` -INFO:root:Using layer `LigerRMSNorm` from repo `kernels-community/liger_kernels` (revision: main) for layer `LigerRMSNorm` -INFO:root:Using layer `LigerRMSNorm` from repo `kernels-community/liger_kernels` (revision: main) for layer `LigerRMSNorm` -INFO:root:Using layer `MegaBlocksMoeMLP` from repo `kernels-community/megablocks` (revision: main) for layer `MegaBlocksMoeMLP` -INFO:root:Using layer `LigerRMSNorm` from repo `kernels-community/liger_kernels` (revision: main) for layer `LigerRMSNorm` -INFO:root:Using layer `LigerRMSNorm` from repo `kernels-community/liger_kernels` (revision: main) for layer `LigerRMSNorm` -INFO:root:Using layer `MegaBlocksMoeMLP` from repo `kernels-community/megablocks` (revision: main) for layer `MegaBlocksMoeMLP` -INFO:root:Using layer `LigerRMSNorm` from repo `kernels-community/liger_kernels` (revision: main) for layer `LigerRMSNorm` -INFO:root:Using layer `LigerRMSNorm` from repo `kernels-community/liger_kernels` (revision: main) for layer `LigerRMSNorm` -INFO:root:Using layer `MegaBlocksMoeMLP` from repo `kernels-community/megablocks` (revision: main) for layer `MegaBlocksMoeMLP` -INFO:root:Using layer `LigerRMSNorm` from repo `kernels-community/liger_kernels` (revision: main) for layer `LigerRMSNorm` -INFO:root:Using layer `LigerRMSNorm` from repo `kernels-community/liger_kernels` (revision: main) for layer `LigerRMSNorm` -INFO:root:Using layer `MegaBlocksMoeMLP` from repo `kernels-community/megablocks` (revision: main) for layer `MegaBlocksMoeMLP` -INFO:root:Using layer `LigerRMSNorm` from repo `kernels-community/liger_kernels` (revision: main) for layer `LigerRMSNorm` -INFO:root:Using layer `LigerRMSNorm` from repo `kernels-community/liger_kernels` (revision: main) for layer `LigerRMSNorm` -INFO:root:Using layer `MegaBlocksMoeMLP` from repo `kernels-community/megablocks` (revision: main) for layer `MegaBlocksMoeMLP` -INFO:root:Using layer `LigerRMSNorm` from repo `kernels-community/liger_kernels` (revision: main) for layer `LigerRMSNorm` -INFO:root:Using layer `LigerRMSNorm` from repo `kernels-community/liger_kernels` (revision: main) for layer `LigerRMSNorm` -INFO:root:Using layer `MegaBlocksMoeMLP` from repo `kernels-community/megablocks` (revision: main) for layer `MegaBlocksMoeMLP` -INFO:root:Using layer `LigerRMSNorm` from repo `kernels-community/liger_kernels` (revision: main) for layer `LigerRMSNorm` -INFO:root:Using layer `LigerRMSNorm` from repo `kernels-community/liger_kernels` (revision: main) for layer `LigerRMSNorm` -INFO:root:Using layer `MegaBlocksMoeMLP` from repo `kernels-community/megablocks` (revision: main) for layer `MegaBlocksMoeMLP` -INFO:root:Using layer `LigerRMSNorm` from repo `kernels-community/liger_kernels` (revision: main) for layer `LigerRMSNorm` -INFO:root:Using layer `LigerRMSNorm` from repo `kernels-community/liger_kernels` (revision: main) for layer `LigerRMSNorm` -INFO:root:Using layer `MegaBlocksMoeMLP` from repo `kernels-community/megablocks` (revision: main) for layer `MegaBlocksMoeMLP` -INFO:root:Using layer `LigerRMSNorm` from repo `kernels-community/liger_kernels` (revision: main) for layer `LigerRMSNorm` -INFO:root:Using layer `LigerRMSNorm` from repo `kernels-community/liger_kernels` (revision: main) for layer `LigerRMSNorm` -INFO:root:Using layer `MegaBlocksMoeMLP` from repo `kernels-community/megablocks` (revision: main) for layer `MegaBlocksMoeMLP` -INFO:root:Using layer `LigerRMSNorm` from repo `kernels-community/liger_kernels` (revision: main) for layer `LigerRMSNorm` -INFO:root:Using layer `LigerRMSNorm` from repo `kernels-community/liger_kernels` (revision: main) for layer `LigerRMSNorm` -INFO:root:Using layer `MegaBlocksMoeMLP` from repo `kernels-community/megablocks` (revision: main) for layer `MegaBlocksMoeMLP` -INFO:root:Using layer `LigerRMSNorm` from repo `kernels-community/liger_kernels` (revision: main) for layer `LigerRMSNorm` -INFO:root:Using layer `LigerRMSNorm` from repo `kernels-community/liger_kernels` (revision: main) for layer `LigerRMSNorm` -INFO:root:Using layer `MegaBlocksMoeMLP` from repo `kernels-community/megablocks` (revision: main) for layer `MegaBlocksMoeMLP` -INFO:root:Using layer `LigerRMSNorm` from repo `kernels-community/liger_kernels` (revision: main) for layer `LigerRMSNorm` -INFO:root:Using layer `LigerRMSNorm` from repo `kernels-community/liger_kernels` (revision: main) for layer `LigerRMSNorm` -INFO:root:Using layer `MegaBlocksMoeMLP` from repo `kernels-community/megablocks` (revision: main) for layer `MegaBlocksMoeMLP` -INFO:root:Using layer `LigerRMSNorm` from repo `kernels-community/liger_kernels` (revision: main) for layer `LigerRMSNorm` -INFO:root:Using layer `LigerRMSNorm` from repo `kernels-community/liger_kernels` (revision: main) for layer `LigerRMSNorm` -INFO:root:Using layer `MegaBlocksMoeMLP` from repo `kernels-community/megablocks` (revision: main) for layer `MegaBlocksMoeMLP` -INFO:root:Using layer `LigerRMSNorm` from repo `kernels-community/liger_kernels` (revision: main) for layer `LigerRMSNorm` -INFO:root:Using layer `LigerRMSNorm` from repo `kernels-community/liger_kernels` (revision: main) for layer `LigerRMSNorm` -INFO:root:Using layer `MegaBlocksMoeMLP` from repo `kernels-community/megablocks` (revision: main) for layer `MegaBlocksMoeMLP` -INFO:root:Using layer `LigerRMSNorm` from repo `kernels-community/liger_kernels` (revision: main) for layer `LigerRMSNorm` -INFO:root:Using layer `LigerRMSNorm` from repo `kernels-community/liger_kernels` (revision: main) for layer `LigerRMSNorm` -INFO:root:Using layer `MegaBlocksMoeMLP` from repo `kernels-community/megablocks` (revision: main) for layer `MegaBlocksMoeMLP` -INFO:root:Using layer `LigerRMSNorm` from repo `kernels-community/liger_kernels` (revision: main) for layer `LigerRMSNorm` -INFO:root:Using layer `LigerRMSNorm` from repo `kernels-community/liger_kernels` (revision: main) for layer `LigerRMSNorm` -INFO:root:Using layer `MegaBlocksMoeMLP` from repo `kernels-community/megablocks` (revision: main) for layer `MegaBlocksMoeMLP` -INFO:root:Using layer `LigerRMSNorm` from repo `kernels-community/liger_kernels` (revision: main) for layer `LigerRMSNorm` -INFO:root:Using layer `LigerRMSNorm` from repo `kernels-community/liger_kernels` (revision: main) for layer `LigerRMSNorm` -INFO:root:Using layer `MegaBlocksMoeMLP` from repo `kernels-community/megablocks` (revision: main) for layer `MegaBlocksMoeMLP` -INFO:root:Using layer `LigerRMSNorm` from repo `kernels-community/liger_kernels` (revision: main) for layer `LigerRMSNorm` -INFO:root:Using layer `LigerRMSNorm` from repo `kernels-community/liger_kernels` (revision: main) for layer `LigerRMSNorm` -INFO:root:Using layer `MegaBlocksMoeMLP` from repo `kernels-community/megablocks` (revision: main) for layer `MegaBlocksMoeMLP` -INFO:root:Using layer `LigerRMSNorm` from repo `kernels-community/liger_kernels` (revision: main) for layer `LigerRMSNorm` -INFO:root:Using layer `LigerRMSNorm` from repo `kernels-community/liger_kernels` (revision: main) for layer `LigerRMSNorm` -INFO:root:Using layer `MegaBlocksMoeMLP` from repo `kernels-community/megablocks` (revision: main) for layer `MegaBlocksMoeMLP` -INFO:root:Using layer `LigerRMSNorm` from repo `kernels-community/liger_kernels` (revision: main) for layer `LigerRMSNorm` -INFO:root:Using layer `LigerRMSNorm` from repo `kernels-community/liger_kernels` (revision: main) for layer `LigerRMSNorm` -INFO:root:Using layer `MegaBlocksMoeMLP` from repo `kernels-community/megablocks` (revision: main) for layer `MegaBlocksMoeMLP` -INFO:root:Using layer `LigerRMSNorm` from repo `kernels-community/liger_kernels` (revision: main) for layer `LigerRMSNorm` -INFO:root:Using layer `LigerRMSNorm` from repo `kernels-community/liger_kernels` (revision: main) for layer `LigerRMSNorm` -INFO:root:Using layer `MegaBlocksMoeMLP` from repo `kernels-community/megablocks` (revision: main) for layer `MegaBlocksMoeMLP` -INFO:root:Using layer `LigerRMSNorm` from repo `kernels-community/liger_kernels` (revision: main) for layer `LigerRMSNorm` -INFO:root:Using layer `LigerRMSNorm` from repo `kernels-community/liger_kernels` (revision: main) for layer `LigerRMSNorm` -INFO:root:Using layer `MegaBlocksMoeMLP` from repo `kernels-community/megablocks` (revision: main) for layer `MegaBlocksMoeMLP` -INFO:root:Using layer `LigerRMSNorm` from repo `kernels-community/liger_kernels` (revision: main) for layer `LigerRMSNorm` -INFO:root:Using layer `LigerRMSNorm` from repo `kernels-community/liger_kernels` (revision: main) for layer `LigerRMSNorm` -INFO:root:Using layer `MegaBlocksMoeMLP` from repo `kernels-community/megablocks` (revision: main) for layer `MegaBlocksMoeMLP` -INFO:root:Using layer `LigerRMSNorm` from repo `kernels-community/liger_kernels` (revision: main) for layer `LigerRMSNorm` -INFO:root:Using layer `LigerRMSNorm` from repo `kernels-community/liger_kernels` (revision: main) for layer `LigerRMSNorm` -INFO:root:Using layer `MegaBlocksMoeMLP` from repo `kernels-community/megablocks` (revision: main) for layer `MegaBlocksMoeMLP` -INFO:root:Using layer `LigerRMSNorm` from repo `kernels-community/liger_kernels` (revision: main) for layer `LigerRMSNorm` -INFO:root:Using layer `LigerRMSNorm` from repo `kernels-community/liger_kernels` (revision: main) for layer `LigerRMSNorm` -INFO:root:Using layer `LigerRMSNorm` from repo `kernels-community/liger_kernels` (revision: main) for layer `LigerRMSNorm` -Traceback (most recent call last): - File "/home/ubuntu/Projects/yamoe-gpt-integration/.uvnote/cells/forward_only.py", line 87, in <module> - generated = model.generate( - ^^^^^^^^^^^^^^^ - File "/tmp/uvnote-run-b1sqpts5/home/.cache/uv/environments-v2/forward-only-e48b1c4099197fb9/lib/python3.12/site-packages/torch/utils/_contextlib.py", line 120, in decorate_context - return func(*args, **kwargs) - ^^^^^^^^^^^^^^^^^^^^^ - File "/tmp/uvnote-run-b1sqpts5/home/.cache/uv/environments-v2/forward-only-e48b1c4099197fb9/lib/python3.12/site-packages/transformers/generation/utils.py", line 2546, in generate - result = decoding_method( - ^^^^^^^^^^^^^^^^ - File "/tmp/uvnote-run-b1sqpts5/home/.cache/uv/environments-v2/forward-only-e48b1c4099197fb9/lib/python3.12/site-packages/transformers/generation/utils.py", line 2766, in _sample - outputs = self(**model_inputs, return_dict=True) - ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ - File "/tmp/uvnote-run-b1sqpts5/home/.cache/uv/environments-v2/forward-only-e48b1c4099197fb9/lib/python3.12/site-packages/torch/nn/modules/module.py", line 1773, in _wrapped_call_impl - return self._call_impl(*args, **kwargs) - ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ - File "/tmp/uvnote-run-b1sqpts5/home/.cache/uv/environments-v2/forward-only-e48b1c4099197fb9/lib/python3.12/site-packages/torch/nn/modules/module.py", line 1784, in _call_impl - return forward_call(*args, **kwargs) - ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ - File "/tmp/uvnote-run-b1sqpts5/home/.cache/uv/environments-v2/forward-only-e48b1c4099197fb9/lib/python3.12/site-packages/accelerate/hooks.py", line 175, in new_forward - output = module._old_forward(*args, **kwargs) - ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ - File "/tmp/uvnote-run-b1sqpts5/home/.cache/uv/environments-v2/forward-only-e48b1c4099197fb9/lib/python3.12/site-packages/transformers/utils/generic.py", line 783, in wrapper - output = func(self, *args, **kwargs) - ^^^^^^^^^^^^^^^^^^^^^^^^^^^ - File "/tmp/uvnote-run-b1sqpts5/home/.cache/uv/environments-v2/forward-only-e48b1c4099197fb9/lib/python3.12/site-packages/transformers/models/gpt_oss/modeling_gpt_oss.py", line 668, in forward - outputs: MoeModelOutputWithPast = self.model( - ^^^^^^^^^^^ - File "/tmp/uvnote-run-b1sqpts5/home/.cache/uv/environments-v2/forward-only-e48b1c4099197fb9/lib/python3.12/site-packages/torch/nn/modules/module.py", line 1773, in _wrapped_call_impl - return self._call_impl(*args, **kwargs) - ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ - File "/tmp/uvnote-run-b1sqpts5/home/.cache/uv/environments-v2/forward-only-e48b1c4099197fb9/lib/python3.12/site-packages/torch/nn/modules/module.py", line 1784, in _call_impl - return forward_call(*args, **kwargs) - ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ - File "/tmp/uvnote-run-b1sqpts5/home/.cache/uv/environments-v2/forward-only-e48b1c4099197fb9/lib/python3.12/site-packages/transformers/utils/generic.py", line 929, in wrapper - outputs = func(self, *args, **kwargs) - ^^^^^^^^^^^^^^^^^^^^^^^^^^^ - File "/tmp/uvnote-run-b1sqpts5/home/.cache/uv/environments-v2/forward-only-e48b1c4099197fb9/lib/python3.12/site-packages/transformers/models/gpt_oss/modeling_gpt_oss.py", line 507, in forward - hidden_states = decoder_layer( - ^^^^^^^^^^^^^^ - File "/tmp/uvnote-run-b1sqpts5/home/.cache/uv/environments-v2/forward-only-e48b1c4099197fb9/lib/python3.12/site-packages/transformers/modeling_layers.py", line 94, in __call__ - return super().__call__(*args, **kwargs) - ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ - File "/tmp/uvnote-run-b1sqpts5/home/.cache/uv/environments-v2/forward-only-e48b1c4099197fb9/lib/python3.12/site-packages/torch/nn/modules/module.py", line 1773, in _wrapped_call_impl - return self._call_impl(*args, **kwargs) - ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ - File "/tmp/uvnote-run-b1sqpts5/home/.cache/uv/environments-v2/forward-only-e48b1c4099197fb9/lib/python3.12/site-packages/torch/nn/modules/module.py", line 1784, in _call_impl - return forward_call(*args, **kwargs) - ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ - File "/tmp/uvnote-run-b1sqpts5/home/.cache/uv/environments-v2/forward-only-e48b1c4099197fb9/lib/python3.12/site-packages/accelerate/hooks.py", line 175, in new_forward - output = module._old_forward(*args, **kwargs) - ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ - File "/tmp/uvnote-run-b1sqpts5/home/.cache/uv/environments-v2/forward-only-e48b1c4099197fb9/lib/python3.12/site-packages/transformers/utils/deprecation.py", line 172, in wrapped_func - return func(*args, **kwargs) - ^^^^^^^^^^^^^^^^^^^^^ - File "/tmp/uvnote-run-b1sqpts5/home/.cache/uv/environments-v2/forward-only-e48b1c4099197fb9/lib/python3.12/site-packages/transformers/models/gpt_oss/modeling_gpt_oss.py", line 369, in forward - hidden_states = self.input_layernorm(hidden_states) - ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ - File "/tmp/uvnote-run-b1sqpts5/home/.cache/uv/environments-v2/forward-only-e48b1c4099197fb9/lib/python3.12/site-packages/torch/nn/modules/module.py", line 1773, in _wrapped_call_impl - return self._call_impl(*args, **kwargs) - ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ - File "/tmp/uvnote-run-b1sqpts5/home/.cache/uv/environments-v2/forward-only-e48b1c4099197fb9/lib/python3.12/site-packages/torch/nn/modules/module.py", line 1784, in _call_impl - return forward_call(*args, **kwargs) - ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ - File "/tmp/uvnote-run-b1sqpts5/home/.cache/huggingface/hub/models--kernels-community--liger_kernels/snapshots/7435d25a8faf175758be14046371a5b0c686f94c/build/torch-universal/liger_kernels/layers.py", line 30, in forward - return LigerRMSNormFunction.apply( - ^^^^^^^^^^^^^^^^^^^^^^^^^^^ - File "/tmp/uvnote-run-b1sqpts5/home/.cache/uv/environments-v2/forward-only-e48b1c4099197fb9/lib/python3.12/site-packages/torch/autograd/function.py", line 576, in apply - return super().apply(*args, **kwargs) # type: ignore[misc] - ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ - File "/tmp/uvnote-run-b1sqpts5/home/.cache/huggingface/hub/models--kernels-community--liger_kernels/snapshots/7435d25a8faf175758be14046371a5b0c686f94c/build/torch-universal/liger_kernels/utils.py", line 48, in wrapper - return fn(ctx, *args, **kwargs) - ^^^^^^^^^^^^^^^^^^^^^^^^ - File "/tmp/uvnote-run-b1sqpts5/home/.cache/huggingface/hub/models--kernels-community--liger_kernels/snapshots/7435d25a8faf175758be14046371a5b0c686f94c/build/torch-universal/liger_kernels/rms_norm.py", line 338, in forward - Y, X, RSTD, BLOCK_SIZE, num_warps, casting_mode = rms_norm_forward(X, W, eps, offset, casting_mode) - ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ - File "/tmp/uvnote-run-b1sqpts5/home/.cache/huggingface/hub/models--kernels-community--liger_kernels/snapshots/7435d25a8faf175758be14046371a5b0c686f94c/build/torch-universal/liger_kernels/rms_norm.py", line 230, in rms_norm_forward - _rms_norm_forward_kernel[(n_rows,)]( - File "/tmp/uvnote-run-b1sqpts5/home/.cache/uv/environments-v2/forward-only-e48b1c4099197fb9/lib/python3.12/site-packages/triton/runtime/jit.py", line 390, in <lambda> - return lambda *args, **kwargs: self.run(grid=grid, warmup=False, *args, **kwargs) - ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ - File "/tmp/uvnote-run-b1sqpts5/home/.cache/uv/environments-v2/forward-only-e48b1c4099197fb9/lib/python3.12/site-packages/triton/runtime/jit.py", line 617, in run - kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata, launch_metadata, - File "/tmp/uvnote-run-b1sqpts5/home/.cache/uv/environments-v2/forward-only-e48b1c4099197fb9/lib/python3.12/site-packages/triton/backends/nvidia/driver.py", line 708, in __call__ - self.launch(gridX, gridY, gridZ, stream, function, self.launch_cooperative_grid, self.launch_pdl, -ValueError: Pointer argument (at 0) cannot be accessed from Triton (cpu tensor?)
-
-
- -

Backwards

-
- - - \ No newline at end of file diff --git a/note.html b/note.html deleted file mode 100644 index 110dc576a7fde9f2b9e6959b1d07117e007578db..0000000000000000000000000000000000000000 --- a/note.html +++ /dev/null @@ -1,3733 +0,0 @@ - - - - - - uvnote Integration Test Report - - - - - -
-
light
-
reset
- -
- -
-
Generated on:
-
- Linux x86_64 | Linux-5.15.0-1084-aws-x86_64-with-glibc2.31 -
-
- -
-
-
- -β–Ό code -β–Ό output - β–Ά uv-logs - | -Cell: setup | 304.89s - | - -Raw -
-
-
  1
-  2
-  3
-  4
-  5
-  6
-  7
-  8
-  9
- 10
- 11
- 12
- 13
- 14
- 15
- 16
- 17
- 18
- 19
- 20
- 21
- 22
- 23
- 24
- 25
- 26
- 27
- 28
- 29
- 30
- 31
- 32
- 33
- 34
- 35
- 36
- 37
- 38
- 39
- 40
- 41
- 42
- 43
- 44
- 45
- 46
- 47
- 48
- 49
- 50
- 51
- 52
- 53
- 54
- 55
- 56
- 57
- 58
- 59
- 60
- 61
- 62
- 63
- 64
- 65
- 66
- 67
- 68
- 69
- 70
- 71
- 72
- 73
- 74
- 75
- 76
- 77
- 78
- 79
- 80
- 81
- 82
- 83
- 84
- 85
- 86
- 87
- 88
- 89
- 90
- 91
- 92
- 93
- 94
- 95
- 96
- 97
- 98
- 99
-100
-101
-102
-103
-104
-105
-106
-107
-108
-109
-110
-111
-112
-113
-114
-115
-116
-117
-118
-119
-120
-121
-122
-123
-124
-125
-126
-127
-128
-129
-130
-131
-132
-133
-134
-135
-136
-137
-138
-139
-140
-141
-142
-143
-144
-145
-146
-147
-148
-149
-150
-151
-152
-153
-154
-155
-156
-157
-158
-159
-160
-161
-162
-163
-164
-165
-166
-167
-168
-169
-170
-171
-172
-173
-174
-175
-176
-177
-178
-179
-180
-181
-182
-183
-184
-185
-186
-187
-188
-189
-190
-191
-192
-193
-194
-195
-196
-197
-198
-199
-200
-201
-202
-203
-204
-205
-206
-207
-208
-209
-210
-211
-212
-213
-214
-215
-216
-217
-218
-219
-220
# /// script
-# requires-python = ">=3.12"
-# dependencies = [
-#     "accelerate>=1.10.1",
-#     "torch>=2.7.0",
-#     "kernels==0.10.0",
-#     "transformers@https://github.com/huggingface/transformers.git",
-#     "ipdb>=0.13.13",
-#     "matplotlib>=3.7.2",
-#     "numpy>=1.24.3",
-# ]
-# ///
-
-import torch
-from transformers import GptOssForCausalLM, PreTrainedTokenizerFast, Mxfp4Config
-import time
-import torch.nn as nn
-from kernels import register_kernel_mapping, Mode, LayerRepository
-import sys
-import torch.profiler
-import gc
-
-def reset_peak_memory_stats():
-    """Clear CUDA cache and reset memory allocation counters."""
-    torch.cuda.empty_cache()
-    if torch.cuda.is_available():
-        torch.cuda.reset_peak_memory_stats()
-    gc.collect()
-
-def get_memory_stats():
-    """Get current and peak CUDA memory usage."""
-    if not torch.cuda.is_available():
-        return {"allocated_gb": 0, "peak_gb": 0, "reserved_gb": 0}
-    return {
-        "allocated_gb": torch.cuda.memory_allocated() / 1e9,
-        "peak_gb": torch.cuda.max_memory_allocated() / 1e9,
-        "reserved_gb": torch.cuda.memory_reserved() / 1e9,
-    }
-
-def override_kernel_layer_name(cls_name: str, value) -> bool:
-    """Helper to dynamically override the kernel_layer_name in a model class."""
-    for mod in sys.modules.values():
-        if mod is None:
-            continue
-        obj = getattr(mod, cls_name, None)
-        if isinstance(obj, type) and issubclass(obj, nn.Module):
-            setattr(obj, "kernel_layer_name", value)
-            print(f"Overrode {cls_name}.kernel_layer_name to {value}")
-            return True
-    return False
-
-def run_generation(model, inputs, max_tokens=64):
-    """Run a single generation pass and measure its duration."""
-    with torch.inference_mode():
-        start_time = time.perf_counter()
-        generated = model.generate(
-            **inputs,
-            max_new_tokens=max_tokens,
-            do_sample=False,
-            temperature=None,
-        )
-        end_time = time.perf_counter()
-    return generated, end_time - start_time
-
-
-# Init the model the normal way
-model_id = "openai/gpt-oss-20b"
-tokenizer = PreTrainedTokenizerFast.from_pretrained(model_id)
-quantization_config = Mxfp4Config(dequantize=True)
-
-# Now we want to add some custom kernel mapping
-custom_mapping = dict(
-    Yamoe=dict(
-        cuda={
-            Mode.INFERENCE: LayerRepository(
-                repo_id="drbh/yamoe",
-                layer_name="Yamoe",
-                revision="v0.3.0",
-            ),
-        },
-    )
-)
-# First add the mapping
-register_kernel_mapping(custom_mapping)
-# Then override the layer name in the model class
-override_kernel_layer_name("GptOssMLP", "Yamoe")
-
-# TODO: remove this line once RMSNorm is working
-override_kernel_layer_name("GptOssRMSNorm", None)
-
-## Normal model stuff
-
-model = GptOssForCausalLM.from_pretrained(
-    model_id,
-    dtype="bfloat16",
-    device_map="auto",
-    use_kernels=True,
-    quantization_config=quantization_config,
-).eval()
-
-
-messages = [
-    {"role": "system", "content": "What is Tensor Parallelism?"},
-]
-
-inputs = tokenizer.apply_chat_template(
-    messages,
-    add_generation_prompt=True,
-    return_tensors="pt",
-    return_dict=True,
-    reasoning_effort="low",
-).to("cuda")
-
-
-
-def run_generation(model, inputs, max_tokens=64):
-    with torch.inference_mode():
-        start_time = time.perf_counter()
-        generated = model.generate(
-            **inputs,
-            max_new_tokens=max_tokens,
-            do_sample=False,
-            temperature=None,
-        )
-        end_time = time.perf_counter()
-    return generated, end_time - start_time
-
-
-print("\n=== Running Benchmarks ===")
-print(f"Model: {model_id}")
-print(f"Device: {torch.cuda.get_device_name()}")
-print(f"Initial memory: {get_memory_stats()}\n")
-
-# Warmup
-print("Running warmup...")
-for _ in range(2):
-    _ = run_generation(model, inputs, max_tokens=16)
-
-reset_peak_memory_stats()
-
-# Benchmark runs
-num_runs = 5
-max_tokens = 64
-times = []
-
-print(f"\nRunning {num_runs} benchmark iterations with {max_tokens} tokens...")
-for i in range(num_runs):
-    reset_peak_memory_stats()
-    generated, elapsed = run_generation(model, inputs, max_tokens)
-    times.append(elapsed)
-    mem_stats = get_memory_stats()
-    tokens_per_sec = max_tokens / elapsed
-    print(f"Run {i+1}: {elapsed:.3f}s ({tokens_per_sec:.1f} tok/s) | Peak: {mem_stats['peak_gb']:.2f}GB")
-
-# Statistics
-avg_time = sum(times) / len(times)
-min_time = min(times)
-max_time = max(times)
-avg_tokens_per_sec = max_tokens / avg_time
-
-print(f"\n=== Benchmark Results ===")
-print(f"Average: {avg_time:.3f}s ({avg_tokens_per_sec:.1f} tok/s)")
-print(f"Min: {min_time:.3f}s | Max: {max_time:.3f}s")
-
-# Final memory stats
-final_mem = get_memory_stats()
-print(f"\nFinal Memory:")
-print(f"  Allocated: {final_mem['allocated_gb']:.2f}GB")
-print(f"  Peak: {final_mem['peak_gb']:.2f}GB")
-print(f"  Reserved: {final_mem['reserved_gb']:.2f}GB")
-
-
-print("\n=== Running with Profiler ===")
-reset_peak_memory_stats()
-
-with torch.profiler.profile(
-    activities=[
-        torch.profiler.ProfilerActivity.CPU,
-        torch.profiler.ProfilerActivity.CUDA,
-    ],
-    record_shapes=True,
-    profile_memory=True,
-    with_stack=True,
-) as prof:
-    generated, elapsed = run_generation(model, inputs, max_tokens=64)
-
-print(f"Generation time: {elapsed:.2f} seconds")
-
-# Print profiler results
-print("\n=== Top 10 CUDA operations by time ===")
-print(prof.key_averages().table(sort_by="cuda_time_total", row_limit=10))
-
-print("\n=== Top 10 operations by memory ===")
-print(prof.key_averages().table(sort_by="self_cuda_memory_usage", row_limit=10))
-
-# Memory stats
-mem_stats = get_memory_stats()
-print(f"\nPeak Memory: {mem_stats['peak_gb']:.2f}GB")
-
-# Save trace if needed
-prof.export_chrome_trace("trace.json")
-print("\nProfile trace saved to trace.json")
-
-
-# Decode and print output
-print("\nGenerated text:")
-print(tokenizer.decode(generated[0][inputs["input_ids"].shape[-1] :]))
-
-
-# save times and memory stats for charting
-with open("benchmark_times.txt", "w") as f:
-    for t in times:
-        f.write(f"{t}\n")
-with open("benchmark_memory.txt", "w") as f:
-    f.write(f"{final_mem['allocated_gb']},{final_mem['peak_gb']},{final_mem['reserved_gb']}\n")
-
-# save avg_tokens_per_sec for charting
-with open("benchmark_avg_tokens_per_sec.txt", "w") as f:
-    f.write(f"{avg_tokens_per_sec}\n")
-
-
- -
-
-
Overrode GptOssMLP.kernel_layer_name to Yamoe -Overrode GptOssRMSNorm.kernel_layer_name to None - -=== Running Benchmarks === -Model: openai/gpt-oss-20b -Device: NVIDIA L4 -Initial memory: {'allocated_gb': 9.390148608, 'peak_gb': 15.5643264, 'reserved_gb': 17.177772032} - -Running warmup... - -Running 5 benchmark iterations with 64 tokens... -Run 1: 12.075s (5.3 tok/s) | Peak: 9.41GB -Run 2: 12.071s (5.3 tok/s) | Peak: 9.41GB -Run 3: 12.070s (5.3 tok/s) | Peak: 9.41GB -Run 4: 12.071s (5.3 tok/s) | Peak: 9.41GB -Run 5: 12.071s (5.3 tok/s) | Peak: 9.41GB - -=== Benchmark Results === -Average: 12.072s (5.3 tok/s) -Min: 12.070s | Max: 12.075s - -Final Memory: - Allocated: 9.40GB - Peak: 9.41GB - Reserved: 10.33GB - -=== Running with Profiler === -Generation time: 12.73 seconds - -=== Top 10 CUDA operations by time === -------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ - Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg CPU Mem Self CPU Mem CUDA Mem Self CUDA Mem # of Calls -------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ - _yamoe_74a2acb_dirty::experts 1.40% 148.156ms 66.87% 7.074s 4.606ms 52.388ms 0.46% 10.583s 6.890ms 0 B -2.98 KB 18.88 MB -2.11 GB 1536 - aten::bmm 1.25% 132.560ms 1.75% 185.015ms 29.803us 10.486s 91.79% 10.486s 1.689ms 0 B 0 B 63.12 MB 63.12 MB 6208 -void cutlass::Kernel2<cutlass_80_wmma_tensorop_bf16_... 0.00% 0.000us 0.00% 0.000us 0.000us 10.319s 90.32% 10.319s 3.412ms 0 B 0 B 0 B 0 B 3024 - aten::linear 0.54% 57.566ms 3.78% 399.802ms 51.627us 0.000us 0.00% 645.165ms 83.312us 0 B 0 B 76.88 MB 0 B 7744 - aten::addmm 1.81% 191.354ms 2.57% 272.095ms 35.429us 352.039ms 3.08% 352.151ms 45.853us 0 B 0 B 52.31 MB 52.31 MB 7680 -std::enable_if<!(false), void>::type internal::gemvx... 0.00% 0.000us 0.00% 0.000us 0.000us 344.917ms 3.02% 344.917ms 74.982us 0 B 0 B 0 B 0 B 4600 - aten::matmul 0.31% 32.441ms 1.72% 181.712ms 56.785us 0.000us 0.00% 303.821ms 94.944us 0 B 0 B 87.68 MB 0 B 3200 -std::enable_if<!(false), void>::type internal::gemvx... 0.00% 0.000us 0.00% 0.000us 0.000us 293.850ms 2.57% 293.850ms 97.173us 0 B 0 B 0 B 0 B 3024 - aten::mm 0.01% 1.506ms 0.02% 2.161ms 33.768us 293.014ms 2.56% 293.014ms 4.578ms 0 B 0 B 24.56 MB 24.56 MB 64 - ampere_bf16_s16816gemm_bf16_128x64_ldg8_f2f_nn 0.00% 0.000us 0.00% 0.000us 0.000us 102.278ms 0.90% 102.278ms 4.262ms 0 B 0 B 0 B 0 B 24 -------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ -Self CPU time total: 10.579s -Self CUDA time total: 11.424s - - -=== Top 10 operations by memory === -------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ - Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg CPU Mem Self CPU Mem CUDA Mem Self CUDA Mem # of Calls -------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ - aten::empty 0.68% 72.026ms 0.68% 72.026ms 4.244us 0.000us 0.00% 0.000us 0.000us 296 B 296 B 3.49 GB 3.49 GB 16973 - aten::clamp 0.46% 48.185ms 0.69% 72.630ms 15.762us 10.269ms 0.09% 10.269ms 2.229us 0 B 0 B 616.69 MB 616.69 MB 4608 - aten::mul 1.76% 186.048ms 2.93% 310.383ms 14.181us 47.780ms 0.42% 47.792ms 2.184us 784 B 784 B 554.93 MB 554.93 MB 21888 - aten::cat 0.78% 82.030ms 1.22% 129.113ms 16.536us 17.028ms 0.15% 17.030ms 2.181us 0 B 0 B 387.88 MB 387.88 MB 7808 - aten::sigmoid 0.09% 9.855ms 0.16% 16.652ms 10.841us 2.889ms 0.03% 2.889ms 1.881us 0 B 0 B 307.97 MB 307.97 MB 1536 - aten::empty_strided 1.08% 114.498ms 1.10% 116.720ms 5.564us 0.000us 0.00% 0.000us 0.000us 0 B 0 B 216.60 MB 216.60 MB 20979 - aten::add 0.93% 97.861ms 1.56% 164.673ms 15.047us 16.394ms 0.14% 16.395ms 1.498us 0 B 0 B 91.03 MB 91.03 MB 10944 - aten::pow 0.36% 38.271ms 0.55% 58.020ms 18.501us 4.117ms 0.04% 4.117ms 1.313us 0 B 0 B 75.58 MB 75.58 MB 3136 - aten::bmm 1.25% 132.560ms 1.75% 185.015ms 29.803us 10.486s 91.79% 10.486s 1.689ms 0 B 0 B 63.12 MB 63.12 MB 6208 - aten::sub 0.51% 53.869ms 0.82% 87.218ms 13.626us 9.277ms 0.08% 9.355ms 1.461us 0 B 0 B 53.04 MB 53.01 MB 6401 -------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ -Self CPU time total: 10.579s -Self CUDA time total: 11.424s - - -Peak Memory: 9.41GB - -Profile trace saved to trace.json - -Generated text: -<|channel|>analysis<|message|>We need to explain what Tensor Parallelism is. It's a concept in distributed training of large language models. It refers to splitting the weight matrices (tensors) across multiple devices, so each device holds a slice of the matrix. During forward/backward passes, each device computes partial results and then they are -
-
-
β–Ά UV Install Logs
- -
-
Fetching 3 files: 0%| | 0/3 [00:00<?, ?it/s] -Fetching 3 files: 33%|β–ˆβ–ˆβ–ˆβ–Ž | 1/3 [00:11<00:23, 11.59s/it] -Fetching 3 files: 67%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–‹ | 2/3 [00:16<00:07, 7.73s/it] -Fetching 3 files: 100%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆ| 3/3 [00:16<00:00, 5.54s/it] -You are using full precision kernels, we will dequantize the model to bf16. To use the quantized model with quantization kernels, please set use_kernels=False - -Loading checkpoint shards: 0%| | 0/3 [00:00<?, ?it/s] -Loading checkpoint shards: 33%|β–ˆβ–ˆβ–ˆβ–Ž | 1/3 [00:03<00:06, 3.23s/it] -Loading checkpoint shards: 67%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–‹ | 2/3 [00:06<00:03, 3.15s/it] -Loading checkpoint shards: 100%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆ| 3/3 [00:08<00:00, 2.50s/it] -Loading checkpoint shards: 100%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆ| 3/3 [00:08<00:00, 2.68s/it] - -Fetching 6 files: 0%| | 0/6 [00:00<?, ?it/s] -Fetching 6 files: 17%|β–ˆβ–‹ | 1/6 [00:00<00:00, 5.23it/s] -Fetching 6 files: 50%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆ | 3/6 [00:00<00:00, 6.19it/s] -Fetching 6 files: 100%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆ| 6/6 [00:00<00:00, 12.15it/s] -/tmp/uvnote-run-hjgpkuq6/home/.cache/uv/environments-v2/setup-30bb029f3f83f37d/lib/python3.12/site-packages/kernels/layer.py:868: UserWarning: -No kernel mapping found for layer `None`. Check if the layer name matches one of the kernels in the mapping or add the kernel you want to use to the mapping. Defaulting to original forward implementation. - warnings.warn( -/tmp/uvnote-run-hjgpkuq6/home/.cache/uv/environments-v2/setup-30bb029f3f83f37d/lib/python3.12/site-packages/kernels/layer.py:868: UserWarning: -No kernel mapping found for layer `None`. Check if the layer name matches one of the kernels in the mapping or add the kernel you want to use to the mapping. Defaulting to original forward implementation. - warnings.warn(
- -
-
- -
-
- -β–Ό code -β–Ό output - β–Ά uv-logs - | -Cell: charts | deps: matplotlib, numpy | 3.51s - | - -Raw -
-
-
  1
-  2
-  3
-  4
-  5
-  6
-  7
-  8
-  9
- 10
- 11
- 12
- 13
- 14
- 15
- 16
- 17
- 18
- 19
- 20
- 21
- 22
- 23
- 24
- 25
- 26
- 27
- 28
- 29
- 30
- 31
- 32
- 33
- 34
- 35
- 36
- 37
- 38
- 39
- 40
- 41
- 42
- 43
- 44
- 45
- 46
- 47
- 48
- 49
- 50
- 51
- 52
- 53
- 54
- 55
- 56
- 57
- 58
- 59
- 60
- 61
- 62
- 63
- 64
- 65
- 66
- 67
- 68
- 69
- 70
- 71
- 72
- 73
- 74
- 75
- 76
- 77
- 78
- 79
- 80
- 81
- 82
- 83
- 84
- 85
- 86
- 87
- 88
- 89
- 90
- 91
- 92
- 93
- 94
- 95
- 96
- 97
- 98
- 99
-100
-101
-102
-103
-104
-105
-106
-107
-108
-109
-110
-111
-112
-113
-114
-115
-116
-117
-118
-119
-120
-121
-122
-123
-124
-125
-126
-127
-128
-129
-130
-131
-132
-133
import matplotlib.pyplot as plt
-import numpy as np
-import os
-
-# get the pathf rom UVNOTE_SETUP env var
-setup_path = os.getenv("UVNOTE_INPUT_SETUP", ".")
-print(f"Reading benchmark data from: {setup_path}")
-
-num_runs = 5
-max_tokens = 64
-times = []
-with open(os.path.join(setup_path, "benchmark_times.txt"), "r") as f:
-    for line in f:
-        times.append(float(line.strip()))
-
-
-avg_time = 0.0
-min_time = 0.0
-max_time = 0.0
-final_mem = {"allocated_gb": 0.0, "peak_gb": 0.0, "reserved_gb": 0.0}
-
-avg_tokens_per_sec = 0.0
-with open(os.path.join(setup_path, "benchmark_avg_tokens_per_sec.txt"), "r") as f:
-    avg_tokens_per_sec = float(f.read().strip())
-
-times_file = os.path.join(setup_path, "benchmark_times.txt")
-memory_file = os.path.join(setup_path, "benchmark_memory.txt")
-
-
-# Minimal brutalist palette (dark theme): grayscale + 1 accent
-ACCENT = '#5ec8f8'   # calm cyan-blue accent
-FG = '#e6e6e6'       # light gray text/lines
-MUTED = '#9aa0a6'    # muted gray for secondary
-GRID = '#333333'     # grid lines
-
-# Styling tuned for clarity, high contrast, few colors
-plt.style.use('dark_background')
-plt.rcParams['figure.facecolor'] = 'none'
-plt.rcParams['axes.facecolor'] = 'none'
-plt.rcParams['savefig.facecolor'] = 'none'
-plt.rcParams['savefig.transparent'] = True
-plt.rcParams['font.family'] = 'monospace'
-plt.rcParams['font.weight'] = 'bold'
-plt.rcParams['axes.linewidth'] = 3
-plt.rcParams['grid.linewidth'] = 2
-plt.rcParams['lines.linewidth'] = 3
-plt.rcParams['patch.linewidth'] = 2
-
-# Prepare data
-runs = list(range(1, len(times) + 1))
-tokens_per_sec_all = [max_tokens / t for t in times]
-
-# Chart 1: Throughput Performance
-fig1, ax1 = plt.subplots(1, 1, figsize=(12, 6))
-fig1.patch.set_alpha(0)
-ax1.patch.set_alpha(0)
-
-ax1.plot(runs, tokens_per_sec_all, color=ACCENT, marker='o', markersize=12,
-         markerfacecolor=ACCENT, markeredgecolor=FG, markeredgewidth=3, linewidth=5, label='tok/s')
-ax1.fill_between(runs, 0, tokens_per_sec_all, alpha=0.2, color=ACCENT)
-ax1.axhline(y=avg_tokens_per_sec, color=FG, linestyle='--', linewidth=3,
-            label=f'AVG: {avg_tokens_per_sec:.1f}')
-ax1.set_title('THROUGHPUT PERFORMANCE', color=FG, fontsize=18, pad=20, fontweight='bold')
-ax1.set_xlabel('RUN NUMBER', color=FG, fontsize=14, fontweight='bold')
-ax1.set_ylabel('TOKENS/SEC', color=FG, fontsize=14, fontweight='bold')
-ax1.grid(True, color=GRID, alpha=0.5, linewidth=2)
-ax1.tick_params(colors=FG, labelsize=12)
-legend1 = ax1.legend(frameon=False, loc='lower right')
-for text in legend1.get_texts():
-    text.set_color(FG)
-    text.set_fontweight('bold')
-plt.tight_layout()
-plt.savefig('throughput.png', dpi=150, bbox_inches='tight', transparent=True)
-plt.show()
-
-# Chart 2: Generation Latency
-fig2, ax2 = plt.subplots(1, 1, figsize=(12, 6))
-fig2.patch.set_alpha(0)
-ax2.patch.set_alpha(0)
-
-bar_colors = [ACCENT if i % 2 == 0 else MUTED for i in range(len(times))]
-bars = ax2.bar(runs, times, color=bar_colors, edgecolor=FG, linewidth=3, width=0.6)
-ax2.axhline(y=avg_time, color=FG, linestyle='--', linewidth=3,
-            label=f'AVG: {avg_time:.2f}s')
-for i, (run, time, bar) in enumerate(zip(runs, times, bars)):
-    ax2.text(run, time + 0.02, f'{time:.2f}s', ha='center', va='bottom',
-             color=FG, fontweight='bold', fontsize=11)
-ax2.set_title('GENERATION LATENCY', color=FG, fontsize=18, pad=20, fontweight='bold')
-ax2.set_xlabel('RUN NUMBER', color=FG, fontsize=14, fontweight='bold')
-ax2.set_ylabel('TIME (SECONDS)', color=FG, fontsize=14, fontweight='bold')
-ax2.grid(True, axis='y', color=GRID, alpha=0.5, linewidth=2)
-ax2.tick_params(colors=FG, labelsize=12)
-ax2.set_ylim(0, max(times) * 1.15)
-legend2 = ax2.legend(frameon=False, loc='upper right')
-for text in legend2.get_texts():
-    text.set_color(FG)
-    text.set_fontweight('bold')
-plt.tight_layout()
-plt.savefig('latency.png', dpi=150, bbox_inches='tight', transparent=True)
-plt.show()
-
-# Chart 3: Memory Usage
-fig3, ax3 = plt.subplots(1, 1, figsize=(12, 6))
-fig3.patch.set_alpha(0)
-ax3.patch.set_alpha(0)
-
-memory_labels = ['ALLOCATED', 'PEAK', 'RESERVED']
-memory_values = [final_mem['allocated_gb'], final_mem['peak_gb'], final_mem['reserved_gb']]
-colors_mem = [MUTED, ACCENT, FG]
-bars = ax3.barh(memory_labels, memory_values, color=colors_mem, edgecolor=FG, linewidth=3, height=0.5)
-for i, (label, value, bar) in enumerate(zip(memory_labels, memory_values, bars)):
-    ax3.text(value + 0.5, i, f'{value:.1f} GB', va='center',
-             color=FG, fontweight='bold', fontsize=13)
-ax3.set_title('MEMORY USAGE', color=FG, fontsize=18, pad=20, fontweight='bold')
-ax3.set_xlabel('GIGABYTES', color=FG, fontsize=14, fontweight='bold')
-ax3.set_xlim(0, max(memory_values) * 1.3)
-ax3.grid(True, axis='x', color=GRID, alpha=0.5, linewidth=2)
-ax3.tick_params(colors=FG, labelsize=12)
-ax3.set_yticks(range(len(memory_labels)))
-ax3.set_yticklabels(memory_labels, fontweight='bold')
-plt.tight_layout()
-plt.savefig('memory.png', dpi=150, bbox_inches='tight', transparent=True)
-plt.show()
-
-print(f"\nπŸ“Š Charts saved as:")
-print(f"  β€’ throughput.png")
-print(f"  β€’ latency.png")
-print(f"  β€’ memory.png")
-print(f"\nBenchmark Summary:")
-print(f"  avg tokens/sec: {avg_tokens_per_sec:.1f}")
-print(f"  min time: {min_time:.3f}s")
-print(f"  max time: {max_time:.3f}s")
-print(f"  peak memory: {final_mem['peak_gb']:.2f}GB")
-
- -
-
-
Reading benchmark data from: /home/ubuntu/Projects/yamoe-gpt-integration/.uvnote/cache/0e89c413a25ded7b4d6fab2a010f0538ba2b35fb5f619a0dfced3121d3ccf879 - -πŸ“Š Charts saved as: - β€’ throughput.png - β€’ latency.png - β€’ memory.png - -Benchmark Summary: - avg tokens/sec: 5.3 - min time: 0.000s - max time: 0.000s - peak memory: 0.00GB -
-
-
β–Ά UV Install Logs
- -
-
/home/ubuntu/Projects/yamoe-gpt-integration/.uvnote/cells/charts.py:123: UserWarning: Attempting to set identical low and high xlims makes transformation singular; automatically expanding. - ax3.set_xlim(0, max(memory_values) * 1.3) -/home/ubuntu/Projects/yamoe-gpt-integration/.uvnote/cells/charts.py:128: UserWarning: Tight layout not applied. The left and right margins cannot be made large enough to accommodate all Axes decorations. - plt.tight_layout()
-
-

Artifacts:

-latency.png -memory.png -throughput.png -
-latency.png -
-
-memory.png -
-
-throughput.png -
-
-
-
-
- - - - \ No newline at end of file diff --git a/note_test_override.html b/note_test_override.html deleted file mode 100644 index 23c5a059d2fb98596e33a750a34dfa0b2253f0e8..0000000000000000000000000000000000000000 --- a/note_test_override.html +++ /dev/null @@ -1,4593 +0,0 @@ - - - - - - uvnote Integration Test Report - - - - - - - -
-
-
light
-
reset
- -
-
- -
-
Generated on:
-
- Linux x86_64 | Linux-6.12.40-64.114.amzn2023.x86_64-x86_64-with-glibc2.36 -
-
- -
-
-
- -β–Ό code -β–Ό output - β–Ά uv-logs - | -Cell: nv | 0.53s - | - -Raw -
-
-
-
-1 -2 -3 -
-
-
import subprocess
-
-print(subprocess.run(["nvidia-smi"], capture_output=True, text=True).stdout)
-
- -
-
-
-
-
-
Tue Sep 23 19:46:07 2025 -+-----------------------------------------------------------------------------------------+ -| NVIDIA-SMI 570.172.08 Driver Version: 570.172.08 CUDA Version: 12.8 | -|-----------------------------------------+------------------------+----------------------+ -| GPU Name Persistence-M | Bus-Id Disp.A | Volatile Uncorr. ECC | -| Fan Temp Perf Pwr:Usage/Cap | Memory-Usage | GPU-Util Compute M. | -| | | MIG M. | -|=========================================+========================+======================| -| 0 NVIDIA A10G On | 00000000:00:1B.0 Off | 0 | -| 0% 42C P0 71W / 300W | 0MiB / 23028MiB | 0% Default | -| | | N/A | -+-----------------------------------------+------------------------+----------------------+ -| 1 NVIDIA A10G On | 00000000:00:1C.0 Off | 0 | -| 0% 43C P0 44W / 300W | 0MiB / 23028MiB | 0% Default | -| | | N/A | -+-----------------------------------------+------------------------+----------------------+ -| 2 NVIDIA A10G On | 00000000:00:1D.0 Off | 0 | -| 0% 42C P0 46W / 300W | 0MiB / 23028MiB | 0% Default | -| | | N/A | -+-----------------------------------------+------------------------+----------------------+ -| 3 NVIDIA A10G On | 00000000:00:1E.0 Off | 0 | -| 0% 41C P0 43W / 300W | 0MiB / 23028MiB | 0% Default | -| | | N/A | -+-----------------------------------------+------------------------+----------------------+ - -+-----------------------------------------------------------------------------------------+ -| Processes: | -| GPU GI CI PID Type Process name GPU Memory | -| ID ID Usage | -|=========================================================================================| -| No running processes found | -+-----------------------------------------------------------------------------------------+ - -
-
-
- -
-
- -β–Ό code -β–Ό output - β–Ά uv-logs - | -Cell: setup | 133.12s - | - -Raw -
-
-
-
-1 -2 -3 -4 -5 -6 -7 -8 -9 -10 -11 -12 -13 -14 -15 -16 -17 -18 -19 -20 -21 -22 -23 -24 -25 -26 -27 -28 -29 -30 -31 -32 -33 -34 -35 -36 -37 -38 -39 -40 -41 -42 -43 -44 -45 -46 -47 -48 -49 -50 -51 -52 -53 -54 -55 -56 -57 -58 -59 -60 -61 -62 -63 -64 -65 -66 -67 -68 -69 -70 -71 -72 -73 -74 -75 -76 -77 -78 -79 -80 -81 -82 -83 -84 -85 -86 -87 -88 -89 -90 -91 -92 -93 -94 -95 -96 -97 -98 -99 -100 -101 -102 -103 -104 -105 -106 -107 -108 -109 -110 -111 -112 -113 -114 -115 -116 -
-
-
# /// script
-# requires-python = ">=3.12"
-# dependencies = [
-#     "accelerate>=1.10.1",
-#     "torch>=2.7.0",
-#     "kernels==0.10.0",
-#     "transformers@https://github.com/huggingface/transformers.git",
-#     "ipdb>=0.13.13",
-#     "matplotlib>=3.7.2",
-#     "numpy>=1.24.3",
-# ]
-# ///
-
-import torch
-from transformers import GptOssForCausalLM, PreTrainedTokenizerFast, Mxfp4Config
-import time
-import torch.nn as nn
-from kernels import register_kernel_mapping, Mode, LayerRepository
-import sys
-import torch.profiler
-import gc
-import logging
-
-# set to debug logging
-logging.basicConfig(level=logging.INFO)
-
-def reset_peak_memory_stats():
-    """Clear CUDA cache and reset memory allocation counters."""
-    torch.cuda.empty_cache()
-    if torch.cuda.is_available():
-        torch.cuda.reset_peak_memory_stats()
-    gc.collect()
-
-def get_memory_stats():
-    """Get current and peak CUDA memory usage."""
-    if not torch.cuda.is_available():
-        return {"allocated_gb": 0, "peak_gb": 0, "reserved_gb": 0}
-    return {
-        "allocated_gb": torch.cuda.memory_allocated() / 1e9,
-        "peak_gb": torch.cuda.max_memory_allocated() / 1e9,
-        "reserved_gb": torch.cuda.memory_reserved() / 1e9,
-    }
-
-def override_kernel_layer_name(cls_name: str, value) -> bool:
-    """Helper to dynamically override the kernel_layer_name in a model class."""
-    for mod in sys.modules.values():
-        if mod is None:
-            continue
-        obj = getattr(mod, cls_name, None)
-        if isinstance(obj, type) and issubclass(obj, nn.Module):
-            setattr(obj, "kernel_layer_name", value)
-            print(f"Overrode {cls_name}.kernel_layer_name to {value}")
-            return True
-    return False
-
-
-# Init the model the normal way
-model_id = "openai/gpt-oss-20b"
-tokenizer = PreTrainedTokenizerFast.from_pretrained(model_id)
-quantization_config = Mxfp4Config(dequantize=True)
-
-
-from kernels import replace_kernel_forward_from_hub, register_kernel_mapping, LayerRepository, Mode
-
-from transformers.models.gpt_oss.modeling_gpt_oss import GptOssMLP, GptOssRMSNorm
-
-replace_kernel_forward_from_hub(GptOssMLP, "Yamoe")  # direct, type-safe
-replace_kernel_forward_from_hub(GptOssRMSNorm, None)  # direct, type-safe
-custom_mapping = {
-    "Yamoe": {
-        "cuda": {
-            Mode.INFERENCE: LayerRepository(
-                repo_id="drbh/yamoe",
-                layer_name="Yamoe",
-                revision="v0.3.0",
-            )
-        }
-    }
-}
-register_kernel_mapping(custom_mapping)
-
-
-model = GptOssForCausalLM.from_pretrained(
-    model_id,
-    dtype="bfloat16",
-    device_map="auto",
-    use_kernels=True,
-    quantization_config=quantization_config,
-).eval()
-
-messages = [
-    {"role": "system", "content": "What is Tensor Parallelism?"},
-]
-
-inputs = tokenizer.apply_chat_template(
-    messages,
-    add_generation_prompt=True,
-    return_tensors="pt",
-    return_dict=True,
-    reasoning_effort="low",
-).to("cuda")
-
-max_tokens = 512
-
-with torch.inference_mode():
-    start_time = time.perf_counter()
-    generated = model.generate(
-        **inputs,
-        max_new_tokens=max_tokens,
-        do_sample=False,
-        temperature=None,
-    )
-    end_time = time.perf_counter()
-
-print(tokenizer.decode(generated[0], skip_special_tokens=False))
-print(f"Generation took {end_time - start_time:.2f} seconds")
-
- -
-
-
-
-
-
<|start|>system<|message|>You are ChatGPT, a large language model trained by OpenAI. -Knowledge cutoff: 2024-06 -Current date: 2025-09-23 - -Reasoning: low - -# Valid channels: analysis, commentary, final. Channel must be included for every message.<|end|><|start|>developer<|message|># Instructions - -What is Tensor Parallelism? - -<|end|><|start|>assistant<|channel|>analysis<|message|>We need to explain what Tensor Parallelism is. It's a concept in distributed training of large language models. It refers to splitting the weight matrices (tensors) across multiple devices, so each device holds a slice of the matrix. During forward/backward passes, each device computes partial results and then they are aggregated. It's used to scale up models beyond single device memory. Also mention pipeline parallelism, data parallelism. Provide details: e.g., for a linear layer weight matrix W of shape (out_features, in_features), we can split along out_features dimension across devices. Each device computes its part of the output. Then gather results. In backward, gradients are computed locally and then aggregated. Provide example: GPT-3 training uses tensor parallelism. Also mention frameworks: Megatron-LM, DeepSpeed, etc. Provide pros/cons. Provide code snippet maybe. Also mention that it's different from data parallelism. Provide explanation of how it works in practice. Provide mention of communication overhead. Provide mention of "tensor model parallelism" vs "tensor parallelism" synonyms. Provide mention of "tensor parallelism" in context of huggingface accelerate. Provide mention of "tensor parallelism" in context of DeepSpeed ZeRO stage 3. Provide mention of "tensor parallelism" in context of Megatron-LM. Provide mention of "tensor parallelism" in context of GPT-NeoX. Provide mention of "tensor parallelism" in context of "DeepSpeed's ZeRO-Offload" maybe. Provide mention of "tensor parallelism" in context of "DeepSpeed's ZeRO-2" maybe. Provide mention of "tensor parallelism" in context of "DeepSpeed's ZeRO-3" maybe. Provide mention of "tensor parallelism" in context of "DeepSpeed's ZeRO-3" maybe. Provide mention of "tensor parallelism" in context of "DeepSpeed's ZeRO-3" maybe. Provide mention of "tensor parallelism" in context of "DeepSpeed's ZeRO-3" maybe. Provide mention of "tensor parallelism" in context of "DeepSpeed's ZeRO-3" maybe. Provide mention of "tensor parallelism" in context of "DeepSpeed's ZeRO-3" maybe. Provide mention of "tensor parallelism" in context of "DeepSpeed's ZeRO-3" maybe. Provide mention of "tensor parallelism" in context of "DeepSpeed's ZeRO-3" maybe. Provide mention of "tensor -Generation took 51.92 seconds -
-
-
β–Ά UV Install Logs
- -
-
Fetching 3 files: 0%| | 0/3 [00:00<?, ?it/s] -Fetching 3 files: 33%|β–ˆβ–ˆβ–ˆβ–Ž | 1/3 [00:06<00:13, 6.78s/it] -Fetching 3 files: 67%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–‹ | 2/3 [00:08<00:03, 3.65s/it] -Fetching 3 files: 100%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆ| 3/3 [00:08<00:00, 2.75s/it] -You are using full precision kernels, we will dequantize the model to bf16. To use the quantized model with quantization kernels, please set use_kernels=False - -Loading checkpoint shards: 0%| | 0/3 [00:00<?, ?it/s] -Loading checkpoint shards: 33%|β–ˆβ–ˆβ–ˆβ–Ž | 1/3 [00:02<00:04, 2.34s/it] -Loading checkpoint shards: 67%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–‹ | 2/3 [00:04<00:02, 2.25s/it] -Loading checkpoint shards: 100%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆ| 3/3 [00:05<00:00, 1.80s/it] -Loading checkpoint shards: 100%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆ| 3/3 [00:05<00:00, 1.93s/it] -INFO:root:Using layer `Yamoe` from repo `drbh/yamoe` (revision: v0.3.0) for layer `Yamoe` - -Fetching 6 files: 0%| | 0/6 [00:00<?, ?it/s] -Fetching 6 files: 17%|β–ˆβ–‹ | 1/6 [00:00<00:01, 3.89it/s] -Fetching 6 files: 100%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆ| 6/6 [00:00<00:00, 17.67it/s] -/tmp/uvnote-run-hvgovjfd/home/.cache/uv/environments-v2/setup-443c07e337d3be43/lib/python3.13/site-packages/kernels/layer.py:868: UserWarning: -No kernel mapping found for layer `None`. Check if the layer name matches one of the kernels in the mapping or add the kernel you want to use to the mapping. Defaulting to original forward implementation. - warnings.warn( -INFO:root:Using layer `Yamoe` from repo `drbh/yamoe` (revision: v0.3.0) for layer `Yamoe` -INFO:root:Using layer `Yamoe` from repo `drbh/yamoe` (revision: v0.3.0) for layer `Yamoe` -INFO:root:Using layer `Yamoe` from repo `drbh/yamoe` (revision: v0.3.0) for layer `Yamoe` -INFO:root:Using layer `Yamoe` from repo `drbh/yamoe` (revision: v0.3.0) for layer `Yamoe` -INFO:root:Using layer `Yamoe` from repo `drbh/yamoe` (revision: v0.3.0) for layer `Yamoe` -INFO:root:Using layer `Yamoe` from repo `drbh/yamoe` (revision: v0.3.0) for layer `Yamoe` -INFO:root:Using layer `Yamoe` from repo `drbh/yamoe` (revision: v0.3.0) for layer `Yamoe` -INFO:root:Using layer `Yamoe` from repo `drbh/yamoe` (revision: v0.3.0) for layer `Yamoe` -INFO:root:Using layer `Yamoe` from repo `drbh/yamoe` (revision: v0.3.0) for layer `Yamoe` -INFO:root:Using layer `Yamoe` from repo `drbh/yamoe` (revision: v0.3.0) for layer `Yamoe` -INFO:root:Using layer `Yamoe` from repo `drbh/yamoe` (revision: v0.3.0) for layer `Yamoe` -INFO:root:Using layer `Yamoe` from repo `drbh/yamoe` (revision: v0.3.0) for layer `Yamoe` -INFO:root:Using layer `Yamoe` from repo `drbh/yamoe` (revision: v0.3.0) for layer `Yamoe` -INFO:root:Using layer `Yamoe` from repo `drbh/yamoe` (revision: v0.3.0) for layer `Yamoe` -INFO:root:Using layer `Yamoe` from repo `drbh/yamoe` (revision: v0.3.0) for layer `Yamoe` -INFO:root:Using layer `Yamoe` from repo `drbh/yamoe` (revision: v0.3.0) for layer `Yamoe` -INFO:root:Using layer `Yamoe` from repo `drbh/yamoe` (revision: v0.3.0) for layer `Yamoe` -INFO:root:Using layer `Yamoe` from repo `drbh/yamoe` (revision: v0.3.0) for layer `Yamoe` -INFO:root:Using layer `Yamoe` from repo `drbh/yamoe` (revision: v0.3.0) for layer `Yamoe` -INFO:root:Using layer `Yamoe` from repo `drbh/yamoe` (revision: v0.3.0) for layer `Yamoe` -INFO:root:Using layer `Yamoe` from repo `drbh/yamoe` (revision: v0.3.0) for layer `Yamoe` -INFO:root:Using layer `Yamoe` from repo `drbh/yamoe` (revision: v0.3.0) for layer `Yamoe` -INFO:root:Using layer `Yamoe` from repo `drbh/yamoe` (revision: v0.3.0) for layer `Yamoe` -INFO:root:Using layer `Yamoe` from repo `drbh/yamoe` (revision: v0.3.0) for layer `Yamoe` -/tmp/uvnote-run-hvgovjfd/home/.cache/uv/environments-v2/setup-443c07e337d3be43/lib/python3.13/site-packages/kernels/layer.py:868: UserWarning: -No kernel mapping found for layer `None`. Check if the layer name matches one of the kernels in the mapping or add the kernel you want to use to the mapping. Defaulting to original forward implementation. - warnings.warn( -INFO:root:Using layer `Yamoe` from repo `drbh/yamoe` (revision: v0.3.0) for layer `Yamoe` -INFO:root:Using layer `Yamoe` from repo `drbh/yamoe` (revision: v0.3.0) for layer `Yamoe` -INFO:root:Using layer `Yamoe` from repo `drbh/yamoe` (revision: v0.3.0) for layer `Yamoe` -INFO:root:Using layer `Yamoe` from repo `drbh/yamoe` (revision: v0.3.0) for layer `Yamoe` -INFO:root:Using layer `Yamoe` from repo `drbh/yamoe` (revision: v0.3.0) for layer `Yamoe` -INFO:root:Using layer `Yamoe` from repo `drbh/yamoe` (revision: v0.3.0) for layer `Yamoe` -INFO:root:Using layer `Yamoe` from repo `drbh/yamoe` (revision: v0.3.0) for layer `Yamoe` -INFO:root:Using layer `Yamoe` from repo `drbh/yamoe` (revision: v0.3.0) for layer `Yamoe` -INFO:root:Using layer `Yamoe` from repo `drbh/yamoe` (revision: v0.3.0) for layer `Yamoe` -INFO:root:Using layer `Yamoe` from repo `drbh/yamoe` (revision: v0.3.0) for layer `Yamoe` -INFO:root:Using layer `Yamoe` from repo `drbh/yamoe` (revision: v0.3.0) for layer `Yamoe` -INFO:root:Using layer `Yamoe` from repo `drbh/yamoe` (revision: v0.3.0) for layer `Yamoe` -INFO:root:Using layer `Yamoe` from repo `drbh/yamoe` (revision: v0.3.0) for layer `Yamoe` -INFO:root:Using layer `Yamoe` from repo `drbh/yamoe` (revision: v0.3.0) for layer `Yamoe` -INFO:root:Using layer `Yamoe` from repo `drbh/yamoe` (revision: v0.3.0) for layer `Yamoe` -INFO:root:Using layer `Yamoe` from repo `drbh/yamoe` (revision: v0.3.0) for layer `Yamoe` -INFO:root:Using layer `Yamoe` from repo `drbh/yamoe` (revision: v0.3.0) for layer `Yamoe` -INFO:root:Using layer `Yamoe` from repo `drbh/yamoe` (revision: v0.3.0) for layer `Yamoe` -INFO:root:Using layer `Yamoe` from repo `drbh/yamoe` (revision: v0.3.0) for layer `Yamoe` -INFO:root:Using layer `Yamoe` from repo `drbh/yamoe` (revision: v0.3.0) for layer `Yamoe` -INFO:root:Using layer `Yamoe` from repo `drbh/yamoe` (revision: v0.3.0) for layer `Yamoe` -INFO:root:Using layer `Yamoe` from repo `drbh/yamoe` (revision: v0.3.0) for layer `Yamoe` -INFO:root:Using layer `Yamoe` from repo `drbh/yamoe` (revision: v0.3.0) for layer `Yamoe`
-
-
- -

Reference kernel

-
-
- -β–Ό code -β–Ό output - β–Ά uv-logs - | -Cell: setup2 | 139.97s - | - -Raw -
-
-
-
-1 -2 -3 -4 -5 -6 -7 -8 -9 -10 -11 -12 -13 -14 -15 -16 -17 -18 -19 -20 -21 -22 -23 -24 -25 -26 -27 -28 -29 -30 -31 -32 -33 -34 -35 -36 -37 -38 -39 -40 -41 -42 -43 -44 -45 -46 -47 -48 -49 -50 -51 -52 -53 -54 -55 -56 -57 -58 -59 -60 -61 -62 -63 -64 -65 -66 -67 -68 -69 -70 -71 -72 -73 -74 -75 -76 -77 -78 -79 -80 -81 -82 -83 -84 -85 -86 -87 -88 -89 -90 -91 -92 -93 -94 -95 -96 -97 -98 -99 -100 -101 -102 -103 -104 -105 -106 -107 -108 -109 -110 -111 -112 -113 -114 -115 -
-
-
# /// script
-# requires-python = ">=3.12"
-# dependencies = [
-#     "accelerate>=1.10.1",
-#     "torch>=2.7.0",
-#     "kernels==0.10.0",
-#     "transformers@https://github.com/huggingface/transformers.git",
-#     "ipdb>=0.13.13",
-#     "matplotlib>=3.7.2",
-#     "numpy>=1.24.3",
-# ]
-# ///
-
-import torch
-from transformers import GptOssForCausalLM, PreTrainedTokenizerFast, Mxfp4Config
-import time
-import torch.nn as nn
-from kernels import register_kernel_mapping, Mode, LayerRepository
-import sys
-import torch.profiler
-import gc
-import logging
-
-# set to debug logging
-logging.basicConfig(level=logging.INFO)
-
-def reset_peak_memory_stats():
-    """Clear CUDA cache and reset memory allocation counters."""
-    torch.cuda.empty_cache()
-    if torch.cuda.is_available():
-        torch.cuda.reset_peak_memory_stats()
-    gc.collect()
-
-def get_memory_stats():
-    """Get current and peak CUDA memory usage."""
-    if not torch.cuda.is_available():
-        return {"allocated_gb": 0, "peak_gb": 0, "reserved_gb": 0}
-    return {
-        "allocated_gb": torch.cuda.memory_allocated() / 1e9,
-        "peak_gb": torch.cuda.max_memory_allocated() / 1e9,
-        "reserved_gb": torch.cuda.memory_reserved() / 1e9,
-    }
-
-def override_kernel_layer_name(cls_name: str, value) -> bool:
-    """Helper to dynamically override the kernel_layer_name in a model class."""
-    for mod in sys.modules.values():
-        if mod is None:
-            continue
-        obj = getattr(mod, cls_name, None)
-        if isinstance(obj, type) and issubclass(obj, nn.Module):
-            setattr(obj, "kernel_layer_name", value)
-            print(f"Overrode {cls_name}.kernel_layer_name to {value}")
-            return True
-    return False
-
-
-# Init the model the normal way
-model_id = "openai/gpt-oss-20b"
-tokenizer = PreTrainedTokenizerFast.from_pretrained(model_id)
-quantization_config = Mxfp4Config(dequantize=True)
-
-
-from kernels import replace_kernel_forward_from_hub, register_kernel_mapping, LayerRepository, Mode
-
-from transformers.models.gpt_oss.modeling_gpt_oss import GptOssMLP, GptOssRMSNorm
-
-replace_kernel_forward_from_hub(GptOssRMSNorm, None)  # direct, type-safe
-custom_mapping = {
-    "Yamoe": {
-        "cuda": {
-            Mode.INFERENCE: LayerRepository(
-                repo_id="drbh/yamoe",
-                layer_name="Yamoe",
-                revision="v0.3.0",
-            )
-        }
-    }
-}
-register_kernel_mapping(custom_mapping)
-
-
-model = GptOssForCausalLM.from_pretrained(
-    model_id,
-    dtype="bfloat16",
-    device_map="auto",
-    use_kernels=True,
-    quantization_config=quantization_config,
-).eval()
-
-messages = [
-    {"role": "system", "content": "What is Tensor Parallelism?"},
-]
-
-inputs = tokenizer.apply_chat_template(
-    messages,
-    add_generation_prompt=True,
-    return_tensors="pt",
-    return_dict=True,
-    reasoning_effort="low",
-).to("cuda")
-
-max_tokens = 512
-
-with torch.inference_mode():
-    start_time = time.perf_counter()
-    generated = model.generate(
-        **inputs,
-        max_new_tokens=max_tokens,
-        do_sample=False,
-        temperature=None,
-    )
-    end_time = time.perf_counter()
-
-print(tokenizer.decode(generated[0], skip_special_tokens=False))
-print(f"Generation took {end_time - start_time:.2f} seconds")
-
- -
-
-
-
-
-
<|start|>system<|message|>You are ChatGPT, a large language model trained by OpenAI. -Knowledge cutoff: 2024-06 -Current date: 2025-09-23 - -Reasoning: low - -# Valid channels: analysis, commentary, final. Channel must be included for every message.<|end|><|start|>developer<|message|># Instructions - -What is Tensor Parallelism? - -<|end|><|start|>assistant<|channel|>analysis<|message|>We need to explain what Tensor Parallelism is. It's a concept in distributed training of large language models. It refers to splitting the weight matrices (tensors) across multiple devices. Provide details: how it works, benefits, challenges, typical frameworks, etc. Also mention difference from data parallelism, pipeline parallelism. Provide example: splitting a weight matrix across GPUs, each GPU holds a slice, compute partial results, then gather. Provide mention of communication overhead, scaling, etc. Also mention that it's used in large models like GPT-3, Megatron-LM, DeepSpeed, etc. Provide explanation of how it reduces memory usage, increases throughput. Provide mention of "tensor model parallelism" vs "tensor parallelism" synonyms. Provide mention of "tensor parallelism" in context of huggingface accelerate, DeepSpeed, Megatron. Provide mention of "tensor parallelism" in the context of "tensor parallelism" in the "DeepSpeed ZeRO-Offload" or "ZeRO-3" etc. Provide mention of "tensor parallelism" in the context of "tensor parallelism" in "DeepSpeed" and "Megatron-LM" and "DeepSpeed's ZeRO" and "DeepSpeed's ZeRO-3" and "DeepSpeed's ZeRO-2" etc. Provide mention of "tensor parallelism" in the context of "tensor parallelism" in "DeepSpeed's ZeRO-3" and "DeepSpeed's ZeRO-2" etc. Provide mention of "tensor parallelism" in the context of "tensor parallelism" in "DeepSpeed's ZeRO-3" and "DeepSpeed's ZeRO-2" etc. Provide mention of "tensor parallelism" in the context of "tensor parallelism" in "DeepSpeed's ZeRO-3" and "DeepSpeed's ZeRO-2" etc. Provide mention of "tensor parallelism" in the context of "tensor parallelism" in "DeepSpeed's ZeRO-3" and "DeepSpeed's ZeRO-2" etc. Provide mention of "tensor parallelism" in the context of "tensor parallelism" in "DeepSpeed's ZeRO-3" and "DeepSpeed's ZeRO-2" etc. Provide mention of "tensor parallelism" in the context of "tensor parallelism" in "DeepSpeed's ZeRO-3" and "DeepSpeed's ZeRO-2" etc. Provide mention of "tensor parallelism" in the -Generation took 57.98 seconds -
-
-
β–Ά UV Install Logs
- -
-
Fetching 3 files: 0%| | 0/3 [00:00<?, ?it/s] -Fetching 3 files: 33%|β–ˆβ–ˆβ–ˆβ–Ž | 1/3 [00:06<00:12, 6.38s/it] -Fetching 3 files: 67%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–‹ | 2/3 [00:08<00:03, 3.61s/it] -Fetching 3 files: 100%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆ| 3/3 [00:08<00:00, 2.69s/it] -You are using full precision kernels, we will dequantize the model to bf16. To use the quantized model with quantization kernels, please set use_kernels=False - -Loading checkpoint shards: 0%| | 0/3 [00:00<?, ?it/s] -Loading checkpoint shards: 33%|β–ˆβ–ˆβ–ˆβ–Ž | 1/3 [00:02<00:04, 2.34s/it] -Loading checkpoint shards: 67%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–‹ | 2/3 [00:04<00:02, 2.25s/it] -Loading checkpoint shards: 100%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆ| 3/3 [00:05<00:00, 1.80s/it] -Loading checkpoint shards: 100%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆ| 3/3 [00:05<00:00, 1.93s/it] -INFO:root:Using layer `MegaBlocksMoeMLP` from repo `kernels-community/megablocks` (revision: main) for layer `MegaBlocksMoeMLP` - -Fetching 66 files: 0%| | 0/66 [00:00<?, ?it/s] -Fetching 66 files: 2%|▏ | 1/66 [00:00<00:10, 6.10it/s] -Fetching 66 files: 14%|β–ˆβ–Ž | 9/66 [00:00<00:01, 30.47it/s] -Fetching 66 files: 24%|β–ˆβ–ˆβ– | 16/66 [00:00<00:01, 37.56it/s] -Fetching 66 files: 30%|β–ˆβ–ˆβ–ˆ | 20/66 [00:01<00:03, 14.24it/s] -Fetching 66 files: 67%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–‹ | 44/66 [00:01<00:00, 37.14it/s] -Fetching 66 files: 91%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆ | 60/66 [00:01<00:00, 49.97it/s] -Fetching 66 files: 100%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆ| 66/66 [00:01<00:00, 36.02it/s] -/tmp/uvnote-run-nw4e52ut/home/.cache/uv/environments-v2/setup2-69adf76231e4ab4f/lib/python3.13/site-packages/kernels/layer.py:868: UserWarning: -No kernel mapping found for layer `None`. Check if the layer name matches one of the kernels in the mapping or add the kernel you want to use to the mapping. Defaulting to original forward implementation. - warnings.warn( -INFO:root:Using layer `MegaBlocksMoeMLP` from repo `kernels-community/megablocks` (revision: main) for layer `MegaBlocksMoeMLP` -INFO:root:Using layer `MegaBlocksMoeMLP` from repo `kernels-community/megablocks` (revision: main) for layer `MegaBlocksMoeMLP` -INFO:root:Using layer `MegaBlocksMoeMLP` from repo `kernels-community/megablocks` (revision: main) for layer `MegaBlocksMoeMLP` -INFO:root:Using layer `MegaBlocksMoeMLP` from repo `kernels-community/megablocks` (revision: main) for layer `MegaBlocksMoeMLP` -INFO:root:Using layer `MegaBlocksMoeMLP` from repo `kernels-community/megablocks` (revision: main) for layer `MegaBlocksMoeMLP` -INFO:root:Using layer `MegaBlocksMoeMLP` from repo `kernels-community/megablocks` (revision: main) for layer `MegaBlocksMoeMLP` -INFO:root:Using layer `MegaBlocksMoeMLP` from repo `kernels-community/megablocks` (revision: main) for layer `MegaBlocksMoeMLP` -INFO:root:Using layer `MegaBlocksMoeMLP` from repo `kernels-community/megablocks` (revision: main) for layer `MegaBlocksMoeMLP` -INFO:root:Using layer `MegaBlocksMoeMLP` from repo `kernels-community/megablocks` (revision: main) for layer `MegaBlocksMoeMLP` -INFO:root:Using layer `MegaBlocksMoeMLP` from repo `kernels-community/megablocks` (revision: main) for layer `MegaBlocksMoeMLP` -INFO:root:Using layer `MegaBlocksMoeMLP` from repo `kernels-community/megablocks` (revision: main) for layer `MegaBlocksMoeMLP` -INFO:root:Using layer `MegaBlocksMoeMLP` from repo `kernels-community/megablocks` (revision: main) for layer `MegaBlocksMoeMLP` -INFO:root:Using layer `MegaBlocksMoeMLP` from repo `kernels-community/megablocks` (revision: main) for layer `MegaBlocksMoeMLP` -INFO:root:Using layer `MegaBlocksMoeMLP` from repo `kernels-community/megablocks` (revision: main) for layer `MegaBlocksMoeMLP` -INFO:root:Using layer `MegaBlocksMoeMLP` from repo `kernels-community/megablocks` (revision: main) for layer `MegaBlocksMoeMLP` -INFO:root:Using layer `MegaBlocksMoeMLP` from repo `kernels-community/megablocks` (revision: main) for layer `MegaBlocksMoeMLP` -INFO:root:Using layer `MegaBlocksMoeMLP` from repo `kernels-community/megablocks` (revision: main) for layer `MegaBlocksMoeMLP` -INFO:root:Using layer `MegaBlocksMoeMLP` from repo `kernels-community/megablocks` (revision: main) for layer `MegaBlocksMoeMLP` -INFO:root:Using layer `MegaBlocksMoeMLP` from repo `kernels-community/megablocks` (revision: main) for layer `MegaBlocksMoeMLP` -INFO:root:Using layer `MegaBlocksMoeMLP` from repo `kernels-community/megablocks` (revision: main) for layer `MegaBlocksMoeMLP` -INFO:root:Using layer `MegaBlocksMoeMLP` from repo `kernels-community/megablocks` (revision: main) for layer `MegaBlocksMoeMLP` -INFO:root:Using layer `MegaBlocksMoeMLP` from repo `kernels-community/megablocks` (revision: main) for layer `MegaBlocksMoeMLP` -INFO:root:Using layer `MegaBlocksMoeMLP` from repo `kernels-community/megablocks` (revision: main) for layer `MegaBlocksMoeMLP` -INFO:root:Using layer `MegaBlocksMoeMLP` from repo `kernels-community/megablocks` (revision: main) for layer `MegaBlocksMoeMLP` -/tmp/uvnote-run-nw4e52ut/home/.cache/uv/environments-v2/setup2-69adf76231e4ab4f/lib/python3.13/site-packages/kernels/layer.py:868: UserWarning: -No kernel mapping found for layer `None`. Check if the layer name matches one of the kernels in the mapping or add the kernel you want to use to the mapping. Defaulting to original forward implementation. - warnings.warn( -INFO:root:Using layer `MegaBlocksMoeMLP` from repo `kernels-community/megablocks` (revision: main) for layer `MegaBlocksMoeMLP` -INFO:root:Using layer `MegaBlocksMoeMLP` from repo `kernels-community/megablocks` (revision: main) for layer `MegaBlocksMoeMLP` -INFO:root:Using layer `MegaBlocksMoeMLP` from repo `kernels-community/megablocks` (revision: main) for layer `MegaBlocksMoeMLP` -INFO:root:Using layer `MegaBlocksMoeMLP` from repo `kernels-community/megablocks` (revision: main) for layer `MegaBlocksMoeMLP` -INFO:root:Using layer `MegaBlocksMoeMLP` from repo `kernels-community/megablocks` (revision: main) for layer `MegaBlocksMoeMLP` -INFO:root:Using layer `MegaBlocksMoeMLP` from repo `kernels-community/megablocks` (revision: main) for layer `MegaBlocksMoeMLP` -INFO:root:Using layer `MegaBlocksMoeMLP` from repo `kernels-community/megablocks` (revision: main) for layer `MegaBlocksMoeMLP` -INFO:root:Using layer `MegaBlocksMoeMLP` from repo `kernels-community/megablocks` (revision: main) for layer `MegaBlocksMoeMLP` -INFO:root:Using layer `MegaBlocksMoeMLP` from repo `kernels-community/megablocks` (revision: main) for layer `MegaBlocksMoeMLP` -INFO:root:Using layer `MegaBlocksMoeMLP` from repo `kernels-community/megablocks` (revision: main) for layer `MegaBlocksMoeMLP` -INFO:root:Using layer `MegaBlocksMoeMLP` from repo `kernels-community/megablocks` (revision: main) for layer `MegaBlocksMoeMLP` -INFO:root:Using layer `MegaBlocksMoeMLP` from repo `kernels-community/megablocks` (revision: main) for layer `MegaBlocksMoeMLP` -INFO:root:Using layer `MegaBlocksMoeMLP` from repo `kernels-community/megablocks` (revision: main) for layer `MegaBlocksMoeMLP` -INFO:root:Using layer `MegaBlocksMoeMLP` from repo `kernels-community/megablocks` (revision: main) for layer `MegaBlocksMoeMLP` -INFO:root:Using layer `MegaBlocksMoeMLP` from repo `kernels-community/megablocks` (revision: main) for layer `MegaBlocksMoeMLP` -INFO:root:Using layer `MegaBlocksMoeMLP` from repo `kernels-community/megablocks` (revision: main) for layer `MegaBlocksMoeMLP` -INFO:root:Using layer `MegaBlocksMoeMLP` from repo `kernels-community/megablocks` (revision: main) for layer `MegaBlocksMoeMLP` -INFO:root:Using layer `MegaBlocksMoeMLP` from repo `kernels-community/megablocks` (revision: main) for layer `MegaBlocksMoeMLP` -INFO:root:Using layer `MegaBlocksMoeMLP` from repo `kernels-community/megablocks` (revision: main) for layer `MegaBlocksMoeMLP` -INFO:root:Using layer `MegaBlocksMoeMLP` from repo `kernels-community/megablocks` (revision: main) for layer `MegaBlocksMoeMLP` -INFO:root:Using layer `MegaBlocksMoeMLP` from repo `kernels-community/megablocks` (revision: main) for layer `MegaBlocksMoeMLP` -INFO:root:Using layer `MegaBlocksMoeMLP` from repo `kernels-community/megablocks` (revision: main) for layer `MegaBlocksMoeMLP` -INFO:root:Using layer `MegaBlocksMoeMLP` from repo `kernels-community/megablocks` (revision: main) for layer `MegaBlocksMoeMLP`
-
-
-
- - - \ No newline at end of file diff --git a/note_test_override.md b/note_test_override.md deleted file mode 100644 index 26be4c932d4c74401ce07aaec21323beb5baf7f5..0000000000000000000000000000000000000000 --- a/note_test_override.md +++ /dev/null @@ -1,261 +0,0 @@ ---- -title: "uvnote Integration Test Report" -author: "uvnote" -theme: "light" -syntax_theme: "monokai" -show_line_numbers: true -collapse_code: false -custom_css: | - #output-setup { - overflow-x: auto; - } - .cell-stdout { - width: 100%; - } - .cell-stderr { - width: max-content; - max-height: 300px; - overflow: auto; - } ---- - -```python id=setup -# /// script -# requires-python = ">=3.12" -# dependencies = [ -# "accelerate>=1.10.1", -# "torch>=2.7.0", -# "kernels==0.10.0", -# "transformers@https://github.com/huggingface/transformers.git", -# "ipdb>=0.13.13", -# "matplotlib>=3.7.2", -# "numpy>=1.24.3", -# ] -# /// - -import torch -from transformers import GptOssForCausalLM, PreTrainedTokenizerFast, Mxfp4Config -import time -import torch.nn as nn -from kernels import register_kernel_mapping, Mode, LayerRepository -import sys -import torch.profiler -import gc -import logging - -# set to debug logging -logging.basicConfig(level=logging.INFO) - -def reset_peak_memory_stats(): - """Clear CUDA cache and reset memory allocation counters.""" - torch.cuda.empty_cache() - if torch.cuda.is_available(): - torch.cuda.reset_peak_memory_stats() - gc.collect() - -def get_memory_stats(): - """Get current and peak CUDA memory usage.""" - if not torch.cuda.is_available(): - return {"allocated_gb": 0, "peak_gb": 0, "reserved_gb": 0} - return { - "allocated_gb": torch.cuda.memory_allocated() / 1e9, - "peak_gb": torch.cuda.max_memory_allocated() / 1e9, - "reserved_gb": torch.cuda.memory_reserved() / 1e9, - } - -def override_kernel_layer_name(cls_name: str, value) -> bool: - """Helper to dynamically override the kernel_layer_name in a model class.""" - for mod in sys.modules.values(): - if mod is None: - continue - obj = getattr(mod, cls_name, None) - if isinstance(obj, type) and issubclass(obj, nn.Module): - setattr(obj, "kernel_layer_name", value) - print(f"Overrode {cls_name}.kernel_layer_name to {value}") - return True - return False - - -# Init the model the normal way -model_id = "openai/gpt-oss-20b" -tokenizer = PreTrainedTokenizerFast.from_pretrained(model_id) -quantization_config = Mxfp4Config(dequantize=True) - - -from kernels import replace_kernel_forward_from_hub, register_kernel_mapping, LayerRepository, Mode - -from transformers.models.gpt_oss.modeling_gpt_oss import GptOssMLP, GptOssRMSNorm - -replace_kernel_forward_from_hub(GptOssMLP, "Yamoe") # direct, type-safe -replace_kernel_forward_from_hub(GptOssRMSNorm, None) # direct, type-safe -custom_mapping = { - "Yamoe": { - "cuda": { - Mode.INFERENCE: LayerRepository( - repo_id="drbh/yamoe", - layer_name="Yamoe", - revision="v0.3.0", - ) - } - } -} -register_kernel_mapping(custom_mapping) - - -model = GptOssForCausalLM.from_pretrained( - model_id, - dtype="bfloat16", - device_map="auto", - use_kernels=True, - quantization_config=quantization_config, -).eval() - -messages = [ - {"role": "system", "content": "What is Tensor Parallelism?"}, -] - -inputs = tokenizer.apply_chat_template( - messages, - add_generation_prompt=True, - return_tensors="pt", - return_dict=True, - reasoning_effort="low", -).to("cuda") - -max_tokens = 512 - -with torch.inference_mode(): - start_time = time.perf_counter() - generated = model.generate( - **inputs, - max_new_tokens=max_tokens, - do_sample=False, - temperature=None, - ) - end_time = time.perf_counter() - -print(tokenizer.decode(generated[0], skip_special_tokens=False)) -print(f"Generation took {end_time - start_time:.2f} seconds") - -``` - -# Reference kernel - -```python id=setup2 -# /// script -# requires-python = ">=3.12" -# dependencies = [ -# "accelerate>=1.10.1", -# "torch>=2.7.0", -# "kernels==0.10.0", -# "transformers@https://github.com/huggingface/transformers.git", -# "ipdb>=0.13.13", -# "matplotlib>=3.7.2", -# "numpy>=1.24.3", -# ] -# /// - -import torch -from transformers import GptOssForCausalLM, PreTrainedTokenizerFast, Mxfp4Config -import time -import torch.nn as nn -from kernels import register_kernel_mapping, Mode, LayerRepository -import sys -import torch.profiler -import gc -import logging - -# set to debug logging -logging.basicConfig(level=logging.INFO) - -def reset_peak_memory_stats(): - """Clear CUDA cache and reset memory allocation counters.""" - torch.cuda.empty_cache() - if torch.cuda.is_available(): - torch.cuda.reset_peak_memory_stats() - gc.collect() - -def get_memory_stats(): - """Get current and peak CUDA memory usage.""" - if not torch.cuda.is_available(): - return {"allocated_gb": 0, "peak_gb": 0, "reserved_gb": 0} - return { - "allocated_gb": torch.cuda.memory_allocated() / 1e9, - "peak_gb": torch.cuda.max_memory_allocated() / 1e9, - "reserved_gb": torch.cuda.memory_reserved() / 1e9, - } - -def override_kernel_layer_name(cls_name: str, value) -> bool: - """Helper to dynamically override the kernel_layer_name in a model class.""" - for mod in sys.modules.values(): - if mod is None: - continue - obj = getattr(mod, cls_name, None) - if isinstance(obj, type) and issubclass(obj, nn.Module): - setattr(obj, "kernel_layer_name", value) - print(f"Overrode {cls_name}.kernel_layer_name to {value}") - return True - return False - - -# Init the model the normal way -model_id = "openai/gpt-oss-20b" -tokenizer = PreTrainedTokenizerFast.from_pretrained(model_id) -quantization_config = Mxfp4Config(dequantize=True) - - -from kernels import replace_kernel_forward_from_hub, register_kernel_mapping, LayerRepository, Mode - -from transformers.models.gpt_oss.modeling_gpt_oss import GptOssMLP, GptOssRMSNorm - -replace_kernel_forward_from_hub(GptOssRMSNorm, None) # direct, type-safe -custom_mapping = { - "Yamoe": { - "cuda": { - Mode.INFERENCE: LayerRepository( - repo_id="drbh/yamoe", - layer_name="Yamoe", - revision="v0.3.0", - ) - } - } -} -register_kernel_mapping(custom_mapping) - - -model = GptOssForCausalLM.from_pretrained( - model_id, - dtype="bfloat16", - device_map="auto", - use_kernels=True, - quantization_config=quantization_config, -).eval() - -messages = [ - {"role": "system", "content": "What is Tensor Parallelism?"}, -] - -inputs = tokenizer.apply_chat_template( - messages, - add_generation_prompt=True, - return_tensors="pt", - return_dict=True, - reasoning_effort="low", -).to("cuda") - -max_tokens = 512 - -with torch.inference_mode(): - start_time = time.perf_counter() - generated = model.generate( - **inputs, - max_new_tokens=max_tokens, - do_sample=False, - temperature=None, - ) - end_time = time.perf_counter() - -print(tokenizer.decode(generated[0], skip_special_tokens=False)) -print(f"Generation took {end_time - start_time:.2f} seconds") - -``` \ No newline at end of file diff --git a/site/artifacts/charts/benchmark_dashboard.png b/site/artifacts/charts/benchmark_dashboard.png deleted file mode 100644 index 3e41a250a9cf81deca81d3b8680906932190c3ce..0000000000000000000000000000000000000000 Binary files a/site/artifacts/charts/benchmark_dashboard.png and /dev/null differ diff --git a/site/artifacts/charts/latency.png b/site/artifacts/charts/latency.png deleted file mode 100644 index 3a47479e94c29b164c07640ac4e3b0839a09b136..0000000000000000000000000000000000000000 Binary files a/site/artifacts/charts/latency.png and /dev/null differ diff --git a/site/artifacts/charts/memory.png b/site/artifacts/charts/memory.png deleted file mode 100644 index 4f2455d5cf6c7ec026cfb2d3237bb00ade76bb5d..0000000000000000000000000000000000000000 Binary files a/site/artifacts/charts/memory.png and /dev/null differ diff --git a/site/artifacts/charts/throughput.png b/site/artifacts/charts/throughput.png deleted file mode 100644 index 03dca989b1ab57e1a4ba8471c4ef62579b743e0b..0000000000000000000000000000000000000000 Binary files a/site/artifacts/charts/throughput.png and /dev/null differ diff --git a/site/artifacts/setup/benchmark_avg_tokens_per_sec.txt b/site/artifacts/setup/benchmark_avg_tokens_per_sec.txt deleted file mode 100644 index af3f00403ff74eb6871c2c28a93d941c2ce0ae9d..0000000000000000000000000000000000000000 --- a/site/artifacts/setup/benchmark_avg_tokens_per_sec.txt +++ /dev/null @@ -1 +0,0 @@ -5.301658854167735 diff --git a/site/artifacts/setup/benchmark_dashboard.png b/site/artifacts/setup/benchmark_dashboard.png deleted file mode 100644 index e34083a615418e4c216b3c892b87a0bee7a77f64..0000000000000000000000000000000000000000 Binary files a/site/artifacts/setup/benchmark_dashboard.png and /dev/null differ diff --git a/site/artifacts/setup/benchmark_memory.txt b/site/artifacts/setup/benchmark_memory.txt deleted file mode 100644 index cafad0766947b5c3809e939b898b158a2f6520d8..0000000000000000000000000000000000000000 --- a/site/artifacts/setup/benchmark_memory.txt +++ /dev/null @@ -1 +0,0 @@ -9.398672896,9.414898176,10.334765056 diff --git a/site/artifacts/setup/benchmark_times.txt b/site/artifacts/setup/benchmark_times.txt deleted file mode 100644 index 01883f89fd7f6b87427c696c3bdce733de2ab011..0000000000000000000000000000000000000000 --- a/site/artifacts/setup/benchmark_times.txt +++ /dev/null @@ -1,5 +0,0 @@ -12.075035744113848 -12.0710428240709 -12.070115809096023 -12.070908240042627 -12.071364195086062 diff --git a/site/cells/charts.py b/site/cells/charts.py deleted file mode 100644 index 68834c18ee9a315ae893cb7cdca1980324a2e517..0000000000000000000000000000000000000000 --- a/site/cells/charts.py +++ /dev/null @@ -1,140 +0,0 @@ -# /// script -# dependencies = [ -# "matplotlib", -# "numpy", -# ] -# /// - -import matplotlib.pyplot as plt -import numpy as np -import os - -# get the pathf rom UVNOTE_SETUP env var -setup_path = os.getenv("UVNOTE_INPUT_SETUP", ".") -print(f"Reading benchmark data from: {setup_path}") - -num_runs = 5 -max_tokens = 64 -times = [] -with open(os.path.join(setup_path, "benchmark_times.txt"), "r") as f: - for line in f: - times.append(float(line.strip())) - - -avg_time = 0.0 -min_time = 0.0 -max_time = 0.0 -final_mem = {"allocated_gb": 0.0, "peak_gb": 0.0, "reserved_gb": 0.0} - -avg_tokens_per_sec = 0.0 -with open(os.path.join(setup_path, "benchmark_avg_tokens_per_sec.txt"), "r") as f: - avg_tokens_per_sec = float(f.read().strip()) - -times_file = os.path.join(setup_path, "benchmark_times.txt") -memory_file = os.path.join(setup_path, "benchmark_memory.txt") - - -# Minimal brutalist palette (dark theme): grayscale + 1 accent -ACCENT = '#5ec8f8' # calm cyan-blue accent -FG = '#e6e6e6' # light gray text/lines -MUTED = '#9aa0a6' # muted gray for secondary -GRID = '#333333' # grid lines - -# Styling tuned for clarity, high contrast, few colors -plt.style.use('dark_background') -plt.rcParams['figure.facecolor'] = 'none' -plt.rcParams['axes.facecolor'] = 'none' -plt.rcParams['savefig.facecolor'] = 'none' -plt.rcParams['savefig.transparent'] = True -plt.rcParams['font.family'] = 'monospace' -plt.rcParams['font.weight'] = 'bold' -plt.rcParams['axes.linewidth'] = 3 -plt.rcParams['grid.linewidth'] = 2 -plt.rcParams['lines.linewidth'] = 3 -plt.rcParams['patch.linewidth'] = 2 - -# Prepare data -runs = list(range(1, len(times) + 1)) -tokens_per_sec_all = [max_tokens / t for t in times] - -# Chart 1: Throughput Performance -fig1, ax1 = plt.subplots(1, 1, figsize=(12, 6)) -fig1.patch.set_alpha(0) -ax1.patch.set_alpha(0) - -ax1.plot(runs, tokens_per_sec_all, color=ACCENT, marker='o', markersize=12, - markerfacecolor=ACCENT, markeredgecolor=FG, markeredgewidth=3, linewidth=5, label='tok/s') -ax1.fill_between(runs, 0, tokens_per_sec_all, alpha=0.2, color=ACCENT) -ax1.axhline(y=avg_tokens_per_sec, color=FG, linestyle='--', linewidth=3, - label=f'AVG: {avg_tokens_per_sec:.1f}') -ax1.set_title('THROUGHPUT PERFORMANCE', color=FG, fontsize=18, pad=20, fontweight='bold') -ax1.set_xlabel('RUN NUMBER', color=FG, fontsize=14, fontweight='bold') -ax1.set_ylabel('TOKENS/SEC', color=FG, fontsize=14, fontweight='bold') -ax1.grid(True, color=GRID, alpha=0.5, linewidth=2) -ax1.tick_params(colors=FG, labelsize=12) -legend1 = ax1.legend(frameon=False, loc='lower right') -for text in legend1.get_texts(): - text.set_color(FG) - text.set_fontweight('bold') -plt.tight_layout() -plt.savefig('throughput.png', dpi=150, bbox_inches='tight', transparent=True) -plt.show() - -# Chart 2: Generation Latency -fig2, ax2 = plt.subplots(1, 1, figsize=(12, 6)) -fig2.patch.set_alpha(0) -ax2.patch.set_alpha(0) - -bar_colors = [ACCENT if i % 2 == 0 else MUTED for i in range(len(times))] -bars = ax2.bar(runs, times, color=bar_colors, edgecolor=FG, linewidth=3, width=0.6) -ax2.axhline(y=avg_time, color=FG, linestyle='--', linewidth=3, - label=f'AVG: {avg_time:.2f}s') -for i, (run, time, bar) in enumerate(zip(runs, times, bars)): - ax2.text(run, time + 0.02, f'{time:.2f}s', ha='center', va='bottom', - color=FG, fontweight='bold', fontsize=11) -ax2.set_title('GENERATION LATENCY', color=FG, fontsize=18, pad=20, fontweight='bold') -ax2.set_xlabel('RUN NUMBER', color=FG, fontsize=14, fontweight='bold') -ax2.set_ylabel('TIME (SECONDS)', color=FG, fontsize=14, fontweight='bold') -ax2.grid(True, axis='y', color=GRID, alpha=0.5, linewidth=2) -ax2.tick_params(colors=FG, labelsize=12) -ax2.set_ylim(0, max(times) * 1.15) -legend2 = ax2.legend(frameon=False, loc='upper right') -for text in legend2.get_texts(): - text.set_color(FG) - text.set_fontweight('bold') -plt.tight_layout() -plt.savefig('latency.png', dpi=150, bbox_inches='tight', transparent=True) -plt.show() - -# Chart 3: Memory Usage -fig3, ax3 = plt.subplots(1, 1, figsize=(12, 6)) -fig3.patch.set_alpha(0) -ax3.patch.set_alpha(0) - -memory_labels = ['ALLOCATED', 'PEAK', 'RESERVED'] -memory_values = [final_mem['allocated_gb'], final_mem['peak_gb'], final_mem['reserved_gb']] -colors_mem = [MUTED, ACCENT, FG] -bars = ax3.barh(memory_labels, memory_values, color=colors_mem, edgecolor=FG, linewidth=3, height=0.5) -for i, (label, value, bar) in enumerate(zip(memory_labels, memory_values, bars)): - ax3.text(value + 0.5, i, f'{value:.1f} GB', va='center', - color=FG, fontweight='bold', fontsize=13) -ax3.set_title('MEMORY USAGE', color=FG, fontsize=18, pad=20, fontweight='bold') -ax3.set_xlabel('GIGABYTES', color=FG, fontsize=14, fontweight='bold') -ax3.set_xlim(0, max(memory_values) * 1.3) -ax3.grid(True, axis='x', color=GRID, alpha=0.5, linewidth=2) -ax3.tick_params(colors=FG, labelsize=12) -ax3.set_yticks(range(len(memory_labels))) -ax3.set_yticklabels(memory_labels, fontweight='bold') -plt.tight_layout() -plt.savefig('memory.png', dpi=150, bbox_inches='tight', transparent=True) -plt.show() - -print(f"\nπŸ“Š Charts saved as:") -print(f" β€’ throughput.png") -print(f" β€’ latency.png") -print(f" β€’ memory.png") -print(f"\nBenchmark Summary:") -print(f" avg tokens/sec: {avg_tokens_per_sec:.1f}") -print(f" min time: {min_time:.3f}s") -print(f" max time: {max_time:.3f}s") -print(f" peak memory: {final_mem['peak_gb']:.2f}GB") diff --git a/site/cells/forward_and_backward.py b/site/cells/forward_and_backward.py deleted file mode 100644 index e9c24970def28b0b80af5848de10e41cc87a1349..0000000000000000000000000000000000000000 --- a/site/cells/forward_and_backward.py +++ /dev/null @@ -1,102 +0,0 @@ -# /// script -# requires-python = ">=3.12" -# dependencies = [ -# "accelerate>=1.10.1", -# "torch>=2.7.0", -# "kernels==0.10.0", -# "transformers@https://github.com/huggingface/transformers.git", -# "ipdb>=0.13.13", -# "matplotlib>=3.7.2", -# "numpy>=1.24.3", -# ] -# /// - -import torch -from transformers import GptOssForCausalLM, PreTrainedTokenizerFast, Mxfp4Config -import time -import torch.nn as nn -from kernels import register_kernel_mapping, Mode, LayerRepository -import sys -import torch.profiler -import gc -import logging -from transformers.models.gpt_oss.modeling_gpt_oss import GptOssRMSNorm - -# remove liger kernel for testing -replace_kernel_forward_from_hub(GptOssRMSNorm, None) - -# set to debug logging -logging.basicConfig(level=logging.INFO) - -def reset_peak_memory_stats(): - """Clear CUDA cache and reset memory allocation counters.""" - torch.cuda.empty_cache() - if torch.cuda.is_available(): - torch.cuda.reset_peak_memory_stats() - gc.collect() - -def get_memory_stats(): - """Get current and peak CUDA memory usage.""" - if not torch.cuda.is_available(): - return {"allocated_gb": 0, "peak_gb": 0, "reserved_gb": 0} - return { - "allocated_gb": torch.cuda.memory_allocated() / 1e9, - "peak_gb": torch.cuda.max_memory_allocated() / 1e9, - "reserved_gb": torch.cuda.memory_reserved() / 1e9, - } - -def override_kernel_layer_name(cls_name: str, value) -> bool: - """Helper to dynamically override the kernel_layer_name in a model class.""" - for mod in sys.modules.values(): - if mod is None: - continue - obj = getattr(mod, cls_name, None) - if isinstance(obj, type) and issubclass(obj, nn.Module): - setattr(obj, "kernel_layer_name", value) - print(f"Overrode {cls_name}.kernel_layer_name to {value}") - return True - return False - - -# Init the model the normal way -model_id = "openai/gpt-oss-20b" -tokenizer = PreTrainedTokenizerFast.from_pretrained(model_id) -quantization_config = Mxfp4Config(dequantize=True) - -model = GptOssForCausalLM.from_pretrained( - model_id, - dtype="bfloat16", - device_map="auto", - use_kernels=True, - quantization_config=quantization_config, - training=True, -).eval() - -messages = [ - {"role": "system", "content": "What is Tensor Parallelism?"}, -] - -inputs = tokenizer.apply_chat_template( - messages, - add_generation_prompt=True, - return_tensors="pt", - return_dict=True, - reasoning_effort="low", -).to("cuda") - -max_tokens = 512 - - -# forward and backward pass -with torch.autograd.set_grad_enabled(True): - start_time = time.perf_counter() - generated = model.generate( - **inputs, - max_new_tokens=max_tokens, - do_sample=False, - temperature=None, - ) - end_time = time.perf_counter() - print(tokenizer.decode(generated[0], skip_special_tokens=False)) - print(f"Generation took {end_time - start_time:.2f} seconds") - diff --git a/site/cells/forward_only.py b/site/cells/forward_only.py deleted file mode 100644 index 9a26615c3e9705cfdbc9be29bdcf68f9d24e597b..0000000000000000000000000000000000000000 --- a/site/cells/forward_only.py +++ /dev/null @@ -1,96 +0,0 @@ -# /// script -# requires-python = ">=3.12" -# dependencies = [ -# "accelerate>=1.10.1", -# "torch>=2.7.0", -# "kernels==0.10.0", -# "transformers@https://github.com/huggingface/transformers.git", -# "ipdb>=0.13.13", -# "matplotlib>=3.7.2", -# "numpy>=1.24.3", -# ] -# /// - -import torch -from transformers import GptOssForCausalLM, PreTrainedTokenizerFast, Mxfp4Config -import time -import torch.nn as nn -from kernels import register_kernel_mapping, Mode, LayerRepository -import sys -import torch.profiler -import gc -import logging - -# set to debug logging -logging.basicConfig(level=logging.INFO) - -def reset_peak_memory_stats(): - """Clear CUDA cache and reset memory allocation counters.""" - torch.cuda.empty_cache() - if torch.cuda.is_available(): - torch.cuda.reset_peak_memory_stats() - gc.collect() - -def get_memory_stats(): - """Get current and peak CUDA memory usage.""" - if not torch.cuda.is_available(): - return {"allocated_gb": 0, "peak_gb": 0, "reserved_gb": 0} - return { - "allocated_gb": torch.cuda.memory_allocated() / 1e9, - "peak_gb": torch.cuda.max_memory_allocated() / 1e9, - "reserved_gb": torch.cuda.memory_reserved() / 1e9, - } - -def override_kernel_layer_name(cls_name: str, value) -> bool: - """Helper to dynamically override the kernel_layer_name in a model class.""" - for mod in sys.modules.values(): - if mod is None: - continue - obj = getattr(mod, cls_name, None) - if isinstance(obj, type) and issubclass(obj, nn.Module): - setattr(obj, "kernel_layer_name", value) - print(f"Overrode {cls_name}.kernel_layer_name to {value}") - return True - return False - - -# Init the model the normal way -model_id = "openai/gpt-oss-20b" -tokenizer = PreTrainedTokenizerFast.from_pretrained(model_id) -quantization_config = Mxfp4Config(dequantize=True) - - -model = GptOssForCausalLM.from_pretrained( - model_id, - dtype="bfloat16", - device_map="auto", - use_kernels=True, - quantization_config=quantization_config, -).eval() - -messages = [ - {"role": "system", "content": "What is Tensor Parallelism?"}, -] - -inputs = tokenizer.apply_chat_template( - messages, - add_generation_prompt=True, - return_tensors="pt", - return_dict=True, - reasoning_effort="low", -).to("cuda") - -max_tokens = 512 - -with torch.inference_mode(): - start_time = time.perf_counter() - generated = model.generate( - **inputs, - max_new_tokens=max_tokens, - do_sample=False, - temperature=None, - ) - end_time = time.perf_counter() - -print(tokenizer.decode(generated[0], skip_special_tokens=False)) -print(f"Generation took {end_time - start_time:.2f} seconds") diff --git a/site/cells/setup.py b/site/cells/setup.py deleted file mode 100644 index 1d44c93415f7502e5fb2ecec3a07ba74863d372e..0000000000000000000000000000000000000000 --- a/site/cells/setup.py +++ /dev/null @@ -1,116 +0,0 @@ -# /// script -# requires-python = ">=3.12" -# dependencies = [ -# "accelerate>=1.10.1", -# "torch>=2.7.0", -# "kernels==0.10.0", -# "transformers@https://github.com/huggingface/transformers.git", -# "ipdb>=0.13.13", -# "matplotlib>=3.7.2", -# "numpy>=1.24.3", -# ] -# /// - -import torch -from transformers import GptOssForCausalLM, PreTrainedTokenizerFast, Mxfp4Config -import time -import torch.nn as nn -from kernels import register_kernel_mapping, Mode, LayerRepository -import sys -import torch.profiler -import gc -import logging - -# set to debug logging -logging.basicConfig(level=logging.INFO) - -def reset_peak_memory_stats(): - """Clear CUDA cache and reset memory allocation counters.""" - torch.cuda.empty_cache() - if torch.cuda.is_available(): - torch.cuda.reset_peak_memory_stats() - gc.collect() - -def get_memory_stats(): - """Get current and peak CUDA memory usage.""" - if not torch.cuda.is_available(): - return {"allocated_gb": 0, "peak_gb": 0, "reserved_gb": 0} - return { - "allocated_gb": torch.cuda.memory_allocated() / 1e9, - "peak_gb": torch.cuda.max_memory_allocated() / 1e9, - "reserved_gb": torch.cuda.memory_reserved() / 1e9, - } - -def override_kernel_layer_name(cls_name: str, value) -> bool: - """Helper to dynamically override the kernel_layer_name in a model class.""" - for mod in sys.modules.values(): - if mod is None: - continue - obj = getattr(mod, cls_name, None) - if isinstance(obj, type) and issubclass(obj, nn.Module): - setattr(obj, "kernel_layer_name", value) - print(f"Overrode {cls_name}.kernel_layer_name to {value}") - return True - return False - - -# Init the model the normal way -model_id = "openai/gpt-oss-20b" -tokenizer = PreTrainedTokenizerFast.from_pretrained(model_id) -quantization_config = Mxfp4Config(dequantize=True) - - -from kernels import replace_kernel_forward_from_hub, register_kernel_mapping, LayerRepository, Mode - -from transformers.models.gpt_oss.modeling_gpt_oss import GptOssMLP, GptOssRMSNorm - -replace_kernel_forward_from_hub(GptOssMLP, "Yamoe") # direct, type-safe -replace_kernel_forward_from_hub(GptOssRMSNorm, None) # direct, type-safe -custom_mapping = { - "Yamoe": { - "cuda": { - Mode.INFERENCE: LayerRepository( - repo_id="drbh/yamoe", - layer_name="Yamoe", - revision="v0.3.0", - ) - } - } -} -register_kernel_mapping(custom_mapping) - - -model = GptOssForCausalLM.from_pretrained( - model_id, - dtype="bfloat16", - device_map="auto", - use_kernels=True, - quantization_config=quantization_config, -).eval() - -messages = [ - {"role": "system", "content": "What is Tensor Parallelism?"}, -] - -inputs = tokenizer.apply_chat_template( - messages, - add_generation_prompt=True, - return_tensors="pt", - return_dict=True, - reasoning_effort="low", -).to("cuda") - -max_tokens = 512 - -with torch.inference_mode(): - start_time = time.perf_counter() - generated = model.generate( - **inputs, - max_new_tokens=max_tokens, - do_sample=False, - temperature=None, - ) - end_time = time.perf_counter() - -print(tokenizer.decode(generated[0], skip_special_tokens=False)) -print(f"Generation took {end_time - start_time:.2f} seconds") diff --git a/site/cells/setup2.py b/site/cells/setup2.py deleted file mode 100644 index 1e0c0cda49faa5c3bac38e61579c62e182be98df..0000000000000000000000000000000000000000 --- a/site/cells/setup2.py +++ /dev/null @@ -1,115 +0,0 @@ -# /// script -# requires-python = ">=3.12" -# dependencies = [ -# "accelerate>=1.10.1", -# "torch>=2.7.0", -# "kernels==0.10.0", -# "transformers@https://github.com/huggingface/transformers.git", -# "ipdb>=0.13.13", -# "matplotlib>=3.7.2", -# "numpy>=1.24.3", -# ] -# /// - -import torch -from transformers import GptOssForCausalLM, PreTrainedTokenizerFast, Mxfp4Config -import time -import torch.nn as nn -from kernels import register_kernel_mapping, Mode, LayerRepository -import sys -import torch.profiler -import gc -import logging - -# set to debug logging -logging.basicConfig(level=logging.INFO) - -def reset_peak_memory_stats(): - """Clear CUDA cache and reset memory allocation counters.""" - torch.cuda.empty_cache() - if torch.cuda.is_available(): - torch.cuda.reset_peak_memory_stats() - gc.collect() - -def get_memory_stats(): - """Get current and peak CUDA memory usage.""" - if not torch.cuda.is_available(): - return {"allocated_gb": 0, "peak_gb": 0, "reserved_gb": 0} - return { - "allocated_gb": torch.cuda.memory_allocated() / 1e9, - "peak_gb": torch.cuda.max_memory_allocated() / 1e9, - "reserved_gb": torch.cuda.memory_reserved() / 1e9, - } - -def override_kernel_layer_name(cls_name: str, value) -> bool: - """Helper to dynamically override the kernel_layer_name in a model class.""" - for mod in sys.modules.values(): - if mod is None: - continue - obj = getattr(mod, cls_name, None) - if isinstance(obj, type) and issubclass(obj, nn.Module): - setattr(obj, "kernel_layer_name", value) - print(f"Overrode {cls_name}.kernel_layer_name to {value}") - return True - return False - - -# Init the model the normal way -model_id = "openai/gpt-oss-20b" -tokenizer = PreTrainedTokenizerFast.from_pretrained(model_id) -quantization_config = Mxfp4Config(dequantize=True) - - -from kernels import replace_kernel_forward_from_hub, register_kernel_mapping, LayerRepository, Mode - -from transformers.models.gpt_oss.modeling_gpt_oss import GptOssMLP, GptOssRMSNorm - -replace_kernel_forward_from_hub(GptOssRMSNorm, None) # direct, type-safe -custom_mapping = { - "Yamoe": { - "cuda": { - Mode.INFERENCE: LayerRepository( - repo_id="drbh/yamoe", - layer_name="Yamoe", - revision="v0.3.0", - ) - } - } -} -register_kernel_mapping(custom_mapping) - - -model = GptOssForCausalLM.from_pretrained( - model_id, - dtype="bfloat16", - device_map="auto", - use_kernels=True, - quantization_config=quantization_config, -).eval() - -messages = [ - {"role": "system", "content": "What is Tensor Parallelism?"}, -] - -inputs = tokenizer.apply_chat_template( - messages, - add_generation_prompt=True, - return_tensors="pt", - return_dict=True, - reasoning_effort="low", -).to("cuda") - -max_tokens = 512 - -with torch.inference_mode(): - start_time = time.perf_counter() - generated = model.generate( - **inputs, - max_new_tokens=max_tokens, - do_sample=False, - temperature=None, - ) - end_time = time.perf_counter() - -print(tokenizer.decode(generated[0], skip_special_tokens=False)) -print(f"Generation took {end_time - start_time:.2f} seconds") diff --git a/site/megablocks_only.html b/site/megablocks_only.html deleted file mode 100644 index 9626698e0d7527be74b531739756fd38544ed636..0000000000000000000000000000000000000000 --- a/site/megablocks_only.html +++ /dev/null @@ -1,4250 +0,0 @@ - - - - - - Megablocks Only Test - - - - - - - -
-
-
light
-
reset
- -
-
- -
-
Generated on:
-
- Linux x86_64 | Linux-5.15.0-1084-aws-x86_64-with-glibc2.31 -
-
- -
-

Reference kernel

-
-
- -β–Ό code -β–Ό output - β–Ά uv-logs - | -Cell: forward_only | 100.45s | FAILED - | - -Raw -
-
-
-
-1 -2 -3 -4 -5 -6 -7 -8 -9 -10 -11 -12 -13 -14 -15 -16 -17 -18 -19 -20 -21 -22 -23 -24 -25 -26 -27 -28 -29 -30 -31 -32 -33 -34 -35 -36 -37 -38 -39 -40 -41 -42 -43 -44 -45 -46 -47 -48 -49 -50 -51 -52 -53 -54 -55 -56 -57 -58 -59 -60 -61 -62 -63 -64 -65 -66 -67 -68 -69 -70 -71 -72 -73 -74 -75 -76 -77 -78 -79 -80 -81 -82 -83 -84 -85 -86 -87 -88 -89 -90 -91 -92 -93 -94 -95 -96 -
-
-
# /// script
-# requires-python = ">=3.12"
-# dependencies = [
-#     "accelerate>=1.10.1",
-#     "torch>=2.7.0",
-#     "kernels==0.10.0",
-#     "transformers@https://github.com/huggingface/transformers.git",
-#     "ipdb>=0.13.13",
-#     "matplotlib>=3.7.2",
-#     "numpy>=1.24.3",
-# ]
-# ///
-
-import torch
-from transformers import GptOssForCausalLM, PreTrainedTokenizerFast, Mxfp4Config
-import time
-import torch.nn as nn
-from kernels import register_kernel_mapping, Mode, LayerRepository
-import sys
-import torch.profiler
-import gc
-import logging
-
-# set to debug logging
-logging.basicConfig(level=logging.INFO)
-
-def reset_peak_memory_stats():
-    """Clear CUDA cache and reset memory allocation counters."""
-    torch.cuda.empty_cache()
-    if torch.cuda.is_available():
-        torch.cuda.reset_peak_memory_stats()
-    gc.collect()
-
-def get_memory_stats():
-    """Get current and peak CUDA memory usage."""
-    if not torch.cuda.is_available():
-        return {"allocated_gb": 0, "peak_gb": 0, "reserved_gb": 0}
-    return {
-        "allocated_gb": torch.cuda.memory_allocated() / 1e9,
-        "peak_gb": torch.cuda.max_memory_allocated() / 1e9,
-        "reserved_gb": torch.cuda.memory_reserved() / 1e9,
-    }
-
-def override_kernel_layer_name(cls_name: str, value) -> bool:
-    """Helper to dynamically override the kernel_layer_name in a model class."""
-    for mod in sys.modules.values():
-        if mod is None:
-            continue
-        obj = getattr(mod, cls_name, None)
-        if isinstance(obj, type) and issubclass(obj, nn.Module):
-            setattr(obj, "kernel_layer_name", value)
-            print(f"Overrode {cls_name}.kernel_layer_name to {value}")
-            return True
-    return False
-
-
-# Init the model the normal way
-model_id = "openai/gpt-oss-20b"
-tokenizer = PreTrainedTokenizerFast.from_pretrained(model_id)
-quantization_config = Mxfp4Config(dequantize=True)
-
-
-model = GptOssForCausalLM.from_pretrained(
-    model_id,
-    dtype="bfloat16",
-    device_map="auto",
-    use_kernels=True,
-    quantization_config=quantization_config,
-).eval()
-
-messages = [
-    {"role": "system", "content": "What is Tensor Parallelism?"},
-]
-
-inputs = tokenizer.apply_chat_template(
-    messages,
-    add_generation_prompt=True,
-    return_tensors="pt",
-    return_dict=True,
-    reasoning_effort="low",
-).to("cuda")
-
-max_tokens = 512
-
-with torch.inference_mode():
-    start_time = time.perf_counter()
-    generated = model.generate(
-        **inputs,
-        max_new_tokens=max_tokens,
-        do_sample=False,
-        temperature=None,
-    )
-    end_time = time.perf_counter()
-
-print(tokenizer.decode(generated[0], skip_special_tokens=False))
-print(f"Generation took {end_time - start_time:.2f} seconds")
-
- -
-
-
-
-
-
-
β–Ά UV Install Logs
- -
-
Fetching 3 files: 0%| | 0/3 [00:00<?, ?it/s] -Fetching 3 files: 33%|β–ˆβ–ˆβ–ˆβ–Ž | 1/3 [00:13<00:27, 13.93s/it] -Fetching 3 files: 67%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–‹ | 2/3 [00:17<00:08, 8.08s/it] -Fetching 3 files: 100%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆ| 3/3 [00:17<00:00, 5.97s/it] -You are using full precision kernels, we will dequantize the model to bf16. To use the quantized model with quantization kernels, please set use_kernels=False - -Loading checkpoint shards: 0%| | 0/3 [00:00<?, ?it/s] -Loading checkpoint shards: 33%|β–ˆβ–ˆβ–ˆβ–Ž | 1/3 [00:03<00:06, 3.23s/it] -Loading checkpoint shards: 67%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–‹ | 2/3 [00:06<00:03, 3.14s/it] -Loading checkpoint shards: 100%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆ| 3/3 [00:08<00:00, 2.49s/it] -Loading checkpoint shards: 100%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆ| 3/3 [00:08<00:00, 2.68s/it] -INFO:root:Using layer `MegaBlocksMoeMLP` from repo `kernels-community/megablocks` (revision: main) for layer `MegaBlocksMoeMLP` - -Fetching 66 files: 0%| | 0/66 [00:00<?, ?it/s] -Fetching 66 files: 2%|▏ | 1/66 [00:00<00:15, 4.28it/s] -Fetching 66 files: 26%|β–ˆβ–ˆβ–Œ | 17/66 [00:01<00:03, 12.73it/s] -Fetching 66 files: 100%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆ| 66/66 [00:01<00:00, 47.76it/s] -INFO:root:Using layer `LigerRMSNorm` from repo `kernels-community/liger_kernels` (revision: main) for layer `LigerRMSNorm` - -Fetching 17 files: 0%| | 0/17 [00:00<?, ?it/s] -Fetching 17 files: 65%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ– | 11/17 [00:00<00:00, 104.99it/s] -Fetching 17 files: 100%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆ| 17/17 [00:00<00:00, 128.06it/s] -INFO:root:Using layer `LigerRMSNorm` from repo `kernels-community/liger_kernels` (revision: main) for layer `LigerRMSNorm` -INFO:root:Using layer `MegaBlocksMoeMLP` from repo `kernels-community/megablocks` (revision: main) for layer `MegaBlocksMoeMLP` -INFO:root:Using layer `LigerRMSNorm` from repo `kernels-community/liger_kernels` (revision: main) for layer `LigerRMSNorm` -INFO:root:Using layer `LigerRMSNorm` from repo `kernels-community/liger_kernels` (revision: main) for layer `LigerRMSNorm` -INFO:root:Using layer `MegaBlocksMoeMLP` from repo `kernels-community/megablocks` (revision: main) for layer `MegaBlocksMoeMLP` -INFO:root:Using layer `LigerRMSNorm` from repo `kernels-community/liger_kernels` (revision: main) for layer `LigerRMSNorm` -INFO:root:Using layer `LigerRMSNorm` from repo `kernels-community/liger_kernels` (revision: main) for layer `LigerRMSNorm` -INFO:root:Using layer `MegaBlocksMoeMLP` from repo `kernels-community/megablocks` (revision: main) for layer `MegaBlocksMoeMLP` -INFO:root:Using layer `LigerRMSNorm` from repo `kernels-community/liger_kernels` (revision: main) for layer `LigerRMSNorm` -INFO:root:Using layer `LigerRMSNorm` from repo `kernels-community/liger_kernels` (revision: main) for layer `LigerRMSNorm` -INFO:root:Using layer `MegaBlocksMoeMLP` from repo `kernels-community/megablocks` (revision: main) for layer `MegaBlocksMoeMLP` -INFO:root:Using layer `LigerRMSNorm` from repo `kernels-community/liger_kernels` (revision: main) for layer `LigerRMSNorm` -INFO:root:Using layer `LigerRMSNorm` from repo `kernels-community/liger_kernels` (revision: main) for layer `LigerRMSNorm` -INFO:root:Using layer `MegaBlocksMoeMLP` from repo `kernels-community/megablocks` (revision: main) for layer `MegaBlocksMoeMLP` -INFO:root:Using layer `LigerRMSNorm` from repo `kernels-community/liger_kernels` (revision: main) for layer `LigerRMSNorm` -INFO:root:Using layer `LigerRMSNorm` from repo `kernels-community/liger_kernels` (revision: main) for layer `LigerRMSNorm` -INFO:root:Using layer `MegaBlocksMoeMLP` from repo `kernels-community/megablocks` (revision: main) for layer `MegaBlocksMoeMLP` -INFO:root:Using layer `LigerRMSNorm` from repo `kernels-community/liger_kernels` (revision: main) for layer `LigerRMSNorm` -INFO:root:Using layer `LigerRMSNorm` from repo `kernels-community/liger_kernels` (revision: main) for layer `LigerRMSNorm` -INFO:root:Using layer `MegaBlocksMoeMLP` from repo `kernels-community/megablocks` (revision: main) for layer `MegaBlocksMoeMLP` -INFO:root:Using layer `LigerRMSNorm` from repo `kernels-community/liger_kernels` (revision: main) for layer `LigerRMSNorm` -INFO:root:Using layer `LigerRMSNorm` from repo `kernels-community/liger_kernels` (revision: main) for layer `LigerRMSNorm` -INFO:root:Using layer `MegaBlocksMoeMLP` from repo `kernels-community/megablocks` (revision: main) for layer `MegaBlocksMoeMLP` -INFO:root:Using layer `LigerRMSNorm` from repo `kernels-community/liger_kernels` (revision: main) for layer `LigerRMSNorm` -INFO:root:Using layer `LigerRMSNorm` from repo `kernels-community/liger_kernels` (revision: main) for layer `LigerRMSNorm` -INFO:root:Using layer `MegaBlocksMoeMLP` from repo `kernels-community/megablocks` (revision: main) for layer `MegaBlocksMoeMLP` -INFO:root:Using layer `LigerRMSNorm` from repo `kernels-community/liger_kernels` (revision: main) for layer `LigerRMSNorm` -INFO:root:Using layer `LigerRMSNorm` from repo `kernels-community/liger_kernels` (revision: main) for layer `LigerRMSNorm` -INFO:root:Using layer `MegaBlocksMoeMLP` from repo `kernels-community/megablocks` (revision: main) for layer `MegaBlocksMoeMLP` -INFO:root:Using layer `LigerRMSNorm` from repo `kernels-community/liger_kernels` (revision: main) for layer `LigerRMSNorm` -INFO:root:Using layer `LigerRMSNorm` from repo `kernels-community/liger_kernels` (revision: main) for layer `LigerRMSNorm` -INFO:root:Using layer `MegaBlocksMoeMLP` from repo `kernels-community/megablocks` (revision: main) for layer `MegaBlocksMoeMLP` -INFO:root:Using layer `LigerRMSNorm` from repo `kernels-community/liger_kernels` (revision: main) for layer `LigerRMSNorm` -INFO:root:Using layer `LigerRMSNorm` from repo `kernels-community/liger_kernels` (revision: main) for layer `LigerRMSNorm` -INFO:root:Using layer `MegaBlocksMoeMLP` from repo `kernels-community/megablocks` (revision: main) for layer `MegaBlocksMoeMLP` -INFO:root:Using layer `LigerRMSNorm` from repo `kernels-community/liger_kernels` (revision: main) for layer `LigerRMSNorm` -INFO:root:Using layer `LigerRMSNorm` from repo `kernels-community/liger_kernels` (revision: main) for layer `LigerRMSNorm` -INFO:root:Using layer `MegaBlocksMoeMLP` from repo `kernels-community/megablocks` (revision: main) for layer `MegaBlocksMoeMLP` -INFO:root:Using layer `LigerRMSNorm` from repo `kernels-community/liger_kernels` (revision: main) for layer `LigerRMSNorm` -INFO:root:Using layer `LigerRMSNorm` from repo `kernels-community/liger_kernels` (revision: main) for layer `LigerRMSNorm` -INFO:root:Using layer `MegaBlocksMoeMLP` from repo `kernels-community/megablocks` (revision: main) for layer `MegaBlocksMoeMLP` -INFO:root:Using layer `LigerRMSNorm` from repo `kernels-community/liger_kernels` (revision: main) for layer `LigerRMSNorm` -INFO:root:Using layer `LigerRMSNorm` from repo `kernels-community/liger_kernels` (revision: main) for layer `LigerRMSNorm` -INFO:root:Using layer `MegaBlocksMoeMLP` from repo `kernels-community/megablocks` (revision: main) for layer `MegaBlocksMoeMLP` -INFO:root:Using layer `LigerRMSNorm` from repo `kernels-community/liger_kernels` (revision: main) for layer `LigerRMSNorm` -INFO:root:Using layer `LigerRMSNorm` from repo `kernels-community/liger_kernels` (revision: main) for layer `LigerRMSNorm` -INFO:root:Using layer `MegaBlocksMoeMLP` from repo `kernels-community/megablocks` (revision: main) for layer `MegaBlocksMoeMLP` -INFO:root:Using layer `LigerRMSNorm` from repo `kernels-community/liger_kernels` (revision: main) for layer `LigerRMSNorm` -INFO:root:Using layer `LigerRMSNorm` from repo `kernels-community/liger_kernels` (revision: main) for layer `LigerRMSNorm` -INFO:root:Using layer `MegaBlocksMoeMLP` from repo `kernels-community/megablocks` (revision: main) for layer `MegaBlocksMoeMLP` -INFO:root:Using layer `LigerRMSNorm` from repo `kernels-community/liger_kernels` (revision: main) for layer `LigerRMSNorm` -INFO:root:Using layer `LigerRMSNorm` from repo `kernels-community/liger_kernels` (revision: main) for layer `LigerRMSNorm` -INFO:root:Using layer `MegaBlocksMoeMLP` from repo `kernels-community/megablocks` (revision: main) for layer `MegaBlocksMoeMLP` -INFO:root:Using layer `LigerRMSNorm` from repo `kernels-community/liger_kernels` (revision: main) for layer `LigerRMSNorm` -INFO:root:Using layer `LigerRMSNorm` from repo `kernels-community/liger_kernels` (revision: main) for layer `LigerRMSNorm` -INFO:root:Using layer `MegaBlocksMoeMLP` from repo `kernels-community/megablocks` (revision: main) for layer `MegaBlocksMoeMLP` -INFO:root:Using layer `LigerRMSNorm` from repo `kernels-community/liger_kernels` (revision: main) for layer `LigerRMSNorm` -INFO:root:Using layer `LigerRMSNorm` from repo `kernels-community/liger_kernels` (revision: main) for layer `LigerRMSNorm` -INFO:root:Using layer `MegaBlocksMoeMLP` from repo `kernels-community/megablocks` (revision: main) for layer `MegaBlocksMoeMLP` -INFO:root:Using layer `LigerRMSNorm` from repo `kernels-community/liger_kernels` (revision: main) for layer `LigerRMSNorm` -INFO:root:Using layer `LigerRMSNorm` from repo `kernels-community/liger_kernels` (revision: main) for layer `LigerRMSNorm` -INFO:root:Using layer `MegaBlocksMoeMLP` from repo `kernels-community/megablocks` (revision: main) for layer `MegaBlocksMoeMLP` -INFO:root:Using layer `LigerRMSNorm` from repo `kernels-community/liger_kernels` (revision: main) for layer `LigerRMSNorm` -INFO:root:Using layer `LigerRMSNorm` from repo `kernels-community/liger_kernels` (revision: main) for layer `LigerRMSNorm` -INFO:root:Using layer `MegaBlocksMoeMLP` from repo `kernels-community/megablocks` (revision: main) for layer `MegaBlocksMoeMLP` -INFO:root:Using layer `LigerRMSNorm` from repo `kernels-community/liger_kernels` (revision: main) for layer `LigerRMSNorm` -INFO:root:Using layer `LigerRMSNorm` from repo `kernels-community/liger_kernels` (revision: main) for layer `LigerRMSNorm` -INFO:root:Using layer `MegaBlocksMoeMLP` from repo `kernels-community/megablocks` (revision: main) for layer `MegaBlocksMoeMLP` -INFO:root:Using layer `LigerRMSNorm` from repo `kernels-community/liger_kernels` (revision: main) for layer `LigerRMSNorm` -INFO:root:Using layer `LigerRMSNorm` from repo `kernels-community/liger_kernels` (revision: main) for layer `LigerRMSNorm` -INFO:root:Using layer `LigerRMSNorm` from repo `kernels-community/liger_kernels` (revision: main) for layer `LigerRMSNorm` -INFO:root:Using layer `MegaBlocksMoeMLP` from repo `kernels-community/megablocks` (revision: main) for layer `MegaBlocksMoeMLP` -INFO:root:Using layer `LigerRMSNorm` from repo `kernels-community/liger_kernels` (revision: main) for layer `LigerRMSNorm` -INFO:root:Using layer `LigerRMSNorm` from repo `kernels-community/liger_kernels` (revision: main) for layer `LigerRMSNorm` -INFO:root:Using layer `MegaBlocksMoeMLP` from repo `kernels-community/megablocks` (revision: main) for layer `MegaBlocksMoeMLP` -INFO:root:Using layer `LigerRMSNorm` from repo `kernels-community/liger_kernels` (revision: main) for layer `LigerRMSNorm` -INFO:root:Using layer `LigerRMSNorm` from repo `kernels-community/liger_kernels` (revision: main) for layer `LigerRMSNorm` -INFO:root:Using layer `MegaBlocksMoeMLP` from repo `kernels-community/megablocks` (revision: main) for layer `MegaBlocksMoeMLP` -INFO:root:Using layer `LigerRMSNorm` from repo `kernels-community/liger_kernels` (revision: main) for layer `LigerRMSNorm` -INFO:root:Using layer `LigerRMSNorm` from repo `kernels-community/liger_kernels` (revision: main) for layer `LigerRMSNorm` -INFO:root:Using layer `MegaBlocksMoeMLP` from repo `kernels-community/megablocks` (revision: main) for layer `MegaBlocksMoeMLP` -INFO:root:Using layer `LigerRMSNorm` from repo `kernels-community/liger_kernels` (revision: main) for layer `LigerRMSNorm` -INFO:root:Using layer `LigerRMSNorm` from repo `kernels-community/liger_kernels` (revision: main) for layer `LigerRMSNorm` -INFO:root:Using layer `MegaBlocksMoeMLP` from repo `kernels-community/megablocks` (revision: main) for layer `MegaBlocksMoeMLP` -INFO:root:Using layer `LigerRMSNorm` from repo `kernels-community/liger_kernels` (revision: main) for layer `LigerRMSNorm` -INFO:root:Using layer `LigerRMSNorm` from repo `kernels-community/liger_kernels` (revision: main) for layer `LigerRMSNorm` -INFO:root:Using layer `MegaBlocksMoeMLP` from repo `kernels-community/megablocks` (revision: main) for layer `MegaBlocksMoeMLP` -INFO:root:Using layer `LigerRMSNorm` from repo `kernels-community/liger_kernels` (revision: main) for layer `LigerRMSNorm` -INFO:root:Using layer `LigerRMSNorm` from repo `kernels-community/liger_kernels` (revision: main) for layer `LigerRMSNorm` -INFO:root:Using layer `MegaBlocksMoeMLP` from repo `kernels-community/megablocks` (revision: main) for layer `MegaBlocksMoeMLP` -INFO:root:Using layer `LigerRMSNorm` from repo `kernels-community/liger_kernels` (revision: main) for layer `LigerRMSNorm` -INFO:root:Using layer `LigerRMSNorm` from repo `kernels-community/liger_kernels` (revision: main) for layer `LigerRMSNorm` -INFO:root:Using layer `MegaBlocksMoeMLP` from repo `kernels-community/megablocks` (revision: main) for layer `MegaBlocksMoeMLP` -INFO:root:Using layer `LigerRMSNorm` from repo `kernels-community/liger_kernels` (revision: main) for layer `LigerRMSNorm` -INFO:root:Using layer `LigerRMSNorm` from repo `kernels-community/liger_kernels` (revision: main) for layer `LigerRMSNorm` -INFO:root:Using layer `MegaBlocksMoeMLP` from repo `kernels-community/megablocks` (revision: main) for layer `MegaBlocksMoeMLP` -INFO:root:Using layer `LigerRMSNorm` from repo `kernels-community/liger_kernels` (revision: main) for layer `LigerRMSNorm` -INFO:root:Using layer `LigerRMSNorm` from repo `kernels-community/liger_kernels` (revision: main) for layer `LigerRMSNorm` -INFO:root:Using layer `MegaBlocksMoeMLP` from repo `kernels-community/megablocks` (revision: main) for layer `MegaBlocksMoeMLP` -INFO:root:Using layer `LigerRMSNorm` from repo `kernels-community/liger_kernels` (revision: main) for layer `LigerRMSNorm` -INFO:root:Using layer `LigerRMSNorm` from repo `kernels-community/liger_kernels` (revision: main) for layer `LigerRMSNorm` -INFO:root:Using layer `MegaBlocksMoeMLP` from repo `kernels-community/megablocks` (revision: main) for layer `MegaBlocksMoeMLP` -INFO:root:Using layer `LigerRMSNorm` from repo `kernels-community/liger_kernels` (revision: main) for layer `LigerRMSNorm` -INFO:root:Using layer `LigerRMSNorm` from repo `kernels-community/liger_kernels` (revision: main) for layer `LigerRMSNorm` -INFO:root:Using layer `MegaBlocksMoeMLP` from repo `kernels-community/megablocks` (revision: main) for layer `MegaBlocksMoeMLP` -INFO:root:Using layer `LigerRMSNorm` from repo `kernels-community/liger_kernels` (revision: main) for layer `LigerRMSNorm` -INFO:root:Using layer `LigerRMSNorm` from repo `kernels-community/liger_kernels` (revision: main) for layer `LigerRMSNorm` -INFO:root:Using layer `MegaBlocksMoeMLP` from repo `kernels-community/megablocks` (revision: main) for layer `MegaBlocksMoeMLP` -INFO:root:Using layer `LigerRMSNorm` from repo `kernels-community/liger_kernels` (revision: main) for layer `LigerRMSNorm` -INFO:root:Using layer `LigerRMSNorm` from repo `kernels-community/liger_kernels` (revision: main) for layer `LigerRMSNorm` -INFO:root:Using layer `MegaBlocksMoeMLP` from repo `kernels-community/megablocks` (revision: main) for layer `MegaBlocksMoeMLP` -INFO:root:Using layer `LigerRMSNorm` from repo `kernels-community/liger_kernels` (revision: main) for layer `LigerRMSNorm` -INFO:root:Using layer `LigerRMSNorm` from repo `kernels-community/liger_kernels` (revision: main) for layer `LigerRMSNorm` -INFO:root:Using layer `MegaBlocksMoeMLP` from repo `kernels-community/megablocks` (revision: main) for layer `MegaBlocksMoeMLP` -INFO:root:Using layer `LigerRMSNorm` from repo `kernels-community/liger_kernels` (revision: main) for layer `LigerRMSNorm` -INFO:root:Using layer `LigerRMSNorm` from repo `kernels-community/liger_kernels` (revision: main) for layer `LigerRMSNorm` -INFO:root:Using layer `MegaBlocksMoeMLP` from repo `kernels-community/megablocks` (revision: main) for layer `MegaBlocksMoeMLP` -INFO:root:Using layer `LigerRMSNorm` from repo `kernels-community/liger_kernels` (revision: main) for layer `LigerRMSNorm` -INFO:root:Using layer `LigerRMSNorm` from repo `kernels-community/liger_kernels` (revision: main) for layer `LigerRMSNorm` -INFO:root:Using layer `MegaBlocksMoeMLP` from repo `kernels-community/megablocks` (revision: main) for layer `MegaBlocksMoeMLP` -INFO:root:Using layer `LigerRMSNorm` from repo `kernels-community/liger_kernels` (revision: main) for layer `LigerRMSNorm` -INFO:root:Using layer `LigerRMSNorm` from repo `kernels-community/liger_kernels` (revision: main) for layer `LigerRMSNorm` -INFO:root:Using layer `MegaBlocksMoeMLP` from repo `kernels-community/megablocks` (revision: main) for layer `MegaBlocksMoeMLP` -INFO:root:Using layer `LigerRMSNorm` from repo `kernels-community/liger_kernels` (revision: main) for layer `LigerRMSNorm` -INFO:root:Using layer `LigerRMSNorm` from repo `kernels-community/liger_kernels` (revision: main) for layer `LigerRMSNorm` -INFO:root:Using layer `MegaBlocksMoeMLP` from repo `kernels-community/megablocks` (revision: main) for layer `MegaBlocksMoeMLP` -INFO:root:Using layer `LigerRMSNorm` from repo `kernels-community/liger_kernels` (revision: main) for layer `LigerRMSNorm` -INFO:root:Using layer `LigerRMSNorm` from repo `kernels-community/liger_kernels` (revision: main) for layer `LigerRMSNorm` -INFO:root:Using layer `MegaBlocksMoeMLP` from repo `kernels-community/megablocks` (revision: main) for layer `MegaBlocksMoeMLP` -INFO:root:Using layer `LigerRMSNorm` from repo `kernels-community/liger_kernels` (revision: main) for layer `LigerRMSNorm` -INFO:root:Using layer `LigerRMSNorm` from repo `kernels-community/liger_kernels` (revision: main) for layer `LigerRMSNorm` -INFO:root:Using layer `MegaBlocksMoeMLP` from repo `kernels-community/megablocks` (revision: main) for layer `MegaBlocksMoeMLP` -INFO:root:Using layer `LigerRMSNorm` from repo `kernels-community/liger_kernels` (revision: main) for layer `LigerRMSNorm` -INFO:root:Using layer `LigerRMSNorm` from repo `kernels-community/liger_kernels` (revision: main) for layer `LigerRMSNorm` -INFO:root:Using layer `MegaBlocksMoeMLP` from repo `kernels-community/megablocks` (revision: main) for layer `MegaBlocksMoeMLP` -INFO:root:Using layer `LigerRMSNorm` from repo `kernels-community/liger_kernels` (revision: main) for layer `LigerRMSNorm` -INFO:root:Using layer `LigerRMSNorm` from repo `kernels-community/liger_kernels` (revision: main) for layer `LigerRMSNorm` -INFO:root:Using layer `MegaBlocksMoeMLP` from repo `kernels-community/megablocks` (revision: main) for layer `MegaBlocksMoeMLP` -INFO:root:Using layer `LigerRMSNorm` from repo `kernels-community/liger_kernels` (revision: main) for layer `LigerRMSNorm` -INFO:root:Using layer `LigerRMSNorm` from repo `kernels-community/liger_kernels` (revision: main) for layer `LigerRMSNorm` -INFO:root:Using layer `MegaBlocksMoeMLP` from repo `kernels-community/megablocks` (revision: main) for layer `MegaBlocksMoeMLP` -INFO:root:Using layer `LigerRMSNorm` from repo `kernels-community/liger_kernels` (revision: main) for layer `LigerRMSNorm` -INFO:root:Using layer `LigerRMSNorm` from repo `kernels-community/liger_kernels` (revision: main) for layer `LigerRMSNorm` -INFO:root:Using layer `LigerRMSNorm` from repo `kernels-community/liger_kernels` (revision: main) for layer `LigerRMSNorm` -Traceback (most recent call last): - File "/home/ubuntu/Projects/yamoe-gpt-integration/.uvnote/cells/forward_only.py", line 87, in <module> - generated = model.generate( - ^^^^^^^^^^^^^^^ - File "/tmp/uvnote-run-b1sqpts5/home/.cache/uv/environments-v2/forward-only-e48b1c4099197fb9/lib/python3.12/site-packages/torch/utils/_contextlib.py", line 120, in decorate_context - return func(*args, **kwargs) - ^^^^^^^^^^^^^^^^^^^^^ - File "/tmp/uvnote-run-b1sqpts5/home/.cache/uv/environments-v2/forward-only-e48b1c4099197fb9/lib/python3.12/site-packages/transformers/generation/utils.py", line 2546, in generate - result = decoding_method( - ^^^^^^^^^^^^^^^^ - File "/tmp/uvnote-run-b1sqpts5/home/.cache/uv/environments-v2/forward-only-e48b1c4099197fb9/lib/python3.12/site-packages/transformers/generation/utils.py", line 2766, in _sample - outputs = self(**model_inputs, return_dict=True) - ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ - File "/tmp/uvnote-run-b1sqpts5/home/.cache/uv/environments-v2/forward-only-e48b1c4099197fb9/lib/python3.12/site-packages/torch/nn/modules/module.py", line 1773, in _wrapped_call_impl - return self._call_impl(*args, **kwargs) - ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ - File "/tmp/uvnote-run-b1sqpts5/home/.cache/uv/environments-v2/forward-only-e48b1c4099197fb9/lib/python3.12/site-packages/torch/nn/modules/module.py", line 1784, in _call_impl - return forward_call(*args, **kwargs) - ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ - File "/tmp/uvnote-run-b1sqpts5/home/.cache/uv/environments-v2/forward-only-e48b1c4099197fb9/lib/python3.12/site-packages/accelerate/hooks.py", line 175, in new_forward - output = module._old_forward(*args, **kwargs) - ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ - File "/tmp/uvnote-run-b1sqpts5/home/.cache/uv/environments-v2/forward-only-e48b1c4099197fb9/lib/python3.12/site-packages/transformers/utils/generic.py", line 783, in wrapper - output = func(self, *args, **kwargs) - ^^^^^^^^^^^^^^^^^^^^^^^^^^^ - File "/tmp/uvnote-run-b1sqpts5/home/.cache/uv/environments-v2/forward-only-e48b1c4099197fb9/lib/python3.12/site-packages/transformers/models/gpt_oss/modeling_gpt_oss.py", line 668, in forward - outputs: MoeModelOutputWithPast = self.model( - ^^^^^^^^^^^ - File "/tmp/uvnote-run-b1sqpts5/home/.cache/uv/environments-v2/forward-only-e48b1c4099197fb9/lib/python3.12/site-packages/torch/nn/modules/module.py", line 1773, in _wrapped_call_impl - return self._call_impl(*args, **kwargs) - ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ - File "/tmp/uvnote-run-b1sqpts5/home/.cache/uv/environments-v2/forward-only-e48b1c4099197fb9/lib/python3.12/site-packages/torch/nn/modules/module.py", line 1784, in _call_impl - return forward_call(*args, **kwargs) - ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ - File "/tmp/uvnote-run-b1sqpts5/home/.cache/uv/environments-v2/forward-only-e48b1c4099197fb9/lib/python3.12/site-packages/transformers/utils/generic.py", line 929, in wrapper - outputs = func(self, *args, **kwargs) - ^^^^^^^^^^^^^^^^^^^^^^^^^^^ - File "/tmp/uvnote-run-b1sqpts5/home/.cache/uv/environments-v2/forward-only-e48b1c4099197fb9/lib/python3.12/site-packages/transformers/models/gpt_oss/modeling_gpt_oss.py", line 507, in forward - hidden_states = decoder_layer( - ^^^^^^^^^^^^^^ - File "/tmp/uvnote-run-b1sqpts5/home/.cache/uv/environments-v2/forward-only-e48b1c4099197fb9/lib/python3.12/site-packages/transformers/modeling_layers.py", line 94, in __call__ - return super().__call__(*args, **kwargs) - ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ - File "/tmp/uvnote-run-b1sqpts5/home/.cache/uv/environments-v2/forward-only-e48b1c4099197fb9/lib/python3.12/site-packages/torch/nn/modules/module.py", line 1773, in _wrapped_call_impl - return self._call_impl(*args, **kwargs) - ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ - File "/tmp/uvnote-run-b1sqpts5/home/.cache/uv/environments-v2/forward-only-e48b1c4099197fb9/lib/python3.12/site-packages/torch/nn/modules/module.py", line 1784, in _call_impl - return forward_call(*args, **kwargs) - ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ - File "/tmp/uvnote-run-b1sqpts5/home/.cache/uv/environments-v2/forward-only-e48b1c4099197fb9/lib/python3.12/site-packages/accelerate/hooks.py", line 175, in new_forward - output = module._old_forward(*args, **kwargs) - ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ - File "/tmp/uvnote-run-b1sqpts5/home/.cache/uv/environments-v2/forward-only-e48b1c4099197fb9/lib/python3.12/site-packages/transformers/utils/deprecation.py", line 172, in wrapped_func - return func(*args, **kwargs) - ^^^^^^^^^^^^^^^^^^^^^ - File "/tmp/uvnote-run-b1sqpts5/home/.cache/uv/environments-v2/forward-only-e48b1c4099197fb9/lib/python3.12/site-packages/transformers/models/gpt_oss/modeling_gpt_oss.py", line 369, in forward - hidden_states = self.input_layernorm(hidden_states) - ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ - File "/tmp/uvnote-run-b1sqpts5/home/.cache/uv/environments-v2/forward-only-e48b1c4099197fb9/lib/python3.12/site-packages/torch/nn/modules/module.py", line 1773, in _wrapped_call_impl - return self._call_impl(*args, **kwargs) - ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ - File "/tmp/uvnote-run-b1sqpts5/home/.cache/uv/environments-v2/forward-only-e48b1c4099197fb9/lib/python3.12/site-packages/torch/nn/modules/module.py", line 1784, in _call_impl - return forward_call(*args, **kwargs) - ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ - File "/tmp/uvnote-run-b1sqpts5/home/.cache/huggingface/hub/models--kernels-community--liger_kernels/snapshots/7435d25a8faf175758be14046371a5b0c686f94c/build/torch-universal/liger_kernels/layers.py", line 30, in forward - return LigerRMSNormFunction.apply( - ^^^^^^^^^^^^^^^^^^^^^^^^^^^ - File "/tmp/uvnote-run-b1sqpts5/home/.cache/uv/environments-v2/forward-only-e48b1c4099197fb9/lib/python3.12/site-packages/torch/autograd/function.py", line 576, in apply - return super().apply(*args, **kwargs) # type: ignore[misc] - ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ - File "/tmp/uvnote-run-b1sqpts5/home/.cache/huggingface/hub/models--kernels-community--liger_kernels/snapshots/7435d25a8faf175758be14046371a5b0c686f94c/build/torch-universal/liger_kernels/utils.py", line 48, in wrapper - return fn(ctx, *args, **kwargs) - ^^^^^^^^^^^^^^^^^^^^^^^^ - File "/tmp/uvnote-run-b1sqpts5/home/.cache/huggingface/hub/models--kernels-community--liger_kernels/snapshots/7435d25a8faf175758be14046371a5b0c686f94c/build/torch-universal/liger_kernels/rms_norm.py", line 338, in forward - Y, X, RSTD, BLOCK_SIZE, num_warps, casting_mode = rms_norm_forward(X, W, eps, offset, casting_mode) - ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ - File "/tmp/uvnote-run-b1sqpts5/home/.cache/huggingface/hub/models--kernels-community--liger_kernels/snapshots/7435d25a8faf175758be14046371a5b0c686f94c/build/torch-universal/liger_kernels/rms_norm.py", line 230, in rms_norm_forward - _rms_norm_forward_kernel[(n_rows,)]( - File "/tmp/uvnote-run-b1sqpts5/home/.cache/uv/environments-v2/forward-only-e48b1c4099197fb9/lib/python3.12/site-packages/triton/runtime/jit.py", line 390, in <lambda> - return lambda *args, **kwargs: self.run(grid=grid, warmup=False, *args, **kwargs) - ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ - File "/tmp/uvnote-run-b1sqpts5/home/.cache/uv/environments-v2/forward-only-e48b1c4099197fb9/lib/python3.12/site-packages/triton/runtime/jit.py", line 617, in run - kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata, launch_metadata, - File "/tmp/uvnote-run-b1sqpts5/home/.cache/uv/environments-v2/forward-only-e48b1c4099197fb9/lib/python3.12/site-packages/triton/backends/nvidia/driver.py", line 708, in __call__ - self.launch(gridX, gridY, gridZ, stream, function, self.launch_cooperative_grid, self.launch_pdl, -ValueError: Pointer argument (at 0) cannot be accessed from Triton (cpu tensor?)
-
-
- -

Backwards

-
- - - \ No newline at end of file diff --git a/site/note.html b/site/note.html deleted file mode 100644 index 110dc576a7fde9f2b9e6959b1d07117e007578db..0000000000000000000000000000000000000000 --- a/site/note.html +++ /dev/null @@ -1,3733 +0,0 @@ - - - - - - uvnote Integration Test Report - - - - - -
-
light
-
reset
- -
- -
-
Generated on:
-
- Linux x86_64 | Linux-5.15.0-1084-aws-x86_64-with-glibc2.31 -
-
- -
-
-
- -β–Ό code -β–Ό output - β–Ά uv-logs - | -Cell: setup | 304.89s - | - -Raw -
-
-
  1
-  2
-  3
-  4
-  5
-  6
-  7
-  8
-  9
- 10
- 11
- 12
- 13
- 14
- 15
- 16
- 17
- 18
- 19
- 20
- 21
- 22
- 23
- 24
- 25
- 26
- 27
- 28
- 29
- 30
- 31
- 32
- 33
- 34
- 35
- 36
- 37
- 38
- 39
- 40
- 41
- 42
- 43
- 44
- 45
- 46
- 47
- 48
- 49
- 50
- 51
- 52
- 53
- 54
- 55
- 56
- 57
- 58
- 59
- 60
- 61
- 62
- 63
- 64
- 65
- 66
- 67
- 68
- 69
- 70
- 71
- 72
- 73
- 74
- 75
- 76
- 77
- 78
- 79
- 80
- 81
- 82
- 83
- 84
- 85
- 86
- 87
- 88
- 89
- 90
- 91
- 92
- 93
- 94
- 95
- 96
- 97
- 98
- 99
-100
-101
-102
-103
-104
-105
-106
-107
-108
-109
-110
-111
-112
-113
-114
-115
-116
-117
-118
-119
-120
-121
-122
-123
-124
-125
-126
-127
-128
-129
-130
-131
-132
-133
-134
-135
-136
-137
-138
-139
-140
-141
-142
-143
-144
-145
-146
-147
-148
-149
-150
-151
-152
-153
-154
-155
-156
-157
-158
-159
-160
-161
-162
-163
-164
-165
-166
-167
-168
-169
-170
-171
-172
-173
-174
-175
-176
-177
-178
-179
-180
-181
-182
-183
-184
-185
-186
-187
-188
-189
-190
-191
-192
-193
-194
-195
-196
-197
-198
-199
-200
-201
-202
-203
-204
-205
-206
-207
-208
-209
-210
-211
-212
-213
-214
-215
-216
-217
-218
-219
-220
# /// script
-# requires-python = ">=3.12"
-# dependencies = [
-#     "accelerate>=1.10.1",
-#     "torch>=2.7.0",
-#     "kernels==0.10.0",
-#     "transformers@https://github.com/huggingface/transformers.git",
-#     "ipdb>=0.13.13",
-#     "matplotlib>=3.7.2",
-#     "numpy>=1.24.3",
-# ]
-# ///
-
-import torch
-from transformers import GptOssForCausalLM, PreTrainedTokenizerFast, Mxfp4Config
-import time
-import torch.nn as nn
-from kernels import register_kernel_mapping, Mode, LayerRepository
-import sys
-import torch.profiler
-import gc
-
-def reset_peak_memory_stats():
-    """Clear CUDA cache and reset memory allocation counters."""
-    torch.cuda.empty_cache()
-    if torch.cuda.is_available():
-        torch.cuda.reset_peak_memory_stats()
-    gc.collect()
-
-def get_memory_stats():
-    """Get current and peak CUDA memory usage."""
-    if not torch.cuda.is_available():
-        return {"allocated_gb": 0, "peak_gb": 0, "reserved_gb": 0}
-    return {
-        "allocated_gb": torch.cuda.memory_allocated() / 1e9,
-        "peak_gb": torch.cuda.max_memory_allocated() / 1e9,
-        "reserved_gb": torch.cuda.memory_reserved() / 1e9,
-    }
-
-def override_kernel_layer_name(cls_name: str, value) -> bool:
-    """Helper to dynamically override the kernel_layer_name in a model class."""
-    for mod in sys.modules.values():
-        if mod is None:
-            continue
-        obj = getattr(mod, cls_name, None)
-        if isinstance(obj, type) and issubclass(obj, nn.Module):
-            setattr(obj, "kernel_layer_name", value)
-            print(f"Overrode {cls_name}.kernel_layer_name to {value}")
-            return True
-    return False
-
-def run_generation(model, inputs, max_tokens=64):
-    """Run a single generation pass and measure its duration."""
-    with torch.inference_mode():
-        start_time = time.perf_counter()
-        generated = model.generate(
-            **inputs,
-            max_new_tokens=max_tokens,
-            do_sample=False,
-            temperature=None,
-        )
-        end_time = time.perf_counter()
-    return generated, end_time - start_time
-
-
-# Init the model the normal way
-model_id = "openai/gpt-oss-20b"
-tokenizer = PreTrainedTokenizerFast.from_pretrained(model_id)
-quantization_config = Mxfp4Config(dequantize=True)
-
-# Now we want to add some custom kernel mapping
-custom_mapping = dict(
-    Yamoe=dict(
-        cuda={
-            Mode.INFERENCE: LayerRepository(
-                repo_id="drbh/yamoe",
-                layer_name="Yamoe",
-                revision="v0.3.0",
-            ),
-        },
-    )
-)
-# First add the mapping
-register_kernel_mapping(custom_mapping)
-# Then override the layer name in the model class
-override_kernel_layer_name("GptOssMLP", "Yamoe")
-
-# TODO: remove this line once RMSNorm is working
-override_kernel_layer_name("GptOssRMSNorm", None)
-
-## Normal model stuff
-
-model = GptOssForCausalLM.from_pretrained(
-    model_id,
-    dtype="bfloat16",
-    device_map="auto",
-    use_kernels=True,
-    quantization_config=quantization_config,
-).eval()
-
-
-messages = [
-    {"role": "system", "content": "What is Tensor Parallelism?"},
-]
-
-inputs = tokenizer.apply_chat_template(
-    messages,
-    add_generation_prompt=True,
-    return_tensors="pt",
-    return_dict=True,
-    reasoning_effort="low",
-).to("cuda")
-
-
-
-def run_generation(model, inputs, max_tokens=64):
-    with torch.inference_mode():
-        start_time = time.perf_counter()
-        generated = model.generate(
-            **inputs,
-            max_new_tokens=max_tokens,
-            do_sample=False,
-            temperature=None,
-        )
-        end_time = time.perf_counter()
-    return generated, end_time - start_time
-
-
-print("\n=== Running Benchmarks ===")
-print(f"Model: {model_id}")
-print(f"Device: {torch.cuda.get_device_name()}")
-print(f"Initial memory: {get_memory_stats()}\n")
-
-# Warmup
-print("Running warmup...")
-for _ in range(2):
-    _ = run_generation(model, inputs, max_tokens=16)
-
-reset_peak_memory_stats()
-
-# Benchmark runs
-num_runs = 5
-max_tokens = 64
-times = []
-
-print(f"\nRunning {num_runs} benchmark iterations with {max_tokens} tokens...")
-for i in range(num_runs):
-    reset_peak_memory_stats()
-    generated, elapsed = run_generation(model, inputs, max_tokens)
-    times.append(elapsed)
-    mem_stats = get_memory_stats()
-    tokens_per_sec = max_tokens / elapsed
-    print(f"Run {i+1}: {elapsed:.3f}s ({tokens_per_sec:.1f} tok/s) | Peak: {mem_stats['peak_gb']:.2f}GB")
-
-# Statistics
-avg_time = sum(times) / len(times)
-min_time = min(times)
-max_time = max(times)
-avg_tokens_per_sec = max_tokens / avg_time
-
-print(f"\n=== Benchmark Results ===")
-print(f"Average: {avg_time:.3f}s ({avg_tokens_per_sec:.1f} tok/s)")
-print(f"Min: {min_time:.3f}s | Max: {max_time:.3f}s")
-
-# Final memory stats
-final_mem = get_memory_stats()
-print(f"\nFinal Memory:")
-print(f"  Allocated: {final_mem['allocated_gb']:.2f}GB")
-print(f"  Peak: {final_mem['peak_gb']:.2f}GB")
-print(f"  Reserved: {final_mem['reserved_gb']:.2f}GB")
-
-
-print("\n=== Running with Profiler ===")
-reset_peak_memory_stats()
-
-with torch.profiler.profile(
-    activities=[
-        torch.profiler.ProfilerActivity.CPU,
-        torch.profiler.ProfilerActivity.CUDA,
-    ],
-    record_shapes=True,
-    profile_memory=True,
-    with_stack=True,
-) as prof:
-    generated, elapsed = run_generation(model, inputs, max_tokens=64)
-
-print(f"Generation time: {elapsed:.2f} seconds")
-
-# Print profiler results
-print("\n=== Top 10 CUDA operations by time ===")
-print(prof.key_averages().table(sort_by="cuda_time_total", row_limit=10))
-
-print("\n=== Top 10 operations by memory ===")
-print(prof.key_averages().table(sort_by="self_cuda_memory_usage", row_limit=10))
-
-# Memory stats
-mem_stats = get_memory_stats()
-print(f"\nPeak Memory: {mem_stats['peak_gb']:.2f}GB")
-
-# Save trace if needed
-prof.export_chrome_trace("trace.json")
-print("\nProfile trace saved to trace.json")
-
-
-# Decode and print output
-print("\nGenerated text:")
-print(tokenizer.decode(generated[0][inputs["input_ids"].shape[-1] :]))
-
-
-# save times and memory stats for charting
-with open("benchmark_times.txt", "w") as f:
-    for t in times:
-        f.write(f"{t}\n")
-with open("benchmark_memory.txt", "w") as f:
-    f.write(f"{final_mem['allocated_gb']},{final_mem['peak_gb']},{final_mem['reserved_gb']}\n")
-
-# save avg_tokens_per_sec for charting
-with open("benchmark_avg_tokens_per_sec.txt", "w") as f:
-    f.write(f"{avg_tokens_per_sec}\n")
-
-
- -
-
-
Overrode GptOssMLP.kernel_layer_name to Yamoe -Overrode GptOssRMSNorm.kernel_layer_name to None - -=== Running Benchmarks === -Model: openai/gpt-oss-20b -Device: NVIDIA L4 -Initial memory: {'allocated_gb': 9.390148608, 'peak_gb': 15.5643264, 'reserved_gb': 17.177772032} - -Running warmup... - -Running 5 benchmark iterations with 64 tokens... -Run 1: 12.075s (5.3 tok/s) | Peak: 9.41GB -Run 2: 12.071s (5.3 tok/s) | Peak: 9.41GB -Run 3: 12.070s (5.3 tok/s) | Peak: 9.41GB -Run 4: 12.071s (5.3 tok/s) | Peak: 9.41GB -Run 5: 12.071s (5.3 tok/s) | Peak: 9.41GB - -=== Benchmark Results === -Average: 12.072s (5.3 tok/s) -Min: 12.070s | Max: 12.075s - -Final Memory: - Allocated: 9.40GB - Peak: 9.41GB - Reserved: 10.33GB - -=== Running with Profiler === -Generation time: 12.73 seconds - -=== Top 10 CUDA operations by time === -------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ - Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg CPU Mem Self CPU Mem CUDA Mem Self CUDA Mem # of Calls -------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ - _yamoe_74a2acb_dirty::experts 1.40% 148.156ms 66.87% 7.074s 4.606ms 52.388ms 0.46% 10.583s 6.890ms 0 B -2.98 KB 18.88 MB -2.11 GB 1536 - aten::bmm 1.25% 132.560ms 1.75% 185.015ms 29.803us 10.486s 91.79% 10.486s 1.689ms 0 B 0 B 63.12 MB 63.12 MB 6208 -void cutlass::Kernel2<cutlass_80_wmma_tensorop_bf16_... 0.00% 0.000us 0.00% 0.000us 0.000us 10.319s 90.32% 10.319s 3.412ms 0 B 0 B 0 B 0 B 3024 - aten::linear 0.54% 57.566ms 3.78% 399.802ms 51.627us 0.000us 0.00% 645.165ms 83.312us 0 B 0 B 76.88 MB 0 B 7744 - aten::addmm 1.81% 191.354ms 2.57% 272.095ms 35.429us 352.039ms 3.08% 352.151ms 45.853us 0 B 0 B 52.31 MB 52.31 MB 7680 -std::enable_if<!(false), void>::type internal::gemvx... 0.00% 0.000us 0.00% 0.000us 0.000us 344.917ms 3.02% 344.917ms 74.982us 0 B 0 B 0 B 0 B 4600 - aten::matmul 0.31% 32.441ms 1.72% 181.712ms 56.785us 0.000us 0.00% 303.821ms 94.944us 0 B 0 B 87.68 MB 0 B 3200 -std::enable_if<!(false), void>::type internal::gemvx... 0.00% 0.000us 0.00% 0.000us 0.000us 293.850ms 2.57% 293.850ms 97.173us 0 B 0 B 0 B 0 B 3024 - aten::mm 0.01% 1.506ms 0.02% 2.161ms 33.768us 293.014ms 2.56% 293.014ms 4.578ms 0 B 0 B 24.56 MB 24.56 MB 64 - ampere_bf16_s16816gemm_bf16_128x64_ldg8_f2f_nn 0.00% 0.000us 0.00% 0.000us 0.000us 102.278ms 0.90% 102.278ms 4.262ms 0 B 0 B 0 B 0 B 24 -------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ -Self CPU time total: 10.579s -Self CUDA time total: 11.424s - - -=== Top 10 operations by memory === -------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ - Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg CPU Mem Self CPU Mem CUDA Mem Self CUDA Mem # of Calls -------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ - aten::empty 0.68% 72.026ms 0.68% 72.026ms 4.244us 0.000us 0.00% 0.000us 0.000us 296 B 296 B 3.49 GB 3.49 GB 16973 - aten::clamp 0.46% 48.185ms 0.69% 72.630ms 15.762us 10.269ms 0.09% 10.269ms 2.229us 0 B 0 B 616.69 MB 616.69 MB 4608 - aten::mul 1.76% 186.048ms 2.93% 310.383ms 14.181us 47.780ms 0.42% 47.792ms 2.184us 784 B 784 B 554.93 MB 554.93 MB 21888 - aten::cat 0.78% 82.030ms 1.22% 129.113ms 16.536us 17.028ms 0.15% 17.030ms 2.181us 0 B 0 B 387.88 MB 387.88 MB 7808 - aten::sigmoid 0.09% 9.855ms 0.16% 16.652ms 10.841us 2.889ms 0.03% 2.889ms 1.881us 0 B 0 B 307.97 MB 307.97 MB 1536 - aten::empty_strided 1.08% 114.498ms 1.10% 116.720ms 5.564us 0.000us 0.00% 0.000us 0.000us 0 B 0 B 216.60 MB 216.60 MB 20979 - aten::add 0.93% 97.861ms 1.56% 164.673ms 15.047us 16.394ms 0.14% 16.395ms 1.498us 0 B 0 B 91.03 MB 91.03 MB 10944 - aten::pow 0.36% 38.271ms 0.55% 58.020ms 18.501us 4.117ms 0.04% 4.117ms 1.313us 0 B 0 B 75.58 MB 75.58 MB 3136 - aten::bmm 1.25% 132.560ms 1.75% 185.015ms 29.803us 10.486s 91.79% 10.486s 1.689ms 0 B 0 B 63.12 MB 63.12 MB 6208 - aten::sub 0.51% 53.869ms 0.82% 87.218ms 13.626us 9.277ms 0.08% 9.355ms 1.461us 0 B 0 B 53.04 MB 53.01 MB 6401 -------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ -Self CPU time total: 10.579s -Self CUDA time total: 11.424s - - -Peak Memory: 9.41GB - -Profile trace saved to trace.json - -Generated text: -<|channel|>analysis<|message|>We need to explain what Tensor Parallelism is. It's a concept in distributed training of large language models. It refers to splitting the weight matrices (tensors) across multiple devices, so each device holds a slice of the matrix. During forward/backward passes, each device computes partial results and then they are -
-
-
β–Ά UV Install Logs
- -
-
Fetching 3 files: 0%| | 0/3 [00:00<?, ?it/s] -Fetching 3 files: 33%|β–ˆβ–ˆβ–ˆβ–Ž | 1/3 [00:11<00:23, 11.59s/it] -Fetching 3 files: 67%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–‹ | 2/3 [00:16<00:07, 7.73s/it] -Fetching 3 files: 100%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆ| 3/3 [00:16<00:00, 5.54s/it] -You are using full precision kernels, we will dequantize the model to bf16. To use the quantized model with quantization kernels, please set use_kernels=False - -Loading checkpoint shards: 0%| | 0/3 [00:00<?, ?it/s] -Loading checkpoint shards: 33%|β–ˆβ–ˆβ–ˆβ–Ž | 1/3 [00:03<00:06, 3.23s/it] -Loading checkpoint shards: 67%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–‹ | 2/3 [00:06<00:03, 3.15s/it] -Loading checkpoint shards: 100%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆ| 3/3 [00:08<00:00, 2.50s/it] -Loading checkpoint shards: 100%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆ| 3/3 [00:08<00:00, 2.68s/it] - -Fetching 6 files: 0%| | 0/6 [00:00<?, ?it/s] -Fetching 6 files: 17%|β–ˆβ–‹ | 1/6 [00:00<00:00, 5.23it/s] -Fetching 6 files: 50%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆ | 3/6 [00:00<00:00, 6.19it/s] -Fetching 6 files: 100%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆ| 6/6 [00:00<00:00, 12.15it/s] -/tmp/uvnote-run-hjgpkuq6/home/.cache/uv/environments-v2/setup-30bb029f3f83f37d/lib/python3.12/site-packages/kernels/layer.py:868: UserWarning: -No kernel mapping found for layer `None`. Check if the layer name matches one of the kernels in the mapping or add the kernel you want to use to the mapping. Defaulting to original forward implementation. - warnings.warn( -/tmp/uvnote-run-hjgpkuq6/home/.cache/uv/environments-v2/setup-30bb029f3f83f37d/lib/python3.12/site-packages/kernels/layer.py:868: UserWarning: -No kernel mapping found for layer `None`. Check if the layer name matches one of the kernels in the mapping or add the kernel you want to use to the mapping. Defaulting to original forward implementation. - warnings.warn(
- -
-
- -
-
- -β–Ό code -β–Ό output - β–Ά uv-logs - | -Cell: charts | deps: matplotlib, numpy | 3.51s - | - -Raw -
-
-
  1
-  2
-  3
-  4
-  5
-  6
-  7
-  8
-  9
- 10
- 11
- 12
- 13
- 14
- 15
- 16
- 17
- 18
- 19
- 20
- 21
- 22
- 23
- 24
- 25
- 26
- 27
- 28
- 29
- 30
- 31
- 32
- 33
- 34
- 35
- 36
- 37
- 38
- 39
- 40
- 41
- 42
- 43
- 44
- 45
- 46
- 47
- 48
- 49
- 50
- 51
- 52
- 53
- 54
- 55
- 56
- 57
- 58
- 59
- 60
- 61
- 62
- 63
- 64
- 65
- 66
- 67
- 68
- 69
- 70
- 71
- 72
- 73
- 74
- 75
- 76
- 77
- 78
- 79
- 80
- 81
- 82
- 83
- 84
- 85
- 86
- 87
- 88
- 89
- 90
- 91
- 92
- 93
- 94
- 95
- 96
- 97
- 98
- 99
-100
-101
-102
-103
-104
-105
-106
-107
-108
-109
-110
-111
-112
-113
-114
-115
-116
-117
-118
-119
-120
-121
-122
-123
-124
-125
-126
-127
-128
-129
-130
-131
-132
-133
import matplotlib.pyplot as plt
-import numpy as np
-import os
-
-# get the pathf rom UVNOTE_SETUP env var
-setup_path = os.getenv("UVNOTE_INPUT_SETUP", ".")
-print(f"Reading benchmark data from: {setup_path}")
-
-num_runs = 5
-max_tokens = 64
-times = []
-with open(os.path.join(setup_path, "benchmark_times.txt"), "r") as f:
-    for line in f:
-        times.append(float(line.strip()))
-
-
-avg_time = 0.0
-min_time = 0.0
-max_time = 0.0
-final_mem = {"allocated_gb": 0.0, "peak_gb": 0.0, "reserved_gb": 0.0}
-
-avg_tokens_per_sec = 0.0
-with open(os.path.join(setup_path, "benchmark_avg_tokens_per_sec.txt"), "r") as f:
-    avg_tokens_per_sec = float(f.read().strip())
-
-times_file = os.path.join(setup_path, "benchmark_times.txt")
-memory_file = os.path.join(setup_path, "benchmark_memory.txt")
-
-
-# Minimal brutalist palette (dark theme): grayscale + 1 accent
-ACCENT = '#5ec8f8'   # calm cyan-blue accent
-FG = '#e6e6e6'       # light gray text/lines
-MUTED = '#9aa0a6'    # muted gray for secondary
-GRID = '#333333'     # grid lines
-
-# Styling tuned for clarity, high contrast, few colors
-plt.style.use('dark_background')
-plt.rcParams['figure.facecolor'] = 'none'
-plt.rcParams['axes.facecolor'] = 'none'
-plt.rcParams['savefig.facecolor'] = 'none'
-plt.rcParams['savefig.transparent'] = True
-plt.rcParams['font.family'] = 'monospace'
-plt.rcParams['font.weight'] = 'bold'
-plt.rcParams['axes.linewidth'] = 3
-plt.rcParams['grid.linewidth'] = 2
-plt.rcParams['lines.linewidth'] = 3
-plt.rcParams['patch.linewidth'] = 2
-
-# Prepare data
-runs = list(range(1, len(times) + 1))
-tokens_per_sec_all = [max_tokens / t for t in times]
-
-# Chart 1: Throughput Performance
-fig1, ax1 = plt.subplots(1, 1, figsize=(12, 6))
-fig1.patch.set_alpha(0)
-ax1.patch.set_alpha(0)
-
-ax1.plot(runs, tokens_per_sec_all, color=ACCENT, marker='o', markersize=12,
-         markerfacecolor=ACCENT, markeredgecolor=FG, markeredgewidth=3, linewidth=5, label='tok/s')
-ax1.fill_between(runs, 0, tokens_per_sec_all, alpha=0.2, color=ACCENT)
-ax1.axhline(y=avg_tokens_per_sec, color=FG, linestyle='--', linewidth=3,
-            label=f'AVG: {avg_tokens_per_sec:.1f}')
-ax1.set_title('THROUGHPUT PERFORMANCE', color=FG, fontsize=18, pad=20, fontweight='bold')
-ax1.set_xlabel('RUN NUMBER', color=FG, fontsize=14, fontweight='bold')
-ax1.set_ylabel('TOKENS/SEC', color=FG, fontsize=14, fontweight='bold')
-ax1.grid(True, color=GRID, alpha=0.5, linewidth=2)
-ax1.tick_params(colors=FG, labelsize=12)
-legend1 = ax1.legend(frameon=False, loc='lower right')
-for text in legend1.get_texts():
-    text.set_color(FG)
-    text.set_fontweight('bold')
-plt.tight_layout()
-plt.savefig('throughput.png', dpi=150, bbox_inches='tight', transparent=True)
-plt.show()
-
-# Chart 2: Generation Latency
-fig2, ax2 = plt.subplots(1, 1, figsize=(12, 6))
-fig2.patch.set_alpha(0)
-ax2.patch.set_alpha(0)
-
-bar_colors = [ACCENT if i % 2 == 0 else MUTED for i in range(len(times))]
-bars = ax2.bar(runs, times, color=bar_colors, edgecolor=FG, linewidth=3, width=0.6)
-ax2.axhline(y=avg_time, color=FG, linestyle='--', linewidth=3,
-            label=f'AVG: {avg_time:.2f}s')
-for i, (run, time, bar) in enumerate(zip(runs, times, bars)):
-    ax2.text(run, time + 0.02, f'{time:.2f}s', ha='center', va='bottom',
-             color=FG, fontweight='bold', fontsize=11)
-ax2.set_title('GENERATION LATENCY', color=FG, fontsize=18, pad=20, fontweight='bold')
-ax2.set_xlabel('RUN NUMBER', color=FG, fontsize=14, fontweight='bold')
-ax2.set_ylabel('TIME (SECONDS)', color=FG, fontsize=14, fontweight='bold')
-ax2.grid(True, axis='y', color=GRID, alpha=0.5, linewidth=2)
-ax2.tick_params(colors=FG, labelsize=12)
-ax2.set_ylim(0, max(times) * 1.15)
-legend2 = ax2.legend(frameon=False, loc='upper right')
-for text in legend2.get_texts():
-    text.set_color(FG)
-    text.set_fontweight('bold')
-plt.tight_layout()
-plt.savefig('latency.png', dpi=150, bbox_inches='tight', transparent=True)
-plt.show()
-
-# Chart 3: Memory Usage
-fig3, ax3 = plt.subplots(1, 1, figsize=(12, 6))
-fig3.patch.set_alpha(0)
-ax3.patch.set_alpha(0)
-
-memory_labels = ['ALLOCATED', 'PEAK', 'RESERVED']
-memory_values = [final_mem['allocated_gb'], final_mem['peak_gb'], final_mem['reserved_gb']]
-colors_mem = [MUTED, ACCENT, FG]
-bars = ax3.barh(memory_labels, memory_values, color=colors_mem, edgecolor=FG, linewidth=3, height=0.5)
-for i, (label, value, bar) in enumerate(zip(memory_labels, memory_values, bars)):
-    ax3.text(value + 0.5, i, f'{value:.1f} GB', va='center',
-             color=FG, fontweight='bold', fontsize=13)
-ax3.set_title('MEMORY USAGE', color=FG, fontsize=18, pad=20, fontweight='bold')
-ax3.set_xlabel('GIGABYTES', color=FG, fontsize=14, fontweight='bold')
-ax3.set_xlim(0, max(memory_values) * 1.3)
-ax3.grid(True, axis='x', color=GRID, alpha=0.5, linewidth=2)
-ax3.tick_params(colors=FG, labelsize=12)
-ax3.set_yticks(range(len(memory_labels)))
-ax3.set_yticklabels(memory_labels, fontweight='bold')
-plt.tight_layout()
-plt.savefig('memory.png', dpi=150, bbox_inches='tight', transparent=True)
-plt.show()
-
-print(f"\nπŸ“Š Charts saved as:")
-print(f"  β€’ throughput.png")
-print(f"  β€’ latency.png")
-print(f"  β€’ memory.png")
-print(f"\nBenchmark Summary:")
-print(f"  avg tokens/sec: {avg_tokens_per_sec:.1f}")
-print(f"  min time: {min_time:.3f}s")
-print(f"  max time: {max_time:.3f}s")
-print(f"  peak memory: {final_mem['peak_gb']:.2f}GB")
-
- -
-
-
Reading benchmark data from: /home/ubuntu/Projects/yamoe-gpt-integration/.uvnote/cache/0e89c413a25ded7b4d6fab2a010f0538ba2b35fb5f619a0dfced3121d3ccf879 - -πŸ“Š Charts saved as: - β€’ throughput.png - β€’ latency.png - β€’ memory.png - -Benchmark Summary: - avg tokens/sec: 5.3 - min time: 0.000s - max time: 0.000s - peak memory: 0.00GB -
-
-
β–Ά UV Install Logs
- -
-
/home/ubuntu/Projects/yamoe-gpt-integration/.uvnote/cells/charts.py:123: UserWarning: Attempting to set identical low and high xlims makes transformation singular; automatically expanding. - ax3.set_xlim(0, max(memory_values) * 1.3) -/home/ubuntu/Projects/yamoe-gpt-integration/.uvnote/cells/charts.py:128: UserWarning: Tight layout not applied. The left and right margins cannot be made large enough to accommodate all Axes decorations. - plt.tight_layout()
-
-

Artifacts:

-latency.png -memory.png -throughput.png -
-latency.png -
-
-memory.png -
-
-throughput.png -
-
-
-
-
- - - - \ No newline at end of file diff --git a/site/note_test_override.html b/site/note_test_override.html deleted file mode 100644 index 5a7c517cfd0c12adae011516be34f2919332eae6..0000000000000000000000000000000000000000 --- a/site/note_test_override.html +++ /dev/null @@ -1,3597 +0,0 @@ - - - - - - uvnote Integration Test Report - - - - - -
-
light
-
reset
- -
- -
-
Generated on:
-
- Linux x86_64 | Linux-5.15.0-1084-aws-x86_64-with-glibc2.31 -
-
- -
-
-
- -β–Ό code -β–Ό output - β–Ά uv-logs - | -Cell: setup | 191.24s - | - -Raw -
-
-
  1
-  2
-  3
-  4
-  5
-  6
-  7
-  8
-  9
- 10
- 11
- 12
- 13
- 14
- 15
- 16
- 17
- 18
- 19
- 20
- 21
- 22
- 23
- 24
- 25
- 26
- 27
- 28
- 29
- 30
- 31
- 32
- 33
- 34
- 35
- 36
- 37
- 38
- 39
- 40
- 41
- 42
- 43
- 44
- 45
- 46
- 47
- 48
- 49
- 50
- 51
- 52
- 53
- 54
- 55
- 56
- 57
- 58
- 59
- 60
- 61
- 62
- 63
- 64
- 65
- 66
- 67
- 68
- 69
- 70
- 71
- 72
- 73
- 74
- 75
- 76
- 77
- 78
- 79
- 80
- 81
- 82
- 83
- 84
- 85
- 86
- 87
- 88
- 89
- 90
- 91
- 92
- 93
- 94
- 95
- 96
- 97
- 98
- 99
-100
-101
-102
-103
-104
-105
-106
-107
-108
-109
-110
-111
-112
-113
-114
-115
-116
# /// script
-# requires-python = ">=3.12"
-# dependencies = [
-#     "accelerate>=1.10.1",
-#     "torch>=2.7.0",
-#     "kernels==0.10.0",
-#     "transformers@https://github.com/huggingface/transformers.git",
-#     "ipdb>=0.13.13",
-#     "matplotlib>=3.7.2",
-#     "numpy>=1.24.3",
-# ]
-# ///
-
-import torch
-from transformers import GptOssForCausalLM, PreTrainedTokenizerFast, Mxfp4Config
-import time
-import torch.nn as nn
-from kernels import register_kernel_mapping, Mode, LayerRepository
-import sys
-import torch.profiler
-import gc
-import logging
-
-# set to debug logging
-logging.basicConfig(level=logging.INFO)
-
-def reset_peak_memory_stats():
-    """Clear CUDA cache and reset memory allocation counters."""
-    torch.cuda.empty_cache()
-    if torch.cuda.is_available():
-        torch.cuda.reset_peak_memory_stats()
-    gc.collect()
-
-def get_memory_stats():
-    """Get current and peak CUDA memory usage."""
-    if not torch.cuda.is_available():
-        return {"allocated_gb": 0, "peak_gb": 0, "reserved_gb": 0}
-    return {
-        "allocated_gb": torch.cuda.memory_allocated() / 1e9,
-        "peak_gb": torch.cuda.max_memory_allocated() / 1e9,
-        "reserved_gb": torch.cuda.memory_reserved() / 1e9,
-    }
-
-def override_kernel_layer_name(cls_name: str, value) -> bool:
-    """Helper to dynamically override the kernel_layer_name in a model class."""
-    for mod in sys.modules.values():
-        if mod is None:
-            continue
-        obj = getattr(mod, cls_name, None)
-        if isinstance(obj, type) and issubclass(obj, nn.Module):
-            setattr(obj, "kernel_layer_name", value)
-            print(f"Overrode {cls_name}.kernel_layer_name to {value}")
-            return True
-    return False
-
-
-# Init the model the normal way
-model_id = "openai/gpt-oss-20b"
-tokenizer = PreTrainedTokenizerFast.from_pretrained(model_id)
-quantization_config = Mxfp4Config(dequantize=True)
-
-
-from kernels import replace_kernel_forward_from_hub, register_kernel_mapping, LayerRepository, Mode
-
-from transformers.models.gpt_oss.modeling_gpt_oss import GptOssMLP, GptOssRMSNorm
-
-replace_kernel_forward_from_hub(GptOssMLP, "Yamoe")  # direct, type-safe
-replace_kernel_forward_from_hub(GptOssRMSNorm, None)  # direct, type-safe
-custom_mapping = {
-    "Yamoe": {
-        "cuda": {
-            Mode.INFERENCE: LayerRepository(
-                repo_id="drbh/yamoe",
-                layer_name="Yamoe",
-                revision="v0.3.0",
-            )
-        }
-    }
-}
-register_kernel_mapping(custom_mapping)
-
-
-model = GptOssForCausalLM.from_pretrained(
-    model_id,
-    dtype="bfloat16",
-    device_map="auto",
-    use_kernels=True,
-    quantization_config=quantization_config,
-).eval()
-
-messages = [
-    {"role": "system", "content": "What is Tensor Parallelism?"},
-]
-
-inputs = tokenizer.apply_chat_template(
-    messages,
-    add_generation_prompt=True,
-    return_tensors="pt",
-    return_dict=True,
-    reasoning_effort="low",
-).to("cuda")
-
-max_tokens = 512
-
-with torch.inference_mode():
-    start_time = time.perf_counter()
-    generated = model.generate(
-        **inputs,
-        max_new_tokens=max_tokens,
-        do_sample=False,
-        temperature=None,
-    )
-    end_time = time.perf_counter()
-
-print(tokenizer.decode(generated[0], skip_special_tokens=False))
-print(f"Generation took {end_time - start_time:.2f} seconds")
-
- -
-
-
<|start|>system<|message|>You are ChatGPT, a large language model trained by OpenAI. -Knowledge cutoff: 2024-06 -Current date: 2025-09-22 - -Reasoning: low - -# Valid channels: analysis, commentary, final. Channel must be included for every message.<|end|><|start|>developer<|message|># Instructions - -What is Tensor Parallelism? - -<|end|><|start|>assistant<|channel|>analysis<|message|>We need to explain what Tensor Parallelism is. It's a concept in distributed training of large language models. It refers to splitting the weight matrices (tensors) across multiple devices, so each device holds a slice of the matrix. During forward/backward passes, each device computes partial results and then they are aggregated. It's used to scale up models beyond single device memory. Also mention pipeline parallelism, data parallelism. Provide details: e.g., for a linear layer weight matrix W of shape (out_features, in_features), we can split along out_features dimension across devices. Each device computes its part of the output. Then gather. Similarly for attention QKV projections. Provide example: GPT-3 uses tensor parallelism. Provide benefits: memory savings, compute scaling. Provide challenges: communication overhead, load balancing. Provide typical frameworks: Megatron-LM, DeepSpeed, etc. Provide mention of "tensor model parallelism" vs "tensor parallelism" synonyms. Provide explanation of how it's implemented: using torch.distributed, NCCL, etc. Provide mention of "tensor parallelism" in context of "DeepSpeed ZeRO Stage 3" or "Megatron-LM" etc. Provide mention of "tensor parallelism" as part of "model parallelism" in large language models. Provide mention of "tensor parallelism" vs "data parallelism" vs "pipeline parallelism". Provide mention of "tensor parallelism" as "splitting weight matrices across GPUs" and "communication of partial results". - -Also mention "tensor parallelism" can be used for linear layers, self-attention, feed-forward networks. Provide example: In GPT, each layer has a linear projection for Q, K, V, and output. These can be split across GPUs. Provide mention of "tensor parallelism" as "model parallelism" that splits the model across GPUs. Provide mention of "tensor parallelism" as "splitting the weight matrices across GPUs" and "communication of partial results". - -Also mention "tensor parallelism" can be used for "embedding tables" as well. Provide mention of "tensor parallelism" as "embedding table partitioning". - -Also mention "tensor parallelism" can be used for "attention heads" splitting across GPUs. - -Also mention "tensor parallelism" can be used for "feed-forward networks" splitting across GPUs. - -Also mention "tensor parallelism" can be used for "softmax" and "layernorm" across GPUs. - -Also mention "tensor parallelism" can be used for -Generation took 97.20 seconds -
-
-
β–Ά UV Install Logs
- -
-
Fetching 3 files: 0%| | 0/3 [00:00<?, ?it/s] -Fetching 3 files: 33%|β–ˆβ–ˆβ–ˆβ–Ž | 1/3 [00:13<00:26, 13.18s/it] -Fetching 3 files: 67%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–‹ | 2/3 [00:17<00:08, 8.21s/it] -Fetching 3 files: 100%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆ| 3/3 [00:17<00:00, 5.97s/it] -You are using full precision kernels, we will dequantize the model to bf16. To use the quantized model with quantization kernels, please set use_kernels=False - -Loading checkpoint shards: 0%| | 0/3 [00:00<?, ?it/s] -Loading checkpoint shards: 33%|β–ˆβ–ˆβ–ˆβ–Ž | 1/3 [00:03<00:06, 3.22s/it] -Loading checkpoint shards: 67%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–‹ | 2/3 [00:06<00:03, 3.13s/it] -Loading checkpoint shards: 100%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆ| 3/3 [00:08<00:00, 2.49s/it] -Loading checkpoint shards: 100%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆ| 3/3 [00:08<00:00, 2.67s/it] -INFO:root:Using layer `Yamoe` from repo `drbh/yamoe` (revision: v0.3.0) for layer `Yamoe` - -Fetching 6 files: 0%| | 0/6 [00:00<?, ?it/s] -Fetching 6 files: 17%|β–ˆβ–‹ | 1/6 [00:00<00:00, 5.14it/s] -Fetching 6 files: 50%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆ | 3/6 [00:00<00:00, 6.83it/s] -Fetching 6 files: 100%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆ| 6/6 [00:00<00:00, 13.22it/s] -/tmp/uvnote-run-he2n6v96/home/.cache/uv/environments-v2/setup-30bb029f3f83f37d/lib/python3.12/site-packages/kernels/layer.py:868: UserWarning: -No kernel mapping found for layer `None`. Check if the layer name matches one of the kernels in the mapping or add the kernel you want to use to the mapping. Defaulting to original forward implementation. - warnings.warn( -INFO:root:Using layer `Yamoe` from repo `drbh/yamoe` (revision: v0.3.0) for layer `Yamoe` -INFO:root:Using layer `Yamoe` from repo `drbh/yamoe` (revision: v0.3.0) for layer `Yamoe` -INFO:root:Using layer `Yamoe` from repo `drbh/yamoe` (revision: v0.3.0) for layer `Yamoe` -INFO:root:Using layer `Yamoe` from repo `drbh/yamoe` (revision: v0.3.0) for layer `Yamoe` -INFO:root:Using layer `Yamoe` from repo `drbh/yamoe` (revision: v0.3.0) for layer `Yamoe` -INFO:root:Using layer `Yamoe` from repo `drbh/yamoe` (revision: v0.3.0) for layer `Yamoe` -INFO:root:Using layer `Yamoe` from repo `drbh/yamoe` (revision: v0.3.0) for layer `Yamoe` -INFO:root:Using layer `Yamoe` from repo `drbh/yamoe` (revision: v0.3.0) for layer `Yamoe` -INFO:root:Using layer `Yamoe` from repo `drbh/yamoe` (revision: v0.3.0) for layer `Yamoe` -INFO:root:Using layer `Yamoe` from repo `drbh/yamoe` (revision: v0.3.0) for layer `Yamoe` -INFO:root:Using layer `Yamoe` from repo `drbh/yamoe` (revision: v0.3.0) for layer `Yamoe` -INFO:root:Using layer `Yamoe` from repo `drbh/yamoe` (revision: v0.3.0) for layer `Yamoe` -INFO:root:Using layer `Yamoe` from repo `drbh/yamoe` (revision: v0.3.0) for layer `Yamoe` -INFO:root:Using layer `Yamoe` from repo `drbh/yamoe` (revision: v0.3.0) for layer `Yamoe` -INFO:root:Using layer `Yamoe` from repo `drbh/yamoe` (revision: v0.3.0) for layer `Yamoe` -INFO:root:Using layer `Yamoe` from repo `drbh/yamoe` (revision: v0.3.0) for layer `Yamoe` -INFO:root:Using layer `Yamoe` from repo `drbh/yamoe` (revision: v0.3.0) for layer `Yamoe` -INFO:root:Using layer `Yamoe` from repo `drbh/yamoe` (revision: v0.3.0) for layer `Yamoe` -INFO:root:Using layer `Yamoe` from repo `drbh/yamoe` (revision: v0.3.0) for layer `Yamoe` -INFO:root:Using layer `Yamoe` from repo `drbh/yamoe` (revision: v0.3.0) for layer `Yamoe` -INFO:root:Using layer `Yamoe` from repo `drbh/yamoe` (revision: v0.3.0) for layer `Yamoe` -INFO:root:Using layer `Yamoe` from repo `drbh/yamoe` (revision: v0.3.0) for layer `Yamoe` -INFO:root:Using layer `Yamoe` from repo `drbh/yamoe` (revision: v0.3.0) for layer `Yamoe` -INFO:root:Using layer `Yamoe` from repo `drbh/yamoe` (revision: v0.3.0) for layer `Yamoe` -/tmp/uvnote-run-he2n6v96/home/.cache/uv/environments-v2/setup-30bb029f3f83f37d/lib/python3.12/site-packages/kernels/layer.py:868: UserWarning: -No kernel mapping found for layer `None`. Check if the layer name matches one of the kernels in the mapping or add the kernel you want to use to the mapping. Defaulting to original forward implementation. - warnings.warn( -INFO:root:Using layer `Yamoe` from repo `drbh/yamoe` (revision: v0.3.0) for layer `Yamoe` -INFO:root:Using layer `Yamoe` from repo `drbh/yamoe` (revision: v0.3.0) for layer `Yamoe` -INFO:root:Using layer `Yamoe` from repo `drbh/yamoe` (revision: v0.3.0) for layer `Yamoe` -INFO:root:Using layer `Yamoe` from repo `drbh/yamoe` (revision: v0.3.0) for layer `Yamoe` -INFO:root:Using layer `Yamoe` from repo `drbh/yamoe` (revision: v0.3.0) for layer `Yamoe` -INFO:root:Using layer `Yamoe` from repo `drbh/yamoe` (revision: v0.3.0) for layer `Yamoe` -INFO:root:Using layer `Yamoe` from repo `drbh/yamoe` (revision: v0.3.0) for layer `Yamoe` -INFO:root:Using layer `Yamoe` from repo `drbh/yamoe` (revision: v0.3.0) for layer `Yamoe` -INFO:root:Using layer `Yamoe` from repo `drbh/yamoe` (revision: v0.3.0) for layer `Yamoe` -INFO:root:Using layer `Yamoe` from repo `drbh/yamoe` (revision: v0.3.0) for layer `Yamoe` -INFO:root:Using layer `Yamoe` from repo `drbh/yamoe` (revision: v0.3.0) for layer `Yamoe` -INFO:root:Using layer `Yamoe` from repo `drbh/yamoe` (revision: v0.3.0) for layer `Yamoe` -INFO:root:Using layer `Yamoe` from repo `drbh/yamoe` (revision: v0.3.0) for layer `Yamoe` -INFO:root:Using layer `Yamoe` from repo `drbh/yamoe` (revision: v0.3.0) for layer `Yamoe` -INFO:root:Using layer `Yamoe` from repo `drbh/yamoe` (revision: v0.3.0) for layer `Yamoe` -INFO:root:Using layer `Yamoe` from repo `drbh/yamoe` (revision: v0.3.0) for layer `Yamoe` -INFO:root:Using layer `Yamoe` from repo `drbh/yamoe` (revision: v0.3.0) for layer `Yamoe` -INFO:root:Using layer `Yamoe` from repo `drbh/yamoe` (revision: v0.3.0) for layer `Yamoe` -INFO:root:Using layer `Yamoe` from repo `drbh/yamoe` (revision: v0.3.0) for layer `Yamoe` -INFO:root:Using layer `Yamoe` from repo `drbh/yamoe` (revision: v0.3.0) for layer `Yamoe` -INFO:root:Using layer `Yamoe` from repo `drbh/yamoe` (revision: v0.3.0) for layer `Yamoe` -INFO:root:Using layer `Yamoe` from repo `drbh/yamoe` (revision: v0.3.0) for layer `Yamoe` -INFO:root:Using layer `Yamoe` from repo `drbh/yamoe` (revision: v0.3.0) for layer `Yamoe`
-
-
- -

Reference kernel

-
-
- -β–Ό code -β–Ό output - β–Ά uv-logs - | -Cell: setup2 | 197.24s - | - -Raw -
-
-
  1
-  2
-  3
-  4
-  5
-  6
-  7
-  8
-  9
- 10
- 11
- 12
- 13
- 14
- 15
- 16
- 17
- 18
- 19
- 20
- 21
- 22
- 23
- 24
- 25
- 26
- 27
- 28
- 29
- 30
- 31
- 32
- 33
- 34
- 35
- 36
- 37
- 38
- 39
- 40
- 41
- 42
- 43
- 44
- 45
- 46
- 47
- 48
- 49
- 50
- 51
- 52
- 53
- 54
- 55
- 56
- 57
- 58
- 59
- 60
- 61
- 62
- 63
- 64
- 65
- 66
- 67
- 68
- 69
- 70
- 71
- 72
- 73
- 74
- 75
- 76
- 77
- 78
- 79
- 80
- 81
- 82
- 83
- 84
- 85
- 86
- 87
- 88
- 89
- 90
- 91
- 92
- 93
- 94
- 95
- 96
- 97
- 98
- 99
-100
-101
-102
-103
-104
-105
-106
-107
-108
-109
-110
-111
-112
-113
-114
-115
# /// script
-# requires-python = ">=3.12"
-# dependencies = [
-#     "accelerate>=1.10.1",
-#     "torch>=2.7.0",
-#     "kernels==0.10.0",
-#     "transformers@https://github.com/huggingface/transformers.git",
-#     "ipdb>=0.13.13",
-#     "matplotlib>=3.7.2",
-#     "numpy>=1.24.3",
-# ]
-# ///
-
-import torch
-from transformers import GptOssForCausalLM, PreTrainedTokenizerFast, Mxfp4Config
-import time
-import torch.nn as nn
-from kernels import register_kernel_mapping, Mode, LayerRepository
-import sys
-import torch.profiler
-import gc
-import logging
-
-# set to debug logging
-logging.basicConfig(level=logging.INFO)
-
-def reset_peak_memory_stats():
-    """Clear CUDA cache and reset memory allocation counters."""
-    torch.cuda.empty_cache()
-    if torch.cuda.is_available():
-        torch.cuda.reset_peak_memory_stats()
-    gc.collect()
-
-def get_memory_stats():
-    """Get current and peak CUDA memory usage."""
-    if not torch.cuda.is_available():
-        return {"allocated_gb": 0, "peak_gb": 0, "reserved_gb": 0}
-    return {
-        "allocated_gb": torch.cuda.memory_allocated() / 1e9,
-        "peak_gb": torch.cuda.max_memory_allocated() / 1e9,
-        "reserved_gb": torch.cuda.memory_reserved() / 1e9,
-    }
-
-def override_kernel_layer_name(cls_name: str, value) -> bool:
-    """Helper to dynamically override the kernel_layer_name in a model class."""
-    for mod in sys.modules.values():
-        if mod is None:
-            continue
-        obj = getattr(mod, cls_name, None)
-        if isinstance(obj, type) and issubclass(obj, nn.Module):
-            setattr(obj, "kernel_layer_name", value)
-            print(f"Overrode {cls_name}.kernel_layer_name to {value}")
-            return True
-    return False
-
-
-# Init the model the normal way
-model_id = "openai/gpt-oss-20b"
-tokenizer = PreTrainedTokenizerFast.from_pretrained(model_id)
-quantization_config = Mxfp4Config(dequantize=True)
-
-
-from kernels import replace_kernel_forward_from_hub, register_kernel_mapping, LayerRepository, Mode
-
-from transformers.models.gpt_oss.modeling_gpt_oss import GptOssMLP, GptOssRMSNorm
-
-replace_kernel_forward_from_hub(GptOssRMSNorm, None)  # direct, type-safe
-custom_mapping = {
-    "Yamoe": {
-        "cuda": {
-            Mode.INFERENCE: LayerRepository(
-                repo_id="drbh/yamoe",
-                layer_name="Yamoe",
-                revision="v0.3.0",
-            )
-        }
-    }
-}
-register_kernel_mapping(custom_mapping)
-
-
-model = GptOssForCausalLM.from_pretrained(
-    model_id,
-    dtype="bfloat16",
-    device_map="auto",
-    use_kernels=True,
-    quantization_config=quantization_config,
-).eval()
-
-messages = [
-    {"role": "system", "content": "What is Tensor Parallelism?"},
-]
-
-inputs = tokenizer.apply_chat_template(
-    messages,
-    add_generation_prompt=True,
-    return_tensors="pt",
-    return_dict=True,
-    reasoning_effort="low",
-).to("cuda")
-
-max_tokens = 512
-
-with torch.inference_mode():
-    start_time = time.perf_counter()
-    generated = model.generate(
-        **inputs,
-        max_new_tokens=max_tokens,
-        do_sample=False,
-        temperature=None,
-    )
-    end_time = time.perf_counter()
-
-print(tokenizer.decode(generated[0], skip_special_tokens=False))
-print(f"Generation took {end_time - start_time:.2f} seconds")
-
- -
-
-
<|start|>system<|message|>You are ChatGPT, a large language model trained by OpenAI. -Knowledge cutoff: 2024-06 -Current date: 2025-09-22 - -Reasoning: low - -# Valid channels: analysis, commentary, final. Channel must be included for every message.<|end|><|start|>developer<|message|># Instructions - -What is Tensor Parallelism? - -<|end|><|start|>assistant<|channel|>analysis<|message|>We need to answer: "What is Tensor Parallelism?" Provide explanation.<|end|><|start|>assistant<|channel|>final<|message|>**Tensor Parallelism** is a technique used in large‑scale deep‑learning training to split the computation of a single tensor (e.g., a weight matrix or an activation) across multiple GPUs or other accelerators. Instead of each device holding a full copy of the tensor, the tensor is partitionedβ€”usually along one dimension (rows or columns)β€”and each device processes only its slice. The partial results are then combined (e.g., summed) to produce the final output. - -Key points: - -| Feature | Description | -|---------|-------------| -| **Purpose** | Reduce memory footprint per device and increase throughput for very large models. | -| **How it works** | A weight matrix \(W\) of shape \((M, N)\) is split into \(k\) parts along one axis: \(W = [W_1, W_2, …, W_k]\). Each GPU holds one part and performs its local matrix multiplication with the input. The partial outputs are then summed (or concatenated) to form the final result. | -| **Typical use‑case** | Training transformer models with billions of parameters where a single weight matrix would not fit on one GPU. | -| **Relation to other parallelism** | It is one of several parallelism strategies: *data parallelism* (replicating the whole model across devices), *model parallelism* (splitting the model into sub‑modules), and *pipeline parallelism* (splitting the model into stages). Tensor parallelism is a fine‑grained form of model parallelism. | -| **Implementation** | Frameworks like Megatron‑LM, DeepSpeed, and PyTorch’s `torch.distributed` provide primitives for tensor‑parallel operations. | -| **Benefits** | β€’ Lower per‑device memory usage. <br>β€’ Enables training of models that would otherwise be impossible on a single device. <br>β€’ Can improve compute utilization when combined with other parallelism strategies. | -| **Challenges** | β€’ Requires careful communication (e.g., all‑reduce) to combine partial results. <br>β€’ Load balancing and communication overhead can become bottlenecks. <br>β€’ Implementation complexity increases. | - -In short, **tensor parallelism** distributes the *internal* tensors of a model across multiple devices, allowing each device to work on a smaller piece of the tensor and then combine the results, -Generation took 103.82 seconds -
-
-
β–Ά UV Install Logs
- -
-
Fetching 3 files: 0%| | 0/3 [00:00<?, ?it/s] -Fetching 3 files: 33%|β–ˆβ–ˆβ–ˆβ–Ž | 1/3 [00:13<00:27, 13.63s/it] -Fetching 3 files: 67%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–‹ | 2/3 [00:17<00:07, 7.65s/it] -Fetching 3 files: 100%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆ| 3/3 [00:17<00:00, 5.70s/it] -You are using full precision kernels, we will dequantize the model to bf16. To use the quantized model with quantization kernels, please set use_kernels=False - -Loading checkpoint shards: 0%| | 0/3 [00:00<?, ?it/s] -Loading checkpoint shards: 33%|β–ˆβ–ˆβ–ˆβ–Ž | 1/3 [00:03<00:06, 3.23s/it] -Loading checkpoint shards: 67%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–‹ | 2/3 [00:06<00:03, 3.14s/it] -Loading checkpoint shards: 100%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆ| 3/3 [00:08<00:00, 2.49s/it] -Loading checkpoint shards: 100%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆ| 3/3 [00:08<00:00, 2.68s/it] -INFO:root:Using layer `MegaBlocksMoeMLP` from repo `kernels-community/megablocks` (revision: main) for layer `MegaBlocksMoeMLP` - -Fetching 66 files: 0%| | 0/66 [00:00<?, ?it/s] -Fetching 66 files: 2%|▏ | 1/66 [00:00<00:20, 3.25it/s] -Fetching 66 files: 17%|β–ˆβ–‹ | 11/66 [00:00<00:01, 28.26it/s] -Fetching 66 files: 26%|β–ˆβ–ˆβ–Œ | 17/66 [00:00<00:02, 18.20it/s] -Fetching 66 files: 100%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆ| 66/66 [00:00<00:00, 69.67it/s] -/tmp/uvnote-run-0ew4aumc/home/.cache/uv/environments-v2/setup2-d4e9795d8c8c2492/lib/python3.12/site-packages/kernels/layer.py:868: UserWarning: -No kernel mapping found for layer `None`. Check if the layer name matches one of the kernels in the mapping or add the kernel you want to use to the mapping. Defaulting to original forward implementation. - warnings.warn( -INFO:root:Using layer `MegaBlocksMoeMLP` from repo `kernels-community/megablocks` (revision: main) for layer `MegaBlocksMoeMLP` -INFO:root:Using layer `MegaBlocksMoeMLP` from repo `kernels-community/megablocks` (revision: main) for layer `MegaBlocksMoeMLP` -INFO:root:Using layer `MegaBlocksMoeMLP` from repo `kernels-community/megablocks` (revision: main) for layer `MegaBlocksMoeMLP` -INFO:root:Using layer `MegaBlocksMoeMLP` from repo `kernels-community/megablocks` (revision: main) for layer `MegaBlocksMoeMLP` -INFO:root:Using layer `MegaBlocksMoeMLP` from repo `kernels-community/megablocks` (revision: main) for layer `MegaBlocksMoeMLP` -INFO:root:Using layer `MegaBlocksMoeMLP` from repo `kernels-community/megablocks` (revision: main) for layer `MegaBlocksMoeMLP` -INFO:root:Using layer `MegaBlocksMoeMLP` from repo `kernels-community/megablocks` (revision: main) for layer `MegaBlocksMoeMLP` -INFO:root:Using layer `MegaBlocksMoeMLP` from repo `kernels-community/megablocks` (revision: main) for layer `MegaBlocksMoeMLP` -INFO:root:Using layer `MegaBlocksMoeMLP` from repo `kernels-community/megablocks` (revision: main) for layer `MegaBlocksMoeMLP` -INFO:root:Using layer `MegaBlocksMoeMLP` from repo `kernels-community/megablocks` (revision: main) for layer `MegaBlocksMoeMLP` -INFO:root:Using layer `MegaBlocksMoeMLP` from repo `kernels-community/megablocks` (revision: main) for layer `MegaBlocksMoeMLP` -INFO:root:Using layer `MegaBlocksMoeMLP` from repo `kernels-community/megablocks` (revision: main) for layer `MegaBlocksMoeMLP` -INFO:root:Using layer `MegaBlocksMoeMLP` from repo `kernels-community/megablocks` (revision: main) for layer `MegaBlocksMoeMLP` -INFO:root:Using layer `MegaBlocksMoeMLP` from repo `kernels-community/megablocks` (revision: main) for layer `MegaBlocksMoeMLP` -INFO:root:Using layer `MegaBlocksMoeMLP` from repo `kernels-community/megablocks` (revision: main) for layer `MegaBlocksMoeMLP` -INFO:root:Using layer `MegaBlocksMoeMLP` from repo `kernels-community/megablocks` (revision: main) for layer `MegaBlocksMoeMLP` -INFO:root:Using layer `MegaBlocksMoeMLP` from repo `kernels-community/megablocks` (revision: main) for layer `MegaBlocksMoeMLP` -INFO:root:Using layer `MegaBlocksMoeMLP` from repo `kernels-community/megablocks` (revision: main) for layer `MegaBlocksMoeMLP` -INFO:root:Using layer `MegaBlocksMoeMLP` from repo `kernels-community/megablocks` (revision: main) for layer `MegaBlocksMoeMLP` -INFO:root:Using layer `MegaBlocksMoeMLP` from repo `kernels-community/megablocks` (revision: main) for layer `MegaBlocksMoeMLP` -INFO:root:Using layer `MegaBlocksMoeMLP` from repo `kernels-community/megablocks` (revision: main) for layer `MegaBlocksMoeMLP` -INFO:root:Using layer `MegaBlocksMoeMLP` from repo `kernels-community/megablocks` (revision: main) for layer `MegaBlocksMoeMLP` -INFO:root:Using layer `MegaBlocksMoeMLP` from repo `kernels-community/megablocks` (revision: main) for layer `MegaBlocksMoeMLP` -INFO:root:Using layer `MegaBlocksMoeMLP` from repo `kernels-community/megablocks` (revision: main) for layer `MegaBlocksMoeMLP` -/tmp/uvnote-run-0ew4aumc/home/.cache/uv/environments-v2/setup2-d4e9795d8c8c2492/lib/python3.12/site-packages/kernels/layer.py:868: UserWarning: -No kernel mapping found for layer `None`. Check if the layer name matches one of the kernels in the mapping or add the kernel you want to use to the mapping. Defaulting to original forward implementation. - warnings.warn( -INFO:root:Using layer `MegaBlocksMoeMLP` from repo `kernels-community/megablocks` (revision: main) for layer `MegaBlocksMoeMLP` -INFO:root:Using layer `MegaBlocksMoeMLP` from repo `kernels-community/megablocks` (revision: main) for layer `MegaBlocksMoeMLP` -INFO:root:Using layer `MegaBlocksMoeMLP` from repo `kernels-community/megablocks` (revision: main) for layer `MegaBlocksMoeMLP` -INFO:root:Using layer `MegaBlocksMoeMLP` from repo `kernels-community/megablocks` (revision: main) for layer `MegaBlocksMoeMLP` -INFO:root:Using layer `MegaBlocksMoeMLP` from repo `kernels-community/megablocks` (revision: main) for layer `MegaBlocksMoeMLP` -INFO:root:Using layer `MegaBlocksMoeMLP` from repo `kernels-community/megablocks` (revision: main) for layer `MegaBlocksMoeMLP` -INFO:root:Using layer `MegaBlocksMoeMLP` from repo `kernels-community/megablocks` (revision: main) for layer `MegaBlocksMoeMLP` -INFO:root:Using layer `MegaBlocksMoeMLP` from repo `kernels-community/megablocks` (revision: main) for layer `MegaBlocksMoeMLP` -INFO:root:Using layer `MegaBlocksMoeMLP` from repo `kernels-community/megablocks` (revision: main) for layer `MegaBlocksMoeMLP` -INFO:root:Using layer `MegaBlocksMoeMLP` from repo `kernels-community/megablocks` (revision: main) for layer `MegaBlocksMoeMLP` -INFO:root:Using layer `MegaBlocksMoeMLP` from repo `kernels-community/megablocks` (revision: main) for layer `MegaBlocksMoeMLP` -INFO:root:Using layer `MegaBlocksMoeMLP` from repo `kernels-community/megablocks` (revision: main) for layer `MegaBlocksMoeMLP` -INFO:root:Using layer `MegaBlocksMoeMLP` from repo `kernels-community/megablocks` (revision: main) for layer `MegaBlocksMoeMLP` -INFO:root:Using layer `MegaBlocksMoeMLP` from repo `kernels-community/megablocks` (revision: main) for layer `MegaBlocksMoeMLP` -INFO:root:Using layer `MegaBlocksMoeMLP` from repo `kernels-community/megablocks` (revision: main) for layer `MegaBlocksMoeMLP` -INFO:root:Using layer `MegaBlocksMoeMLP` from repo `kernels-community/megablocks` (revision: main) for layer `MegaBlocksMoeMLP` -INFO:root:Using layer `MegaBlocksMoeMLP` from repo `kernels-community/megablocks` (revision: main) for layer `MegaBlocksMoeMLP` -INFO:root:Using layer `MegaBlocksMoeMLP` from repo `kernels-community/megablocks` (revision: main) for layer `MegaBlocksMoeMLP` -INFO:root:Using layer `MegaBlocksMoeMLP` from repo `kernels-community/megablocks` (revision: main) for layer `MegaBlocksMoeMLP` -INFO:root:Using layer `MegaBlocksMoeMLP` from repo `kernels-community/megablocks` (revision: main) for layer `MegaBlocksMoeMLP` -INFO:root:Using layer `MegaBlocksMoeMLP` from repo `kernels-community/megablocks` (revision: main) for layer `MegaBlocksMoeMLP` -INFO:root:Using layer `MegaBlocksMoeMLP` from repo `kernels-community/megablocks` (revision: main) for layer `MegaBlocksMoeMLP` -INFO:root:Using layer `MegaBlocksMoeMLP` from repo `kernels-community/megablocks` (revision: main) for layer `MegaBlocksMoeMLP`
-
-
-
- - - - \ No newline at end of file diff --git a/style.css b/style.css deleted file mode 100644 index 114adf441e9032febb46bc056b2a8bb651075f0d..0000000000000000000000000000000000000000 --- a/style.css +++ /dev/null @@ -1,28 +0,0 @@ -body { - padding: 2rem; - font-family: -apple-system, BlinkMacSystemFont, "Arial", sans-serif; -} - -h1 { - font-size: 16px; - margin-top: 0; -} - -p { - color: rgb(107, 114, 128); - font-size: 15px; - margin-bottom: 10px; - margin-top: 5px; -} - -.card { - max-width: 620px; - margin: 0 auto; - padding: 16px; - border: 1px solid lightgray; - border-radius: 16px; -} - -.card p:last-child { - margin-bottom: 0; -}