Spaces:

kernels-community
/

kernels-benchmarks

Running

App Files Files Community

drbh commited on Sep 25

Commit

43ffb32

1 Parent(s): b975ca1

fix: cleanup test generations and update attributes

Browse files

Files changed (48) hide show

.gitattributes +11 -1
.venv/index.html +0 -24
.venv/lib/index.html +0 -24
.venv/lib/python3.11/index.html +0 -24
.venv/lib/python3.11/site-packages/flask/index.html +0 -24
.venv/lib/python3.11/site-packages/flask/sansio/index.html +0 -24
.venv/lib/python3.11/site-packages/index.html +0 -26
.venv/lib/python3.11/site-packages/markdown-3.9.dist-info/index.html +0 -24
.venv/lib/python3.11/site-packages/markdown-3.9.dist-info/licenses/index.html +0 -24
.venv/lib/python3.11/site-packages/werkzeug/debug/index.html +0 -24
.venv/lib/python3.11/site-packages/werkzeug/debug/shared/index.html +0 -24
.venv/lib/python3.11/site-packages/werkzeug/index.html +0 -24
artifacts/charts/benchmark_dashboard.png +0 -0
artifacts/charts/latency.png +0 -0
artifacts/charts/memory.png +0 -0
artifacts/charts/throughput.png +0 -0
artifacts/setup/benchmark_avg_tokens_per_sec.txt +0 -1
artifacts/setup/benchmark_dashboard.png +0 -0
artifacts/setup/benchmark_memory.txt +0 -1
artifacts/setup/benchmark_times.txt +0 -5
cells/charts.py +0 -140
cells/forward_and_backward.py +0 -102
cells/forward_only.py +0 -96
cells/nv.py +0 -3
cells/setup.py +0 -116
cells/setup2.py +0 -115
index.html +0 -24
megablocks_only.html +0 -0
note.html +0 -0
note_test_override.html +0 -0
note_test_override.md +0 -261
site/artifacts/charts/benchmark_dashboard.png +0 -0
site/artifacts/charts/latency.png +0 -0
site/artifacts/charts/memory.png +0 -0
site/artifacts/charts/throughput.png +0 -0
site/artifacts/setup/benchmark_avg_tokens_per_sec.txt +0 -1
site/artifacts/setup/benchmark_dashboard.png +0 -0
site/artifacts/setup/benchmark_memory.txt +0 -1
site/artifacts/setup/benchmark_times.txt +0 -5
site/cells/charts.py +0 -140
site/cells/forward_and_backward.py +0 -102
site/cells/forward_only.py +0 -96
site/cells/setup.py +0 -116
site/cells/setup2.py +0 -115
site/megablocks_only.html +0 -0
site/note.html +0 -0
site/note_test_override.html +0 -0
style.css +0 -28

.gitattributes CHANGED Viewed

@@ -33,4 +33,14 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
 *.zip filter=lfs diff=lfs merge=lfs -text
 *.zst filter=lfs diff=lfs merge=lfs -text
 *tfevents* filter=lfs diff=lfs merge=lfs -text
-moe_benchmarks/megablocks_yamoe/artifacts/visualization/moe_performance_comparison.png filter=lfs diff=lfs merge=lfs -text

 *.zip filter=lfs diff=lfs merge=lfs -text
 *.zst filter=lfs diff=lfs merge=lfs -text
 *tfevents* filter=lfs diff=lfs merge=lfs -text
+# Image files
+*.png filter=lfs diff=lfs merge=lfs -text
+*.jpg filter=lfs diff=lfs merge=lfs -text
+*.jpeg filter=lfs diff=lfs merge=lfs -text
+*.gif filter=lfs diff=lfs merge=lfs -text
+*.bmp filter=lfs diff=lfs merge=lfs -text
+*.tiff filter=lfs diff=lfs merge=lfs -text
+*.tif filter=lfs diff=lfs merge=lfs -text
+*.webp filter=lfs diff=lfs merge=lfs -text
+*.svg filter=lfs diff=lfs merge=lfs -text
+*.ico filter=lfs diff=lfs merge=lfs -text

.venv/index.html DELETED Viewed

@@ -1,24 +0,0 @@
-<!DOCTYPE html>
-<html>
-<head>
-  <meta charset='UTF-8'>
-  <title>Directory Index</title>
-  <style>
-    body { font-family: monospace; margin: 20px; }
-    h1 { font-size: 1.5em; }
-    ul { list-style-type: none; padding-left: 20px; }
-    li { margin: 5px 0; }
-    .dir { font-weight: bold; }
-    .file { color: #0066cc; }
-    a { text-decoration: none; }
-    a:hover { text-decoration: underline; }
-  </style>
-</head>
-<body>
-  <h1>Index of /.venv</h1>
-  <ul>
-    <li><a href='../index.html' class='dir'>../</a></li>
-    <li><a href='lib/index.html' class='dir'>lib/</a></li>
-  </ul>
-</body>
-</html>

.venv/lib/index.html DELETED Viewed

@@ -1,24 +0,0 @@
-<!DOCTYPE html>
-<html>
-<head>
-  <meta charset='UTF-8'>
-  <title>Directory Index</title>
-  <style>
-    body { font-family: monospace; margin: 20px; }
-    h1 { font-size: 1.5em; }
-    ul { list-style-type: none; padding-left: 20px; }
-    li { margin: 5px 0; }
-    .dir { font-weight: bold; }
-    .file { color: #0066cc; }
-    a { text-decoration: none; }
-    a:hover { text-decoration: underline; }
-  </style>
-</head>
-<body>
-  <h1>Index of /.venv/lib</h1>
-  <ul>
-    <li><a href='../index.html' class='dir'>../</a></li>
-    <li><a href='python3.11/index.html' class='dir'>python3.11/</a></li>
-  </ul>
-</body>
-</html>

.venv/lib/python3.11/index.html DELETED Viewed

@@ -1,24 +0,0 @@
-<!DOCTYPE html>
-<html>
-<head>
-  <meta charset='UTF-8'>
-  <title>Directory Index</title>
-  <style>
-    body { font-family: monospace; margin: 20px; }
-    h1 { font-size: 1.5em; }
-    ul { list-style-type: none; padding-left: 20px; }
-    li { margin: 5px 0; }
-    .dir { font-weight: bold; }
-    .file { color: #0066cc; }
-    a { text-decoration: none; }
-    a:hover { text-decoration: underline; }
-  </style>
-</head>
-<body>
-  <h1>Index of /.venv/lib/python3.11</h1>
-  <ul>
-    <li><a href='../index.html' class='dir'>../</a></li>
-    <li><a href='site-packages/index.html' class='dir'>site-packages/</a></li>
-  </ul>
-</body>
-</html>

.venv/lib/python3.11/site-packages/flask/index.html DELETED Viewed

@@ -1,24 +0,0 @@
-<!DOCTYPE html>
-<html>
-<head>
-  <meta charset='UTF-8'>
-  <title>Directory Index</title>
-  <style>
-    body { font-family: monospace; margin: 20px; }
-    h1 { font-size: 1.5em; }
-    ul { list-style-type: none; padding-left: 20px; }
-    li { margin: 5px 0; }
-    .dir { font-weight: bold; }
-    .file { color: #0066cc; }
-    a { text-decoration: none; }
-    a:hover { text-decoration: underline; }
-  </style>
-</head>
-<body>
-  <h1>Index of /.venv/lib/python3.11/site-packages/flask</h1>
-  <ul>
-    <li><a href='../index.html' class='dir'>../</a></li>
-    <li><a href='sansio/index.html' class='dir'>sansio/</a></li>
-  </ul>
-</body>
-</html>

.venv/lib/python3.11/site-packages/flask/sansio/index.html DELETED Viewed

@@ -1,24 +0,0 @@
-<!DOCTYPE html>
-<html>
-<head>
-  <meta charset='UTF-8'>
-  <title>Directory Index</title>
-  <style>
-    body { font-family: monospace; margin: 20px; }
-    h1 { font-size: 1.5em; }
-    ul { list-style-type: none; padding-left: 20px; }
-    li { margin: 5px 0; }
-    .dir { font-weight: bold; }
-    .file { color: #0066cc; }
-    a { text-decoration: none; }
-    a:hover { text-decoration: underline; }
-  </style>
-</head>
-<body>
-  <h1>Index of /.venv/lib/python3.11/site-packages/flask/sansio</h1>
-  <ul>
-    <li><a href='../index.html' class='dir'>../</a></li>
-    <li><a href='README.html' class='file'>README.html</a></li>
-  </ul>
-</body>
-</html>

.venv/lib/python3.11/site-packages/index.html DELETED Viewed

@@ -1,26 +0,0 @@
-<!DOCTYPE html>
-<html>
-<head>
-  <meta charset='UTF-8'>
-  <title>Directory Index</title>
-  <style>
-    body { font-family: monospace; margin: 20px; }
-    h1 { font-size: 1.5em; }
-    ul { list-style-type: none; padding-left: 20px; }
-    li { margin: 5px 0; }
-    .dir { font-weight: bold; }
-    .file { color: #0066cc; }
-    a { text-decoration: none; }
-    a:hover { text-decoration: underline; }
-  </style>
-</head>
-<body>
-  <h1>Index of /.venv/lib/python3.11/site-packages</h1>
-  <ul>
-    <li><a href='../index.html' class='dir'>../</a></li>
-    <li><a href='flask/index.html' class='dir'>flask/</a></li>
-    <li><a href='markdown-3.9.dist-info/index.html' class='dir'>markdown-3.9.dist-info/</a></li>
-    <li><a href='werkzeug/index.html' class='dir'>werkzeug/</a></li>
-  </ul>
-</body>
-</html>

.venv/lib/python3.11/site-packages/markdown-3.9.dist-info/index.html DELETED Viewed

@@ -1,24 +0,0 @@
-<!DOCTYPE html>
-<html>
-<head>
-  <meta charset='UTF-8'>
-  <title>Directory Index</title>
-  <style>
-    body { font-family: monospace; margin: 20px; }
-    h1 { font-size: 1.5em; }
-    ul { list-style-type: none; padding-left: 20px; }
-    li { margin: 5px 0; }
-    .dir { font-weight: bold; }
-    .file { color: #0066cc; }
-    a { text-decoration: none; }
-    a:hover { text-decoration: underline; }
-  </style>
-</head>
-<body>
-  <h1>Index of /.venv/lib/python3.11/site-packages/markdown-3.9.dist-info</h1>
-  <ul>
-    <li><a href='../index.html' class='dir'>../</a></li>
-    <li><a href='licenses/index.html' class='dir'>licenses/</a></li>
-  </ul>
-</body>
-</html>

.venv/lib/python3.11/site-packages/markdown-3.9.dist-info/licenses/index.html DELETED Viewed

@@ -1,24 +0,0 @@
-<!DOCTYPE html>
-<html>
-<head>
-  <meta charset='UTF-8'>
-  <title>Directory Index</title>
-  <style>
-    body { font-family: monospace; margin: 20px; }
-    h1 { font-size: 1.5em; }
-    ul { list-style-type: none; padding-left: 20px; }
-    li { margin: 5px 0; }
-    .dir { font-weight: bold; }
-    .file { color: #0066cc; }
-    a { text-decoration: none; }
-    a:hover { text-decoration: underline; }
-  </style>
-</head>
-<body>
-  <h1>Index of /.venv/lib/python3.11/site-packages/markdown-3.9.dist-info/licenses</h1>
-  <ul>
-    <li><a href='../index.html' class='dir'>../</a></li>
-    <li><a href='LICENSE.html' class='file'>LICENSE.html</a></li>
-  </ul>
-</body>
-</html>

.venv/lib/python3.11/site-packages/werkzeug/debug/index.html DELETED Viewed

@@ -1,24 +0,0 @@
-<!DOCTYPE html>
-<html>
-<head>
-  <meta charset='UTF-8'>
-  <title>Directory Index</title>
-  <style>
-    body { font-family: monospace; margin: 20px; }
-    h1 { font-size: 1.5em; }
-    ul { list-style-type: none; padding-left: 20px; }
-    li { margin: 5px 0; }
-    .dir { font-weight: bold; }
-    .file { color: #0066cc; }
-    a { text-decoration: none; }
-    a:hover { text-decoration: underline; }
-  </style>
-</head>
-<body>
-  <h1>Index of /.venv/lib/python3.11/site-packages/werkzeug/debug</h1>
-  <ul>
-    <li><a href='../index.html' class='dir'>../</a></li>
-    <li><a href='shared/index.html' class='dir'>shared/</a></li>
-  </ul>
-</body>
-</html>

.venv/lib/python3.11/site-packages/werkzeug/debug/shared/index.html DELETED Viewed

@@ -1,24 +0,0 @@
-<!DOCTYPE html>
-<html>
-<head>
-  <meta charset='UTF-8'>
-  <title>Directory Index</title>
-  <style>
-    body { font-family: monospace; margin: 20px; }
-    h1 { font-size: 1.5em; }
-    ul { list-style-type: none; padding-left: 20px; }
-    li { margin: 5px 0; }
-    .dir { font-weight: bold; }
-    .file { color: #0066cc; }
-    a { text-decoration: none; }
-    a:hover { text-decoration: underline; }
-  </style>
-</head>
-<body>
-  <h1>Index of /.venv/lib/python3.11/site-packages/werkzeug/debug/shared</h1>
-  <ul>
-    <li><a href='../index.html' class='dir'>../</a></li>
-    <li><a href='ICON_LICENSE.html' class='file'>ICON_LICENSE.html</a></li>
-  </ul>
-</body>
-</html>

.venv/lib/python3.11/site-packages/werkzeug/index.html DELETED Viewed

@@ -1,24 +0,0 @@
-<!DOCTYPE html>
-<html>
-<head>
-  <meta charset='UTF-8'>
-  <title>Directory Index</title>
-  <style>
-    body { font-family: monospace; margin: 20px; }
-    h1 { font-size: 1.5em; }
-    ul { list-style-type: none; padding-left: 20px; }
-    li { margin: 5px 0; }
-    .dir { font-weight: bold; }
-    .file { color: #0066cc; }
-    a { text-decoration: none; }
-    a:hover { text-decoration: underline; }
-  </style>
-</head>
-<body>
-  <h1>Index of /.venv/lib/python3.11/site-packages/werkzeug</h1>
-  <ul>
-    <li><a href='../index.html' class='dir'>../</a></li>
-    <li><a href='debug/index.html' class='dir'>debug/</a></li>
-  </ul>
-</body>
-</html>

artifacts/charts/benchmark_dashboard.png DELETED Viewed

Binary file (87.7 kB)

artifacts/charts/latency.png DELETED Viewed

Binary file (31.6 kB)

artifacts/charts/memory.png DELETED Viewed

Binary file (46.3 kB)

artifacts/charts/throughput.png DELETED Viewed

Binary file (37.4 kB)

artifacts/setup/benchmark_avg_tokens_per_sec.txt DELETED Viewed

	@@ -1 +0,0 @@
1	- 5.301658854167735

artifacts/setup/benchmark_dashboard.png DELETED Viewed

Binary file (92.9 kB)

artifacts/setup/benchmark_memory.txt DELETED Viewed

	@@ -1 +0,0 @@
1	- 9.398672896,9.414898176,10.334765056

artifacts/setup/benchmark_times.txt DELETED Viewed

@@ -1,5 +0,0 @@
-12.075035744113848
-12.0710428240709
-12.070115809096023
-12.070908240042627
-12.071364195086062

cells/charts.py DELETED Viewed

@@ -1,140 +0,0 @@
-# /// script
-# dependencies = [
-#     "matplotlib",
-#     "numpy",
-# ]
-# ///
-import matplotlib.pyplot as plt
-import numpy as np
-import os
-# get the pathf rom UVNOTE_SETUP env var
-setup_path = os.getenv("UVNOTE_INPUT_SETUP", ".")
-print(f"Reading benchmark data from: {setup_path}")
-num_runs = 5
-max_tokens = 64
-times = []
-with open(os.path.join(setup_path, "benchmark_times.txt"), "r") as f:
-    for line in f:
-        times.append(float(line.strip()))
-avg_time = 0.0
-min_time = 0.0
-max_time = 0.0
-final_mem = {"allocated_gb": 0.0, "peak_gb": 0.0, "reserved_gb": 0.0}
-avg_tokens_per_sec = 0.0
-with open(os.path.join(setup_path, "benchmark_avg_tokens_per_sec.txt"), "r") as f:
-    avg_tokens_per_sec = float(f.read().strip())
-times_file = os.path.join(setup_path, "benchmark_times.txt")
-memory_file = os.path.join(setup_path, "benchmark_memory.txt")
-# Minimal brutalist palette (dark theme): grayscale + 1 accent
-ACCENT = '#5ec8f8'   # calm cyan-blue accent
-FG = '#e6e6e6'       # light gray text/lines
-MUTED = '#9aa0a6'    # muted gray for secondary
-GRID = '#333333'     # grid lines
-# Styling tuned for clarity, high contrast, few colors
-plt.style.use('dark_background')
-plt.rcParams['figure.facecolor'] = 'none'
-plt.rcParams['axes.facecolor'] = 'none'
-plt.rcParams['savefig.facecolor'] = 'none'
-plt.rcParams['savefig.transparent'] = True
-plt.rcParams['font.family'] = 'monospace'
-plt.rcParams['font.weight'] = 'bold'
-plt.rcParams['axes.linewidth'] = 3
-plt.rcParams['grid.linewidth'] = 2
-plt.rcParams['lines.linewidth'] = 3
-plt.rcParams['patch.linewidth'] = 2
-# Prepare data
-runs = list(range(1, len(times) + 1))
-tokens_per_sec_all = [max_tokens / t for t in times]
-# Chart 1: Throughput Performance
-fig1, ax1 = plt.subplots(1, 1, figsize=(12, 6))
-fig1.patch.set_alpha(0)
-ax1.patch.set_alpha(0)
-ax1.plot(runs, tokens_per_sec_all, color=ACCENT, marker='o', markersize=12,
-         markerfacecolor=ACCENT, markeredgecolor=FG, markeredgewidth=3, linewidth=5, label='tok/s')
-ax1.fill_between(runs, 0, tokens_per_sec_all, alpha=0.2, color=ACCENT)
-ax1.axhline(y=avg_tokens_per_sec, color=FG, linestyle='--', linewidth=3,
-            label=f'AVG: {avg_tokens_per_sec:.1f}')
-ax1.set_title('THROUGHPUT PERFORMANCE', color=FG, fontsize=18, pad=20, fontweight='bold')
-ax1.set_xlabel('RUN NUMBER', color=FG, fontsize=14, fontweight='bold')
-ax1.set_ylabel('TOKENS/SEC', color=FG, fontsize=14, fontweight='bold')
-ax1.grid(True, color=GRID, alpha=0.5, linewidth=2)
-ax1.tick_params(colors=FG, labelsize=12)
-legend1 = ax1.legend(frameon=False, loc='lower right')
-for text in legend1.get_texts():
-    text.set_color(FG)
-    text.set_fontweight('bold')
-plt.tight_layout()
-plt.savefig('throughput.png', dpi=150, bbox_inches='tight', transparent=True)
-plt.show()
-# Chart 2: Generation Latency
-fig2, ax2 = plt.subplots(1, 1, figsize=(12, 6))
-fig2.patch.set_alpha(0)
-ax2.patch.set_alpha(0)
-bar_colors = [ACCENT if i % 2 == 0 else MUTED for i in range(len(times))]
-bars = ax2.bar(runs, times, color=bar_colors, edgecolor=FG, linewidth=3, width=0.6)
-ax2.axhline(y=avg_time, color=FG, linestyle='--', linewidth=3,
-            label=f'AVG: {avg_time:.2f}s')
-for i, (run, time, bar) in enumerate(zip(runs, times, bars)):
-    ax2.text(run, time + 0.02, f'{time:.2f}s', ha='center', va='bottom',
-             color=FG, fontweight='bold', fontsize=11)
-ax2.set_title('GENERATION LATENCY', color=FG, fontsize=18, pad=20, fontweight='bold')
-ax2.set_xlabel('RUN NUMBER', color=FG, fontsize=14, fontweight='bold')
-ax2.set_ylabel('TIME (SECONDS)', color=FG, fontsize=14, fontweight='bold')
-ax2.grid(True, axis='y', color=GRID, alpha=0.5, linewidth=2)
-ax2.tick_params(colors=FG, labelsize=12)
-ax2.set_ylim(0, max(times) * 1.15)
-legend2 = ax2.legend(frameon=False, loc='upper right')
-for text in legend2.get_texts():
-    text.set_color(FG)
-    text.set_fontweight('bold')
-plt.tight_layout()
-plt.savefig('latency.png', dpi=150, bbox_inches='tight', transparent=True)
-plt.show()
-# Chart 3: Memory Usage
-fig3, ax3 = plt.subplots(1, 1, figsize=(12, 6))
-fig3.patch.set_alpha(0)
-ax3.patch.set_alpha(0)
-memory_labels = ['ALLOCATED', 'PEAK', 'RESERVED']
-memory_values = [final_mem['allocated_gb'], final_mem['peak_gb'], final_mem['reserved_gb']]
-colors_mem = [MUTED, ACCENT, FG]
-bars = ax3.barh(memory_labels, memory_values, color=colors_mem, edgecolor=FG, linewidth=3, height=0.5)
-for i, (label, value, bar) in enumerate(zip(memory_labels, memory_values, bars)):
-    ax3.text(value + 0.5, i, f'{value:.1f} GB', va='center',
-             color=FG, fontweight='bold', fontsize=13)
-ax3.set_title('MEMORY USAGE', color=FG, fontsize=18, pad=20, fontweight='bold')
-ax3.set_xlabel('GIGABYTES', color=FG, fontsize=14, fontweight='bold')
-ax3.set_xlim(0, max(memory_values) * 1.3)
-ax3.grid(True, axis='x', color=GRID, alpha=0.5, linewidth=2)
-ax3.tick_params(colors=FG, labelsize=12)
-ax3.set_yticks(range(len(memory_labels)))
-ax3.set_yticklabels(memory_labels, fontweight='bold')
-plt.tight_layout()
-plt.savefig('memory.png', dpi=150, bbox_inches='tight', transparent=True)
-plt.show()
-print(f"\n📊 Charts saved as:")
-print(f"  • throughput.png")
-print(f"  • latency.png")
-print(f"  • memory.png")
-print(f"\nBenchmark Summary:")
-print(f"  avg tokens/sec: {avg_tokens_per_sec:.1f}")
-print(f"  min time: {min_time:.3f}s")
-print(f"  max time: {max_time:.3f}s")
-print(f"  peak memory: {final_mem['peak_gb']:.2f}GB")

cells/forward_and_backward.py DELETED Viewed

@@ -1,102 +0,0 @@
-# /// script
-# requires-python = ">=3.12"
-# dependencies = [
-#     "accelerate>=1.10.1",
-#     "torch>=2.7.0",
-#     "kernels==0.10.0",
-#     "transformers@https://github.com/huggingface/transformers.git",
-#     "ipdb>=0.13.13",
-#     "matplotlib>=3.7.2",
-#     "numpy>=1.24.3",
-# ]
-# ///
-import torch
-from transformers import GptOssForCausalLM, PreTrainedTokenizerFast, Mxfp4Config
-import time
-import torch.nn as nn
-from kernels import register_kernel_mapping, Mode, LayerRepository
-import sys
-import torch.profiler
-import gc
-import logging
-from transformers.models.gpt_oss.modeling_gpt_oss import GptOssRMSNorm
-# remove liger kernel for testing
-replace_kernel_forward_from_hub(GptOssRMSNorm, None)
-# set to debug logging
-logging.basicConfig(level=logging.INFO)
-def reset_peak_memory_stats():
-    """Clear CUDA cache and reset memory allocation counters."""
-    torch.cuda.empty_cache()
-    if torch.cuda.is_available():
-        torch.cuda.reset_peak_memory_stats()
-    gc.collect()
-def get_memory_stats():
-    """Get current and peak CUDA memory usage."""
-    if not torch.cuda.is_available():
-        return {"allocated_gb": 0, "peak_gb": 0, "reserved_gb": 0}
-    return {
-        "allocated_gb": torch.cuda.memory_allocated() / 1e9,
-        "peak_gb": torch.cuda.max_memory_allocated() / 1e9,
-        "reserved_gb": torch.cuda.memory_reserved() / 1e9,
-    }
-def override_kernel_layer_name(cls_name: str, value) -> bool:
-    """Helper to dynamically override the kernel_layer_name in a model class."""
-    for mod in sys.modules.values():
-        if mod is None:
-            continue
-        obj = getattr(mod, cls_name, None)
-        if isinstance(obj, type) and issubclass(obj, nn.Module):
-            setattr(obj, "kernel_layer_name", value)
-            print(f"Overrode {cls_name}.kernel_layer_name to {value}")
-            return True
-    return False
-# Init the model the normal way
-model_id = "openai/gpt-oss-20b"
-tokenizer = PreTrainedTokenizerFast.from_pretrained(model_id)
-quantization_config = Mxfp4Config(dequantize=True)
-model = GptOssForCausalLM.from_pretrained(
-    model_id,
-    dtype="bfloat16",
-    device_map="auto",
-    use_kernels=True,
-    quantization_config=quantization_config,
-    training=True,
-).eval()
-messages = [
-    {"role": "system", "content": "What is Tensor Parallelism?"},
-]
-inputs = tokenizer.apply_chat_template(
-    messages,
-    add_generation_prompt=True,
-    return_tensors="pt",
-    return_dict=True,
-    reasoning_effort="low",
-).to("cuda")
-max_tokens = 512
-# forward and backward pass
-with torch.autograd.set_grad_enabled(True):
-    start_time = time.perf_counter()
-    generated = model.generate(
-        **inputs,
-        max_new_tokens=max_tokens,
-        do_sample=False,
-        temperature=None,
-    )
-    end_time = time.perf_counter()
-    print(tokenizer.decode(generated[0], skip_special_tokens=False))
-    print(f"Generation took {end_time - start_time:.2f} seconds")

cells/forward_only.py DELETED Viewed

@@ -1,96 +0,0 @@
-# /// script
-# requires-python = ">=3.12"
-# dependencies = [
-#     "accelerate>=1.10.1",
-#     "torch>=2.7.0",
-#     "kernels==0.10.0",
-#     "transformers@https://github.com/huggingface/transformers.git",
-#     "ipdb>=0.13.13",
-#     "matplotlib>=3.7.2",
-#     "numpy>=1.24.3",
-# ]
-# ///
-import torch
-from transformers import GptOssForCausalLM, PreTrainedTokenizerFast, Mxfp4Config
-import time
-import torch.nn as nn
-from kernels import register_kernel_mapping, Mode, LayerRepository
-import sys
-import torch.profiler
-import gc
-import logging
-# set to debug logging
-logging.basicConfig(level=logging.INFO)
-def reset_peak_memory_stats():
-    """Clear CUDA cache and reset memory allocation counters."""
-    torch.cuda.empty_cache()
-    if torch.cuda.is_available():
-        torch.cuda.reset_peak_memory_stats()
-    gc.collect()
-def get_memory_stats():
-    """Get current and peak CUDA memory usage."""
-    if not torch.cuda.is_available():
-        return {"allocated_gb": 0, "peak_gb": 0, "reserved_gb": 0}
-    return {
-        "allocated_gb": torch.cuda.memory_allocated() / 1e9,
-        "peak_gb": torch.cuda.max_memory_allocated() / 1e9,
-        "reserved_gb": torch.cuda.memory_reserved() / 1e9,
-    }
-def override_kernel_layer_name(cls_name: str, value) -> bool:
-    """Helper to dynamically override the kernel_layer_name in a model class."""
-    for mod in sys.modules.values():
-        if mod is None:
-            continue
-        obj = getattr(mod, cls_name, None)
-        if isinstance(obj, type) and issubclass(obj, nn.Module):
-            setattr(obj, "kernel_layer_name", value)
-            print(f"Overrode {cls_name}.kernel_layer_name to {value}")
-            return True
-    return False
-# Init the model the normal way
-model_id = "openai/gpt-oss-20b"
-tokenizer = PreTrainedTokenizerFast.from_pretrained(model_id)
-quantization_config = Mxfp4Config(dequantize=True)
-model = GptOssForCausalLM.from_pretrained(
-    model_id,
-    dtype="bfloat16",
-    device_map="auto",
-    use_kernels=True,
-    quantization_config=quantization_config,
-).eval()
-messages = [
-    {"role": "system", "content": "What is Tensor Parallelism?"},
-]
-inputs = tokenizer.apply_chat_template(
-    messages,
-    add_generation_prompt=True,
-    return_tensors="pt",
-    return_dict=True,
-    reasoning_effort="low",
-).to("cuda")
-max_tokens = 512
-with torch.inference_mode():
-    start_time = time.perf_counter()
-    generated = model.generate(
-        **inputs,
-        max_new_tokens=max_tokens,
-        do_sample=False,
-        temperature=None,
-    )
-    end_time = time.perf_counter()
-print(tokenizer.decode(generated[0], skip_special_tokens=False))
-print(f"Generation took {end_time - start_time:.2f} seconds")

cells/nv.py DELETED Viewed

@@ -1,3 +0,0 @@
-import subprocess
-print(subprocess.run(["nvidia-smi"], capture_output=True, text=True).stdout)

cells/setup.py DELETED Viewed

@@ -1,116 +0,0 @@
-# /// script
-# requires-python = ">=3.12"
-# dependencies = [
-#     "accelerate>=1.10.1",
-#     "torch>=2.7.0",
-#     "kernels==0.10.0",
-#     "transformers@https://github.com/huggingface/transformers.git",
-#     "ipdb>=0.13.13",
-#     "matplotlib>=3.7.2",
-#     "numpy>=1.24.3",
-# ]
-# ///
-import torch
-from transformers import GptOssForCausalLM, PreTrainedTokenizerFast, Mxfp4Config
-import time
-import torch.nn as nn
-from kernels import register_kernel_mapping, Mode, LayerRepository
-import sys
-import torch.profiler
-import gc
-import logging
-# set to debug logging
-logging.basicConfig(level=logging.INFO)
-def reset_peak_memory_stats():
-    """Clear CUDA cache and reset memory allocation counters."""
-    torch.cuda.empty_cache()
-    if torch.cuda.is_available():
-        torch.cuda.reset_peak_memory_stats()
-    gc.collect()
-def get_memory_stats():
-    """Get current and peak CUDA memory usage."""
-    if not torch.cuda.is_available():
-        return {"allocated_gb": 0, "peak_gb": 0, "reserved_gb": 0}
-    return {
-        "allocated_gb": torch.cuda.memory_allocated() / 1e9,
-        "peak_gb": torch.cuda.max_memory_allocated() / 1e9,
-        "reserved_gb": torch.cuda.memory_reserved() / 1e9,
-    }
-def override_kernel_layer_name(cls_name: str, value) -> bool:
-    """Helper to dynamically override the kernel_layer_name in a model class."""
-    for mod in sys.modules.values():
-        if mod is None:
-            continue
-        obj = getattr(mod, cls_name, None)
-        if isinstance(obj, type) and issubclass(obj, nn.Module):
-            setattr(obj, "kernel_layer_name", value)
-            print(f"Overrode {cls_name}.kernel_layer_name to {value}")
-            return True
-    return False
-# Init the model the normal way
-model_id = "openai/gpt-oss-20b"
-tokenizer = PreTrainedTokenizerFast.from_pretrained(model_id)
-quantization_config = Mxfp4Config(dequantize=True)
-from kernels import replace_kernel_forward_from_hub, register_kernel_mapping, LayerRepository, Mode
-from transformers.models.gpt_oss.modeling_gpt_oss import GptOssMLP, GptOssRMSNorm
-replace_kernel_forward_from_hub(GptOssMLP, "Yamoe")  # direct, type-safe
-replace_kernel_forward_from_hub(GptOssRMSNorm, None)  # direct, type-safe
-custom_mapping = {
-    "Yamoe": {
-        "cuda": {
-            Mode.INFERENCE: LayerRepository(
-                repo_id="drbh/yamoe",
-                layer_name="Yamoe",
-                revision="v0.3.0",
-            )
-        }
-    }
-}
-register_kernel_mapping(custom_mapping)
-model = GptOssForCausalLM.from_pretrained(
-    model_id,
-    dtype="bfloat16",
-    device_map="auto",
-    use_kernels=True,
-    quantization_config=quantization_config,
-).eval()
-messages = [
-    {"role": "system", "content": "What is Tensor Parallelism?"},
-]
-inputs = tokenizer.apply_chat_template(
-    messages,
-    add_generation_prompt=True,
-    return_tensors="pt",
-    return_dict=True,
-    reasoning_effort="low",
-).to("cuda")
-max_tokens = 512
-with torch.inference_mode():
-    start_time = time.perf_counter()
-    generated = model.generate(
-        **inputs,
-        max_new_tokens=max_tokens,
-        do_sample=False,
-        temperature=None,
-    )
-    end_time = time.perf_counter()
-print(tokenizer.decode(generated[0], skip_special_tokens=False))
-print(f"Generation took {end_time - start_time:.2f} seconds")

cells/setup2.py DELETED Viewed

@@ -1,115 +0,0 @@
-# /// script
-# requires-python = ">=3.12"
-# dependencies = [
-#     "accelerate>=1.10.1",
-#     "torch>=2.7.0",
-#     "kernels==0.10.0",
-#     "transformers@https://github.com/huggingface/transformers.git",
-#     "ipdb>=0.13.13",
-#     "matplotlib>=3.7.2",
-#     "numpy>=1.24.3",
-# ]
-# ///
-import torch
-from transformers import GptOssForCausalLM, PreTrainedTokenizerFast, Mxfp4Config
-import time
-import torch.nn as nn
-from kernels import register_kernel_mapping, Mode, LayerRepository
-import sys
-import torch.profiler
-import gc
-import logging
-# set to debug logging
-logging.basicConfig(level=logging.INFO)
-def reset_peak_memory_stats():
-    """Clear CUDA cache and reset memory allocation counters."""
-    torch.cuda.empty_cache()
-    if torch.cuda.is_available():
-        torch.cuda.reset_peak_memory_stats()
-    gc.collect()
-def get_memory_stats():
-    """Get current and peak CUDA memory usage."""
-    if not torch.cuda.is_available():
-        return {"allocated_gb": 0, "peak_gb": 0, "reserved_gb": 0}
-    return {
-        "allocated_gb": torch.cuda.memory_allocated() / 1e9,
-        "peak_gb": torch.cuda.max_memory_allocated() / 1e9,
-        "reserved_gb": torch.cuda.memory_reserved() / 1e9,
-    }
-def override_kernel_layer_name(cls_name: str, value) -> bool:
-    """Helper to dynamically override the kernel_layer_name in a model class."""
-    for mod in sys.modules.values():
-        if mod is None:
-            continue
-        obj = getattr(mod, cls_name, None)
-        if isinstance(obj, type) and issubclass(obj, nn.Module):
-            setattr(obj, "kernel_layer_name", value)
-            print(f"Overrode {cls_name}.kernel_layer_name to {value}")
-            return True
-    return False
-# Init the model the normal way
-model_id = "openai/gpt-oss-20b"
-tokenizer = PreTrainedTokenizerFast.from_pretrained(model_id)
-quantization_config = Mxfp4Config(dequantize=True)
-from kernels import replace_kernel_forward_from_hub, register_kernel_mapping, LayerRepository, Mode
-from transformers.models.gpt_oss.modeling_gpt_oss import GptOssMLP, GptOssRMSNorm
-replace_kernel_forward_from_hub(GptOssRMSNorm, None)  # direct, type-safe
-custom_mapping = {
-    "Yamoe": {
-        "cuda": {
-            Mode.INFERENCE: LayerRepository(
-                repo_id="drbh/yamoe",
-                layer_name="Yamoe",
-                revision="v0.3.0",
-            )
-        }
-    }
-}
-register_kernel_mapping(custom_mapping)
-model = GptOssForCausalLM.from_pretrained(
-    model_id,
-    dtype="bfloat16",
-    device_map="auto",
-    use_kernels=True,
-    quantization_config=quantization_config,
-).eval()
-messages = [
-    {"role": "system", "content": "What is Tensor Parallelism?"},
-]
-inputs = tokenizer.apply_chat_template(
-    messages,
-    add_generation_prompt=True,
-    return_tensors="pt",
-    return_dict=True,
-    reasoning_effort="low",
-).to("cuda")
-max_tokens = 512
-with torch.inference_mode():
-    start_time = time.perf_counter()
-    generated = model.generate(
-        **inputs,
-        max_new_tokens=max_tokens,
-        do_sample=False,
-        temperature=None,
-    )
-    end_time = time.perf_counter()
-print(tokenizer.decode(generated[0], skip_special_tokens=False))
-print(f"Generation took {end_time - start_time:.2f} seconds")

index.html DELETED Viewed

@@ -1,24 +0,0 @@
-<!DOCTYPE html>
-<html>
-<head>
-  <meta charset='UTF-8'>
-  <title>Directory Index</title>
-  <style>
-    body { font-family: monospace; margin: 20px; }
-    h1 { font-size: 1.5em; }
-    ul { list-style-type: none; padding-left: 20px; }
-    li { margin: 5px 0; }
-    .dir { font-weight: bold; }
-    .file { color: #0066cc; }
-    a { text-decoration: none; }
-    a:hover { text-decoration: underline; }
-  </style>
-</head>
-<body>
-  <h1>Index of /</h1>
-  <ul>
-    <li><a href='flash_attn/index.html' class='dir'>flash_attn/</a></li>
-    <li><a href='moe_benchmarks/index.html' class='dir'>moe_benchmarks/</a></li>
-  </ul>
-</body>
-</html>

megablocks_only.html DELETED Viewed

The diff for this file is too large to render. See raw diff

note.html DELETED Viewed

The diff for this file is too large to render. See raw diff

note_test_override.html DELETED Viewed

The diff for this file is too large to render. See raw diff

note_test_override.md DELETED Viewed

@@ -1,261 +0,0 @@
----
-title: "uvnote Integration Test Report"
-author: "uvnote"
-theme: "light"
-syntax_theme: "monokai"
-show_line_numbers: true
-collapse_code: false
-custom_css: |
-    #output-setup {
-        overflow-x: auto;
-    }
-    .cell-stdout {
-        width: 100%;
-    }
-    .cell-stderr {
-        width: max-content;
-        max-height: 300px;
-        overflow: auto;
-    }
----
-```python id=setup
-# /// script
-# requires-python = ">=3.12"
-# dependencies = [
-#     "accelerate>=1.10.1",
-#     "torch>=2.7.0",
-#     "kernels==0.10.0",
-#     "transformers@https://github.com/huggingface/transformers.git",
-#     "ipdb>=0.13.13",
-#     "matplotlib>=3.7.2",
-#     "numpy>=1.24.3",
-# ]
-# ///
-import torch
-from transformers import GptOssForCausalLM, PreTrainedTokenizerFast, Mxfp4Config
-import time
-import torch.nn as nn
-from kernels import register_kernel_mapping, Mode, LayerRepository
-import sys
-import torch.profiler
-import gc
-import logging
-# set to debug logging
-logging.basicConfig(level=logging.INFO)
-def reset_peak_memory_stats():
-    """Clear CUDA cache and reset memory allocation counters."""
-    torch.cuda.empty_cache()
-    if torch.cuda.is_available():
-        torch.cuda.reset_peak_memory_stats()
-    gc.collect()
-def get_memory_stats():
-    """Get current and peak CUDA memory usage."""
-    if not torch.cuda.is_available():
-        return {"allocated_gb": 0, "peak_gb": 0, "reserved_gb": 0}
-    return {
-        "allocated_gb": torch.cuda.memory_allocated() / 1e9,
-        "peak_gb": torch.cuda.max_memory_allocated() / 1e9,
-        "reserved_gb": torch.cuda.memory_reserved() / 1e9,
-    }
-def override_kernel_layer_name(cls_name: str, value) -> bool:
-    """Helper to dynamically override the kernel_layer_name in a model class."""
-    for mod in sys.modules.values():
-        if mod is None:
-            continue
-        obj = getattr(mod, cls_name, None)
-        if isinstance(obj, type) and issubclass(obj, nn.Module):
-            setattr(obj, "kernel_layer_name", value)
-            print(f"Overrode {cls_name}.kernel_layer_name to {value}")
-            return True
-    return False
-# Init the model the normal way
-model_id = "openai/gpt-oss-20b"
-tokenizer = PreTrainedTokenizerFast.from_pretrained(model_id)
-quantization_config = Mxfp4Config(dequantize=True)
-from kernels import replace_kernel_forward_from_hub, register_kernel_mapping, LayerRepository, Mode
-from transformers.models.gpt_oss.modeling_gpt_oss import GptOssMLP, GptOssRMSNorm
-replace_kernel_forward_from_hub(GptOssMLP, "Yamoe")  # direct, type-safe
-replace_kernel_forward_from_hub(GptOssRMSNorm, None)  # direct, type-safe
-custom_mapping = {
-    "Yamoe": {
-        "cuda": {
-            Mode.INFERENCE: LayerRepository(
-                repo_id="drbh/yamoe",
-                layer_name="Yamoe",
-                revision="v0.3.0",
-            )
-        }
-    }
-}
-register_kernel_mapping(custom_mapping)
-model = GptOssForCausalLM.from_pretrained(
-    model_id,
-    dtype="bfloat16",
-    device_map="auto",
-    use_kernels=True,
-    quantization_config=quantization_config,
-).eval()
-messages = [
-    {"role": "system", "content": "What is Tensor Parallelism?"},
-]
-inputs = tokenizer.apply_chat_template(
-    messages,
-    add_generation_prompt=True,
-    return_tensors="pt",
-    return_dict=True,
-    reasoning_effort="low",
-).to("cuda")
-max_tokens = 512
-with torch.inference_mode():
-    start_time = time.perf_counter()
-    generated = model.generate(
-        **inputs,
-        max_new_tokens=max_tokens,
-        do_sample=False,
-        temperature=None,
-    )
-    end_time = time.perf_counter()
-print(tokenizer.decode(generated[0], skip_special_tokens=False))
-print(f"Generation took {end_time - start_time:.2f} seconds")
-```
-# Reference kernel
-```python id=setup2
-# /// script
-# requires-python = ">=3.12"
-# dependencies = [
-#     "accelerate>=1.10.1",
-#     "torch>=2.7.0",
-#     "kernels==0.10.0",
-#     "transformers@https://github.com/huggingface/transformers.git",
-#     "ipdb>=0.13.13",
-#     "matplotlib>=3.7.2",
-#     "numpy>=1.24.3",
-# ]
-# ///
-import torch
-from transformers import GptOssForCausalLM, PreTrainedTokenizerFast, Mxfp4Config
-import time
-import torch.nn as nn
-from kernels import register_kernel_mapping, Mode, LayerRepository
-import sys
-import torch.profiler
-import gc
-import logging
-# set to debug logging
-logging.basicConfig(level=logging.INFO)
-def reset_peak_memory_stats():
-    """Clear CUDA cache and reset memory allocation counters."""
-    torch.cuda.empty_cache()
-    if torch.cuda.is_available():
-        torch.cuda.reset_peak_memory_stats()
-    gc.collect()
-def get_memory_stats():
-    """Get current and peak CUDA memory usage."""
-    if not torch.cuda.is_available():
-        return {"allocated_gb": 0, "peak_gb": 0, "reserved_gb": 0}
-    return {
-        "allocated_gb": torch.cuda.memory_allocated() / 1e9,
-        "peak_gb": torch.cuda.max_memory_allocated() / 1e9,
-        "reserved_gb": torch.cuda.memory_reserved() / 1e9,
-    }
-def override_kernel_layer_name(cls_name: str, value) -> bool:
-    """Helper to dynamically override the kernel_layer_name in a model class."""
-    for mod in sys.modules.values():
-        if mod is None:
-            continue
-        obj = getattr(mod, cls_name, None)
-        if isinstance(obj, type) and issubclass(obj, nn.Module):
-            setattr(obj, "kernel_layer_name", value)
-            print(f"Overrode {cls_name}.kernel_layer_name to {value}")
-            return True
-    return False
-# Init the model the normal way
-model_id = "openai/gpt-oss-20b"
-tokenizer = PreTrainedTokenizerFast.from_pretrained(model_id)
-quantization_config = Mxfp4Config(dequantize=True)
-from kernels import replace_kernel_forward_from_hub, register_kernel_mapping, LayerRepository, Mode
-from transformers.models.gpt_oss.modeling_gpt_oss import GptOssMLP, GptOssRMSNorm
-replace_kernel_forward_from_hub(GptOssRMSNorm, None)  # direct, type-safe
-custom_mapping = {
-    "Yamoe": {
-        "cuda": {
-            Mode.INFERENCE: LayerRepository(
-                repo_id="drbh/yamoe",
-                layer_name="Yamoe",
-                revision="v0.3.0",
-            )
-        }
-    }
-}
-register_kernel_mapping(custom_mapping)
-model = GptOssForCausalLM.from_pretrained(
-    model_id,
-    dtype="bfloat16",
-    device_map="auto",
-    use_kernels=True,
-    quantization_config=quantization_config,
-).eval()
-messages = [
-    {"role": "system", "content": "What is Tensor Parallelism?"},
-]
-inputs = tokenizer.apply_chat_template(
-    messages,
-    add_generation_prompt=True,
-    return_tensors="pt",
-    return_dict=True,
-    reasoning_effort="low",
-).to("cuda")
-max_tokens = 512
-with torch.inference_mode():
-    start_time = time.perf_counter()
-    generated = model.generate(
-        **inputs,
-        max_new_tokens=max_tokens,
-        do_sample=False,
-        temperature=None,
-    )
-    end_time = time.perf_counter()
-print(tokenizer.decode(generated[0], skip_special_tokens=False))
-print(f"Generation took {end_time - start_time:.2f} seconds")
-```

site/artifacts/charts/benchmark_dashboard.png DELETED Viewed

Binary file (87.7 kB)

site/artifacts/charts/latency.png DELETED Viewed

Binary file (31.6 kB)

site/artifacts/charts/memory.png DELETED Viewed

Binary file (46.3 kB)

site/artifacts/charts/throughput.png DELETED Viewed

Binary file (37.4 kB)

site/artifacts/setup/benchmark_avg_tokens_per_sec.txt DELETED Viewed

	@@ -1 +0,0 @@
1	- 5.301658854167735

site/artifacts/setup/benchmark_dashboard.png DELETED Viewed

Binary file (92.9 kB)

site/artifacts/setup/benchmark_memory.txt DELETED Viewed

	@@ -1 +0,0 @@
1	- 9.398672896,9.414898176,10.334765056

site/artifacts/setup/benchmark_times.txt DELETED Viewed

@@ -1,5 +0,0 @@
-12.075035744113848
-12.0710428240709
-12.070115809096023
-12.070908240042627
-12.071364195086062

site/cells/charts.py DELETED Viewed

@@ -1,140 +0,0 @@
-# /// script
-# dependencies = [
-#     "matplotlib",
-#     "numpy",
-# ]
-# ///
-import matplotlib.pyplot as plt
-import numpy as np
-import os
-# get the pathf rom UVNOTE_SETUP env var
-setup_path = os.getenv("UVNOTE_INPUT_SETUP", ".")
-print(f"Reading benchmark data from: {setup_path}")
-num_runs = 5
-max_tokens = 64
-times = []
-with open(os.path.join(setup_path, "benchmark_times.txt"), "r") as f:
-    for line in f:
-        times.append(float(line.strip()))
-avg_time = 0.0
-min_time = 0.0
-max_time = 0.0
-final_mem = {"allocated_gb": 0.0, "peak_gb": 0.0, "reserved_gb": 0.0}
-avg_tokens_per_sec = 0.0
-with open(os.path.join(setup_path, "benchmark_avg_tokens_per_sec.txt"), "r") as f:
-    avg_tokens_per_sec = float(f.read().strip())
-times_file = os.path.join(setup_path, "benchmark_times.txt")
-memory_file = os.path.join(setup_path, "benchmark_memory.txt")
-# Minimal brutalist palette (dark theme): grayscale + 1 accent
-ACCENT = '#5ec8f8'   # calm cyan-blue accent
-FG = '#e6e6e6'       # light gray text/lines
-MUTED = '#9aa0a6'    # muted gray for secondary
-GRID = '#333333'     # grid lines
-# Styling tuned for clarity, high contrast, few colors
-plt.style.use('dark_background')
-plt.rcParams['figure.facecolor'] = 'none'
-plt.rcParams['axes.facecolor'] = 'none'
-plt.rcParams['savefig.facecolor'] = 'none'
-plt.rcParams['savefig.transparent'] = True
-plt.rcParams['font.family'] = 'monospace'
-plt.rcParams['font.weight'] = 'bold'
-plt.rcParams['axes.linewidth'] = 3
-plt.rcParams['grid.linewidth'] = 2
-plt.rcParams['lines.linewidth'] = 3
-plt.rcParams['patch.linewidth'] = 2
-# Prepare data
-runs = list(range(1, len(times) + 1))
-tokens_per_sec_all = [max_tokens / t for t in times]
-# Chart 1: Throughput Performance
-fig1, ax1 = plt.subplots(1, 1, figsize=(12, 6))
-fig1.patch.set_alpha(0)
-ax1.patch.set_alpha(0)
-ax1.plot(runs, tokens_per_sec_all, color=ACCENT, marker='o', markersize=12,
-         markerfacecolor=ACCENT, markeredgecolor=FG, markeredgewidth=3, linewidth=5, label='tok/s')
-ax1.fill_between(runs, 0, tokens_per_sec_all, alpha=0.2, color=ACCENT)
-ax1.axhline(y=avg_tokens_per_sec, color=FG, linestyle='--', linewidth=3,
-            label=f'AVG: {avg_tokens_per_sec:.1f}')
-ax1.set_title('THROUGHPUT PERFORMANCE', color=FG, fontsize=18, pad=20, fontweight='bold')
-ax1.set_xlabel('RUN NUMBER', color=FG, fontsize=14, fontweight='bold')
-ax1.set_ylabel('TOKENS/SEC', color=FG, fontsize=14, fontweight='bold')
-ax1.grid(True, color=GRID, alpha=0.5, linewidth=2)
-ax1.tick_params(colors=FG, labelsize=12)
-legend1 = ax1.legend(frameon=False, loc='lower right')
-for text in legend1.get_texts():
-    text.set_color(FG)
-    text.set_fontweight('bold')
-plt.tight_layout()
-plt.savefig('throughput.png', dpi=150, bbox_inches='tight', transparent=True)
-plt.show()
-# Chart 2: Generation Latency
-fig2, ax2 = plt.subplots(1, 1, figsize=(12, 6))
-fig2.patch.set_alpha(0)
-ax2.patch.set_alpha(0)
-bar_colors = [ACCENT if i % 2 == 0 else MUTED for i in range(len(times))]
-bars = ax2.bar(runs, times, color=bar_colors, edgecolor=FG, linewidth=3, width=0.6)
-ax2.axhline(y=avg_time, color=FG, linestyle='--', linewidth=3,
-            label=f'AVG: {avg_time:.2f}s')
-for i, (run, time, bar) in enumerate(zip(runs, times, bars)):
-    ax2.text(run, time + 0.02, f'{time:.2f}s', ha='center', va='bottom',
-             color=FG, fontweight='bold', fontsize=11)
-ax2.set_title('GENERATION LATENCY', color=FG, fontsize=18, pad=20, fontweight='bold')
-ax2.set_xlabel('RUN NUMBER', color=FG, fontsize=14, fontweight='bold')
-ax2.set_ylabel('TIME (SECONDS)', color=FG, fontsize=14, fontweight='bold')
-ax2.grid(True, axis='y', color=GRID, alpha=0.5, linewidth=2)
-ax2.tick_params(colors=FG, labelsize=12)
-ax2.set_ylim(0, max(times) * 1.15)
-legend2 = ax2.legend(frameon=False, loc='upper right')
-for text in legend2.get_texts():
-    text.set_color(FG)
-    text.set_fontweight('bold')
-plt.tight_layout()
-plt.savefig('latency.png', dpi=150, bbox_inches='tight', transparent=True)
-plt.show()
-# Chart 3: Memory Usage
-fig3, ax3 = plt.subplots(1, 1, figsize=(12, 6))
-fig3.patch.set_alpha(0)
-ax3.patch.set_alpha(0)
-memory_labels = ['ALLOCATED', 'PEAK', 'RESERVED']
-memory_values = [final_mem['allocated_gb'], final_mem['peak_gb'], final_mem['reserved_gb']]
-colors_mem = [MUTED, ACCENT, FG]
-bars = ax3.barh(memory_labels, memory_values, color=colors_mem, edgecolor=FG, linewidth=3, height=0.5)
-for i, (label, value, bar) in enumerate(zip(memory_labels, memory_values, bars)):
-    ax3.text(value + 0.5, i, f'{value:.1f} GB', va='center',
-             color=FG, fontweight='bold', fontsize=13)
-ax3.set_title('MEMORY USAGE', color=FG, fontsize=18, pad=20, fontweight='bold')
-ax3.set_xlabel('GIGABYTES', color=FG, fontsize=14, fontweight='bold')
-ax3.set_xlim(0, max(memory_values) * 1.3)
-ax3.grid(True, axis='x', color=GRID, alpha=0.5, linewidth=2)
-ax3.tick_params(colors=FG, labelsize=12)
-ax3.set_yticks(range(len(memory_labels)))
-ax3.set_yticklabels(memory_labels, fontweight='bold')
-plt.tight_layout()
-plt.savefig('memory.png', dpi=150, bbox_inches='tight', transparent=True)
-plt.show()
-print(f"\n📊 Charts saved as:")
-print(f"  • throughput.png")
-print(f"  • latency.png")
-print(f"  • memory.png")
-print(f"\nBenchmark Summary:")
-print(f"  avg tokens/sec: {avg_tokens_per_sec:.1f}")
-print(f"  min time: {min_time:.3f}s")
-print(f"  max time: {max_time:.3f}s")
-print(f"  peak memory: {final_mem['peak_gb']:.2f}GB")

site/cells/forward_and_backward.py DELETED Viewed

@@ -1,102 +0,0 @@
-# /// script
-# requires-python = ">=3.12"
-# dependencies = [
-#     "accelerate>=1.10.1",
-#     "torch>=2.7.0",
-#     "kernels==0.10.0",
-#     "transformers@https://github.com/huggingface/transformers.git",
-#     "ipdb>=0.13.13",
-#     "matplotlib>=3.7.2",
-#     "numpy>=1.24.3",
-# ]
-# ///
-import torch
-from transformers import GptOssForCausalLM, PreTrainedTokenizerFast, Mxfp4Config
-import time
-import torch.nn as nn
-from kernels import register_kernel_mapping, Mode, LayerRepository
-import sys
-import torch.profiler
-import gc
-import logging
-from transformers.models.gpt_oss.modeling_gpt_oss import GptOssRMSNorm
-# remove liger kernel for testing
-replace_kernel_forward_from_hub(GptOssRMSNorm, None)
-# set to debug logging
-logging.basicConfig(level=logging.INFO)
-def reset_peak_memory_stats():
-    """Clear CUDA cache and reset memory allocation counters."""
-    torch.cuda.empty_cache()
-    if torch.cuda.is_available():
-        torch.cuda.reset_peak_memory_stats()
-    gc.collect()
-def get_memory_stats():
-    """Get current and peak CUDA memory usage."""
-    if not torch.cuda.is_available():
-        return {"allocated_gb": 0, "peak_gb": 0, "reserved_gb": 0}
-    return {
-        "allocated_gb": torch.cuda.memory_allocated() / 1e9,
-        "peak_gb": torch.cuda.max_memory_allocated() / 1e9,
-        "reserved_gb": torch.cuda.memory_reserved() / 1e9,
-    }
-def override_kernel_layer_name(cls_name: str, value) -> bool:
-    """Helper to dynamically override the kernel_layer_name in a model class."""
-    for mod in sys.modules.values():
-        if mod is None:
-            continue
-        obj = getattr(mod, cls_name, None)
-        if isinstance(obj, type) and issubclass(obj, nn.Module):
-            setattr(obj, "kernel_layer_name", value)
-            print(f"Overrode {cls_name}.kernel_layer_name to {value}")
-            return True
-    return False
-# Init the model the normal way
-model_id = "openai/gpt-oss-20b"
-tokenizer = PreTrainedTokenizerFast.from_pretrained(model_id)
-quantization_config = Mxfp4Config(dequantize=True)
-model = GptOssForCausalLM.from_pretrained(
-    model_id,
-    dtype="bfloat16",
-    device_map="auto",
-    use_kernels=True,
-    quantization_config=quantization_config,
-    training=True,
-).eval()
-messages = [
-    {"role": "system", "content": "What is Tensor Parallelism?"},
-]
-inputs = tokenizer.apply_chat_template(
-    messages,
-    add_generation_prompt=True,
-    return_tensors="pt",
-    return_dict=True,
-    reasoning_effort="low",
-).to("cuda")
-max_tokens = 512
-# forward and backward pass
-with torch.autograd.set_grad_enabled(True):
-    start_time = time.perf_counter()
-    generated = model.generate(
-        **inputs,
-        max_new_tokens=max_tokens,
-        do_sample=False,
-        temperature=None,
-    )
-    end_time = time.perf_counter()
-    print(tokenizer.decode(generated[0], skip_special_tokens=False))
-    print(f"Generation took {end_time - start_time:.2f} seconds")

site/cells/forward_only.py DELETED Viewed

@@ -1,96 +0,0 @@
-# /// script
-# requires-python = ">=3.12"
-# dependencies = [
-#     "accelerate>=1.10.1",
-#     "torch>=2.7.0",
-#     "kernels==0.10.0",
-#     "transformers@https://github.com/huggingface/transformers.git",
-#     "ipdb>=0.13.13",
-#     "matplotlib>=3.7.2",
-#     "numpy>=1.24.3",
-# ]
-# ///
-import torch
-from transformers import GptOssForCausalLM, PreTrainedTokenizerFast, Mxfp4Config
-import time
-import torch.nn as nn
-from kernels import register_kernel_mapping, Mode, LayerRepository
-import sys
-import torch.profiler
-import gc
-import logging
-# set to debug logging
-logging.basicConfig(level=logging.INFO)
-def reset_peak_memory_stats():
-    """Clear CUDA cache and reset memory allocation counters."""
-    torch.cuda.empty_cache()
-    if torch.cuda.is_available():
-        torch.cuda.reset_peak_memory_stats()
-    gc.collect()
-def get_memory_stats():
-    """Get current and peak CUDA memory usage."""
-    if not torch.cuda.is_available():
-        return {"allocated_gb": 0, "peak_gb": 0, "reserved_gb": 0}
-    return {
-        "allocated_gb": torch.cuda.memory_allocated() / 1e9,
-        "peak_gb": torch.cuda.max_memory_allocated() / 1e9,
-        "reserved_gb": torch.cuda.memory_reserved() / 1e9,
-    }
-def override_kernel_layer_name(cls_name: str, value) -> bool:
-    """Helper to dynamically override the kernel_layer_name in a model class."""
-    for mod in sys.modules.values():
-        if mod is None:
-            continue
-        obj = getattr(mod, cls_name, None)
-        if isinstance(obj, type) and issubclass(obj, nn.Module):
-            setattr(obj, "kernel_layer_name", value)
-            print(f"Overrode {cls_name}.kernel_layer_name to {value}")
-            return True
-    return False
-# Init the model the normal way
-model_id = "openai/gpt-oss-20b"
-tokenizer = PreTrainedTokenizerFast.from_pretrained(model_id)
-quantization_config = Mxfp4Config(dequantize=True)
-model = GptOssForCausalLM.from_pretrained(
-    model_id,
-    dtype="bfloat16",
-    device_map="auto",
-    use_kernels=True,
-    quantization_config=quantization_config,
-).eval()
-messages = [
-    {"role": "system", "content": "What is Tensor Parallelism?"},
-]
-inputs = tokenizer.apply_chat_template(
-    messages,
-    add_generation_prompt=True,
-    return_tensors="pt",
-    return_dict=True,
-    reasoning_effort="low",
-).to("cuda")
-max_tokens = 512
-with torch.inference_mode():
-    start_time = time.perf_counter()
-    generated = model.generate(
-        **inputs,
-        max_new_tokens=max_tokens,
-        do_sample=False,
-        temperature=None,
-    )
-    end_time = time.perf_counter()
-print(tokenizer.decode(generated[0], skip_special_tokens=False))
-print(f"Generation took {end_time - start_time:.2f} seconds")

site/cells/setup.py DELETED Viewed

@@ -1,116 +0,0 @@
-# /// script
-# requires-python = ">=3.12"
-# dependencies = [
-#     "accelerate>=1.10.1",
-#     "torch>=2.7.0",
-#     "kernels==0.10.0",
-#     "transformers@https://github.com/huggingface/transformers.git",
-#     "ipdb>=0.13.13",
-#     "matplotlib>=3.7.2",
-#     "numpy>=1.24.3",
-# ]
-# ///
-import torch
-from transformers import GptOssForCausalLM, PreTrainedTokenizerFast, Mxfp4Config
-import time
-import torch.nn as nn
-from kernels import register_kernel_mapping, Mode, LayerRepository
-import sys
-import torch.profiler
-import gc
-import logging
-# set to debug logging
-logging.basicConfig(level=logging.INFO)
-def reset_peak_memory_stats():
-    """Clear CUDA cache and reset memory allocation counters."""
-    torch.cuda.empty_cache()
-    if torch.cuda.is_available():
-        torch.cuda.reset_peak_memory_stats()
-    gc.collect()
-def get_memory_stats():
-    """Get current and peak CUDA memory usage."""
-    if not torch.cuda.is_available():
-        return {"allocated_gb": 0, "peak_gb": 0, "reserved_gb": 0}
-    return {
-        "allocated_gb": torch.cuda.memory_allocated() / 1e9,
-        "peak_gb": torch.cuda.max_memory_allocated() / 1e9,
-        "reserved_gb": torch.cuda.memory_reserved() / 1e9,
-    }
-def override_kernel_layer_name(cls_name: str, value) -> bool:
-    """Helper to dynamically override the kernel_layer_name in a model class."""
-    for mod in sys.modules.values():
-        if mod is None:
-            continue
-        obj = getattr(mod, cls_name, None)
-        if isinstance(obj, type) and issubclass(obj, nn.Module):
-            setattr(obj, "kernel_layer_name", value)
-            print(f"Overrode {cls_name}.kernel_layer_name to {value}")
-            return True
-    return False
-# Init the model the normal way
-model_id = "openai/gpt-oss-20b"
-tokenizer = PreTrainedTokenizerFast.from_pretrained(model_id)
-quantization_config = Mxfp4Config(dequantize=True)
-from kernels import replace_kernel_forward_from_hub, register_kernel_mapping, LayerRepository, Mode
-from transformers.models.gpt_oss.modeling_gpt_oss import GptOssMLP, GptOssRMSNorm
-replace_kernel_forward_from_hub(GptOssMLP, "Yamoe")  # direct, type-safe
-replace_kernel_forward_from_hub(GptOssRMSNorm, None)  # direct, type-safe
-custom_mapping = {
-    "Yamoe": {
-        "cuda": {
-            Mode.INFERENCE: LayerRepository(
-                repo_id="drbh/yamoe",
-                layer_name="Yamoe",
-                revision="v0.3.0",
-            )
-        }
-    }
-}
-register_kernel_mapping(custom_mapping)
-model = GptOssForCausalLM.from_pretrained(
-    model_id,
-    dtype="bfloat16",
-    device_map="auto",
-    use_kernels=True,
-    quantization_config=quantization_config,
-).eval()
-messages = [
-    {"role": "system", "content": "What is Tensor Parallelism?"},
-]
-inputs = tokenizer.apply_chat_template(
-    messages,
-    add_generation_prompt=True,
-    return_tensors="pt",
-    return_dict=True,
-    reasoning_effort="low",
-).to("cuda")
-max_tokens = 512
-with torch.inference_mode():
-    start_time = time.perf_counter()
-    generated = model.generate(
-        **inputs,
-        max_new_tokens=max_tokens,
-        do_sample=False,
-        temperature=None,
-    )
-    end_time = time.perf_counter()
-print(tokenizer.decode(generated[0], skip_special_tokens=False))
-print(f"Generation took {end_time - start_time:.2f} seconds")

site/cells/setup2.py DELETED Viewed

@@ -1,115 +0,0 @@
-# /// script
-# requires-python = ">=3.12"
-# dependencies = [
-#     "accelerate>=1.10.1",
-#     "torch>=2.7.0",
-#     "kernels==0.10.0",
-#     "transformers@https://github.com/huggingface/transformers.git",
-#     "ipdb>=0.13.13",
-#     "matplotlib>=3.7.2",
-#     "numpy>=1.24.3",
-# ]
-# ///
-import torch
-from transformers import GptOssForCausalLM, PreTrainedTokenizerFast, Mxfp4Config
-import time
-import torch.nn as nn
-from kernels import register_kernel_mapping, Mode, LayerRepository
-import sys
-import torch.profiler
-import gc
-import logging
-# set to debug logging
-logging.basicConfig(level=logging.INFO)
-def reset_peak_memory_stats():
-    """Clear CUDA cache and reset memory allocation counters."""
-    torch.cuda.empty_cache()
-    if torch.cuda.is_available():
-        torch.cuda.reset_peak_memory_stats()
-    gc.collect()
-def get_memory_stats():
-    """Get current and peak CUDA memory usage."""
-    if not torch.cuda.is_available():
-        return {"allocated_gb": 0, "peak_gb": 0, "reserved_gb": 0}
-    return {
-        "allocated_gb": torch.cuda.memory_allocated() / 1e9,
-        "peak_gb": torch.cuda.max_memory_allocated() / 1e9,
-        "reserved_gb": torch.cuda.memory_reserved() / 1e9,
-    }
-def override_kernel_layer_name(cls_name: str, value) -> bool:
-    """Helper to dynamically override the kernel_layer_name in a model class."""
-    for mod in sys.modules.values():
-        if mod is None:
-            continue
-        obj = getattr(mod, cls_name, None)
-        if isinstance(obj, type) and issubclass(obj, nn.Module):
-            setattr(obj, "kernel_layer_name", value)
-            print(f"Overrode {cls_name}.kernel_layer_name to {value}")
-            return True
-    return False
-# Init the model the normal way
-model_id = "openai/gpt-oss-20b"
-tokenizer = PreTrainedTokenizerFast.from_pretrained(model_id)
-quantization_config = Mxfp4Config(dequantize=True)
-from kernels import replace_kernel_forward_from_hub, register_kernel_mapping, LayerRepository, Mode
-from transformers.models.gpt_oss.modeling_gpt_oss import GptOssMLP, GptOssRMSNorm
-replace_kernel_forward_from_hub(GptOssRMSNorm, None)  # direct, type-safe
-custom_mapping = {
-    "Yamoe": {
-        "cuda": {
-            Mode.INFERENCE: LayerRepository(
-                repo_id="drbh/yamoe",
-                layer_name="Yamoe",
-                revision="v0.3.0",
-            )
-        }
-    }
-}
-register_kernel_mapping(custom_mapping)
-model = GptOssForCausalLM.from_pretrained(
-    model_id,
-    dtype="bfloat16",
-    device_map="auto",
-    use_kernels=True,
-    quantization_config=quantization_config,
-).eval()
-messages = [
-    {"role": "system", "content": "What is Tensor Parallelism?"},
-]
-inputs = tokenizer.apply_chat_template(
-    messages,
-    add_generation_prompt=True,
-    return_tensors="pt",
-    return_dict=True,
-    reasoning_effort="low",
-).to("cuda")
-max_tokens = 512
-with torch.inference_mode():
-    start_time = time.perf_counter()
-    generated = model.generate(
-        **inputs,
-        max_new_tokens=max_tokens,
-        do_sample=False,
-        temperature=None,
-    )
-    end_time = time.perf_counter()
-print(tokenizer.decode(generated[0], skip_special_tokens=False))
-print(f"Generation took {end_time - start_time:.2f} seconds")

site/megablocks_only.html DELETED Viewed

The diff for this file is too large to render. See raw diff

site/note.html DELETED Viewed

The diff for this file is too large to render. See raw diff

site/note_test_override.html DELETED Viewed

The diff for this file is too large to render. See raw diff

style.css DELETED Viewed

@@ -1,28 +0,0 @@
-body {
-	padding: 2rem;
-	font-family: -apple-system, BlinkMacSystemFont, "Arial", sans-serif;
-}
-h1 {
-	font-size: 16px;
-	margin-top: 0;
-}
-p {
-	color: rgb(107, 114, 128);
-	font-size: 15px;
-	margin-bottom: 10px;
-	margin-top: 5px;
-}
-.card {
-	max-width: 620px;
-	margin: 0 auto;
-	padding: 16px;
-	border: 1px solid lightgray;
-	border-radius: 16px;
-}
-.card p:last-child {
-	margin-bottom: 0;
-}