drbh commited on
Commit
43ffb32
·
1 Parent(s): b975ca1

fix: cleanup test generations and update attributes

Browse files
Files changed (48) hide show
  1. .gitattributes +11 -1
  2. .venv/index.html +0 -24
  3. .venv/lib/index.html +0 -24
  4. .venv/lib/python3.11/index.html +0 -24
  5. .venv/lib/python3.11/site-packages/flask/index.html +0 -24
  6. .venv/lib/python3.11/site-packages/flask/sansio/index.html +0 -24
  7. .venv/lib/python3.11/site-packages/index.html +0 -26
  8. .venv/lib/python3.11/site-packages/markdown-3.9.dist-info/index.html +0 -24
  9. .venv/lib/python3.11/site-packages/markdown-3.9.dist-info/licenses/index.html +0 -24
  10. .venv/lib/python3.11/site-packages/werkzeug/debug/index.html +0 -24
  11. .venv/lib/python3.11/site-packages/werkzeug/debug/shared/index.html +0 -24
  12. .venv/lib/python3.11/site-packages/werkzeug/index.html +0 -24
  13. artifacts/charts/benchmark_dashboard.png +0 -0
  14. artifacts/charts/latency.png +0 -0
  15. artifacts/charts/memory.png +0 -0
  16. artifacts/charts/throughput.png +0 -0
  17. artifacts/setup/benchmark_avg_tokens_per_sec.txt +0 -1
  18. artifacts/setup/benchmark_dashboard.png +0 -0
  19. artifacts/setup/benchmark_memory.txt +0 -1
  20. artifacts/setup/benchmark_times.txt +0 -5
  21. cells/charts.py +0 -140
  22. cells/forward_and_backward.py +0 -102
  23. cells/forward_only.py +0 -96
  24. cells/nv.py +0 -3
  25. cells/setup.py +0 -116
  26. cells/setup2.py +0 -115
  27. index.html +0 -24
  28. megablocks_only.html +0 -0
  29. note.html +0 -0
  30. note_test_override.html +0 -0
  31. note_test_override.md +0 -261
  32. site/artifacts/charts/benchmark_dashboard.png +0 -0
  33. site/artifacts/charts/latency.png +0 -0
  34. site/artifacts/charts/memory.png +0 -0
  35. site/artifacts/charts/throughput.png +0 -0
  36. site/artifacts/setup/benchmark_avg_tokens_per_sec.txt +0 -1
  37. site/artifacts/setup/benchmark_dashboard.png +0 -0
  38. site/artifacts/setup/benchmark_memory.txt +0 -1
  39. site/artifacts/setup/benchmark_times.txt +0 -5
  40. site/cells/charts.py +0 -140
  41. site/cells/forward_and_backward.py +0 -102
  42. site/cells/forward_only.py +0 -96
  43. site/cells/setup.py +0 -116
  44. site/cells/setup2.py +0 -115
  45. site/megablocks_only.html +0 -0
  46. site/note.html +0 -0
  47. site/note_test_override.html +0 -0
  48. style.css +0 -28
.gitattributes CHANGED
@@ -33,4 +33,14 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
33
  *.zip filter=lfs diff=lfs merge=lfs -text
34
  *.zst filter=lfs diff=lfs merge=lfs -text
35
  *tfevents* filter=lfs diff=lfs merge=lfs -text
36
- moe_benchmarks/megablocks_yamoe/artifacts/visualization/moe_performance_comparison.png filter=lfs diff=lfs merge=lfs -text
 
 
 
 
 
 
 
 
 
 
 
33
  *.zip filter=lfs diff=lfs merge=lfs -text
34
  *.zst filter=lfs diff=lfs merge=lfs -text
35
  *tfevents* filter=lfs diff=lfs merge=lfs -text
36
+ # Image files
37
+ *.png filter=lfs diff=lfs merge=lfs -text
38
+ *.jpg filter=lfs diff=lfs merge=lfs -text
39
+ *.jpeg filter=lfs diff=lfs merge=lfs -text
40
+ *.gif filter=lfs diff=lfs merge=lfs -text
41
+ *.bmp filter=lfs diff=lfs merge=lfs -text
42
+ *.tiff filter=lfs diff=lfs merge=lfs -text
43
+ *.tif filter=lfs diff=lfs merge=lfs -text
44
+ *.webp filter=lfs diff=lfs merge=lfs -text
45
+ *.svg filter=lfs diff=lfs merge=lfs -text
46
+ *.ico filter=lfs diff=lfs merge=lfs -text
.venv/index.html DELETED
@@ -1,24 +0,0 @@
1
- <!DOCTYPE html>
2
- <html>
3
- <head>
4
- <meta charset='UTF-8'>
5
- <title>Directory Index</title>
6
- <style>
7
- body { font-family: monospace; margin: 20px; }
8
- h1 { font-size: 1.5em; }
9
- ul { list-style-type: none; padding-left: 20px; }
10
- li { margin: 5px 0; }
11
- .dir { font-weight: bold; }
12
- .file { color: #0066cc; }
13
- a { text-decoration: none; }
14
- a:hover { text-decoration: underline; }
15
- </style>
16
- </head>
17
- <body>
18
- <h1>Index of /.venv</h1>
19
- <ul>
20
- <li><a href='../index.html' class='dir'>../</a></li>
21
- <li><a href='lib/index.html' class='dir'>lib/</a></li>
22
- </ul>
23
- </body>
24
- </html>
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
.venv/lib/index.html DELETED
@@ -1,24 +0,0 @@
1
- <!DOCTYPE html>
2
- <html>
3
- <head>
4
- <meta charset='UTF-8'>
5
- <title>Directory Index</title>
6
- <style>
7
- body { font-family: monospace; margin: 20px; }
8
- h1 { font-size: 1.5em; }
9
- ul { list-style-type: none; padding-left: 20px; }
10
- li { margin: 5px 0; }
11
- .dir { font-weight: bold; }
12
- .file { color: #0066cc; }
13
- a { text-decoration: none; }
14
- a:hover { text-decoration: underline; }
15
- </style>
16
- </head>
17
- <body>
18
- <h1>Index of /.venv/lib</h1>
19
- <ul>
20
- <li><a href='../index.html' class='dir'>../</a></li>
21
- <li><a href='python3.11/index.html' class='dir'>python3.11/</a></li>
22
- </ul>
23
- </body>
24
- </html>
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
.venv/lib/python3.11/index.html DELETED
@@ -1,24 +0,0 @@
1
- <!DOCTYPE html>
2
- <html>
3
- <head>
4
- <meta charset='UTF-8'>
5
- <title>Directory Index</title>
6
- <style>
7
- body { font-family: monospace; margin: 20px; }
8
- h1 { font-size: 1.5em; }
9
- ul { list-style-type: none; padding-left: 20px; }
10
- li { margin: 5px 0; }
11
- .dir { font-weight: bold; }
12
- .file { color: #0066cc; }
13
- a { text-decoration: none; }
14
- a:hover { text-decoration: underline; }
15
- </style>
16
- </head>
17
- <body>
18
- <h1>Index of /.venv/lib/python3.11</h1>
19
- <ul>
20
- <li><a href='../index.html' class='dir'>../</a></li>
21
- <li><a href='site-packages/index.html' class='dir'>site-packages/</a></li>
22
- </ul>
23
- </body>
24
- </html>
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
.venv/lib/python3.11/site-packages/flask/index.html DELETED
@@ -1,24 +0,0 @@
1
- <!DOCTYPE html>
2
- <html>
3
- <head>
4
- <meta charset='UTF-8'>
5
- <title>Directory Index</title>
6
- <style>
7
- body { font-family: monospace; margin: 20px; }
8
- h1 { font-size: 1.5em; }
9
- ul { list-style-type: none; padding-left: 20px; }
10
- li { margin: 5px 0; }
11
- .dir { font-weight: bold; }
12
- .file { color: #0066cc; }
13
- a { text-decoration: none; }
14
- a:hover { text-decoration: underline; }
15
- </style>
16
- </head>
17
- <body>
18
- <h1>Index of /.venv/lib/python3.11/site-packages/flask</h1>
19
- <ul>
20
- <li><a href='../index.html' class='dir'>../</a></li>
21
- <li><a href='sansio/index.html' class='dir'>sansio/</a></li>
22
- </ul>
23
- </body>
24
- </html>
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
.venv/lib/python3.11/site-packages/flask/sansio/index.html DELETED
@@ -1,24 +0,0 @@
1
- <!DOCTYPE html>
2
- <html>
3
- <head>
4
- <meta charset='UTF-8'>
5
- <title>Directory Index</title>
6
- <style>
7
- body { font-family: monospace; margin: 20px; }
8
- h1 { font-size: 1.5em; }
9
- ul { list-style-type: none; padding-left: 20px; }
10
- li { margin: 5px 0; }
11
- .dir { font-weight: bold; }
12
- .file { color: #0066cc; }
13
- a { text-decoration: none; }
14
- a:hover { text-decoration: underline; }
15
- </style>
16
- </head>
17
- <body>
18
- <h1>Index of /.venv/lib/python3.11/site-packages/flask/sansio</h1>
19
- <ul>
20
- <li><a href='../index.html' class='dir'>../</a></li>
21
- <li><a href='README.html' class='file'>README.html</a></li>
22
- </ul>
23
- </body>
24
- </html>
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
.venv/lib/python3.11/site-packages/index.html DELETED
@@ -1,26 +0,0 @@
1
- <!DOCTYPE html>
2
- <html>
3
- <head>
4
- <meta charset='UTF-8'>
5
- <title>Directory Index</title>
6
- <style>
7
- body { font-family: monospace; margin: 20px; }
8
- h1 { font-size: 1.5em; }
9
- ul { list-style-type: none; padding-left: 20px; }
10
- li { margin: 5px 0; }
11
- .dir { font-weight: bold; }
12
- .file { color: #0066cc; }
13
- a { text-decoration: none; }
14
- a:hover { text-decoration: underline; }
15
- </style>
16
- </head>
17
- <body>
18
- <h1>Index of /.venv/lib/python3.11/site-packages</h1>
19
- <ul>
20
- <li><a href='../index.html' class='dir'>../</a></li>
21
- <li><a href='flask/index.html' class='dir'>flask/</a></li>
22
- <li><a href='markdown-3.9.dist-info/index.html' class='dir'>markdown-3.9.dist-info/</a></li>
23
- <li><a href='werkzeug/index.html' class='dir'>werkzeug/</a></li>
24
- </ul>
25
- </body>
26
- </html>
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
.venv/lib/python3.11/site-packages/markdown-3.9.dist-info/index.html DELETED
@@ -1,24 +0,0 @@
1
- <!DOCTYPE html>
2
- <html>
3
- <head>
4
- <meta charset='UTF-8'>
5
- <title>Directory Index</title>
6
- <style>
7
- body { font-family: monospace; margin: 20px; }
8
- h1 { font-size: 1.5em; }
9
- ul { list-style-type: none; padding-left: 20px; }
10
- li { margin: 5px 0; }
11
- .dir { font-weight: bold; }
12
- .file { color: #0066cc; }
13
- a { text-decoration: none; }
14
- a:hover { text-decoration: underline; }
15
- </style>
16
- </head>
17
- <body>
18
- <h1>Index of /.venv/lib/python3.11/site-packages/markdown-3.9.dist-info</h1>
19
- <ul>
20
- <li><a href='../index.html' class='dir'>../</a></li>
21
- <li><a href='licenses/index.html' class='dir'>licenses/</a></li>
22
- </ul>
23
- </body>
24
- </html>
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
.venv/lib/python3.11/site-packages/markdown-3.9.dist-info/licenses/index.html DELETED
@@ -1,24 +0,0 @@
1
- <!DOCTYPE html>
2
- <html>
3
- <head>
4
- <meta charset='UTF-8'>
5
- <title>Directory Index</title>
6
- <style>
7
- body { font-family: monospace; margin: 20px; }
8
- h1 { font-size: 1.5em; }
9
- ul { list-style-type: none; padding-left: 20px; }
10
- li { margin: 5px 0; }
11
- .dir { font-weight: bold; }
12
- .file { color: #0066cc; }
13
- a { text-decoration: none; }
14
- a:hover { text-decoration: underline; }
15
- </style>
16
- </head>
17
- <body>
18
- <h1>Index of /.venv/lib/python3.11/site-packages/markdown-3.9.dist-info/licenses</h1>
19
- <ul>
20
- <li><a href='../index.html' class='dir'>../</a></li>
21
- <li><a href='LICENSE.html' class='file'>LICENSE.html</a></li>
22
- </ul>
23
- </body>
24
- </html>
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
.venv/lib/python3.11/site-packages/werkzeug/debug/index.html DELETED
@@ -1,24 +0,0 @@
1
- <!DOCTYPE html>
2
- <html>
3
- <head>
4
- <meta charset='UTF-8'>
5
- <title>Directory Index</title>
6
- <style>
7
- body { font-family: monospace; margin: 20px; }
8
- h1 { font-size: 1.5em; }
9
- ul { list-style-type: none; padding-left: 20px; }
10
- li { margin: 5px 0; }
11
- .dir { font-weight: bold; }
12
- .file { color: #0066cc; }
13
- a { text-decoration: none; }
14
- a:hover { text-decoration: underline; }
15
- </style>
16
- </head>
17
- <body>
18
- <h1>Index of /.venv/lib/python3.11/site-packages/werkzeug/debug</h1>
19
- <ul>
20
- <li><a href='../index.html' class='dir'>../</a></li>
21
- <li><a href='shared/index.html' class='dir'>shared/</a></li>
22
- </ul>
23
- </body>
24
- </html>
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
.venv/lib/python3.11/site-packages/werkzeug/debug/shared/index.html DELETED
@@ -1,24 +0,0 @@
1
- <!DOCTYPE html>
2
- <html>
3
- <head>
4
- <meta charset='UTF-8'>
5
- <title>Directory Index</title>
6
- <style>
7
- body { font-family: monospace; margin: 20px; }
8
- h1 { font-size: 1.5em; }
9
- ul { list-style-type: none; padding-left: 20px; }
10
- li { margin: 5px 0; }
11
- .dir { font-weight: bold; }
12
- .file { color: #0066cc; }
13
- a { text-decoration: none; }
14
- a:hover { text-decoration: underline; }
15
- </style>
16
- </head>
17
- <body>
18
- <h1>Index of /.venv/lib/python3.11/site-packages/werkzeug/debug/shared</h1>
19
- <ul>
20
- <li><a href='../index.html' class='dir'>../</a></li>
21
- <li><a href='ICON_LICENSE.html' class='file'>ICON_LICENSE.html</a></li>
22
- </ul>
23
- </body>
24
- </html>
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
.venv/lib/python3.11/site-packages/werkzeug/index.html DELETED
@@ -1,24 +0,0 @@
1
- <!DOCTYPE html>
2
- <html>
3
- <head>
4
- <meta charset='UTF-8'>
5
- <title>Directory Index</title>
6
- <style>
7
- body { font-family: monospace; margin: 20px; }
8
- h1 { font-size: 1.5em; }
9
- ul { list-style-type: none; padding-left: 20px; }
10
- li { margin: 5px 0; }
11
- .dir { font-weight: bold; }
12
- .file { color: #0066cc; }
13
- a { text-decoration: none; }
14
- a:hover { text-decoration: underline; }
15
- </style>
16
- </head>
17
- <body>
18
- <h1>Index of /.venv/lib/python3.11/site-packages/werkzeug</h1>
19
- <ul>
20
- <li><a href='../index.html' class='dir'>../</a></li>
21
- <li><a href='debug/index.html' class='dir'>debug/</a></li>
22
- </ul>
23
- </body>
24
- </html>
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
artifacts/charts/benchmark_dashboard.png DELETED
Binary file (87.7 kB)
 
artifacts/charts/latency.png DELETED
Binary file (31.6 kB)
 
artifacts/charts/memory.png DELETED
Binary file (46.3 kB)
 
artifacts/charts/throughput.png DELETED
Binary file (37.4 kB)
 
artifacts/setup/benchmark_avg_tokens_per_sec.txt DELETED
@@ -1 +0,0 @@
1
- 5.301658854167735
 
 
artifacts/setup/benchmark_dashboard.png DELETED
Binary file (92.9 kB)
 
artifacts/setup/benchmark_memory.txt DELETED
@@ -1 +0,0 @@
1
- 9.398672896,9.414898176,10.334765056
 
 
artifacts/setup/benchmark_times.txt DELETED
@@ -1,5 +0,0 @@
1
- 12.075035744113848
2
- 12.0710428240709
3
- 12.070115809096023
4
- 12.070908240042627
5
- 12.071364195086062
 
 
 
 
 
 
cells/charts.py DELETED
@@ -1,140 +0,0 @@
1
- # /// script
2
- # dependencies = [
3
- # "matplotlib",
4
- # "numpy",
5
- # ]
6
- # ///
7
-
8
- import matplotlib.pyplot as plt
9
- import numpy as np
10
- import os
11
-
12
- # get the pathf rom UVNOTE_SETUP env var
13
- setup_path = os.getenv("UVNOTE_INPUT_SETUP", ".")
14
- print(f"Reading benchmark data from: {setup_path}")
15
-
16
- num_runs = 5
17
- max_tokens = 64
18
- times = []
19
- with open(os.path.join(setup_path, "benchmark_times.txt"), "r") as f:
20
- for line in f:
21
- times.append(float(line.strip()))
22
-
23
-
24
- avg_time = 0.0
25
- min_time = 0.0
26
- max_time = 0.0
27
- final_mem = {"allocated_gb": 0.0, "peak_gb": 0.0, "reserved_gb": 0.0}
28
-
29
- avg_tokens_per_sec = 0.0
30
- with open(os.path.join(setup_path, "benchmark_avg_tokens_per_sec.txt"), "r") as f:
31
- avg_tokens_per_sec = float(f.read().strip())
32
-
33
- times_file = os.path.join(setup_path, "benchmark_times.txt")
34
- memory_file = os.path.join(setup_path, "benchmark_memory.txt")
35
-
36
-
37
- # Minimal brutalist palette (dark theme): grayscale + 1 accent
38
- ACCENT = '#5ec8f8' # calm cyan-blue accent
39
- FG = '#e6e6e6' # light gray text/lines
40
- MUTED = '#9aa0a6' # muted gray for secondary
41
- GRID = '#333333' # grid lines
42
-
43
- # Styling tuned for clarity, high contrast, few colors
44
- plt.style.use('dark_background')
45
- plt.rcParams['figure.facecolor'] = 'none'
46
- plt.rcParams['axes.facecolor'] = 'none'
47
- plt.rcParams['savefig.facecolor'] = 'none'
48
- plt.rcParams['savefig.transparent'] = True
49
- plt.rcParams['font.family'] = 'monospace'
50
- plt.rcParams['font.weight'] = 'bold'
51
- plt.rcParams['axes.linewidth'] = 3
52
- plt.rcParams['grid.linewidth'] = 2
53
- plt.rcParams['lines.linewidth'] = 3
54
- plt.rcParams['patch.linewidth'] = 2
55
-
56
- # Prepare data
57
- runs = list(range(1, len(times) + 1))
58
- tokens_per_sec_all = [max_tokens / t for t in times]
59
-
60
- # Chart 1: Throughput Performance
61
- fig1, ax1 = plt.subplots(1, 1, figsize=(12, 6))
62
- fig1.patch.set_alpha(0)
63
- ax1.patch.set_alpha(0)
64
-
65
- ax1.plot(runs, tokens_per_sec_all, color=ACCENT, marker='o', markersize=12,
66
- markerfacecolor=ACCENT, markeredgecolor=FG, markeredgewidth=3, linewidth=5, label='tok/s')
67
- ax1.fill_between(runs, 0, tokens_per_sec_all, alpha=0.2, color=ACCENT)
68
- ax1.axhline(y=avg_tokens_per_sec, color=FG, linestyle='--', linewidth=3,
69
- label=f'AVG: {avg_tokens_per_sec:.1f}')
70
- ax1.set_title('THROUGHPUT PERFORMANCE', color=FG, fontsize=18, pad=20, fontweight='bold')
71
- ax1.set_xlabel('RUN NUMBER', color=FG, fontsize=14, fontweight='bold')
72
- ax1.set_ylabel('TOKENS/SEC', color=FG, fontsize=14, fontweight='bold')
73
- ax1.grid(True, color=GRID, alpha=0.5, linewidth=2)
74
- ax1.tick_params(colors=FG, labelsize=12)
75
- legend1 = ax1.legend(frameon=False, loc='lower right')
76
- for text in legend1.get_texts():
77
- text.set_color(FG)
78
- text.set_fontweight('bold')
79
- plt.tight_layout()
80
- plt.savefig('throughput.png', dpi=150, bbox_inches='tight', transparent=True)
81
- plt.show()
82
-
83
- # Chart 2: Generation Latency
84
- fig2, ax2 = plt.subplots(1, 1, figsize=(12, 6))
85
- fig2.patch.set_alpha(0)
86
- ax2.patch.set_alpha(0)
87
-
88
- bar_colors = [ACCENT if i % 2 == 0 else MUTED for i in range(len(times))]
89
- bars = ax2.bar(runs, times, color=bar_colors, edgecolor=FG, linewidth=3, width=0.6)
90
- ax2.axhline(y=avg_time, color=FG, linestyle='--', linewidth=3,
91
- label=f'AVG: {avg_time:.2f}s')
92
- for i, (run, time, bar) in enumerate(zip(runs, times, bars)):
93
- ax2.text(run, time + 0.02, f'{time:.2f}s', ha='center', va='bottom',
94
- color=FG, fontweight='bold', fontsize=11)
95
- ax2.set_title('GENERATION LATENCY', color=FG, fontsize=18, pad=20, fontweight='bold')
96
- ax2.set_xlabel('RUN NUMBER', color=FG, fontsize=14, fontweight='bold')
97
- ax2.set_ylabel('TIME (SECONDS)', color=FG, fontsize=14, fontweight='bold')
98
- ax2.grid(True, axis='y', color=GRID, alpha=0.5, linewidth=2)
99
- ax2.tick_params(colors=FG, labelsize=12)
100
- ax2.set_ylim(0, max(times) * 1.15)
101
- legend2 = ax2.legend(frameon=False, loc='upper right')
102
- for text in legend2.get_texts():
103
- text.set_color(FG)
104
- text.set_fontweight('bold')
105
- plt.tight_layout()
106
- plt.savefig('latency.png', dpi=150, bbox_inches='tight', transparent=True)
107
- plt.show()
108
-
109
- # Chart 3: Memory Usage
110
- fig3, ax3 = plt.subplots(1, 1, figsize=(12, 6))
111
- fig3.patch.set_alpha(0)
112
- ax3.patch.set_alpha(0)
113
-
114
- memory_labels = ['ALLOCATED', 'PEAK', 'RESERVED']
115
- memory_values = [final_mem['allocated_gb'], final_mem['peak_gb'], final_mem['reserved_gb']]
116
- colors_mem = [MUTED, ACCENT, FG]
117
- bars = ax3.barh(memory_labels, memory_values, color=colors_mem, edgecolor=FG, linewidth=3, height=0.5)
118
- for i, (label, value, bar) in enumerate(zip(memory_labels, memory_values, bars)):
119
- ax3.text(value + 0.5, i, f'{value:.1f} GB', va='center',
120
- color=FG, fontweight='bold', fontsize=13)
121
- ax3.set_title('MEMORY USAGE', color=FG, fontsize=18, pad=20, fontweight='bold')
122
- ax3.set_xlabel('GIGABYTES', color=FG, fontsize=14, fontweight='bold')
123
- ax3.set_xlim(0, max(memory_values) * 1.3)
124
- ax3.grid(True, axis='x', color=GRID, alpha=0.5, linewidth=2)
125
- ax3.tick_params(colors=FG, labelsize=12)
126
- ax3.set_yticks(range(len(memory_labels)))
127
- ax3.set_yticklabels(memory_labels, fontweight='bold')
128
- plt.tight_layout()
129
- plt.savefig('memory.png', dpi=150, bbox_inches='tight', transparent=True)
130
- plt.show()
131
-
132
- print(f"\n📊 Charts saved as:")
133
- print(f" • throughput.png")
134
- print(f" • latency.png")
135
- print(f" • memory.png")
136
- print(f"\nBenchmark Summary:")
137
- print(f" avg tokens/sec: {avg_tokens_per_sec:.1f}")
138
- print(f" min time: {min_time:.3f}s")
139
- print(f" max time: {max_time:.3f}s")
140
- print(f" peak memory: {final_mem['peak_gb']:.2f}GB")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
cells/forward_and_backward.py DELETED
@@ -1,102 +0,0 @@
1
- # /// script
2
- # requires-python = ">=3.12"
3
- # dependencies = [
4
- # "accelerate>=1.10.1",
5
- # "torch>=2.7.0",
6
- # "kernels==0.10.0",
7
- # "transformers@https://github.com/huggingface/transformers.git",
8
- # "ipdb>=0.13.13",
9
- # "matplotlib>=3.7.2",
10
- # "numpy>=1.24.3",
11
- # ]
12
- # ///
13
-
14
- import torch
15
- from transformers import GptOssForCausalLM, PreTrainedTokenizerFast, Mxfp4Config
16
- import time
17
- import torch.nn as nn
18
- from kernels import register_kernel_mapping, Mode, LayerRepository
19
- import sys
20
- import torch.profiler
21
- import gc
22
- import logging
23
- from transformers.models.gpt_oss.modeling_gpt_oss import GptOssRMSNorm
24
-
25
- # remove liger kernel for testing
26
- replace_kernel_forward_from_hub(GptOssRMSNorm, None)
27
-
28
- # set to debug logging
29
- logging.basicConfig(level=logging.INFO)
30
-
31
- def reset_peak_memory_stats():
32
- """Clear CUDA cache and reset memory allocation counters."""
33
- torch.cuda.empty_cache()
34
- if torch.cuda.is_available():
35
- torch.cuda.reset_peak_memory_stats()
36
- gc.collect()
37
-
38
- def get_memory_stats():
39
- """Get current and peak CUDA memory usage."""
40
- if not torch.cuda.is_available():
41
- return {"allocated_gb": 0, "peak_gb": 0, "reserved_gb": 0}
42
- return {
43
- "allocated_gb": torch.cuda.memory_allocated() / 1e9,
44
- "peak_gb": torch.cuda.max_memory_allocated() / 1e9,
45
- "reserved_gb": torch.cuda.memory_reserved() / 1e9,
46
- }
47
-
48
- def override_kernel_layer_name(cls_name: str, value) -> bool:
49
- """Helper to dynamically override the kernel_layer_name in a model class."""
50
- for mod in sys.modules.values():
51
- if mod is None:
52
- continue
53
- obj = getattr(mod, cls_name, None)
54
- if isinstance(obj, type) and issubclass(obj, nn.Module):
55
- setattr(obj, "kernel_layer_name", value)
56
- print(f"Overrode {cls_name}.kernel_layer_name to {value}")
57
- return True
58
- return False
59
-
60
-
61
- # Init the model the normal way
62
- model_id = "openai/gpt-oss-20b"
63
- tokenizer = PreTrainedTokenizerFast.from_pretrained(model_id)
64
- quantization_config = Mxfp4Config(dequantize=True)
65
-
66
- model = GptOssForCausalLM.from_pretrained(
67
- model_id,
68
- dtype="bfloat16",
69
- device_map="auto",
70
- use_kernels=True,
71
- quantization_config=quantization_config,
72
- training=True,
73
- ).eval()
74
-
75
- messages = [
76
- {"role": "system", "content": "What is Tensor Parallelism?"},
77
- ]
78
-
79
- inputs = tokenizer.apply_chat_template(
80
- messages,
81
- add_generation_prompt=True,
82
- return_tensors="pt",
83
- return_dict=True,
84
- reasoning_effort="low",
85
- ).to("cuda")
86
-
87
- max_tokens = 512
88
-
89
-
90
- # forward and backward pass
91
- with torch.autograd.set_grad_enabled(True):
92
- start_time = time.perf_counter()
93
- generated = model.generate(
94
- **inputs,
95
- max_new_tokens=max_tokens,
96
- do_sample=False,
97
- temperature=None,
98
- )
99
- end_time = time.perf_counter()
100
- print(tokenizer.decode(generated[0], skip_special_tokens=False))
101
- print(f"Generation took {end_time - start_time:.2f} seconds")
102
-
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
cells/forward_only.py DELETED
@@ -1,96 +0,0 @@
1
- # /// script
2
- # requires-python = ">=3.12"
3
- # dependencies = [
4
- # "accelerate>=1.10.1",
5
- # "torch>=2.7.0",
6
- # "kernels==0.10.0",
7
- # "transformers@https://github.com/huggingface/transformers.git",
8
- # "ipdb>=0.13.13",
9
- # "matplotlib>=3.7.2",
10
- # "numpy>=1.24.3",
11
- # ]
12
- # ///
13
-
14
- import torch
15
- from transformers import GptOssForCausalLM, PreTrainedTokenizerFast, Mxfp4Config
16
- import time
17
- import torch.nn as nn
18
- from kernels import register_kernel_mapping, Mode, LayerRepository
19
- import sys
20
- import torch.profiler
21
- import gc
22
- import logging
23
-
24
- # set to debug logging
25
- logging.basicConfig(level=logging.INFO)
26
-
27
- def reset_peak_memory_stats():
28
- """Clear CUDA cache and reset memory allocation counters."""
29
- torch.cuda.empty_cache()
30
- if torch.cuda.is_available():
31
- torch.cuda.reset_peak_memory_stats()
32
- gc.collect()
33
-
34
- def get_memory_stats():
35
- """Get current and peak CUDA memory usage."""
36
- if not torch.cuda.is_available():
37
- return {"allocated_gb": 0, "peak_gb": 0, "reserved_gb": 0}
38
- return {
39
- "allocated_gb": torch.cuda.memory_allocated() / 1e9,
40
- "peak_gb": torch.cuda.max_memory_allocated() / 1e9,
41
- "reserved_gb": torch.cuda.memory_reserved() / 1e9,
42
- }
43
-
44
- def override_kernel_layer_name(cls_name: str, value) -> bool:
45
- """Helper to dynamically override the kernel_layer_name in a model class."""
46
- for mod in sys.modules.values():
47
- if mod is None:
48
- continue
49
- obj = getattr(mod, cls_name, None)
50
- if isinstance(obj, type) and issubclass(obj, nn.Module):
51
- setattr(obj, "kernel_layer_name", value)
52
- print(f"Overrode {cls_name}.kernel_layer_name to {value}")
53
- return True
54
- return False
55
-
56
-
57
- # Init the model the normal way
58
- model_id = "openai/gpt-oss-20b"
59
- tokenizer = PreTrainedTokenizerFast.from_pretrained(model_id)
60
- quantization_config = Mxfp4Config(dequantize=True)
61
-
62
-
63
- model = GptOssForCausalLM.from_pretrained(
64
- model_id,
65
- dtype="bfloat16",
66
- device_map="auto",
67
- use_kernels=True,
68
- quantization_config=quantization_config,
69
- ).eval()
70
-
71
- messages = [
72
- {"role": "system", "content": "What is Tensor Parallelism?"},
73
- ]
74
-
75
- inputs = tokenizer.apply_chat_template(
76
- messages,
77
- add_generation_prompt=True,
78
- return_tensors="pt",
79
- return_dict=True,
80
- reasoning_effort="low",
81
- ).to("cuda")
82
-
83
- max_tokens = 512
84
-
85
- with torch.inference_mode():
86
- start_time = time.perf_counter()
87
- generated = model.generate(
88
- **inputs,
89
- max_new_tokens=max_tokens,
90
- do_sample=False,
91
- temperature=None,
92
- )
93
- end_time = time.perf_counter()
94
-
95
- print(tokenizer.decode(generated[0], skip_special_tokens=False))
96
- print(f"Generation took {end_time - start_time:.2f} seconds")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
cells/nv.py DELETED
@@ -1,3 +0,0 @@
1
- import subprocess
2
-
3
- print(subprocess.run(["nvidia-smi"], capture_output=True, text=True).stdout)
 
 
 
 
cells/setup.py DELETED
@@ -1,116 +0,0 @@
1
- # /// script
2
- # requires-python = ">=3.12"
3
- # dependencies = [
4
- # "accelerate>=1.10.1",
5
- # "torch>=2.7.0",
6
- # "kernels==0.10.0",
7
- # "transformers@https://github.com/huggingface/transformers.git",
8
- # "ipdb>=0.13.13",
9
- # "matplotlib>=3.7.2",
10
- # "numpy>=1.24.3",
11
- # ]
12
- # ///
13
-
14
- import torch
15
- from transformers import GptOssForCausalLM, PreTrainedTokenizerFast, Mxfp4Config
16
- import time
17
- import torch.nn as nn
18
- from kernels import register_kernel_mapping, Mode, LayerRepository
19
- import sys
20
- import torch.profiler
21
- import gc
22
- import logging
23
-
24
- # set to debug logging
25
- logging.basicConfig(level=logging.INFO)
26
-
27
- def reset_peak_memory_stats():
28
- """Clear CUDA cache and reset memory allocation counters."""
29
- torch.cuda.empty_cache()
30
- if torch.cuda.is_available():
31
- torch.cuda.reset_peak_memory_stats()
32
- gc.collect()
33
-
34
- def get_memory_stats():
35
- """Get current and peak CUDA memory usage."""
36
- if not torch.cuda.is_available():
37
- return {"allocated_gb": 0, "peak_gb": 0, "reserved_gb": 0}
38
- return {
39
- "allocated_gb": torch.cuda.memory_allocated() / 1e9,
40
- "peak_gb": torch.cuda.max_memory_allocated() / 1e9,
41
- "reserved_gb": torch.cuda.memory_reserved() / 1e9,
42
- }
43
-
44
- def override_kernel_layer_name(cls_name: str, value) -> bool:
45
- """Helper to dynamically override the kernel_layer_name in a model class."""
46
- for mod in sys.modules.values():
47
- if mod is None:
48
- continue
49
- obj = getattr(mod, cls_name, None)
50
- if isinstance(obj, type) and issubclass(obj, nn.Module):
51
- setattr(obj, "kernel_layer_name", value)
52
- print(f"Overrode {cls_name}.kernel_layer_name to {value}")
53
- return True
54
- return False
55
-
56
-
57
- # Init the model the normal way
58
- model_id = "openai/gpt-oss-20b"
59
- tokenizer = PreTrainedTokenizerFast.from_pretrained(model_id)
60
- quantization_config = Mxfp4Config(dequantize=True)
61
-
62
-
63
- from kernels import replace_kernel_forward_from_hub, register_kernel_mapping, LayerRepository, Mode
64
-
65
- from transformers.models.gpt_oss.modeling_gpt_oss import GptOssMLP, GptOssRMSNorm
66
-
67
- replace_kernel_forward_from_hub(GptOssMLP, "Yamoe") # direct, type-safe
68
- replace_kernel_forward_from_hub(GptOssRMSNorm, None) # direct, type-safe
69
- custom_mapping = {
70
- "Yamoe": {
71
- "cuda": {
72
- Mode.INFERENCE: LayerRepository(
73
- repo_id="drbh/yamoe",
74
- layer_name="Yamoe",
75
- revision="v0.3.0",
76
- )
77
- }
78
- }
79
- }
80
- register_kernel_mapping(custom_mapping)
81
-
82
-
83
- model = GptOssForCausalLM.from_pretrained(
84
- model_id,
85
- dtype="bfloat16",
86
- device_map="auto",
87
- use_kernels=True,
88
- quantization_config=quantization_config,
89
- ).eval()
90
-
91
- messages = [
92
- {"role": "system", "content": "What is Tensor Parallelism?"},
93
- ]
94
-
95
- inputs = tokenizer.apply_chat_template(
96
- messages,
97
- add_generation_prompt=True,
98
- return_tensors="pt",
99
- return_dict=True,
100
- reasoning_effort="low",
101
- ).to("cuda")
102
-
103
- max_tokens = 512
104
-
105
- with torch.inference_mode():
106
- start_time = time.perf_counter()
107
- generated = model.generate(
108
- **inputs,
109
- max_new_tokens=max_tokens,
110
- do_sample=False,
111
- temperature=None,
112
- )
113
- end_time = time.perf_counter()
114
-
115
- print(tokenizer.decode(generated[0], skip_special_tokens=False))
116
- print(f"Generation took {end_time - start_time:.2f} seconds")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
cells/setup2.py DELETED
@@ -1,115 +0,0 @@
1
- # /// script
2
- # requires-python = ">=3.12"
3
- # dependencies = [
4
- # "accelerate>=1.10.1",
5
- # "torch>=2.7.0",
6
- # "kernels==0.10.0",
7
- # "transformers@https://github.com/huggingface/transformers.git",
8
- # "ipdb>=0.13.13",
9
- # "matplotlib>=3.7.2",
10
- # "numpy>=1.24.3",
11
- # ]
12
- # ///
13
-
14
- import torch
15
- from transformers import GptOssForCausalLM, PreTrainedTokenizerFast, Mxfp4Config
16
- import time
17
- import torch.nn as nn
18
- from kernels import register_kernel_mapping, Mode, LayerRepository
19
- import sys
20
- import torch.profiler
21
- import gc
22
- import logging
23
-
24
- # set to debug logging
25
- logging.basicConfig(level=logging.INFO)
26
-
27
- def reset_peak_memory_stats():
28
- """Clear CUDA cache and reset memory allocation counters."""
29
- torch.cuda.empty_cache()
30
- if torch.cuda.is_available():
31
- torch.cuda.reset_peak_memory_stats()
32
- gc.collect()
33
-
34
- def get_memory_stats():
35
- """Get current and peak CUDA memory usage."""
36
- if not torch.cuda.is_available():
37
- return {"allocated_gb": 0, "peak_gb": 0, "reserved_gb": 0}
38
- return {
39
- "allocated_gb": torch.cuda.memory_allocated() / 1e9,
40
- "peak_gb": torch.cuda.max_memory_allocated() / 1e9,
41
- "reserved_gb": torch.cuda.memory_reserved() / 1e9,
42
- }
43
-
44
- def override_kernel_layer_name(cls_name: str, value) -> bool:
45
- """Helper to dynamically override the kernel_layer_name in a model class."""
46
- for mod in sys.modules.values():
47
- if mod is None:
48
- continue
49
- obj = getattr(mod, cls_name, None)
50
- if isinstance(obj, type) and issubclass(obj, nn.Module):
51
- setattr(obj, "kernel_layer_name", value)
52
- print(f"Overrode {cls_name}.kernel_layer_name to {value}")
53
- return True
54
- return False
55
-
56
-
57
- # Init the model the normal way
58
- model_id = "openai/gpt-oss-20b"
59
- tokenizer = PreTrainedTokenizerFast.from_pretrained(model_id)
60
- quantization_config = Mxfp4Config(dequantize=True)
61
-
62
-
63
- from kernels import replace_kernel_forward_from_hub, register_kernel_mapping, LayerRepository, Mode
64
-
65
- from transformers.models.gpt_oss.modeling_gpt_oss import GptOssMLP, GptOssRMSNorm
66
-
67
- replace_kernel_forward_from_hub(GptOssRMSNorm, None) # direct, type-safe
68
- custom_mapping = {
69
- "Yamoe": {
70
- "cuda": {
71
- Mode.INFERENCE: LayerRepository(
72
- repo_id="drbh/yamoe",
73
- layer_name="Yamoe",
74
- revision="v0.3.0",
75
- )
76
- }
77
- }
78
- }
79
- register_kernel_mapping(custom_mapping)
80
-
81
-
82
- model = GptOssForCausalLM.from_pretrained(
83
- model_id,
84
- dtype="bfloat16",
85
- device_map="auto",
86
- use_kernels=True,
87
- quantization_config=quantization_config,
88
- ).eval()
89
-
90
- messages = [
91
- {"role": "system", "content": "What is Tensor Parallelism?"},
92
- ]
93
-
94
- inputs = tokenizer.apply_chat_template(
95
- messages,
96
- add_generation_prompt=True,
97
- return_tensors="pt",
98
- return_dict=True,
99
- reasoning_effort="low",
100
- ).to("cuda")
101
-
102
- max_tokens = 512
103
-
104
- with torch.inference_mode():
105
- start_time = time.perf_counter()
106
- generated = model.generate(
107
- **inputs,
108
- max_new_tokens=max_tokens,
109
- do_sample=False,
110
- temperature=None,
111
- )
112
- end_time = time.perf_counter()
113
-
114
- print(tokenizer.decode(generated[0], skip_special_tokens=False))
115
- print(f"Generation took {end_time - start_time:.2f} seconds")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
index.html DELETED
@@ -1,24 +0,0 @@
1
- <!DOCTYPE html>
2
- <html>
3
- <head>
4
- <meta charset='UTF-8'>
5
- <title>Directory Index</title>
6
- <style>
7
- body { font-family: monospace; margin: 20px; }
8
- h1 { font-size: 1.5em; }
9
- ul { list-style-type: none; padding-left: 20px; }
10
- li { margin: 5px 0; }
11
- .dir { font-weight: bold; }
12
- .file { color: #0066cc; }
13
- a { text-decoration: none; }
14
- a:hover { text-decoration: underline; }
15
- </style>
16
- </head>
17
- <body>
18
- <h1>Index of /</h1>
19
- <ul>
20
- <li><a href='flash_attn/index.html' class='dir'>flash_attn/</a></li>
21
- <li><a href='moe_benchmarks/index.html' class='dir'>moe_benchmarks/</a></li>
22
- </ul>
23
- </body>
24
- </html>
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
megablocks_only.html DELETED
The diff for this file is too large to render. See raw diff
 
note.html DELETED
The diff for this file is too large to render. See raw diff
 
note_test_override.html DELETED
The diff for this file is too large to render. See raw diff
 
note_test_override.md DELETED
@@ -1,261 +0,0 @@
1
- ---
2
- title: "uvnote Integration Test Report"
3
- author: "uvnote"
4
- theme: "light"
5
- syntax_theme: "monokai"
6
- show_line_numbers: true
7
- collapse_code: false
8
- custom_css: |
9
- #output-setup {
10
- overflow-x: auto;
11
- }
12
- .cell-stdout {
13
- width: 100%;
14
- }
15
- .cell-stderr {
16
- width: max-content;
17
- max-height: 300px;
18
- overflow: auto;
19
- }
20
- ---
21
-
22
- ```python id=setup
23
- # /// script
24
- # requires-python = ">=3.12"
25
- # dependencies = [
26
- # "accelerate>=1.10.1",
27
- # "torch>=2.7.0",
28
- # "kernels==0.10.0",
29
- # "transformers@https://github.com/huggingface/transformers.git",
30
- # "ipdb>=0.13.13",
31
- # "matplotlib>=3.7.2",
32
- # "numpy>=1.24.3",
33
- # ]
34
- # ///
35
-
36
- import torch
37
- from transformers import GptOssForCausalLM, PreTrainedTokenizerFast, Mxfp4Config
38
- import time
39
- import torch.nn as nn
40
- from kernels import register_kernel_mapping, Mode, LayerRepository
41
- import sys
42
- import torch.profiler
43
- import gc
44
- import logging
45
-
46
- # set to debug logging
47
- logging.basicConfig(level=logging.INFO)
48
-
49
- def reset_peak_memory_stats():
50
- """Clear CUDA cache and reset memory allocation counters."""
51
- torch.cuda.empty_cache()
52
- if torch.cuda.is_available():
53
- torch.cuda.reset_peak_memory_stats()
54
- gc.collect()
55
-
56
- def get_memory_stats():
57
- """Get current and peak CUDA memory usage."""
58
- if not torch.cuda.is_available():
59
- return {"allocated_gb": 0, "peak_gb": 0, "reserved_gb": 0}
60
- return {
61
- "allocated_gb": torch.cuda.memory_allocated() / 1e9,
62
- "peak_gb": torch.cuda.max_memory_allocated() / 1e9,
63
- "reserved_gb": torch.cuda.memory_reserved() / 1e9,
64
- }
65
-
66
- def override_kernel_layer_name(cls_name: str, value) -> bool:
67
- """Helper to dynamically override the kernel_layer_name in a model class."""
68
- for mod in sys.modules.values():
69
- if mod is None:
70
- continue
71
- obj = getattr(mod, cls_name, None)
72
- if isinstance(obj, type) and issubclass(obj, nn.Module):
73
- setattr(obj, "kernel_layer_name", value)
74
- print(f"Overrode {cls_name}.kernel_layer_name to {value}")
75
- return True
76
- return False
77
-
78
-
79
- # Init the model the normal way
80
- model_id = "openai/gpt-oss-20b"
81
- tokenizer = PreTrainedTokenizerFast.from_pretrained(model_id)
82
- quantization_config = Mxfp4Config(dequantize=True)
83
-
84
-
85
- from kernels import replace_kernel_forward_from_hub, register_kernel_mapping, LayerRepository, Mode
86
-
87
- from transformers.models.gpt_oss.modeling_gpt_oss import GptOssMLP, GptOssRMSNorm
88
-
89
- replace_kernel_forward_from_hub(GptOssMLP, "Yamoe") # direct, type-safe
90
- replace_kernel_forward_from_hub(GptOssRMSNorm, None) # direct, type-safe
91
- custom_mapping = {
92
- "Yamoe": {
93
- "cuda": {
94
- Mode.INFERENCE: LayerRepository(
95
- repo_id="drbh/yamoe",
96
- layer_name="Yamoe",
97
- revision="v0.3.0",
98
- )
99
- }
100
- }
101
- }
102
- register_kernel_mapping(custom_mapping)
103
-
104
-
105
- model = GptOssForCausalLM.from_pretrained(
106
- model_id,
107
- dtype="bfloat16",
108
- device_map="auto",
109
- use_kernels=True,
110
- quantization_config=quantization_config,
111
- ).eval()
112
-
113
- messages = [
114
- {"role": "system", "content": "What is Tensor Parallelism?"},
115
- ]
116
-
117
- inputs = tokenizer.apply_chat_template(
118
- messages,
119
- add_generation_prompt=True,
120
- return_tensors="pt",
121
- return_dict=True,
122
- reasoning_effort="low",
123
- ).to("cuda")
124
-
125
- max_tokens = 512
126
-
127
- with torch.inference_mode():
128
- start_time = time.perf_counter()
129
- generated = model.generate(
130
- **inputs,
131
- max_new_tokens=max_tokens,
132
- do_sample=False,
133
- temperature=None,
134
- )
135
- end_time = time.perf_counter()
136
-
137
- print(tokenizer.decode(generated[0], skip_special_tokens=False))
138
- print(f"Generation took {end_time - start_time:.2f} seconds")
139
-
140
- ```
141
-
142
- # Reference kernel
143
-
144
- ```python id=setup2
145
- # /// script
146
- # requires-python = ">=3.12"
147
- # dependencies = [
148
- # "accelerate>=1.10.1",
149
- # "torch>=2.7.0",
150
- # "kernels==0.10.0",
151
- # "transformers@https://github.com/huggingface/transformers.git",
152
- # "ipdb>=0.13.13",
153
- # "matplotlib>=3.7.2",
154
- # "numpy>=1.24.3",
155
- # ]
156
- # ///
157
-
158
- import torch
159
- from transformers import GptOssForCausalLM, PreTrainedTokenizerFast, Mxfp4Config
160
- import time
161
- import torch.nn as nn
162
- from kernels import register_kernel_mapping, Mode, LayerRepository
163
- import sys
164
- import torch.profiler
165
- import gc
166
- import logging
167
-
168
- # set to debug logging
169
- logging.basicConfig(level=logging.INFO)
170
-
171
- def reset_peak_memory_stats():
172
- """Clear CUDA cache and reset memory allocation counters."""
173
- torch.cuda.empty_cache()
174
- if torch.cuda.is_available():
175
- torch.cuda.reset_peak_memory_stats()
176
- gc.collect()
177
-
178
- def get_memory_stats():
179
- """Get current and peak CUDA memory usage."""
180
- if not torch.cuda.is_available():
181
- return {"allocated_gb": 0, "peak_gb": 0, "reserved_gb": 0}
182
- return {
183
- "allocated_gb": torch.cuda.memory_allocated() / 1e9,
184
- "peak_gb": torch.cuda.max_memory_allocated() / 1e9,
185
- "reserved_gb": torch.cuda.memory_reserved() / 1e9,
186
- }
187
-
188
- def override_kernel_layer_name(cls_name: str, value) -> bool:
189
- """Helper to dynamically override the kernel_layer_name in a model class."""
190
- for mod in sys.modules.values():
191
- if mod is None:
192
- continue
193
- obj = getattr(mod, cls_name, None)
194
- if isinstance(obj, type) and issubclass(obj, nn.Module):
195
- setattr(obj, "kernel_layer_name", value)
196
- print(f"Overrode {cls_name}.kernel_layer_name to {value}")
197
- return True
198
- return False
199
-
200
-
201
- # Init the model the normal way
202
- model_id = "openai/gpt-oss-20b"
203
- tokenizer = PreTrainedTokenizerFast.from_pretrained(model_id)
204
- quantization_config = Mxfp4Config(dequantize=True)
205
-
206
-
207
- from kernels import replace_kernel_forward_from_hub, register_kernel_mapping, LayerRepository, Mode
208
-
209
- from transformers.models.gpt_oss.modeling_gpt_oss import GptOssMLP, GptOssRMSNorm
210
-
211
- replace_kernel_forward_from_hub(GptOssRMSNorm, None) # direct, type-safe
212
- custom_mapping = {
213
- "Yamoe": {
214
- "cuda": {
215
- Mode.INFERENCE: LayerRepository(
216
- repo_id="drbh/yamoe",
217
- layer_name="Yamoe",
218
- revision="v0.3.0",
219
- )
220
- }
221
- }
222
- }
223
- register_kernel_mapping(custom_mapping)
224
-
225
-
226
- model = GptOssForCausalLM.from_pretrained(
227
- model_id,
228
- dtype="bfloat16",
229
- device_map="auto",
230
- use_kernels=True,
231
- quantization_config=quantization_config,
232
- ).eval()
233
-
234
- messages = [
235
- {"role": "system", "content": "What is Tensor Parallelism?"},
236
- ]
237
-
238
- inputs = tokenizer.apply_chat_template(
239
- messages,
240
- add_generation_prompt=True,
241
- return_tensors="pt",
242
- return_dict=True,
243
- reasoning_effort="low",
244
- ).to("cuda")
245
-
246
- max_tokens = 512
247
-
248
- with torch.inference_mode():
249
- start_time = time.perf_counter()
250
- generated = model.generate(
251
- **inputs,
252
- max_new_tokens=max_tokens,
253
- do_sample=False,
254
- temperature=None,
255
- )
256
- end_time = time.perf_counter()
257
-
258
- print(tokenizer.decode(generated[0], skip_special_tokens=False))
259
- print(f"Generation took {end_time - start_time:.2f} seconds")
260
-
261
- ```
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
site/artifacts/charts/benchmark_dashboard.png DELETED
Binary file (87.7 kB)
 
site/artifacts/charts/latency.png DELETED
Binary file (31.6 kB)
 
site/artifacts/charts/memory.png DELETED
Binary file (46.3 kB)
 
site/artifacts/charts/throughput.png DELETED
Binary file (37.4 kB)
 
site/artifacts/setup/benchmark_avg_tokens_per_sec.txt DELETED
@@ -1 +0,0 @@
1
- 5.301658854167735
 
 
site/artifacts/setup/benchmark_dashboard.png DELETED
Binary file (92.9 kB)
 
site/artifacts/setup/benchmark_memory.txt DELETED
@@ -1 +0,0 @@
1
- 9.398672896,9.414898176,10.334765056
 
 
site/artifacts/setup/benchmark_times.txt DELETED
@@ -1,5 +0,0 @@
1
- 12.075035744113848
2
- 12.0710428240709
3
- 12.070115809096023
4
- 12.070908240042627
5
- 12.071364195086062
 
 
 
 
 
 
site/cells/charts.py DELETED
@@ -1,140 +0,0 @@
1
- # /// script
2
- # dependencies = [
3
- # "matplotlib",
4
- # "numpy",
5
- # ]
6
- # ///
7
-
8
- import matplotlib.pyplot as plt
9
- import numpy as np
10
- import os
11
-
12
- # get the pathf rom UVNOTE_SETUP env var
13
- setup_path = os.getenv("UVNOTE_INPUT_SETUP", ".")
14
- print(f"Reading benchmark data from: {setup_path}")
15
-
16
- num_runs = 5
17
- max_tokens = 64
18
- times = []
19
- with open(os.path.join(setup_path, "benchmark_times.txt"), "r") as f:
20
- for line in f:
21
- times.append(float(line.strip()))
22
-
23
-
24
- avg_time = 0.0
25
- min_time = 0.0
26
- max_time = 0.0
27
- final_mem = {"allocated_gb": 0.0, "peak_gb": 0.0, "reserved_gb": 0.0}
28
-
29
- avg_tokens_per_sec = 0.0
30
- with open(os.path.join(setup_path, "benchmark_avg_tokens_per_sec.txt"), "r") as f:
31
- avg_tokens_per_sec = float(f.read().strip())
32
-
33
- times_file = os.path.join(setup_path, "benchmark_times.txt")
34
- memory_file = os.path.join(setup_path, "benchmark_memory.txt")
35
-
36
-
37
- # Minimal brutalist palette (dark theme): grayscale + 1 accent
38
- ACCENT = '#5ec8f8' # calm cyan-blue accent
39
- FG = '#e6e6e6' # light gray text/lines
40
- MUTED = '#9aa0a6' # muted gray for secondary
41
- GRID = '#333333' # grid lines
42
-
43
- # Styling tuned for clarity, high contrast, few colors
44
- plt.style.use('dark_background')
45
- plt.rcParams['figure.facecolor'] = 'none'
46
- plt.rcParams['axes.facecolor'] = 'none'
47
- plt.rcParams['savefig.facecolor'] = 'none'
48
- plt.rcParams['savefig.transparent'] = True
49
- plt.rcParams['font.family'] = 'monospace'
50
- plt.rcParams['font.weight'] = 'bold'
51
- plt.rcParams['axes.linewidth'] = 3
52
- plt.rcParams['grid.linewidth'] = 2
53
- plt.rcParams['lines.linewidth'] = 3
54
- plt.rcParams['patch.linewidth'] = 2
55
-
56
- # Prepare data
57
- runs = list(range(1, len(times) + 1))
58
- tokens_per_sec_all = [max_tokens / t for t in times]
59
-
60
- # Chart 1: Throughput Performance
61
- fig1, ax1 = plt.subplots(1, 1, figsize=(12, 6))
62
- fig1.patch.set_alpha(0)
63
- ax1.patch.set_alpha(0)
64
-
65
- ax1.plot(runs, tokens_per_sec_all, color=ACCENT, marker='o', markersize=12,
66
- markerfacecolor=ACCENT, markeredgecolor=FG, markeredgewidth=3, linewidth=5, label='tok/s')
67
- ax1.fill_between(runs, 0, tokens_per_sec_all, alpha=0.2, color=ACCENT)
68
- ax1.axhline(y=avg_tokens_per_sec, color=FG, linestyle='--', linewidth=3,
69
- label=f'AVG: {avg_tokens_per_sec:.1f}')
70
- ax1.set_title('THROUGHPUT PERFORMANCE', color=FG, fontsize=18, pad=20, fontweight='bold')
71
- ax1.set_xlabel('RUN NUMBER', color=FG, fontsize=14, fontweight='bold')
72
- ax1.set_ylabel('TOKENS/SEC', color=FG, fontsize=14, fontweight='bold')
73
- ax1.grid(True, color=GRID, alpha=0.5, linewidth=2)
74
- ax1.tick_params(colors=FG, labelsize=12)
75
- legend1 = ax1.legend(frameon=False, loc='lower right')
76
- for text in legend1.get_texts():
77
- text.set_color(FG)
78
- text.set_fontweight('bold')
79
- plt.tight_layout()
80
- plt.savefig('throughput.png', dpi=150, bbox_inches='tight', transparent=True)
81
- plt.show()
82
-
83
- # Chart 2: Generation Latency
84
- fig2, ax2 = plt.subplots(1, 1, figsize=(12, 6))
85
- fig2.patch.set_alpha(0)
86
- ax2.patch.set_alpha(0)
87
-
88
- bar_colors = [ACCENT if i % 2 == 0 else MUTED for i in range(len(times))]
89
- bars = ax2.bar(runs, times, color=bar_colors, edgecolor=FG, linewidth=3, width=0.6)
90
- ax2.axhline(y=avg_time, color=FG, linestyle='--', linewidth=3,
91
- label=f'AVG: {avg_time:.2f}s')
92
- for i, (run, time, bar) in enumerate(zip(runs, times, bars)):
93
- ax2.text(run, time + 0.02, f'{time:.2f}s', ha='center', va='bottom',
94
- color=FG, fontweight='bold', fontsize=11)
95
- ax2.set_title('GENERATION LATENCY', color=FG, fontsize=18, pad=20, fontweight='bold')
96
- ax2.set_xlabel('RUN NUMBER', color=FG, fontsize=14, fontweight='bold')
97
- ax2.set_ylabel('TIME (SECONDS)', color=FG, fontsize=14, fontweight='bold')
98
- ax2.grid(True, axis='y', color=GRID, alpha=0.5, linewidth=2)
99
- ax2.tick_params(colors=FG, labelsize=12)
100
- ax2.set_ylim(0, max(times) * 1.15)
101
- legend2 = ax2.legend(frameon=False, loc='upper right')
102
- for text in legend2.get_texts():
103
- text.set_color(FG)
104
- text.set_fontweight('bold')
105
- plt.tight_layout()
106
- plt.savefig('latency.png', dpi=150, bbox_inches='tight', transparent=True)
107
- plt.show()
108
-
109
- # Chart 3: Memory Usage
110
- fig3, ax3 = plt.subplots(1, 1, figsize=(12, 6))
111
- fig3.patch.set_alpha(0)
112
- ax3.patch.set_alpha(0)
113
-
114
- memory_labels = ['ALLOCATED', 'PEAK', 'RESERVED']
115
- memory_values = [final_mem['allocated_gb'], final_mem['peak_gb'], final_mem['reserved_gb']]
116
- colors_mem = [MUTED, ACCENT, FG]
117
- bars = ax3.barh(memory_labels, memory_values, color=colors_mem, edgecolor=FG, linewidth=3, height=0.5)
118
- for i, (label, value, bar) in enumerate(zip(memory_labels, memory_values, bars)):
119
- ax3.text(value + 0.5, i, f'{value:.1f} GB', va='center',
120
- color=FG, fontweight='bold', fontsize=13)
121
- ax3.set_title('MEMORY USAGE', color=FG, fontsize=18, pad=20, fontweight='bold')
122
- ax3.set_xlabel('GIGABYTES', color=FG, fontsize=14, fontweight='bold')
123
- ax3.set_xlim(0, max(memory_values) * 1.3)
124
- ax3.grid(True, axis='x', color=GRID, alpha=0.5, linewidth=2)
125
- ax3.tick_params(colors=FG, labelsize=12)
126
- ax3.set_yticks(range(len(memory_labels)))
127
- ax3.set_yticklabels(memory_labels, fontweight='bold')
128
- plt.tight_layout()
129
- plt.savefig('memory.png', dpi=150, bbox_inches='tight', transparent=True)
130
- plt.show()
131
-
132
- print(f"\n📊 Charts saved as:")
133
- print(f" • throughput.png")
134
- print(f" • latency.png")
135
- print(f" • memory.png")
136
- print(f"\nBenchmark Summary:")
137
- print(f" avg tokens/sec: {avg_tokens_per_sec:.1f}")
138
- print(f" min time: {min_time:.3f}s")
139
- print(f" max time: {max_time:.3f}s")
140
- print(f" peak memory: {final_mem['peak_gb']:.2f}GB")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
site/cells/forward_and_backward.py DELETED
@@ -1,102 +0,0 @@
1
- # /// script
2
- # requires-python = ">=3.12"
3
- # dependencies = [
4
- # "accelerate>=1.10.1",
5
- # "torch>=2.7.0",
6
- # "kernels==0.10.0",
7
- # "transformers@https://github.com/huggingface/transformers.git",
8
- # "ipdb>=0.13.13",
9
- # "matplotlib>=3.7.2",
10
- # "numpy>=1.24.3",
11
- # ]
12
- # ///
13
-
14
- import torch
15
- from transformers import GptOssForCausalLM, PreTrainedTokenizerFast, Mxfp4Config
16
- import time
17
- import torch.nn as nn
18
- from kernels import register_kernel_mapping, Mode, LayerRepository
19
- import sys
20
- import torch.profiler
21
- import gc
22
- import logging
23
- from transformers.models.gpt_oss.modeling_gpt_oss import GptOssRMSNorm
24
-
25
- # remove liger kernel for testing
26
- replace_kernel_forward_from_hub(GptOssRMSNorm, None)
27
-
28
- # set to debug logging
29
- logging.basicConfig(level=logging.INFO)
30
-
31
- def reset_peak_memory_stats():
32
- """Clear CUDA cache and reset memory allocation counters."""
33
- torch.cuda.empty_cache()
34
- if torch.cuda.is_available():
35
- torch.cuda.reset_peak_memory_stats()
36
- gc.collect()
37
-
38
- def get_memory_stats():
39
- """Get current and peak CUDA memory usage."""
40
- if not torch.cuda.is_available():
41
- return {"allocated_gb": 0, "peak_gb": 0, "reserved_gb": 0}
42
- return {
43
- "allocated_gb": torch.cuda.memory_allocated() / 1e9,
44
- "peak_gb": torch.cuda.max_memory_allocated() / 1e9,
45
- "reserved_gb": torch.cuda.memory_reserved() / 1e9,
46
- }
47
-
48
- def override_kernel_layer_name(cls_name: str, value) -> bool:
49
- """Helper to dynamically override the kernel_layer_name in a model class."""
50
- for mod in sys.modules.values():
51
- if mod is None:
52
- continue
53
- obj = getattr(mod, cls_name, None)
54
- if isinstance(obj, type) and issubclass(obj, nn.Module):
55
- setattr(obj, "kernel_layer_name", value)
56
- print(f"Overrode {cls_name}.kernel_layer_name to {value}")
57
- return True
58
- return False
59
-
60
-
61
- # Init the model the normal way
62
- model_id = "openai/gpt-oss-20b"
63
- tokenizer = PreTrainedTokenizerFast.from_pretrained(model_id)
64
- quantization_config = Mxfp4Config(dequantize=True)
65
-
66
- model = GptOssForCausalLM.from_pretrained(
67
- model_id,
68
- dtype="bfloat16",
69
- device_map="auto",
70
- use_kernels=True,
71
- quantization_config=quantization_config,
72
- training=True,
73
- ).eval()
74
-
75
- messages = [
76
- {"role": "system", "content": "What is Tensor Parallelism?"},
77
- ]
78
-
79
- inputs = tokenizer.apply_chat_template(
80
- messages,
81
- add_generation_prompt=True,
82
- return_tensors="pt",
83
- return_dict=True,
84
- reasoning_effort="low",
85
- ).to("cuda")
86
-
87
- max_tokens = 512
88
-
89
-
90
- # forward and backward pass
91
- with torch.autograd.set_grad_enabled(True):
92
- start_time = time.perf_counter()
93
- generated = model.generate(
94
- **inputs,
95
- max_new_tokens=max_tokens,
96
- do_sample=False,
97
- temperature=None,
98
- )
99
- end_time = time.perf_counter()
100
- print(tokenizer.decode(generated[0], skip_special_tokens=False))
101
- print(f"Generation took {end_time - start_time:.2f} seconds")
102
-
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
site/cells/forward_only.py DELETED
@@ -1,96 +0,0 @@
1
- # /// script
2
- # requires-python = ">=3.12"
3
- # dependencies = [
4
- # "accelerate>=1.10.1",
5
- # "torch>=2.7.0",
6
- # "kernels==0.10.0",
7
- # "transformers@https://github.com/huggingface/transformers.git",
8
- # "ipdb>=0.13.13",
9
- # "matplotlib>=3.7.2",
10
- # "numpy>=1.24.3",
11
- # ]
12
- # ///
13
-
14
- import torch
15
- from transformers import GptOssForCausalLM, PreTrainedTokenizerFast, Mxfp4Config
16
- import time
17
- import torch.nn as nn
18
- from kernels import register_kernel_mapping, Mode, LayerRepository
19
- import sys
20
- import torch.profiler
21
- import gc
22
- import logging
23
-
24
- # set to debug logging
25
- logging.basicConfig(level=logging.INFO)
26
-
27
- def reset_peak_memory_stats():
28
- """Clear CUDA cache and reset memory allocation counters."""
29
- torch.cuda.empty_cache()
30
- if torch.cuda.is_available():
31
- torch.cuda.reset_peak_memory_stats()
32
- gc.collect()
33
-
34
- def get_memory_stats():
35
- """Get current and peak CUDA memory usage."""
36
- if not torch.cuda.is_available():
37
- return {"allocated_gb": 0, "peak_gb": 0, "reserved_gb": 0}
38
- return {
39
- "allocated_gb": torch.cuda.memory_allocated() / 1e9,
40
- "peak_gb": torch.cuda.max_memory_allocated() / 1e9,
41
- "reserved_gb": torch.cuda.memory_reserved() / 1e9,
42
- }
43
-
44
- def override_kernel_layer_name(cls_name: str, value) -> bool:
45
- """Helper to dynamically override the kernel_layer_name in a model class."""
46
- for mod in sys.modules.values():
47
- if mod is None:
48
- continue
49
- obj = getattr(mod, cls_name, None)
50
- if isinstance(obj, type) and issubclass(obj, nn.Module):
51
- setattr(obj, "kernel_layer_name", value)
52
- print(f"Overrode {cls_name}.kernel_layer_name to {value}")
53
- return True
54
- return False
55
-
56
-
57
- # Init the model the normal way
58
- model_id = "openai/gpt-oss-20b"
59
- tokenizer = PreTrainedTokenizerFast.from_pretrained(model_id)
60
- quantization_config = Mxfp4Config(dequantize=True)
61
-
62
-
63
- model = GptOssForCausalLM.from_pretrained(
64
- model_id,
65
- dtype="bfloat16",
66
- device_map="auto",
67
- use_kernels=True,
68
- quantization_config=quantization_config,
69
- ).eval()
70
-
71
- messages = [
72
- {"role": "system", "content": "What is Tensor Parallelism?"},
73
- ]
74
-
75
- inputs = tokenizer.apply_chat_template(
76
- messages,
77
- add_generation_prompt=True,
78
- return_tensors="pt",
79
- return_dict=True,
80
- reasoning_effort="low",
81
- ).to("cuda")
82
-
83
- max_tokens = 512
84
-
85
- with torch.inference_mode():
86
- start_time = time.perf_counter()
87
- generated = model.generate(
88
- **inputs,
89
- max_new_tokens=max_tokens,
90
- do_sample=False,
91
- temperature=None,
92
- )
93
- end_time = time.perf_counter()
94
-
95
- print(tokenizer.decode(generated[0], skip_special_tokens=False))
96
- print(f"Generation took {end_time - start_time:.2f} seconds")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
site/cells/setup.py DELETED
@@ -1,116 +0,0 @@
1
- # /// script
2
- # requires-python = ">=3.12"
3
- # dependencies = [
4
- # "accelerate>=1.10.1",
5
- # "torch>=2.7.0",
6
- # "kernels==0.10.0",
7
- # "transformers@https://github.com/huggingface/transformers.git",
8
- # "ipdb>=0.13.13",
9
- # "matplotlib>=3.7.2",
10
- # "numpy>=1.24.3",
11
- # ]
12
- # ///
13
-
14
- import torch
15
- from transformers import GptOssForCausalLM, PreTrainedTokenizerFast, Mxfp4Config
16
- import time
17
- import torch.nn as nn
18
- from kernels import register_kernel_mapping, Mode, LayerRepository
19
- import sys
20
- import torch.profiler
21
- import gc
22
- import logging
23
-
24
- # set to debug logging
25
- logging.basicConfig(level=logging.INFO)
26
-
27
- def reset_peak_memory_stats():
28
- """Clear CUDA cache and reset memory allocation counters."""
29
- torch.cuda.empty_cache()
30
- if torch.cuda.is_available():
31
- torch.cuda.reset_peak_memory_stats()
32
- gc.collect()
33
-
34
- def get_memory_stats():
35
- """Get current and peak CUDA memory usage."""
36
- if not torch.cuda.is_available():
37
- return {"allocated_gb": 0, "peak_gb": 0, "reserved_gb": 0}
38
- return {
39
- "allocated_gb": torch.cuda.memory_allocated() / 1e9,
40
- "peak_gb": torch.cuda.max_memory_allocated() / 1e9,
41
- "reserved_gb": torch.cuda.memory_reserved() / 1e9,
42
- }
43
-
44
- def override_kernel_layer_name(cls_name: str, value) -> bool:
45
- """Helper to dynamically override the kernel_layer_name in a model class."""
46
- for mod in sys.modules.values():
47
- if mod is None:
48
- continue
49
- obj = getattr(mod, cls_name, None)
50
- if isinstance(obj, type) and issubclass(obj, nn.Module):
51
- setattr(obj, "kernel_layer_name", value)
52
- print(f"Overrode {cls_name}.kernel_layer_name to {value}")
53
- return True
54
- return False
55
-
56
-
57
- # Init the model the normal way
58
- model_id = "openai/gpt-oss-20b"
59
- tokenizer = PreTrainedTokenizerFast.from_pretrained(model_id)
60
- quantization_config = Mxfp4Config(dequantize=True)
61
-
62
-
63
- from kernels import replace_kernel_forward_from_hub, register_kernel_mapping, LayerRepository, Mode
64
-
65
- from transformers.models.gpt_oss.modeling_gpt_oss import GptOssMLP, GptOssRMSNorm
66
-
67
- replace_kernel_forward_from_hub(GptOssMLP, "Yamoe") # direct, type-safe
68
- replace_kernel_forward_from_hub(GptOssRMSNorm, None) # direct, type-safe
69
- custom_mapping = {
70
- "Yamoe": {
71
- "cuda": {
72
- Mode.INFERENCE: LayerRepository(
73
- repo_id="drbh/yamoe",
74
- layer_name="Yamoe",
75
- revision="v0.3.0",
76
- )
77
- }
78
- }
79
- }
80
- register_kernel_mapping(custom_mapping)
81
-
82
-
83
- model = GptOssForCausalLM.from_pretrained(
84
- model_id,
85
- dtype="bfloat16",
86
- device_map="auto",
87
- use_kernels=True,
88
- quantization_config=quantization_config,
89
- ).eval()
90
-
91
- messages = [
92
- {"role": "system", "content": "What is Tensor Parallelism?"},
93
- ]
94
-
95
- inputs = tokenizer.apply_chat_template(
96
- messages,
97
- add_generation_prompt=True,
98
- return_tensors="pt",
99
- return_dict=True,
100
- reasoning_effort="low",
101
- ).to("cuda")
102
-
103
- max_tokens = 512
104
-
105
- with torch.inference_mode():
106
- start_time = time.perf_counter()
107
- generated = model.generate(
108
- **inputs,
109
- max_new_tokens=max_tokens,
110
- do_sample=False,
111
- temperature=None,
112
- )
113
- end_time = time.perf_counter()
114
-
115
- print(tokenizer.decode(generated[0], skip_special_tokens=False))
116
- print(f"Generation took {end_time - start_time:.2f} seconds")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
site/cells/setup2.py DELETED
@@ -1,115 +0,0 @@
1
- # /// script
2
- # requires-python = ">=3.12"
3
- # dependencies = [
4
- # "accelerate>=1.10.1",
5
- # "torch>=2.7.0",
6
- # "kernels==0.10.0",
7
- # "transformers@https://github.com/huggingface/transformers.git",
8
- # "ipdb>=0.13.13",
9
- # "matplotlib>=3.7.2",
10
- # "numpy>=1.24.3",
11
- # ]
12
- # ///
13
-
14
- import torch
15
- from transformers import GptOssForCausalLM, PreTrainedTokenizerFast, Mxfp4Config
16
- import time
17
- import torch.nn as nn
18
- from kernels import register_kernel_mapping, Mode, LayerRepository
19
- import sys
20
- import torch.profiler
21
- import gc
22
- import logging
23
-
24
- # set to debug logging
25
- logging.basicConfig(level=logging.INFO)
26
-
27
- def reset_peak_memory_stats():
28
- """Clear CUDA cache and reset memory allocation counters."""
29
- torch.cuda.empty_cache()
30
- if torch.cuda.is_available():
31
- torch.cuda.reset_peak_memory_stats()
32
- gc.collect()
33
-
34
- def get_memory_stats():
35
- """Get current and peak CUDA memory usage."""
36
- if not torch.cuda.is_available():
37
- return {"allocated_gb": 0, "peak_gb": 0, "reserved_gb": 0}
38
- return {
39
- "allocated_gb": torch.cuda.memory_allocated() / 1e9,
40
- "peak_gb": torch.cuda.max_memory_allocated() / 1e9,
41
- "reserved_gb": torch.cuda.memory_reserved() / 1e9,
42
- }
43
-
44
- def override_kernel_layer_name(cls_name: str, value) -> bool:
45
- """Helper to dynamically override the kernel_layer_name in a model class."""
46
- for mod in sys.modules.values():
47
- if mod is None:
48
- continue
49
- obj = getattr(mod, cls_name, None)
50
- if isinstance(obj, type) and issubclass(obj, nn.Module):
51
- setattr(obj, "kernel_layer_name", value)
52
- print(f"Overrode {cls_name}.kernel_layer_name to {value}")
53
- return True
54
- return False
55
-
56
-
57
- # Init the model the normal way
58
- model_id = "openai/gpt-oss-20b"
59
- tokenizer = PreTrainedTokenizerFast.from_pretrained(model_id)
60
- quantization_config = Mxfp4Config(dequantize=True)
61
-
62
-
63
- from kernels import replace_kernel_forward_from_hub, register_kernel_mapping, LayerRepository, Mode
64
-
65
- from transformers.models.gpt_oss.modeling_gpt_oss import GptOssMLP, GptOssRMSNorm
66
-
67
- replace_kernel_forward_from_hub(GptOssRMSNorm, None) # direct, type-safe
68
- custom_mapping = {
69
- "Yamoe": {
70
- "cuda": {
71
- Mode.INFERENCE: LayerRepository(
72
- repo_id="drbh/yamoe",
73
- layer_name="Yamoe",
74
- revision="v0.3.0",
75
- )
76
- }
77
- }
78
- }
79
- register_kernel_mapping(custom_mapping)
80
-
81
-
82
- model = GptOssForCausalLM.from_pretrained(
83
- model_id,
84
- dtype="bfloat16",
85
- device_map="auto",
86
- use_kernels=True,
87
- quantization_config=quantization_config,
88
- ).eval()
89
-
90
- messages = [
91
- {"role": "system", "content": "What is Tensor Parallelism?"},
92
- ]
93
-
94
- inputs = tokenizer.apply_chat_template(
95
- messages,
96
- add_generation_prompt=True,
97
- return_tensors="pt",
98
- return_dict=True,
99
- reasoning_effort="low",
100
- ).to("cuda")
101
-
102
- max_tokens = 512
103
-
104
- with torch.inference_mode():
105
- start_time = time.perf_counter()
106
- generated = model.generate(
107
- **inputs,
108
- max_new_tokens=max_tokens,
109
- do_sample=False,
110
- temperature=None,
111
- )
112
- end_time = time.perf_counter()
113
-
114
- print(tokenizer.decode(generated[0], skip_special_tokens=False))
115
- print(f"Generation took {end_time - start_time:.2f} seconds")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
site/megablocks_only.html DELETED
The diff for this file is too large to render. See raw diff
 
site/note.html DELETED
The diff for this file is too large to render. See raw diff
 
site/note_test_override.html DELETED
The diff for this file is too large to render. See raw diff
 
style.css DELETED
@@ -1,28 +0,0 @@
1
- body {
2
- padding: 2rem;
3
- font-family: -apple-system, BlinkMacSystemFont, "Arial", sans-serif;
4
- }
5
-
6
- h1 {
7
- font-size: 16px;
8
- margin-top: 0;
9
- }
10
-
11
- p {
12
- color: rgb(107, 114, 128);
13
- font-size: 15px;
14
- margin-bottom: 10px;
15
- margin-top: 5px;
16
- }
17
-
18
- .card {
19
- max-width: 620px;
20
- margin: 0 auto;
21
- padding: 16px;
22
- border: 1px solid lightgray;
23
- border-radius: 16px;
24
- }
25
-
26
- .card p:last-child {
27
- margin-bottom: 0;
28
- }