drbh HF Staff commited on
Commit
93c5002
·
verified ·
1 Parent(s): 6ba657a

Upload folder using huggingface_hub

Browse files
Files changed (27) hide show
  1. .venv/index.html +24 -0
  2. .venv/lib/index.html +24 -0
  3. .venv/lib/python3.11/index.html +24 -0
  4. .venv/lib/python3.11/site-packages/flask/index.html +24 -0
  5. .venv/lib/python3.11/site-packages/flask/sansio/index.html +24 -0
  6. .venv/lib/python3.11/site-packages/index.html +26 -0
  7. .venv/lib/python3.11/site-packages/markdown-3.9.dist-info/index.html +24 -0
  8. .venv/lib/python3.11/site-packages/markdown-3.9.dist-info/licenses/index.html +24 -0
  9. .venv/lib/python3.11/site-packages/werkzeug/debug/index.html +24 -0
  10. .venv/lib/python3.11/site-packages/werkzeug/debug/shared/index.html +24 -0
  11. .venv/lib/python3.11/site-packages/werkzeug/index.html +24 -0
  12. index.html +0 -0
  13. moe_benchmarks/index.html +25 -0
  14. moe_benchmarks/megablocks/cells/forward_only.py +101 -0
  15. moe_benchmarks/megablocks/index.html +24 -0
  16. moe_benchmarks/megablocks/megablocks_only.html +0 -0
  17. moe_benchmarks/megablocks_yamoe/cells/__pycache__/bench_utils.cpython-311.pyc +0 -0
  18. moe_benchmarks/megablocks_yamoe/cells/__pycache__/config.cpython-311.pyc +0 -0
  19. moe_benchmarks/megablocks_yamoe/cells/bench_utils.py +241 -0
  20. moe_benchmarks/megablocks_yamoe/cells/config.py +27 -0
  21. moe_benchmarks/megablocks_yamoe/cells/nv.py +3 -0
  22. moe_benchmarks/megablocks_yamoe/cells/save_data.py +42 -0
  23. moe_benchmarks/megablocks_yamoe/cells/utils.py +34 -0
  24. moe_benchmarks/megablocks_yamoe/cells/yamoe_run.py +135 -0
  25. moe_benchmarks/megablocks_yamoe/index.html +25 -0
  26. moe_benchmarks/megablocks_yamoe/megablocks_yamoe.html +0 -0
  27. moe_benchmarks/megablocks_yamoe/torch_profile.html +0 -0
.venv/index.html ADDED
@@ -0,0 +1,24 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ <!DOCTYPE html>
2
+ <html>
3
+ <head>
4
+ <meta charset='UTF-8'>
5
+ <title>Directory Index</title>
6
+ <style>
7
+ body { font-family: monospace; margin: 20px; }
8
+ h1 { font-size: 1.5em; }
9
+ ul { list-style-type: none; padding-left: 20px; }
10
+ li { margin: 5px 0; }
11
+ .dir { font-weight: bold; }
12
+ .file { color: #0066cc; }
13
+ a { text-decoration: none; }
14
+ a:hover { text-decoration: underline; }
15
+ </style>
16
+ </head>
17
+ <body>
18
+ <h1>Index of /.venv</h1>
19
+ <ul>
20
+ <li><a href='../index.html' class='dir'>../</a></li>
21
+ <li><a href='lib/index.html' class='dir'>lib/</a></li>
22
+ </ul>
23
+ </body>
24
+ </html>
.venv/lib/index.html ADDED
@@ -0,0 +1,24 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ <!DOCTYPE html>
2
+ <html>
3
+ <head>
4
+ <meta charset='UTF-8'>
5
+ <title>Directory Index</title>
6
+ <style>
7
+ body { font-family: monospace; margin: 20px; }
8
+ h1 { font-size: 1.5em; }
9
+ ul { list-style-type: none; padding-left: 20px; }
10
+ li { margin: 5px 0; }
11
+ .dir { font-weight: bold; }
12
+ .file { color: #0066cc; }
13
+ a { text-decoration: none; }
14
+ a:hover { text-decoration: underline; }
15
+ </style>
16
+ </head>
17
+ <body>
18
+ <h1>Index of /.venv/lib</h1>
19
+ <ul>
20
+ <li><a href='../index.html' class='dir'>../</a></li>
21
+ <li><a href='python3.11/index.html' class='dir'>python3.11/</a></li>
22
+ </ul>
23
+ </body>
24
+ </html>
.venv/lib/python3.11/index.html ADDED
@@ -0,0 +1,24 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ <!DOCTYPE html>
2
+ <html>
3
+ <head>
4
+ <meta charset='UTF-8'>
5
+ <title>Directory Index</title>
6
+ <style>
7
+ body { font-family: monospace; margin: 20px; }
8
+ h1 { font-size: 1.5em; }
9
+ ul { list-style-type: none; padding-left: 20px; }
10
+ li { margin: 5px 0; }
11
+ .dir { font-weight: bold; }
12
+ .file { color: #0066cc; }
13
+ a { text-decoration: none; }
14
+ a:hover { text-decoration: underline; }
15
+ </style>
16
+ </head>
17
+ <body>
18
+ <h1>Index of /.venv/lib/python3.11</h1>
19
+ <ul>
20
+ <li><a href='../index.html' class='dir'>../</a></li>
21
+ <li><a href='site-packages/index.html' class='dir'>site-packages/</a></li>
22
+ </ul>
23
+ </body>
24
+ </html>
.venv/lib/python3.11/site-packages/flask/index.html ADDED
@@ -0,0 +1,24 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ <!DOCTYPE html>
2
+ <html>
3
+ <head>
4
+ <meta charset='UTF-8'>
5
+ <title>Directory Index</title>
6
+ <style>
7
+ body { font-family: monospace; margin: 20px; }
8
+ h1 { font-size: 1.5em; }
9
+ ul { list-style-type: none; padding-left: 20px; }
10
+ li { margin: 5px 0; }
11
+ .dir { font-weight: bold; }
12
+ .file { color: #0066cc; }
13
+ a { text-decoration: none; }
14
+ a:hover { text-decoration: underline; }
15
+ </style>
16
+ </head>
17
+ <body>
18
+ <h1>Index of /.venv/lib/python3.11/site-packages/flask</h1>
19
+ <ul>
20
+ <li><a href='../index.html' class='dir'>../</a></li>
21
+ <li><a href='sansio/index.html' class='dir'>sansio/</a></li>
22
+ </ul>
23
+ </body>
24
+ </html>
.venv/lib/python3.11/site-packages/flask/sansio/index.html ADDED
@@ -0,0 +1,24 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ <!DOCTYPE html>
2
+ <html>
3
+ <head>
4
+ <meta charset='UTF-8'>
5
+ <title>Directory Index</title>
6
+ <style>
7
+ body { font-family: monospace; margin: 20px; }
8
+ h1 { font-size: 1.5em; }
9
+ ul { list-style-type: none; padding-left: 20px; }
10
+ li { margin: 5px 0; }
11
+ .dir { font-weight: bold; }
12
+ .file { color: #0066cc; }
13
+ a { text-decoration: none; }
14
+ a:hover { text-decoration: underline; }
15
+ </style>
16
+ </head>
17
+ <body>
18
+ <h1>Index of /.venv/lib/python3.11/site-packages/flask/sansio</h1>
19
+ <ul>
20
+ <li><a href='../index.html' class='dir'>../</a></li>
21
+ <li><a href='README.html' class='file'>README.html</a></li>
22
+ </ul>
23
+ </body>
24
+ </html>
.venv/lib/python3.11/site-packages/index.html ADDED
@@ -0,0 +1,26 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ <!DOCTYPE html>
2
+ <html>
3
+ <head>
4
+ <meta charset='UTF-8'>
5
+ <title>Directory Index</title>
6
+ <style>
7
+ body { font-family: monospace; margin: 20px; }
8
+ h1 { font-size: 1.5em; }
9
+ ul { list-style-type: none; padding-left: 20px; }
10
+ li { margin: 5px 0; }
11
+ .dir { font-weight: bold; }
12
+ .file { color: #0066cc; }
13
+ a { text-decoration: none; }
14
+ a:hover { text-decoration: underline; }
15
+ </style>
16
+ </head>
17
+ <body>
18
+ <h1>Index of /.venv/lib/python3.11/site-packages</h1>
19
+ <ul>
20
+ <li><a href='../index.html' class='dir'>../</a></li>
21
+ <li><a href='flask/index.html' class='dir'>flask/</a></li>
22
+ <li><a href='markdown-3.9.dist-info/index.html' class='dir'>markdown-3.9.dist-info/</a></li>
23
+ <li><a href='werkzeug/index.html' class='dir'>werkzeug/</a></li>
24
+ </ul>
25
+ </body>
26
+ </html>
.venv/lib/python3.11/site-packages/markdown-3.9.dist-info/index.html ADDED
@@ -0,0 +1,24 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ <!DOCTYPE html>
2
+ <html>
3
+ <head>
4
+ <meta charset='UTF-8'>
5
+ <title>Directory Index</title>
6
+ <style>
7
+ body { font-family: monospace; margin: 20px; }
8
+ h1 { font-size: 1.5em; }
9
+ ul { list-style-type: none; padding-left: 20px; }
10
+ li { margin: 5px 0; }
11
+ .dir { font-weight: bold; }
12
+ .file { color: #0066cc; }
13
+ a { text-decoration: none; }
14
+ a:hover { text-decoration: underline; }
15
+ </style>
16
+ </head>
17
+ <body>
18
+ <h1>Index of /.venv/lib/python3.11/site-packages/markdown-3.9.dist-info</h1>
19
+ <ul>
20
+ <li><a href='../index.html' class='dir'>../</a></li>
21
+ <li><a href='licenses/index.html' class='dir'>licenses/</a></li>
22
+ </ul>
23
+ </body>
24
+ </html>
.venv/lib/python3.11/site-packages/markdown-3.9.dist-info/licenses/index.html ADDED
@@ -0,0 +1,24 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ <!DOCTYPE html>
2
+ <html>
3
+ <head>
4
+ <meta charset='UTF-8'>
5
+ <title>Directory Index</title>
6
+ <style>
7
+ body { font-family: monospace; margin: 20px; }
8
+ h1 { font-size: 1.5em; }
9
+ ul { list-style-type: none; padding-left: 20px; }
10
+ li { margin: 5px 0; }
11
+ .dir { font-weight: bold; }
12
+ .file { color: #0066cc; }
13
+ a { text-decoration: none; }
14
+ a:hover { text-decoration: underline; }
15
+ </style>
16
+ </head>
17
+ <body>
18
+ <h1>Index of /.venv/lib/python3.11/site-packages/markdown-3.9.dist-info/licenses</h1>
19
+ <ul>
20
+ <li><a href='../index.html' class='dir'>../</a></li>
21
+ <li><a href='LICENSE.html' class='file'>LICENSE.html</a></li>
22
+ </ul>
23
+ </body>
24
+ </html>
.venv/lib/python3.11/site-packages/werkzeug/debug/index.html ADDED
@@ -0,0 +1,24 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ <!DOCTYPE html>
2
+ <html>
3
+ <head>
4
+ <meta charset='UTF-8'>
5
+ <title>Directory Index</title>
6
+ <style>
7
+ body { font-family: monospace; margin: 20px; }
8
+ h1 { font-size: 1.5em; }
9
+ ul { list-style-type: none; padding-left: 20px; }
10
+ li { margin: 5px 0; }
11
+ .dir { font-weight: bold; }
12
+ .file { color: #0066cc; }
13
+ a { text-decoration: none; }
14
+ a:hover { text-decoration: underline; }
15
+ </style>
16
+ </head>
17
+ <body>
18
+ <h1>Index of /.venv/lib/python3.11/site-packages/werkzeug/debug</h1>
19
+ <ul>
20
+ <li><a href='../index.html' class='dir'>../</a></li>
21
+ <li><a href='shared/index.html' class='dir'>shared/</a></li>
22
+ </ul>
23
+ </body>
24
+ </html>
.venv/lib/python3.11/site-packages/werkzeug/debug/shared/index.html ADDED
@@ -0,0 +1,24 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ <!DOCTYPE html>
2
+ <html>
3
+ <head>
4
+ <meta charset='UTF-8'>
5
+ <title>Directory Index</title>
6
+ <style>
7
+ body { font-family: monospace; margin: 20px; }
8
+ h1 { font-size: 1.5em; }
9
+ ul { list-style-type: none; padding-left: 20px; }
10
+ li { margin: 5px 0; }
11
+ .dir { font-weight: bold; }
12
+ .file { color: #0066cc; }
13
+ a { text-decoration: none; }
14
+ a:hover { text-decoration: underline; }
15
+ </style>
16
+ </head>
17
+ <body>
18
+ <h1>Index of /.venv/lib/python3.11/site-packages/werkzeug/debug/shared</h1>
19
+ <ul>
20
+ <li><a href='../index.html' class='dir'>../</a></li>
21
+ <li><a href='ICON_LICENSE.html' class='file'>ICON_LICENSE.html</a></li>
22
+ </ul>
23
+ </body>
24
+ </html>
.venv/lib/python3.11/site-packages/werkzeug/index.html ADDED
@@ -0,0 +1,24 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ <!DOCTYPE html>
2
+ <html>
3
+ <head>
4
+ <meta charset='UTF-8'>
5
+ <title>Directory Index</title>
6
+ <style>
7
+ body { font-family: monospace; margin: 20px; }
8
+ h1 { font-size: 1.5em; }
9
+ ul { list-style-type: none; padding-left: 20px; }
10
+ li { margin: 5px 0; }
11
+ .dir { font-weight: bold; }
12
+ .file { color: #0066cc; }
13
+ a { text-decoration: none; }
14
+ a:hover { text-decoration: underline; }
15
+ </style>
16
+ </head>
17
+ <body>
18
+ <h1>Index of /.venv/lib/python3.11/site-packages/werkzeug</h1>
19
+ <ul>
20
+ <li><a href='../index.html' class='dir'>../</a></li>
21
+ <li><a href='debug/index.html' class='dir'>debug/</a></li>
22
+ </ul>
23
+ </body>
24
+ </html>
index.html CHANGED
The diff for this file is too large to render. See raw diff
 
moe_benchmarks/index.html ADDED
@@ -0,0 +1,25 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ <!DOCTYPE html>
2
+ <html>
3
+ <head>
4
+ <meta charset='UTF-8'>
5
+ <title>Directory Index</title>
6
+ <style>
7
+ body { font-family: monospace; margin: 20px; }
8
+ h1 { font-size: 1.5em; }
9
+ ul { list-style-type: none; padding-left: 20px; }
10
+ li { margin: 5px 0; }
11
+ .dir { font-weight: bold; }
12
+ .file { color: #0066cc; }
13
+ a { text-decoration: none; }
14
+ a:hover { text-decoration: underline; }
15
+ </style>
16
+ </head>
17
+ <body>
18
+ <h1>Index of /moe_benchmarks</h1>
19
+ <ul>
20
+ <li><a href='../index.html' class='dir'>../</a></li>
21
+ <li><a href='megablocks/index.html' class='dir'>megablocks/</a></li>
22
+ <li><a href='megablocks_yamoe/index.html' class='dir'>megablocks_yamoe/</a></li>
23
+ </ul>
24
+ </body>
25
+ </html>
moe_benchmarks/megablocks/cells/forward_only.py ADDED
@@ -0,0 +1,101 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # /// script
2
+ # requires-python = ">=3.12"
3
+ # dependencies = [
4
+ # "accelerate>=1.10.1",
5
+ # "torch>=2.7.0",
6
+ # "kernels==0.10.0",
7
+ # "transformers@https://github.com/huggingface/transformers.git",
8
+ # "ipdb>=0.13.13",
9
+ # "matplotlib>=3.7.2",
10
+ # "numpy>=1.24.3",
11
+ # ]
12
+ # ///
13
+
14
+ import torch
15
+ from transformers import GptOssForCausalLM, PreTrainedTokenizerFast, Mxfp4Config
16
+ import time
17
+ import torch.nn as nn
18
+ from kernels import register_kernel_mapping, Mode, LayerRepository, replace_kernel_forward_from_hub
19
+ import sys
20
+ import torch.profiler
21
+ import gc
22
+ import logging
23
+ from transformers.models.gpt_oss.modeling_gpt_oss import GptOssRMSNorm
24
+
25
+
26
+ replace_kernel_forward_from_hub(GptOssRMSNorm, None)
27
+
28
+ # set to debug logging
29
+ logging.basicConfig(level=logging.INFO)
30
+
31
+ def reset_peak_memory_stats():
32
+ """Clear CUDA cache and reset memory allocation counters."""
33
+ torch.cuda.empty_cache()
34
+ if torch.cuda.is_available():
35
+ torch.cuda.reset_peak_memory_stats()
36
+ gc.collect()
37
+
38
+ def get_memory_stats():
39
+ """Get current and peak CUDA memory usage."""
40
+ if not torch.cuda.is_available():
41
+ return {"allocated_gb": 0, "peak_gb": 0, "reserved_gb": 0}
42
+ return {
43
+ "allocated_gb": torch.cuda.memory_allocated() / 1e9,
44
+ "peak_gb": torch.cuda.max_memory_allocated() / 1e9,
45
+ "reserved_gb": torch.cuda.memory_reserved() / 1e9,
46
+ }
47
+
48
+ def override_kernel_layer_name(cls_name: str, value) -> bool:
49
+ """Helper to dynamically override the kernel_layer_name in a model class."""
50
+ for mod in sys.modules.values():
51
+ if mod is None:
52
+ continue
53
+ obj = getattr(mod, cls_name, None)
54
+ if isinstance(obj, type) and issubclass(obj, nn.Module):
55
+ setattr(obj, "kernel_layer_name", value)
56
+ print(f"Overrode {cls_name}.kernel_layer_name to {value}")
57
+ return True
58
+ return False
59
+
60
+
61
+ # Init the model the normal way
62
+ model_id = "openai/gpt-oss-20b"
63
+ tokenizer = PreTrainedTokenizerFast.from_pretrained(model_id)
64
+ quantization_config = Mxfp4Config(dequantize=True)
65
+
66
+
67
+
68
+ model = GptOssForCausalLM.from_pretrained(
69
+ model_id,
70
+ dtype="bfloat16",
71
+ device_map="auto",
72
+ use_kernels=True,
73
+ quantization_config=quantization_config,
74
+ ).eval()
75
+
76
+ messages = [
77
+ {"role": "system", "content": "What is Tensor Parallelism?"},
78
+ ]
79
+
80
+ inputs = tokenizer.apply_chat_template(
81
+ messages,
82
+ add_generation_prompt=True,
83
+ return_tensors="pt",
84
+ return_dict=True,
85
+ reasoning_effort="low",
86
+ ).to("cuda")
87
+
88
+ max_tokens = 256
89
+
90
+ with torch.inference_mode():
91
+ start_time = time.perf_counter()
92
+ generated = model.generate(
93
+ **inputs,
94
+ max_new_tokens=max_tokens,
95
+ do_sample=False,
96
+ temperature=None,
97
+ )
98
+ end_time = time.perf_counter()
99
+
100
+ print(tokenizer.decode(generated[0], skip_special_tokens=False))
101
+ print(f"Generation took {end_time - start_time:.2f} seconds")
moe_benchmarks/megablocks/index.html ADDED
@@ -0,0 +1,24 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ <!DOCTYPE html>
2
+ <html>
3
+ <head>
4
+ <meta charset='UTF-8'>
5
+ <title>Directory Index</title>
6
+ <style>
7
+ body { font-family: monospace; margin: 20px; }
8
+ h1 { font-size: 1.5em; }
9
+ ul { list-style-type: none; padding-left: 20px; }
10
+ li { margin: 5px 0; }
11
+ .dir { font-weight: bold; }
12
+ .file { color: #0066cc; }
13
+ a { text-decoration: none; }
14
+ a:hover { text-decoration: underline; }
15
+ </style>
16
+ </head>
17
+ <body>
18
+ <h1>Index of /moe_benchmarks/megablocks</h1>
19
+ <ul>
20
+ <li><a href='../index.html' class='dir'>../</a></li>
21
+ <li><a href='megablocks_only.html' class='file'>megablocks_only.html</a></li>
22
+ </ul>
23
+ </body>
24
+ </html>
moe_benchmarks/megablocks/megablocks_only.html ADDED
The diff for this file is too large to render. See raw diff
 
moe_benchmarks/megablocks_yamoe/cells/__pycache__/bench_utils.cpython-311.pyc ADDED
Binary file (16.1 kB). View file
 
moe_benchmarks/megablocks_yamoe/cells/__pycache__/config.cpython-311.pyc ADDED
Binary file (724 Bytes). View file
 
moe_benchmarks/megablocks_yamoe/cells/bench_utils.py ADDED
@@ -0,0 +1,241 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # /// script
2
+ # dependencies = [
3
+ # "torch",
4
+ # "numpy",
5
+ # ]
6
+ # ///
7
+
8
+ """Reusable benchmarking utilities for performance testing."""
9
+ import time
10
+ import numpy as np
11
+ from contextlib import contextmanager
12
+ from typing import Callable, Dict, Tuple, Any, Optional
13
+ import torch
14
+
15
+ def to_dtype(dtype_str: str):
16
+ """Convert string to torch dtype."""
17
+ if dtype_str == "float16":
18
+ return torch.float16
19
+ if dtype_str == "bfloat16":
20
+ return torch.bfloat16
21
+ return torch.float32
22
+
23
+ def _sync(device: str):
24
+ """Synchronize device if CUDA."""
25
+ if device == "cuda":
26
+ torch.cuda.synchronize()
27
+
28
+ def _compute_stats(times_s, tokens: Optional[int] = None) -> Dict[str, float]:
29
+ """Compute comprehensive latency and throughput statistics."""
30
+ lat_ms = np.array([t * 1000.0 for t in times_s])
31
+ lat_ms_sorted = np.sort(lat_ms)
32
+ n = len(lat_ms)
33
+
34
+ stats = {
35
+ "avg_ms": np.mean(lat_ms),
36
+ "min_ms": np.min(lat_ms),
37
+ "max_ms": np.max(lat_ms),
38
+ "std_ms": np.std(lat_ms),
39
+ "p50_ms": np.percentile(lat_ms, 50),
40
+ "p95_ms": np.percentile(lat_ms, 95),
41
+ "p99_ms": np.percentile(lat_ms, 99),
42
+ "num_iters": n
43
+ }
44
+
45
+ if tokens is not None and n > 0:
46
+ avg_s = np.mean(times_s)
47
+ stats["tokens_per_s"] = tokens / avg_s if avg_s > 0 else float("inf")
48
+ stats["throughput_variance"] = np.std([tokens / t for t in times_s if t > 0])
49
+
50
+ return stats
51
+
52
+ def _format_timing_stats(stats: Dict[str, float], tokens: Optional[int] = None) -> str:
53
+ """Format timing statistics for display."""
54
+ lines = [
55
+ "\n━━━━━━━━━━━━━━━━━━━━ Benchmark Results ━━━━━━━━━━━━━━━━━━━━",
56
+ f"Iterations: {stats.get('num_iters', 0)}",
57
+ "\nLatency Statistics:",
58
+ f" Average: {stats['avg_ms']:.3f} ms",
59
+ f" Min: {stats['min_ms']:.3f} ms",
60
+ f" Max: {stats['max_ms']:.3f} ms",
61
+ f" Std Dev: {stats['std_ms']:.3f} ms",
62
+ "\nPercentiles:",
63
+ f" P50 (median): {stats['p50_ms']:.3f} ms",
64
+ f" P95: {stats['p95_ms']:.3f} ms",
65
+ f" P99: {stats['p99_ms']:.3f} ms",
66
+ ]
67
+
68
+ if tokens is not None and 'tokens_per_s' in stats:
69
+ lines.extend([
70
+ "\nThroughput:",
71
+ f" Tokens/sec: {stats['tokens_per_s']:.1f}",
72
+ f" Std Dev: {stats.get('throughput_variance', 0):.1f}",
73
+ ])
74
+
75
+ lines.append("━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━")
76
+ return "\n".join(lines)
77
+
78
+ def _bench_engine(
79
+ call: Callable[[], Any], *, warmup: int, iters: int, device: str, dtype, input_gen: Callable[[], Any] = None
80
+ ) -> Tuple[Any, list]:
81
+ """Core benchmarking engine with warmup and timing."""
82
+ use_autocast = device == "cuda" and dtype in (torch.float16, torch.bfloat16)
83
+
84
+ # Warmup phase
85
+ print(f"\nWarming up ({warmup} iterations)...")
86
+ with torch.inference_mode():
87
+ for _ in range(max(0, warmup)):
88
+ if use_autocast:
89
+ with torch.autocast(device_type="cuda", dtype=dtype):
90
+ if input_gen is not None:
91
+ _ = call(input_gen())
92
+ else:
93
+ _ = call()
94
+ else:
95
+ if input_gen is not None:
96
+ _ = call(input_gen())
97
+ else:
98
+ _ = call()
99
+ _sync(device)
100
+
101
+ # Measurement phase
102
+ print(f"Benchmarking ({iters} iterations)...")
103
+ times_s = []
104
+ last = None
105
+ with torch.inference_mode():
106
+ for i in range(max(1, iters)):
107
+ start = time.perf_counter()
108
+ if use_autocast:
109
+ with torch.autocast(device_type="cuda", dtype=dtype):
110
+ if input_gen is not None:
111
+ last = call(input_gen())
112
+ else:
113
+ last = call()
114
+ else:
115
+ if input_gen is not None:
116
+ last = call(input_gen())
117
+ else:
118
+ last = call()
119
+ _sync(device)
120
+ end = time.perf_counter()
121
+ times_s.append(end - start)
122
+
123
+ # Progress indicator every 20% of iterations
124
+ if i > 0 and i % max(1, iters // 5) == 0:
125
+ pct = (i / iters) * 100
126
+ avg_so_far = np.mean(times_s[:i]) * 1000
127
+ print(f" Progress: {pct:.0f}% complete (avg: {avg_so_far:.3f} ms)")
128
+
129
+ return last, times_s
130
+
131
+ def tensor_stats(t: torch.Tensor) -> str:
132
+ """Generate comprehensive stats string for a tensor."""
133
+ return (f"shape={tuple(t.shape)}, "
134
+ f"dtype={t.dtype}, "
135
+ f"device={t.device}, "
136
+ f"range=[{t.min().item():.6f}, {t.max().item():.6f}], "
137
+ f"mean={t.mean().item():.6f}, "
138
+ f"std={t.std().item():.6f}, "
139
+ f"norm={t.norm().item():.6f}")
140
+
141
+ @contextmanager
142
+ def bench_context(
143
+ *, warmup: int = 25, iters: int = 100, device: str = "cuda", dtype=torch.float32, tokens: Optional[int] = None, verbose: bool = True, save_json: Optional[str] = None, vary_inputs: bool = True
144
+ ):
145
+ """Context that yields a runner: runner(fn, *args, **kwargs) -> (result, stats).
146
+
147
+ If vary_inputs=True, the first argument should be a base tensor that will be varied each iteration
148
+ by adding a small deterministic increment to prevent caching artifacts.
149
+ """
150
+
151
+ def runner(fn: Callable[..., Any], *args, **kwargs) -> Tuple[Any, Dict[str, float]]:
152
+ # Log configuration
153
+ if verbose:
154
+ print(f"\n┌─ Benchmark Configuration ─────────────────────────────┐")
155
+ # print(f"│ Device: {device:<15} Dtype: {dtype} │")
156
+ print(f"│ Warmup: {warmup:<15} Iters: {iters} │")
157
+ if tokens:
158
+ print(f"│ Tokens: {tokens} │")
159
+ if vary_inputs:
160
+ print(f"│ Input Variation: Enabled (prevents caching artifacts) │")
161
+ print(f"└────────────────────────────────────────────────────────┘")
162
+
163
+ # Set up input generation
164
+ input_gen = None
165
+ if vary_inputs and args and isinstance(args[0], torch.Tensor):
166
+ base_input = args[0].clone()
167
+ iteration_counter = [0] # Use list for mutable closure
168
+
169
+ def generate_varied_input():
170
+ """Generate input tensor varied by iteration to prevent caching."""
171
+ # Add small deterministic increment: 0.001 * iteration_number
172
+ varied_input = base_input + (iteration_counter[0] * 0.001)
173
+ iteration_counter[0] += 1
174
+ return varied_input
175
+
176
+ input_gen = generate_varied_input
177
+ call = lambda x: fn(x, *args[1:], **kwargs)
178
+
179
+ # Log base input stats
180
+ if verbose:
181
+ print(f"\nBase Input: {tensor_stats(base_input)}")
182
+ print(f"Input Variation: +{0.001:.3f} * iteration (deterministic)")
183
+ else:
184
+ # Legacy mode - static inputs
185
+ call = lambda: fn(*args, **kwargs)
186
+ if verbose and args and isinstance(args[0], torch.Tensor):
187
+ print(f"\nInput: {tensor_stats(args[0])}")
188
+
189
+ result, times_s = _bench_engine(call, warmup=warmup, iters=iters, device=device, dtype=dtype, input_gen=input_gen)
190
+
191
+ # Log output if it's a tensor or tuple with tensors
192
+ if verbose:
193
+ print("\nOutput tensors:")
194
+ if isinstance(result, torch.Tensor):
195
+ print(f" Primary: {tensor_stats(result)}")
196
+ elif isinstance(result, tuple) and len(result) > 0 and isinstance(result[0], torch.Tensor):
197
+ print(f" Primary: {tensor_stats(result[0])}")
198
+ if len(result) > 1:
199
+ if isinstance(result[1], torch.Tensor):
200
+ print(f" Auxiliary: {tensor_stats(result[1])}")
201
+ else:
202
+ print(f" Auxiliary: {type(result[1]).__name__}")
203
+
204
+ # Compute and display statistics
205
+ stats = _compute_stats(times_s, tokens=tokens)
206
+ if verbose:
207
+ print(_format_timing_stats(stats, tokens))
208
+
209
+ # Save to JSON if requested
210
+ if save_json:
211
+ import json
212
+ json_data = {
213
+ "implementation": save_json.replace(".json", ""),
214
+ "config": {
215
+ "warmup": warmup,
216
+ "iters": iters,
217
+ "device": str(device), # Convert device to string
218
+ "dtype": str(dtype),
219
+ "tokens": tokens,
220
+ "vary_inputs": vary_inputs
221
+ },
222
+ "stats": stats,
223
+ "output_sum": float(result[0].sum().item()) if isinstance(result, tuple) and len(result) > 0 else float(result.sum().item()) if isinstance(result, torch.Tensor) else None
224
+ }
225
+ with open(save_json, 'w') as f:
226
+ json.dump(json_data, f, indent=2)
227
+ if verbose:
228
+ print(f"\nSaved benchmark results to {save_json}")
229
+
230
+ return result, stats
231
+
232
+ yield runner
233
+
234
+ def set_seed(seed: int):
235
+ """Set seeds for reproducibility."""
236
+ torch.manual_seed(seed)
237
+ if torch.cuda.is_available():
238
+ torch.cuda.manual_seed(seed)
239
+ torch.cuda.manual_seed_all(seed)
240
+ torch.backends.cudnn.deterministic = True
241
+ torch.backends.cudnn.benchmark = False
moe_benchmarks/megablocks_yamoe/cells/config.py ADDED
@@ -0,0 +1,27 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # /// script
2
+ # dependencies = [
3
+ # "torch",
4
+ # "numpy",
5
+ # ]
6
+ # ///
7
+
8
+ """Shared configuration for both implementations."""
9
+ import torch
10
+
11
+ # Model configuration
12
+ NUM_EXPERTS = 128
13
+ HIDDEN_SIZE = 1152
14
+ INTERMEDIATE_SIZE = 3072
15
+ TOP_K = 4
16
+
17
+ # Input configuration
18
+ BATCH_SIZE = 1
19
+ SEQ_LEN = 100
20
+ DTYPE = "float32"
21
+ DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
22
+
23
+ # Seeds for reproducibility
24
+ WEIGHT_SEED = 999
25
+ EXPERT_SEED = 777
26
+ INPUT_SEED = 123
27
+ GENERAL_SEED = 42
moe_benchmarks/megablocks_yamoe/cells/nv.py ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ import subprocess
2
+
3
+ print(subprocess.run(["nvidia-smi"], capture_output=True, text=True).stdout)
moe_benchmarks/megablocks_yamoe/cells/save_data.py ADDED
@@ -0,0 +1,42 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # /// script
2
+ # dependencies = [
3
+ # "torch",
4
+ # "numpy",
5
+ # ]
6
+ # ///
7
+
8
+ """
9
+ Generate deterministic shared weights once and save as artifacts so
10
+ both implementations load identical parameters.
11
+ """
12
+ import torch
13
+ from config import NUM_EXPERTS, HIDDEN_SIZE, WEIGHT_SEED, EXPERT_SEED
14
+
15
+ def save_shared_weights():
16
+ # Router: Kaiming uniform as used by both, bias zeros
17
+ torch.manual_seed(WEIGHT_SEED)
18
+ router_weight = torch.empty(NUM_EXPERTS, HIDDEN_SIZE)
19
+ torch.nn.init.kaiming_uniform_(router_weight)
20
+ router_bias = torch.zeros(NUM_EXPERTS)
21
+
22
+ # Experts: normal(0, 0.02), biases zeros
23
+ torch.manual_seed(EXPERT_SEED)
24
+ gate_up_proj = torch.empty(NUM_EXPERTS, HIDDEN_SIZE, 2 * HIDDEN_SIZE).normal_(mean=0.0, std=0.02)
25
+ gate_up_proj_bias = torch.zeros(NUM_EXPERTS, 2 * HIDDEN_SIZE)
26
+ down_proj = torch.empty(NUM_EXPERTS, HIDDEN_SIZE, HIDDEN_SIZE).normal_(mean=0.0, std=0.02)
27
+ down_proj_bias = torch.zeros(NUM_EXPERTS, HIDDEN_SIZE)
28
+
29
+ # Save artifacts
30
+ torch.save(router_weight, 'router_weight.pt')
31
+ torch.save(router_bias, 'router_bias.pt')
32
+ torch.save(gate_up_proj, 'gate_up_proj.pt')
33
+ torch.save(gate_up_proj_bias, 'gate_up_proj_bias.pt')
34
+ torch.save(down_proj, 'down_proj.pt')
35
+ torch.save(down_proj_bias, 'down_proj_bias.pt')
36
+
37
+ print("Saved shared weights to artifacts")
38
+ print(f"Router weight sum: {router_weight.sum().item():.6f}")
39
+ print(f"Gate/up sum: {gate_up_proj.sum().item():.6f}")
40
+ print(f"Down sum: {down_proj.sum().item():.6f}")
41
+
42
+ save_shared_weights()
moe_benchmarks/megablocks_yamoe/cells/utils.py ADDED
@@ -0,0 +1,34 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # /// script
2
+ # dependencies = [
3
+ # "torch",
4
+ # "numpy",
5
+ # ]
6
+ # ///
7
+
8
+ """Simple utilities for running the models."""
9
+ import torch
10
+
11
+ def to_dtype(dtype_str: str):
12
+ """Convert string to torch dtype."""
13
+ if dtype_str == "float16":
14
+ return torch.float16
15
+ if dtype_str == "bfloat16":
16
+ return torch.bfloat16
17
+ return torch.float32
18
+
19
+ def tensor_stats(t: torch.Tensor) -> str:
20
+ """Generate stats string for a tensor."""
21
+ return (f"shape={tuple(t.shape)}, "
22
+ f"dtype={t.dtype}, "
23
+ f"device={t.device}, "
24
+ f"mean={t.mean().item():.6f}, "
25
+ f"std={t.std().item():.6f}")
26
+
27
+ def set_seed(seed: int):
28
+ """Set seeds for reproducibility."""
29
+ torch.manual_seed(seed)
30
+ if torch.cuda.is_available():
31
+ torch.cuda.manual_seed(seed)
32
+ torch.cuda.manual_seed_all(seed)
33
+ torch.backends.cudnn.deterministic = True
34
+ torch.backends.cudnn.benchmark = False
moe_benchmarks/megablocks_yamoe/cells/yamoe_run.py ADDED
@@ -0,0 +1,135 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # /// script
2
+ # dependencies = [
3
+ # "torch",
4
+ # "kernels",
5
+ # "numpy",
6
+ # ]
7
+ # ///
8
+
9
+ import torch
10
+ from torch import nn
11
+ from torch.nn import functional as F
12
+ from kernels import get_kernel, get_local_kernel
13
+ from bench_utils import to_dtype, tensor_stats, set_seed, bench_context
14
+ from config import (
15
+ NUM_EXPERTS, HIDDEN_SIZE, TOP_K,
16
+ BATCH_SIZE, SEQ_LEN, DTYPE, DEVICE,
17
+ WEIGHT_SEED, EXPERT_SEED, INPUT_SEED, GENERAL_SEED
18
+ )
19
+ from pathlib import Path
20
+ import os
21
+
22
+ # Discover the upstream artifact directory from env
23
+ data_dir = os.environ.get('UVNOTE_INPUT_SAVE_DATA', '.')
24
+ print(f"Loading weights from: {data_dir}")
25
+
26
+ router_weight = torch.load(Path(data_dir) / 'router_weight.pt')
27
+ router_bias = torch.load(Path(data_dir) / 'router_bias.pt')
28
+ gate_up_proj = torch.load(Path(data_dir) / 'gate_up_proj.pt')
29
+ gate_up_proj_bias = torch.load(Path(data_dir) / 'gate_up_proj_bias.pt')
30
+ down_proj = torch.load(Path(data_dir) / 'down_proj.pt')
31
+ down_proj_bias = torch.load(Path(data_dir) / 'down_proj_bias.pt')
32
+
33
+ print("Loaded shared weights from artifacts")
34
+ print(f"Router weight sum: {router_weight.sum().item():.6f}")
35
+ print(f"Gate/up sum: {gate_up_proj.sum().item():.6f}")
36
+ print(f"Down sum: {down_proj.sum().item():.6f}")
37
+
38
+ class YamoeRouter(nn.Module):
39
+ def __init__(self, router_weight, router_bias):
40
+ super().__init__()
41
+ self.top_k = TOP_K
42
+ self.num_experts = NUM_EXPERTS
43
+ self.hidden_dim = HIDDEN_SIZE
44
+ self.weight = nn.Parameter(router_weight.clone())
45
+ self.bias = nn.Parameter(router_bias.clone())
46
+
47
+ def forward(self, hidden_states):
48
+ hidden_states = hidden_states.reshape(-1, self.hidden_dim)
49
+ router_logits = F.linear(hidden_states, self.weight, self.bias)
50
+ router_top_value, router_indices = torch.topk(router_logits, self.top_k, dim=-1)
51
+ router_top_value = torch.nn.functional.softmax(router_top_value, dim=1, dtype=router_top_value.dtype)
52
+ router_scores = torch.zeros_like(router_logits).scatter_(1, router_indices, router_top_value)
53
+ return router_scores, router_indices
54
+
55
+ def ceil_div(a, b):
56
+ return (a + b - 1) // b
57
+
58
+ class YamoeMoEMLP(nn.Module):
59
+ def __init__(self, router_weight, router_bias, gate_up_proj, gate_up_proj_bias, down_proj, down_proj_bias):
60
+ super().__init__()
61
+ self.router = YamoeRouter(router_weight, router_bias)
62
+ self.num_experts = NUM_EXPERTS
63
+ self.hidden_size = HIDDEN_SIZE
64
+ self.top_k = TOP_K
65
+
66
+ # Load Yamoe kernel
67
+ # self.yamoe = get_local_kernel(Path("/home/ubuntu/Projects/yamoe/result"), "yamoe")
68
+ self.yamoe = get_kernel("drbh/yamoe", revision="v0.2.0")
69
+
70
+ # Expert weights - use the loaded weights
71
+ self.gate_up_proj = nn.Parameter(gate_up_proj.clone())
72
+ self.gate_up_proj_bias = nn.Parameter(gate_up_proj_bias.clone())
73
+ self.down_proj = nn.Parameter(down_proj.clone())
74
+ self.down_proj_bias = nn.Parameter(down_proj_bias.clone())
75
+
76
+ def forward(self, hidden_states):
77
+ batch_size, seq_len, hidden_dim = hidden_states.shape
78
+
79
+ # Get routing decisions
80
+ routing_weights, router_indices = self.router(hidden_states)
81
+
82
+ # Reshape for Yamoe kernel
83
+ hidden_states_flat = hidden_states.view(-1, hidden_dim)
84
+ routing_weights_flat = routing_weights.view(-1, self.num_experts)
85
+ expert_capacity = ceil_div(batch_size * self.top_k, self.num_experts)
86
+
87
+ # Call Yamoe optimized kernel
88
+ output = self.yamoe.experts(
89
+ hidden_states_flat,
90
+ router_indices,
91
+ routing_weights_flat,
92
+ self.gate_up_proj,
93
+ self.gate_up_proj_bias,
94
+ self.down_proj,
95
+ self.down_proj_bias,
96
+ expert_capacity,
97
+ self.num_experts,
98
+ self.top_k,
99
+ )
100
+
101
+ # Reshape output back
102
+ output = output.view(batch_size, seq_len, hidden_dim)
103
+
104
+ return output, routing_weights
105
+
106
+ # Run the model
107
+ set_seed(GENERAL_SEED)
108
+
109
+ device = torch.device(DEVICE if DEVICE == "cuda" else "cuda")
110
+ dtype = to_dtype(DTYPE)
111
+
112
+ print("\n=== Yamoe Implementation ===")
113
+ # Initialize model with loaded weights
114
+ model = YamoeMoEMLP(
115
+ router_weight.to(device),
116
+ router_bias.to(device),
117
+ gate_up_proj.to(device),
118
+ gate_up_proj_bias.to(device),
119
+ down_proj.to(device),
120
+ down_proj_bias.to(device)
121
+ ).to(device=device)
122
+
123
+ print(f"Router weight sum: {model.router.weight.sum().item():.6f}")
124
+ print(f"Gate/up proj sum: {model.gate_up_proj.sum().item():.6f}")
125
+ print(f"Down proj sum: {model.down_proj.sum().item():.6f}")
126
+
127
+ # Generate input
128
+ set_seed(INPUT_SEED)
129
+ x = torch.randn(BATCH_SIZE, SEQ_LEN, HIDDEN_SIZE, device=device, dtype=dtype) * 0.1
130
+
131
+ # Benchmark the model with varied inputs to prevent caching artifacts
132
+ tokens = BATCH_SIZE * SEQ_LEN
133
+ with bench_context(warmup=10, iters=50, device=device, dtype=dtype, tokens=tokens, save_json="yamoe_results.json", vary_inputs=True) as bench:
134
+ output, stats = bench(model, x)
135
+ print(f"\nOutput sum: {output[0].sum().item():.6f}")
moe_benchmarks/megablocks_yamoe/index.html ADDED
@@ -0,0 +1,25 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ <!DOCTYPE html>
2
+ <html>
3
+ <head>
4
+ <meta charset='UTF-8'>
5
+ <title>Directory Index</title>
6
+ <style>
7
+ body { font-family: monospace; margin: 20px; }
8
+ h1 { font-size: 1.5em; }
9
+ ul { list-style-type: none; padding-left: 20px; }
10
+ li { margin: 5px 0; }
11
+ .dir { font-weight: bold; }
12
+ .file { color: #0066cc; }
13
+ a { text-decoration: none; }
14
+ a:hover { text-decoration: underline; }
15
+ </style>
16
+ </head>
17
+ <body>
18
+ <h1>Index of /moe_benchmarks/megablocks_yamoe</h1>
19
+ <ul>
20
+ <li><a href='../index.html' class='dir'>../</a></li>
21
+ <li><a href='megablocks_yamoe.html' class='file'>megablocks_yamoe.html</a></li>
22
+ <li><a href='torch_profile.html' class='file'>torch_profile.html</a></li>
23
+ </ul>
24
+ </body>
25
+ </html>
moe_benchmarks/megablocks_yamoe/megablocks_yamoe.html ADDED
The diff for this file is too large to render. See raw diff
 
moe_benchmarks/megablocks_yamoe/torch_profile.html ADDED
The diff for this file is too large to render. See raw diff