drbh HF Staff commited on
Commit
6094336
·
verified ·
1 Parent(s): ed9a6af

Upload folder using huggingface_hub

Browse files
megablocks/megablocks_only.html CHANGED
The diff for this file is too large to render. See raw diff
 
megablocks_yamoe/artifacts/binned_run/binned_results.json CHANGED
@@ -9,16 +9,16 @@
9
  "vary_inputs": true
10
  },
11
  "stats": {
12
- "avg_ms": 35.832872119995045,
13
- "min_ms": 32.58174399991276,
14
- "max_ms": 40.50060700001268,
15
- "std_ms": 1.694341573523051,
16
- "p50_ms": 36.17695449997882,
17
- "p95_ms": 38.67062735003515,
18
- "p99_ms": 39.92923416996405,
19
  "num_iters": 50,
20
- "tokens_per_s": 2790.733594145783,
21
- "throughput_variance": 131.29596945634063
22
  },
23
  "output_sum": 3.97190523147583
24
  }
 
9
  "vary_inputs": true
10
  },
11
  "stats": {
12
+ "avg_ms": 36.21514258000616,
13
+ "min_ms": 33.172280000030696,
14
+ "max_ms": 38.75413800005845,
15
+ "std_ms": 1.401058294284512,
16
+ "p50_ms": 36.36444199997868,
17
+ "p95_ms": 38.060839599990004,
18
+ "p99_ms": 38.46422802999541,
19
  "num_iters": 50,
20
+ "tokens_per_s": 2761.275888368544,
21
+ "throughput_variance": 108.05444381816277
22
  },
23
  "output_sum": 3.97190523147583
24
  }
megablocks_yamoe/artifacts/gptoss_run/gptoss_results.json CHANGED
@@ -9,16 +9,16 @@
9
  "vary_inputs": true
10
  },
11
  "stats": {
12
- "avg_ms": 46.790802699997585,
13
- "min_ms": 39.03555299996242,
14
- "max_ms": 50.85692799991648,
15
- "std_ms": 3.250858562771192,
16
- "p50_ms": 47.475618500016026,
17
- "p95_ms": 50.805645549957035,
18
- "p99_ms": 50.83896361993766,
19
  "num_iters": 50,
20
- "tokens_per_s": 2137.172141310693,
21
- "throughput_variance": 155.17201487457513
22
  },
23
  "output_sum": 11.53223705291748
24
  }
 
9
  "vary_inputs": true
10
  },
11
  "stats": {
12
+ "avg_ms": 45.94982444000152,
13
+ "min_ms": 40.76497799997014,
14
+ "max_ms": 52.299967999942965,
15
+ "std_ms": 3.623045351544196,
16
+ "p50_ms": 45.46925300002158,
17
+ "p95_ms": 51.35251775002985,
18
+ "p99_ms": 52.12179027996967,
19
  "num_iters": 50,
20
+ "tokens_per_s": 2176.286878540176,
21
+ "throughput_variance": 169.79505096491204
22
  },
23
  "output_sum": 11.53223705291748
24
  }
megablocks_yamoe/artifacts/gptoss_training_run/gptoss_training_results.json CHANGED
@@ -9,16 +9,16 @@
9
  "vary_inputs": true
10
  },
11
  "stats": {
12
- "avg_ms": 45.006849599990346,
13
- "min_ms": 38.83674200005771,
14
- "max_ms": 49.30821800007834,
15
- "std_ms": 2.893955494967115,
16
- "p50_ms": 45.57549300000119,
17
- "p95_ms": 48.57250854988706,
18
- "p99_ms": 48.963614720073565,
19
  "num_iters": 50,
20
- "tokens_per_s": 2221.8840218494533,
21
- "throughput_variance": 147.8630259637854
22
  },
23
  "output_sum": 11.53223705291748
24
  }
 
9
  "vary_inputs": true
10
  },
11
  "stats": {
12
+ "avg_ms": 46.09780513999567,
13
+ "min_ms": 38.8389360000474,
14
+ "max_ms": 49.40391599996019,
15
+ "std_ms": 2.4686999934552376,
16
+ "p50_ms": 47.23983950003685,
17
+ "p95_ms": 48.725092950002136,
18
+ "p99_ms": 49.16830440000467,
19
  "num_iters": 50,
20
+ "tokens_per_s": 2169.300679203864,
21
+ "throughput_variance": 122.29861537972276
22
  },
23
  "output_sum": 11.53223705291748
24
  }
megablocks_yamoe/artifacts/yamoe_run/yamoe_results.json CHANGED
@@ -9,16 +9,16 @@
9
  "vary_inputs": true
10
  },
11
  "stats": {
12
- "avg_ms": 4.2496077999999216,
13
- "min_ms": 4.143714000065302,
14
- "max_ms": 4.276272000083736,
15
- "std_ms": 0.02026809704303406,
16
- "p50_ms": 4.251974999931463,
17
- "p95_ms": 4.269103000035557,
18
- "p99_ms": 4.276041210073345,
19
  "num_iters": 50,
20
- "tokens_per_s": 23531.58331458302,
21
- "throughput_variance": 113.86151920477748
22
  },
23
  "output_sum": 3.97190523147583
24
  }
 
9
  "vary_inputs": true
10
  },
11
  "stats": {
12
+ "avg_ms": 4.247618279998733,
13
+ "min_ms": 4.12893800000802,
14
+ "max_ms": 4.265831999987313,
15
+ "std_ms": 0.020712896658640616,
16
+ "p50_ms": 4.251555999985612,
17
+ "p95_ms": 4.263803499975438,
18
+ "p99_ms": 4.2652827100027935,
19
  "num_iters": 50,
20
+ "tokens_per_s": 23542.605151428495,
21
+ "throughput_variance": 117.11531020813602
22
  },
23
  "output_sum": 3.97190523147583
24
  }
megablocks_yamoe/cells/__pycache__/bench_utils.cpython-311.pyc CHANGED
Binary files a/megablocks_yamoe/cells/__pycache__/bench_utils.cpython-311.pyc and b/megablocks_yamoe/cells/__pycache__/bench_utils.cpython-311.pyc differ
 
megablocks_yamoe/cells/__pycache__/config.cpython-311.pyc CHANGED
Binary files a/megablocks_yamoe/cells/__pycache__/config.cpython-311.pyc and b/megablocks_yamoe/cells/__pycache__/config.cpython-311.pyc differ
 
megablocks_yamoe/cells/megablocks_run.py CHANGED
@@ -56,7 +56,7 @@ def build_megablocks_model(device: torch.device):
56
  # Attach loaded expert weights to the experts container
57
  e = model.experts
58
  e.alpha = 1.702
59
- e.capacity_factor = 64
60
  e.gate_up_proj = torch.nn.Parameter(gate_up_proj.clone().to(device))
61
  e.gate_up_proj_bias = torch.nn.Parameter(gate_up_proj_bias.clone().to(device))
62
  e.down_proj = torch.nn.Parameter(down_proj.clone().to(device))
 
56
  # Attach loaded expert weights to the experts container
57
  e = model.experts
58
  e.alpha = 1.702
59
+ e.capacity_factor = 32
60
  e.gate_up_proj = torch.nn.Parameter(gate_up_proj.clone().to(device))
61
  e.gate_up_proj_bias = torch.nn.Parameter(gate_up_proj_bias.clone().to(device))
62
  e.down_proj = torch.nn.Parameter(down_proj.clone().to(device))
megablocks_yamoe/megablocks_yamoe.html CHANGED
@@ -3722,7 +3722,7 @@ span.linenos.special { color: #000000; background-color: #ffffc0; padding-left:
3722
  <span onclick="toggleOutput('nv')" style="cursor: pointer;">▼ output</span>
3723
  <span id="uv-indicator-nv" style="cursor: default; opacity: 0.3;">▶ uv-logs</span>
3724
  </span> |
3725
- Cell: nv | 0.53s
3726
  | <button class="run-btn" onclick="runCell('nv')">▶ run</button>
3727
  <button class="copy-btn" onclick="copyCell('nv')">Copy</button>
3728
  <a href="cells/nv.py" target="_blank" class="raw-btn">Raw</a>
@@ -3745,7 +3745,7 @@ Cell: nv | 0.53s
3745
  </div>
3746
  </div>
3747
  <div id="output-nv" class="cell-output">
3748
- <div class="cell-stdout">Wed Sep 24 21:05:30 2025
3749
  +-----------------------------------------------------------------------------------------+
3750
  | NVIDIA-SMI 570.172.08 Driver Version: 570.172.08 CUDA Version: 12.8 |
3751
  |-----------------------------------------+------------------------+----------------------+
@@ -3754,19 +3754,19 @@ Cell: nv | 0.53s
3754
  | | | MIG M. |
3755
  |=========================================+========================+======================|
3756
  | 0 NVIDIA A10G On | 00000000:00:1B.0 Off | 0 |
3757
- | 0% 38C P0 46W / 300W | 0MiB / 23028MiB | 0% Default |
3758
  | | | N/A |
3759
  +-----------------------------------------+------------------------+----------------------+
3760
  | 1 NVIDIA A10G On | 00000000:00:1C.0 Off | 0 |
3761
- | 0% 37C P0 45W / 300W | 0MiB / 23028MiB | 0% Default |
3762
  | | | N/A |
3763
  +-----------------------------------------+------------------------+----------------------+
3764
  | 2 NVIDIA A10G On | 00000000:00:1D.0 Off | 0 |
3765
- | 0% 39C P0 47W / 300W | 0MiB / 23028MiB | 0% Default |
3766
  | | | N/A |
3767
  +-----------------------------------------+------------------------+----------------------+
3768
  | 3 NVIDIA A10G On | 00000000:00:1E.0 Off | 0 |
3769
- | 0% 38C P0 46W / 300W | 0MiB / 23028MiB | 0% Default |
3770
  | | | N/A |
3771
  +-----------------------------------------+------------------------+----------------------+
3772
 
@@ -3792,7 +3792,7 @@ Cell: nv | 0.53s
3792
  <span onclick="toggleOutput('setup2')" style="cursor: pointer;">▼ output</span>
3793
  <span id="uv-indicator-setup2" onclick="toggleUvLogsFromHeader('setup2')" style="cursor: pointer;">▶ uv-logs</span>
3794
  </span> |
3795
- Cell: setup2 | 113.64s
3796
  | <button class="run-btn" onclick="runCell('setup2')">▶ run</button>
3797
  <button class="copy-btn" onclick="copyCell('setup2')">Copy</button>
3798
  <a href="cells/setup2.py" target="_blank" class="raw-btn">Raw</a>
@@ -4050,7 +4050,7 @@ Reasoning: low
4050
  What is Tensor Parallelism?
4051
 
4052
  &lt;|end|&gt;&lt;|start|&gt;assistant&lt;|channel|&gt;analysis&lt;|message|&gt;We need to explain what Tensor Parallelism is. It&#x27;s a concept in distributed training of large language models. It refers to splitting the weight matrices (tensors) across multiple devices. Provide details: how it works, benefits, challenges, typical frameworks, etc. Also mention difference from data parallelism, pipeline parallelism. Provide example: splitting a weight matrix across GPUs, each GPU holds a slice, compute partial results, then gather. Provide mention of communication overhead, scaling, etc. Also mention that it&#x27;s used in large models like GPT-3, Megatron-LM, DeepSpeed. Provide references. Also mention that it&#x27;s also called model parallelism. Provide explanation of how it works in practice: e.g., for a linear layer, weight matrix W of shape (out_features, in_features). In tensor parallelism, split W along out_features dimension across GPUs. Each GPU computes partial output. Then gather outputs. Provide details on how to handle bias, etc. Provide mention of &quot;tensor model parallelism&quot; vs &quot;tensor parallelism&quot; synonyms. Provide mention of &quot;tensor parallelism&quot; in Megatron-LM: splitting weight matrices across GPUs. Provide mention of &quot;tensor parallelism&quot; in DeepSpeed: &quot;ZeRO-Offload&quot; etc. Provide mention
4053
- Generation took 31.35 seconds
4054
  </div>
4055
  <div class="uv-install-logs" id="uv-logs-setup2">
4056
  <div class="uv-logs-header" onclick="toggleUvLogs(this)">▶ UV Install Logs</div>
@@ -4059,31 +4059,31 @@ Downloading cpython-3.13.7-linux-x86_64-gnu (download) (32.0MiB)
4059
  Downloading cpython-3.13.7-linux-x86_64-gnu (download)
4060
  Updating https://github.com/huggingface/transformers.git (HEAD)
4061
  Updated https://github.com/huggingface/transformers.git (7258ea44bc0c0a425a468f66f8559d1de8c4126d)
4062
- Downloading sympy (6.0MiB)
 
4063
  Building transformers @ git+https://github.com/huggingface/transformers.git@7258ea44bc0c0a425a468f66f8559d1de8c4126d
4064
- Downloading hf-xet (3.0MiB)
4065
- Downloading pillow (6.3MiB)
4066
- Downloading nvidia-nvjitlink-cu12 (37.4MiB)
4067
  Downloading networkx (1.9MiB)
4068
- Downloading pygments (1.2MiB)
4069
- Downloading tokenizers (3.1MiB)
4070
- Downloading nvidia-cusparselt-cu12 (273.9MiB)
4071
  Downloading nvidia-cublas-cu12 (566.8MiB)
 
 
 
 
 
4072
  Downloading nvidia-cufile-cu12 (1.1MiB)
4073
- Downloading jedi (1.5MiB)
4074
- Downloading numpy (15.9MiB)
4075
  Downloading nvidia-cuda-cupti-cu12 (9.8MiB)
4076
- Downloading nvidia-cudnn-cu12 (674.0MiB)
4077
- Downloading nvidia-curand-cu12 (60.7MiB)
4078
  Downloading triton (148.4MiB)
4079
- Downloading nvidia-cuda-nvrtc-cu12 (84.0MiB)
4080
- Downloading nvidia-nccl-cu12 (307.4MiB)
4081
  Downloading nvidia-cusolver-cu12 (255.1MiB)
4082
- Downloading nvidia-cufft-cu12 (184.2MiB)
4083
- Downloading matplotlib (8.3MiB)
4084
- Downloading fonttools (4.7MiB)
4085
- Downloading nvidia-cusparse-cu12 (274.9MiB)
4086
  Downloading kiwisolver (1.4MiB)
 
 
 
 
 
 
 
4087
  Downloading torch (846.8MiB)
4088
  Downloading nvidia-cufile-cu12
4089
  Downloading kiwisolver
@@ -4105,38 +4105,36 @@ Downloading torch (846.8MiB)
4105
  Downloading triton
4106
  Downloading nvidia-cufft-cu12
4107
  Downloading nvidia-cusolver-cu12
4108
- Downloading nvidia-cusparse-cu12
4109
  Downloading nvidia-cusparselt-cu12
 
4110
  Downloading nvidia-nccl-cu12
4111
  Downloading nvidia-cublas-cu12
4112
  Downloading nvidia-cudnn-cu12
4113
  Downloading torch
4114
- Installed 69 packages in 550ms
4115
  </div>
4116
  </div>
4117
  <div class="cell-stderr">Fetching 3 files: 0%| | 0/3 [00:00&lt;?, ?it/s]
4118
- Fetching 3 files: 33%|███▎ | 1/3 [00:06&lt;00:12, 6.47s/it]
4119
- Fetching 3 files: 67%|██████▋ | 2/3 [00:07&lt;00:03, 3.37s/it]
4120
- Fetching 3 files: 100%|██████████| 3/3 [00:07&lt;00:00, 2.56s/it]
4121
  You are using full precision kernels, we will dequantize the model to bf16. To use the quantized model with quantization kernels, please set use_kernels=False
4122
 
4123
  Loading checkpoint shards: 0%| | 0/3 [00:00&lt;?, ?it/s]
4124
- Loading checkpoint shards: 33%|███▎ | 1/3 [00:02&lt;00:04, 2.34s/it]
4125
  Loading checkpoint shards: 67%|██████▋ | 2/3 [00:04&lt;00:02, 2.25s/it]
4126
  Loading checkpoint shards: 100%|██████████| 3/3 [00:05&lt;00:00, 1.80s/it]
4127
  Loading checkpoint shards: 100%|██████████| 3/3 [00:05&lt;00:00, 1.93s/it]
4128
  INFO:root:Using layer `MegaBlocksMoeMLP` from repo `kernels-community/megablocks` (revision: main) for layer `MegaBlocksMoeMLP`
4129
 
4130
  Fetching 66 files: 0%| | 0/66 [00:00&lt;?, ?it/s]
4131
- Fetching 66 files: 2%|▏ | 1/66 [00:00&lt;00:16, 3.87it/s]
4132
- Fetching 66 files: 14%|█▎ | 9/66 [00:00&lt;00:03, 18.15it/s]
4133
- Fetching 66 files: 26%|██▌ | 17/66 [00:00&lt;00:02, 24.03it/s]
4134
- Fetching 66 files: 56%|█████▌ | 37/66 [00:00&lt;00:00, 58.06it/s]
4135
- Fetching 66 files: 71%|███████ | 47/66 [00:01&lt;00:00, 37.14it/s]
4136
- Fetching 66 files: 85%|████████▍ | 56/66 [00:01&lt;00:00, 39.66it/s]
4137
- Fetching 66 files: 98%|█████████▊| 65/66 [00:01&lt;00:00, 42.21it/s]
4138
- Fetching 66 files: 100%|██████████| 66/66 [00:01&lt;00:00, 37.62it/s]
4139
- /tmp/uvnote-run-e6cle3et/home/.cache/uv/environments-v2/setup2-adf2810b697d7b08/lib/python3.13/site-packages/kernels/layer.py:868: UserWarning:
4140
  No kernel mapping found for layer `None`. Check if the layer name matches one of the kernels in the mapping or add the kernel you want to use to the mapping. Defaulting to original forward implementation.
4141
  warnings.warn(
4142
  INFO:root:Using layer `MegaBlocksMoeMLP` from repo `kernels-community/megablocks` (revision: main) for layer `MegaBlocksMoeMLP`
@@ -4163,7 +4161,7 @@ INFO:root:Using layer `MegaBlocksMoeMLP` from repo `kernels-community/megablocks
4163
  INFO:root:Using layer `MegaBlocksMoeMLP` from repo `kernels-community/megablocks` (revision: main) for layer `MegaBlocksMoeMLP`
4164
  INFO:root:Using layer `MegaBlocksMoeMLP` from repo `kernels-community/megablocks` (revision: main) for layer `MegaBlocksMoeMLP`
4165
  INFO:root:Using layer `MegaBlocksMoeMLP` from repo `kernels-community/megablocks` (revision: main) for layer `MegaBlocksMoeMLP`
4166
- /tmp/uvnote-run-e6cle3et/home/.cache/uv/environments-v2/setup2-adf2810b697d7b08/lib/python3.13/site-packages/kernels/layer.py:868: UserWarning:
4167
  No kernel mapping found for layer `None`. Check if the layer name matches one of the kernels in the mapping or add the kernel you want to use to the mapping. Defaulting to original forward implementation.
4168
  warnings.warn(
4169
  INFO:root:Using layer `MegaBlocksMoeMLP` from repo `kernels-community/megablocks` (revision: main) for layer `MegaBlocksMoeMLP`
@@ -4200,7 +4198,7 @@ INFO:root:Using layer `MegaBlocksMoeMLP` from repo `kernels-community/megablocks
4200
  <span onclick="toggleOutput('setup')" style="cursor: pointer;">▼ output</span>
4201
  <span id="uv-indicator-setup" onclick="toggleUvLogsFromHeader('setup')" style="cursor: pointer;">▶ uv-logs</span>
4202
  </span> |
4203
- Cell: setup | 108.22s
4204
  | <button class="run-btn" onclick="runCell('setup')">▶ run</button>
4205
  <button class="copy-btn" onclick="copyCell('setup')">Copy</button>
4206
  <a href="cells/setup.py" target="_blank" class="raw-btn">Raw</a>
@@ -4459,12 +4457,8 @@ Reasoning: low
4459
 
4460
  What is Tensor Parallelism?
4461
 
4462
- &lt;|end|&gt;&lt;|start|&gt;assistant&lt;|channel|&gt;analysis&lt;|message|&gt;We need to explain what Tensor Parallelism is. It&#x27;s a concept in distributed training of large language models. It refers to splitting the weight matrices (tensors) across multiple devices. Provide details: how it works, benefits, challenges, typical frameworks, etc. Also mention difference from data parallelism, pipeline parallelism. Provide example: splitting a weight matrix across GPUs, each GPU holds a slice, compute partial results, then gather. Provide mention of Megatron-LM, DeepSpeed, etc. Also mention that it&#x27;s used for very large models that don&#x27;t fit in a single GPU. Provide explanation of how it reduces memory usage and increases throughput. Also mention that it can be combined with other parallelism strategies. Provide mention of communication overhead, scaling, etc. Provide references. Provide code snippet maybe. Provide final answer.&lt;|end|&gt;&lt;|start|&gt;assistant&lt;|channel|&gt;final&lt;|message|&gt;## Tensor Parallelism A Quick Overview
4463
-
4464
- | **Aspect** | **What it is** | **Why it matters** |
4465
- |------------|----------------|--------------------|
4466
- | **Definition** | Splitting a *single* weight tensor (e.g., a large matrix in a transformer layer) across multiple devices so that each device holds only a *portion* of the tensor. | Allows training of models that are
4467
- Generation took 26.28 seconds
4468
  </div>
4469
  <div class="uv-install-logs" id="uv-logs-setup">
4470
  <div class="uv-logs-header" onclick="toggleUvLogs(this)">▶ UV Install Logs</div>
@@ -4473,37 +4467,37 @@ Downloading cpython-3.13.7-linux-x86_64-gnu (download) (32.0MiB)
4473
  Downloading cpython-3.13.7-linux-x86_64-gnu (download)
4474
  Updating https://github.com/huggingface/transformers.git (HEAD)
4475
  Updated https://github.com/huggingface/transformers.git (7258ea44bc0c0a425a468f66f8559d1de8c4126d)
4476
- Downloading tokenizers (3.1MiB)
4477
- Downloading nvidia-cufile-cu12 (1.1MiB)
4478
- Building transformers @ git+https://github.com/huggingface/transformers.git@7258ea44bc0c0a425a468f66f8559d1de8c4126d
4479
- Downloading nvidia-curand-cu12 (60.7MiB)
4480
- Downloading nvidia-cusparse-cu12 (274.9MiB)
4481
- Downloading pygments (1.2MiB)
4482
- Downloading sympy (6.0MiB)
4483
  Downloading nvidia-cuda-cupti-cu12 (9.8MiB)
 
 
 
4484
  Downloading nvidia-nccl-cu12 (307.4MiB)
4485
  Downloading nvidia-cusparselt-cu12 (273.9MiB)
4486
- Downloading nvidia-nvjitlink-cu12 (37.4MiB)
 
 
4487
  Downloading nvidia-cufft-cu12 (184.2MiB)
4488
- Downloading networkx (1.9MiB)
 
 
 
 
 
 
4489
  Downloading kiwisolver (1.4MiB)
4490
- Downloading pillow (6.3MiB)
 
 
4491
  Downloading nvidia-cuda-nvrtc-cu12 (84.0MiB)
 
4492
  Downloading fonttools (4.7MiB)
4493
- Downloading hf-xet (3.0MiB)
4494
- Downloading numpy (15.9MiB)
4495
- Downloading matplotlib (8.3MiB)
4496
- Downloading nvidia-cudnn-cu12 (674.0MiB)
4497
- Downloading nvidia-cusolver-cu12 (255.1MiB)
4498
- Downloading jedi (1.5MiB)
4499
- Downloading nvidia-cublas-cu12 (566.8MiB)
4500
  Downloading triton (148.4MiB)
4501
  Downloading torch (846.8MiB)
4502
  Downloading nvidia-cufile-cu12
4503
  Downloading kiwisolver
4504
  Downloading pygments
4505
- Downloading tokenizers
4506
  Downloading hf-xet
 
4507
  Downloading networkx
4508
  Downloading fonttools
4509
  Downloading pillow
@@ -4519,33 +4513,33 @@ Downloading torch (846.8MiB)
4519
  Downloading triton
4520
  Downloading nvidia-cufft-cu12
4521
  Downloading nvidia-cusolver-cu12
4522
- Downloading nvidia-cusparse-cu12
4523
  Downloading nvidia-cusparselt-cu12
 
4524
  Downloading nvidia-nccl-cu12
4525
  Downloading nvidia-cublas-cu12
4526
  Downloading nvidia-cudnn-cu12
4527
  Downloading torch
4528
- Installed 69 packages in 462ms
4529
  </div>
4530
  </div>
4531
  <div class="cell-stderr">Fetching 3 files: 0%| | 0/3 [00:00&lt;?, ?it/s]
4532
- Fetching 3 files: 33%|███▎ | 1/3 [00:07&lt;00:14, 7.36s/it]
4533
- Fetching 3 files: 67%|██████▋ | 2/3 [00:08&lt;00:03, 3.69s/it]
4534
- Fetching 3 files: 100%|██████████| 3/3 [00:08&lt;00:00, 2.83s/it]
4535
  You are using full precision kernels, we will dequantize the model to bf16. To use the quantized model with quantization kernels, please set use_kernels=False
4536
 
4537
  Loading checkpoint shards: 0%| | 0/3 [00:00&lt;?, ?it/s]
4538
- Loading checkpoint shards: 33%|███▎ | 1/3 [00:02&lt;00:04, 2.36s/it]
4539
- Loading checkpoint shards: 67%|██████▋ | 2/3 [00:04&lt;00:02, 2.26s/it]
4540
  Loading checkpoint shards: 100%|██████████| 3/3 [00:05&lt;00:00, 1.80s/it]
4541
- Loading checkpoint shards: 100%|██████████| 3/3 [00:05&lt;00:00, 1.94s/it]
4542
  INFO:root:Using layer `Yamoe` from repo `drbh/yamoe` (revision: v0.3.0) for layer `Yamoe`
4543
 
4544
  Fetching 6 files: 0%| | 0/6 [00:00&lt;?, ?it/s]
4545
- Fetching 6 files: 17%|█▋ | 1/6 [00:00&lt;00:01, 2.82it/s]
4546
- Fetching 6 files: 100%|██████████| 6/6 [00:00&lt;00:00, 11.61it/s]
4547
- Fetching 6 files: 100%|██████████| 6/6 [00:00&lt;00:00, 10.04it/s]
4548
- /tmp/uvnote-run-ga2bg_po/home/.cache/uv/environments-v2/setup-1400c3ff0fc01263/lib/python3.13/site-packages/kernels/layer.py:868: UserWarning:
4549
  No kernel mapping found for layer `None`. Check if the layer name matches one of the kernels in the mapping or add the kernel you want to use to the mapping. Defaulting to original forward implementation.
4550
  warnings.warn(
4551
  INFO:root:Using layer `Yamoe` from repo `drbh/yamoe` (revision: v0.3.0) for layer `Yamoe`
@@ -4572,7 +4566,7 @@ INFO:root:Using layer `Yamoe` from repo `drbh/yamoe` (revision: v0.3.0) for laye
4572
  INFO:root:Using layer `Yamoe` from repo `drbh/yamoe` (revision: v0.3.0) for layer `Yamoe`
4573
  INFO:root:Using layer `Yamoe` from repo `drbh/yamoe` (revision: v0.3.0) for layer `Yamoe`
4574
  INFO:root:Using layer `Yamoe` from repo `drbh/yamoe` (revision: v0.3.0) for layer `Yamoe`
4575
- /tmp/uvnote-run-ga2bg_po/home/.cache/uv/environments-v2/setup-1400c3ff0fc01263/lib/python3.13/site-packages/kernels/layer.py:868: UserWarning:
4576
  No kernel mapping found for layer `None`. Check if the layer name matches one of the kernels in the mapping or add the kernel you want to use to the mapping. Defaulting to original forward implementation.
4577
  warnings.warn(
4578
  INFO:root:Using layer `Yamoe` from repo `drbh/yamoe` (revision: v0.3.0) for layer `Yamoe`
 
3722
  <span onclick="toggleOutput('nv')" style="cursor: pointer;">▼ output</span>
3723
  <span id="uv-indicator-nv" style="cursor: default; opacity: 0.3;">▶ uv-logs</span>
3724
  </span> |
3725
+ Cell: nv | 0.55s
3726
  | <button class="run-btn" onclick="runCell('nv')">▶ run</button>
3727
  <button class="copy-btn" onclick="copyCell('nv')">Copy</button>
3728
  <a href="cells/nv.py" target="_blank" class="raw-btn">Raw</a>
 
3745
  </div>
3746
  </div>
3747
  <div id="output-nv" class="cell-output">
3748
+ <div class="cell-stdout">Wed Sep 24 22:04:34 2025
3749
  +-----------------------------------------------------------------------------------------+
3750
  | NVIDIA-SMI 570.172.08 Driver Version: 570.172.08 CUDA Version: 12.8 |
3751
  |-----------------------------------------+------------------------+----------------------+
 
3754
  | | | MIG M. |
3755
  |=========================================+========================+======================|
3756
  | 0 NVIDIA A10G On | 00000000:00:1B.0 Off | 0 |
3757
+ | 0% 36C P0 45W / 300W | 0MiB / 23028MiB | 0% Default |
3758
  | | | N/A |
3759
  +-----------------------------------------+------------------------+----------------------+
3760
  | 1 NVIDIA A10G On | 00000000:00:1C.0 Off | 0 |
3761
+ | 0% 37C P0 47W / 300W | 0MiB / 23028MiB | 0% Default |
3762
  | | | N/A |
3763
  +-----------------------------------------+------------------------+----------------------+
3764
  | 2 NVIDIA A10G On | 00000000:00:1D.0 Off | 0 |
3765
+ | 0% 35C P0 47W / 300W | 0MiB / 23028MiB | 0% Default |
3766
  | | | N/A |
3767
  +-----------------------------------------+------------------------+----------------------+
3768
  | 3 NVIDIA A10G On | 00000000:00:1E.0 Off | 0 |
3769
+ | 0% 37C P0 44W / 300W | 0MiB / 23028MiB | 0% Default |
3770
  | | | N/A |
3771
  +-----------------------------------------+------------------------+----------------------+
3772
 
 
3792
  <span onclick="toggleOutput('setup2')" style="cursor: pointer;">▼ output</span>
3793
  <span id="uv-indicator-setup2" onclick="toggleUvLogsFromHeader('setup2')" style="cursor: pointer;">▶ uv-logs</span>
3794
  </span> |
3795
+ Cell: setup2 | 114.03s
3796
  | <button class="run-btn" onclick="runCell('setup2')">▶ run</button>
3797
  <button class="copy-btn" onclick="copyCell('setup2')">Copy</button>
3798
  <a href="cells/setup2.py" target="_blank" class="raw-btn">Raw</a>
 
4050
  What is Tensor Parallelism?
4051
 
4052
  &lt;|end|&gt;&lt;|start|&gt;assistant&lt;|channel|&gt;analysis&lt;|message|&gt;We need to explain what Tensor Parallelism is. It&#x27;s a concept in distributed training of large language models. It refers to splitting the weight matrices (tensors) across multiple devices. Provide details: how it works, benefits, challenges, typical frameworks, etc. Also mention difference from data parallelism, pipeline parallelism. Provide example: splitting a weight matrix across GPUs, each GPU holds a slice, compute partial results, then gather. Provide mention of communication overhead, scaling, etc. Also mention that it&#x27;s used in large models like GPT-3, Megatron-LM, DeepSpeed. Provide references. Also mention that it&#x27;s also called model parallelism. Provide explanation of how it works in practice: e.g., for a linear layer, weight matrix W of shape (out_features, in_features). In tensor parallelism, split W along out_features dimension across GPUs. Each GPU computes partial output. Then gather outputs. Provide details on how to handle bias, etc. Provide mention of &quot;tensor model parallelism&quot; vs &quot;tensor parallelism&quot; synonyms. Provide mention of &quot;tensor parallelism&quot; in Megatron-LM: splitting weight matrices across GPUs. Provide mention of &quot;tensor parallelism&quot; in DeepSpeed: &quot;ZeRO-Offload&quot; etc. Provide mention
4053
+ Generation took 31.36 seconds
4054
  </div>
4055
  <div class="uv-install-logs" id="uv-logs-setup2">
4056
  <div class="uv-logs-header" onclick="toggleUvLogs(this)">▶ UV Install Logs</div>
 
4059
  Downloading cpython-3.13.7-linux-x86_64-gnu (download)
4060
  Updating https://github.com/huggingface/transformers.git (HEAD)
4061
  Updated https://github.com/huggingface/transformers.git (7258ea44bc0c0a425a468f66f8559d1de8c4126d)
4062
+ Downloading jedi (1.5MiB)
4063
+ Downloading pygments (1.2MiB)
4064
  Building transformers @ git+https://github.com/huggingface/transformers.git@7258ea44bc0c0a425a468f66f8559d1de8c4126d
4065
+ Downloading matplotlib (8.3MiB)
 
 
4066
  Downloading networkx (1.9MiB)
4067
+ Downloading sympy (6.0MiB)
 
 
4068
  Downloading nvidia-cublas-cu12 (566.8MiB)
4069
+ Downloading nvidia-nvjitlink-cu12 (37.4MiB)
4070
+ Downloading nvidia-nccl-cu12 (307.4MiB)
4071
+ Downloading hf-xet (3.0MiB)
4072
+ Downloading nvidia-cusparse-cu12 (274.9MiB)
4073
+ Downloading fonttools (4.7MiB)
4074
  Downloading nvidia-cufile-cu12 (1.1MiB)
 
 
4075
  Downloading nvidia-cuda-cupti-cu12 (9.8MiB)
 
 
4076
  Downloading triton (148.4MiB)
 
 
4077
  Downloading nvidia-cusolver-cu12 (255.1MiB)
4078
+ Downloading tokenizers (3.1MiB)
 
 
 
4079
  Downloading kiwisolver (1.4MiB)
4080
+ Downloading nvidia-curand-cu12 (60.7MiB)
4081
+ Downloading pillow (6.3MiB)
4082
+ Downloading numpy (15.9MiB)
4083
+ Downloading nvidia-cufft-cu12 (184.2MiB)
4084
+ Downloading nvidia-cuda-nvrtc-cu12 (84.0MiB)
4085
+ Downloading nvidia-cudnn-cu12 (674.0MiB)
4086
+ Downloading nvidia-cusparselt-cu12 (273.9MiB)
4087
  Downloading torch (846.8MiB)
4088
  Downloading nvidia-cufile-cu12
4089
  Downloading kiwisolver
 
4105
  Downloading triton
4106
  Downloading nvidia-cufft-cu12
4107
  Downloading nvidia-cusolver-cu12
 
4108
  Downloading nvidia-cusparselt-cu12
4109
+ Downloading nvidia-cusparse-cu12
4110
  Downloading nvidia-nccl-cu12
4111
  Downloading nvidia-cublas-cu12
4112
  Downloading nvidia-cudnn-cu12
4113
  Downloading torch
4114
+ Installed 69 packages in 509ms
4115
  </div>
4116
  </div>
4117
  <div class="cell-stderr">Fetching 3 files: 0%| | 0/3 [00:00&lt;?, ?it/s]
4118
+ Fetching 3 files: 33%|███▎ | 1/3 [00:06&lt;00:12, 6.49s/it]
4119
+ Fetching 3 files: 67%|██████▋ | 2/3 [00:07&lt;00:03, 3.44s/it]
4120
+ Fetching 3 files: 100%|██████████| 3/3 [00:07&lt;00:00, 2.60s/it]
4121
  You are using full precision kernels, we will dequantize the model to bf16. To use the quantized model with quantization kernels, please set use_kernels=False
4122
 
4123
  Loading checkpoint shards: 0%| | 0/3 [00:00&lt;?, ?it/s]
4124
+ Loading checkpoint shards: 33%|███▎ | 1/3 [00:02&lt;00:04, 2.35s/it]
4125
  Loading checkpoint shards: 67%|██████▋ | 2/3 [00:04&lt;00:02, 2.25s/it]
4126
  Loading checkpoint shards: 100%|██████████| 3/3 [00:05&lt;00:00, 1.80s/it]
4127
  Loading checkpoint shards: 100%|██████████| 3/3 [00:05&lt;00:00, 1.93s/it]
4128
  INFO:root:Using layer `MegaBlocksMoeMLP` from repo `kernels-community/megablocks` (revision: main) for layer `MegaBlocksMoeMLP`
4129
 
4130
  Fetching 66 files: 0%| | 0/66 [00:00&lt;?, ?it/s]
4131
+ Fetching 66 files: 2%|▏ | 1/66 [00:00&lt;00:10, 6.31it/s]
4132
+ Fetching 66 files: 14%|█▎ | 9/66 [00:00&lt;00:02, 26.39it/s]
4133
+ Fetching 66 files: 26%|██▌ | 17/66 [00:01&lt;00:03, 12.42it/s]
4134
+ Fetching 66 files: 74%|███████▍ | 49/66 [00:01&lt;00:00, 45.00it/s]
4135
+ Fetching 66 files: 91%|█████████ | 60/66 [00:01&lt;00:00, 45.67it/s]
4136
+ Fetching 66 files: 100%|██████████| 66/66 [00:01&lt;00:00, 34.31it/s]
4137
+ /tmp/uvnote-run-_uergc47/home/.cache/uv/environments-v2/setup2-adf2810b697d7b08/lib/python3.13/site-packages/kernels/layer.py:868: UserWarning:
 
 
4138
  No kernel mapping found for layer `None`. Check if the layer name matches one of the kernels in the mapping or add the kernel you want to use to the mapping. Defaulting to original forward implementation.
4139
  warnings.warn(
4140
  INFO:root:Using layer `MegaBlocksMoeMLP` from repo `kernels-community/megablocks` (revision: main) for layer `MegaBlocksMoeMLP`
 
4161
  INFO:root:Using layer `MegaBlocksMoeMLP` from repo `kernels-community/megablocks` (revision: main) for layer `MegaBlocksMoeMLP`
4162
  INFO:root:Using layer `MegaBlocksMoeMLP` from repo `kernels-community/megablocks` (revision: main) for layer `MegaBlocksMoeMLP`
4163
  INFO:root:Using layer `MegaBlocksMoeMLP` from repo `kernels-community/megablocks` (revision: main) for layer `MegaBlocksMoeMLP`
4164
+ /tmp/uvnote-run-_uergc47/home/.cache/uv/environments-v2/setup2-adf2810b697d7b08/lib/python3.13/site-packages/kernels/layer.py:868: UserWarning:
4165
  No kernel mapping found for layer `None`. Check if the layer name matches one of the kernels in the mapping or add the kernel you want to use to the mapping. Defaulting to original forward implementation.
4166
  warnings.warn(
4167
  INFO:root:Using layer `MegaBlocksMoeMLP` from repo `kernels-community/megablocks` (revision: main) for layer `MegaBlocksMoeMLP`
 
4198
  <span onclick="toggleOutput('setup')" style="cursor: pointer;">▼ output</span>
4199
  <span id="uv-indicator-setup" onclick="toggleUvLogsFromHeader('setup')" style="cursor: pointer;">▶ uv-logs</span>
4200
  </span> |
4201
+ Cell: setup | 109.23s
4202
  | <button class="run-btn" onclick="runCell('setup')">▶ run</button>
4203
  <button class="copy-btn" onclick="copyCell('setup')">Copy</button>
4204
  <a href="cells/setup.py" target="_blank" class="raw-btn">Raw</a>
 
4457
 
4458
  What is Tensor Parallelism?
4459
 
4460
+ &lt;|end|&gt;&lt;|start|&gt;assistant&lt;|channel|&gt;analysis&lt;|message|&gt;We need to explain what Tensor Parallelism is. It&#x27;s a concept in distributed training of large language models. It refers to splitting the weight matrices (tensors) across multiple devices. Provide details: how it works, benefits, challenges, typical use cases, differences from data parallelism, pipeline parallelism, model parallelism. Provide example: splitting a fully connected layer&#x27;s weight matrix across GPUs. Provide mention of frameworks: Megatron-LM, DeepSpeed, etc. Provide explanation of how forward/backward passes are computed. Provide mention of communication overhead, scaling, etc. Provide mention of &quot;tensor parallelism&quot; as part of &quot;model parallelism&quot; but specifically splitting tensors. Provide mention of &quot;tensor parallelism&quot; in context of transformer layers: splitting attention heads, feed-forward layers. Provide mention of &quot;tensor parallelism&quot; in context of &quot;DeepSpeed ZeRO Stage 3&quot; or &quot;Megatron-LM&#x27;s tensor parallelism&quot;. Provide mention of &quot;tensor parallelism&quot; as &quot;model parallelism across the weight matrices&quot; and &quot;tensor parallelism&quot; vs &quot;pipeline parallelism&quot;. Provide mention of &quot;tensor parallelism&quot; as &quot;splitting the weight matrix across GPUs, each GPU holds a slice of the matrix, and the input is broadcasted,
4461
+ Generation took 26.26 seconds
 
 
 
 
4462
  </div>
4463
  <div class="uv-install-logs" id="uv-logs-setup">
4464
  <div class="uv-logs-header" onclick="toggleUvLogs(this)">▶ UV Install Logs</div>
 
4467
  Downloading cpython-3.13.7-linux-x86_64-gnu (download)
4468
  Updating https://github.com/huggingface/transformers.git (HEAD)
4469
  Updated https://github.com/huggingface/transformers.git (7258ea44bc0c0a425a468f66f8559d1de8c4126d)
 
 
 
 
 
 
 
4470
  Downloading nvidia-cuda-cupti-cu12 (9.8MiB)
4471
+ Downloading pillow (6.3MiB)
4472
+ Building transformers @ git+https://github.com/huggingface/transformers.git@7258ea44bc0c0a425a468f66f8559d1de8c4126d
4473
+ Downloading nvidia-cublas-cu12 (566.8MiB)
4474
  Downloading nvidia-nccl-cu12 (307.4MiB)
4475
  Downloading nvidia-cusparselt-cu12 (273.9MiB)
4476
+ Downloading numpy (15.9MiB)
4477
+ Downloading hf-xet (3.0MiB)
4478
+ Downloading nvidia-curand-cu12 (60.7MiB)
4479
  Downloading nvidia-cufft-cu12 (184.2MiB)
4480
+ Downloading nvidia-cusolver-cu12 (255.1MiB)
4481
+ Downloading nvidia-cudnn-cu12 (674.0MiB)
4482
+ Downloading nvidia-cufile-cu12 (1.1MiB)
4483
+ Downloading pygments (1.2MiB)
4484
+ Downloading nvidia-nvjitlink-cu12 (37.4MiB)
4485
+ Downloading jedi (1.5MiB)
4486
+ Downloading sympy (6.0MiB)
4487
  Downloading kiwisolver (1.4MiB)
4488
+ Downloading matplotlib (8.3MiB)
4489
+ Downloading nvidia-cusparse-cu12 (274.9MiB)
4490
+ Downloading networkx (1.9MiB)
4491
  Downloading nvidia-cuda-nvrtc-cu12 (84.0MiB)
4492
+ Downloading tokenizers (3.1MiB)
4493
  Downloading fonttools (4.7MiB)
 
 
 
 
 
 
 
4494
  Downloading triton (148.4MiB)
4495
  Downloading torch (846.8MiB)
4496
  Downloading nvidia-cufile-cu12
4497
  Downloading kiwisolver
4498
  Downloading pygments
 
4499
  Downloading hf-xet
4500
+ Downloading tokenizers
4501
  Downloading networkx
4502
  Downloading fonttools
4503
  Downloading pillow
 
4513
  Downloading triton
4514
  Downloading nvidia-cufft-cu12
4515
  Downloading nvidia-cusolver-cu12
 
4516
  Downloading nvidia-cusparselt-cu12
4517
+ Downloading nvidia-cusparse-cu12
4518
  Downloading nvidia-nccl-cu12
4519
  Downloading nvidia-cublas-cu12
4520
  Downloading nvidia-cudnn-cu12
4521
  Downloading torch
4522
+ Installed 69 packages in 464ms
4523
  </div>
4524
  </div>
4525
  <div class="cell-stderr">Fetching 3 files: 0%| | 0/3 [00:00&lt;?, ?it/s]
4526
+ Fetching 3 files: 33%|███▎ | 1/3 [00:07&lt;00:14, 7.38s/it]
4527
+ Fetching 3 files: 67%|██████▋ | 2/3 [00:08&lt;00:03, 3.64s/it]
4528
+ Fetching 3 files: 100%|██████████| 3/3 [00:08&lt;00:00, 2.80s/it]
4529
  You are using full precision kernels, we will dequantize the model to bf16. To use the quantized model with quantization kernels, please set use_kernels=False
4530
 
4531
  Loading checkpoint shards: 0%| | 0/3 [00:00&lt;?, ?it/s]
4532
+ Loading checkpoint shards: 33%|███▎ | 1/3 [00:02&lt;00:04, 2.34s/it]
4533
+ Loading checkpoint shards: 67%|██████▋ | 2/3 [00:04&lt;00:02, 2.25s/it]
4534
  Loading checkpoint shards: 100%|██████████| 3/3 [00:05&lt;00:00, 1.80s/it]
4535
+ Loading checkpoint shards: 100%|██████████| 3/3 [00:05&lt;00:00, 1.93s/it]
4536
  INFO:root:Using layer `Yamoe` from repo `drbh/yamoe` (revision: v0.3.0) for layer `Yamoe`
4537
 
4538
  Fetching 6 files: 0%| | 0/6 [00:00&lt;?, ?it/s]
4539
+ Fetching 6 files: 17%|█▋ | 1/6 [00:00&lt;00:00, 5.44it/s]
4540
+ Fetching 6 files: 50%|█████ | 3/6 [00:00&lt;00:00, 6.96it/s]
4541
+ Fetching 6 files: 100%|██████████| 6/6 [00:00&lt;00:00, 13.54it/s]
4542
+ /tmp/uvnote-run-jc1wbhvj/home/.cache/uv/environments-v2/setup-1400c3ff0fc01263/lib/python3.13/site-packages/kernels/layer.py:868: UserWarning:
4543
  No kernel mapping found for layer `None`. Check if the layer name matches one of the kernels in the mapping or add the kernel you want to use to the mapping. Defaulting to original forward implementation.
4544
  warnings.warn(
4545
  INFO:root:Using layer `Yamoe` from repo `drbh/yamoe` (revision: v0.3.0) for layer `Yamoe`
 
4566
  INFO:root:Using layer `Yamoe` from repo `drbh/yamoe` (revision: v0.3.0) for layer `Yamoe`
4567
  INFO:root:Using layer `Yamoe` from repo `drbh/yamoe` (revision: v0.3.0) for layer `Yamoe`
4568
  INFO:root:Using layer `Yamoe` from repo `drbh/yamoe` (revision: v0.3.0) for layer `Yamoe`
4569
+ /tmp/uvnote-run-jc1wbhvj/home/.cache/uv/environments-v2/setup-1400c3ff0fc01263/lib/python3.13/site-packages/kernels/layer.py:868: UserWarning:
4570
  No kernel mapping found for layer `None`. Check if the layer name matches one of the kernels in the mapping or add the kernel you want to use to the mapping. Defaulting to original forward implementation.
4571
  warnings.warn(
4572
  INFO:root:Using layer `Yamoe` from repo `drbh/yamoe` (revision: v0.3.0) for layer `Yamoe`
megablocks_yamoe/torch_profile.html CHANGED
@@ -3720,7 +3720,7 @@ span.linenos.special { color: #000000; background-color: #ffffc0; padding-left:
3720
  <span onclick="toggleOutput('utils')" style="cursor: pointer;">▼ output</span>
3721
  <span id="uv-indicator-utils" onclick="toggleUvLogsFromHeader('utils')" style="cursor: pointer;">▶ uv-logs</span>
3722
  </span> |
3723
- Cell: utils | deps: torch, numpy | 34.25s
3724
  | <button class="run-btn" onclick="runCell('utils')">▶ run</button>
3725
  <button class="copy-btn" onclick="copyCell('utils')">Copy</button>
3726
  <a href="cells/utils.py" target="_blank" class="raw-btn">Raw</a>
@@ -3794,24 +3794,24 @@ Cell: utils | deps: torch, numpy | 34.25s
3794
  <div class="uv-install-logs" id="uv-logs-utils">
3795
  <div class="uv-logs-header" onclick="toggleUvLogs(this)">▶ UV Install Logs</div>
3796
  <div class="uv-logs-content" style="display: none;">
3797
- Downloading nvidia-cuda-cupti-cu12 (9.8MiB)
3798
  Downloading setuptools (1.1MiB)
3799
- Downloading nvidia-cufft-cu12 (184.2MiB)
3800
- Downloading nvidia-cusolver-cu12 (255.1MiB)
3801
- Downloading nvidia-cufile-cu12 (1.1MiB)
3802
- Downloading nvidia-cusparse-cu12 (274.9MiB)
3803
  Downloading numpy (16.2MiB)
3804
- Downloading nvidia-cudnn-cu12 (674.0MiB)
3805
- Downloading nvidia-cublas-cu12 (566.8MiB)
3806
  Downloading nvidia-curand-cu12 (60.7MiB)
 
 
 
 
3807
  Downloading nvidia-nccl-cu12 (307.4MiB)
 
3808
  Downloading nvidia-cusparselt-cu12 (273.9MiB)
3809
- Downloading sympy (6.0MiB)
3810
- Downloading nvidia-nvjitlink-cu12 (37.4MiB)
 
3811
  Downloading nvidia-cuda-nvrtc-cu12 (84.0MiB)
3812
- Downloading triton (148.3MiB)
3813
  Downloading torch (846.9MiB)
3814
- Downloading networkx (1.9MiB)
3815
  Downloading nvidia-cufile-cu12
3816
  Downloading setuptools
3817
  Downloading networkx
@@ -3824,13 +3824,13 @@ Downloading networkx (1.9MiB)
3824
  Downloading triton
3825
  Downloading nvidia-cufft-cu12
3826
  Downloading nvidia-cusolver-cu12
3827
- Downloading nvidia-cusparselt-cu12
3828
  Downloading nvidia-cusparse-cu12
 
3829
  Downloading nvidia-nccl-cu12
3830
  Downloading nvidia-cublas-cu12
3831
  Downloading nvidia-cudnn-cu12
3832
  Downloading torch
3833
- Installed 26 packages in 446ms
3834
  </div>
3835
  </div>
3836
  </div>
@@ -3843,7 +3843,7 @@ Installed 26 packages in 446ms
3843
  <span onclick="toggleOutput('bench_utils')" style="cursor: pointer;">▼ output</span>
3844
  <span id="uv-indicator-bench_utils" onclick="toggleUvLogsFromHeader('bench_utils')" style="cursor: pointer;">▶ uv-logs</span>
3845
  </span> |
3846
- Cell: bench_utils | deps: torch, numpy | 35.45s
3847
  | <button class="run-btn" onclick="runCell('bench_utils')">▶ run</button>
3848
  <button class="copy-btn" onclick="copyCell('bench_utils')">Copy</button>
3849
  <a href="cells/bench_utils.py" target="_blank" class="raw-btn">Raw</a>
@@ -4331,24 +4331,24 @@ Cell: bench_utils | deps: torch, numpy | 35.45s
4331
  <div class="uv-install-logs" id="uv-logs-bench_utils">
4332
  <div class="uv-logs-header" onclick="toggleUvLogs(this)">▶ UV Install Logs</div>
4333
  <div class="uv-logs-content" style="display: none;">
4334
- Downloading numpy (16.2MiB)
4335
- Downloading torch (846.9MiB)
4336
- Downloading triton (148.3MiB)
4337
- Downloading nvidia-cuda-cupti-cu12 (9.8MiB)
4338
- Downloading nvidia-curand-cu12 (60.7MiB)
4339
  Downloading setuptools (1.1MiB)
4340
- Downloading nvidia-cufft-cu12 (184.2MiB)
4341
- Downloading nvidia-cufile-cu12 (1.1MiB)
4342
- Downloading nvidia-cuda-nvrtc-cu12 (84.0MiB)
4343
  Downloading nvidia-nvjitlink-cu12 (37.4MiB)
4344
- Downloading nvidia-nccl-cu12 (307.4MiB)
4345
  Downloading sympy (6.0MiB)
4346
- Downloading networkx (1.9MiB)
4347
- Downloading nvidia-cudnn-cu12 (674.0MiB)
4348
- Downloading nvidia-cusolver-cu12 (255.1MiB)
4349
  Downloading nvidia-cusparse-cu12 (274.9MiB)
4350
  Downloading nvidia-cusparselt-cu12 (273.9MiB)
4351
- Downloading nvidia-cublas-cu12 (566.8MiB)
 
 
 
 
 
 
 
 
4352
  Downloading nvidia-cufile-cu12
4353
  Downloading setuptools
4354
  Downloading networkx
@@ -4367,7 +4367,7 @@ Downloading nvidia-cublas-cu12 (566.8MiB)
4367
  Downloading nvidia-cublas-cu12
4368
  Downloading nvidia-cudnn-cu12
4369
  Downloading torch
4370
- Installed 26 packages in 445ms
4371
  </div>
4372
  </div>
4373
  </div>
@@ -4381,7 +4381,7 @@ Installed 26 packages in 445ms
4381
  <span onclick="toggleOutput('config')" style="cursor: pointer;">▼ output</span>
4382
  <span id="uv-indicator-config" onclick="toggleUvLogsFromHeader('config')" style="cursor: pointer;">▶ uv-logs</span>
4383
  </span> |
4384
- Cell: config | deps: torch, numpy | 34.31s
4385
  | <button class="run-btn" onclick="runCell('config')">▶ run</button>
4386
  <button class="copy-btn" onclick="copyCell('config')">Copy</button>
4387
  <a href="cells/config.py" target="_blank" class="raw-btn">Raw</a>
@@ -4441,24 +4441,24 @@ Cell: config | deps: torch, numpy | 34.31s
4441
  <div class="uv-install-logs" id="uv-logs-config">
4442
  <div class="uv-logs-header" onclick="toggleUvLogs(this)">▶ UV Install Logs</div>
4443
  <div class="uv-logs-content" style="display: none;">
4444
- Downloading sympy (6.0MiB)
4445
- Downloading nvidia-cufile-cu12 (1.1MiB)
4446
- Downloading nvidia-nccl-cu12 (307.4MiB)
4447
- Downloading nvidia-cudnn-cu12 (674.0MiB)
4448
- Downloading nvidia-cuda-nvrtc-cu12 (84.0MiB)
4449
- Downloading nvidia-cusparselt-cu12 (273.9MiB)
4450
- Downloading torch (846.9MiB)
4451
- Downloading networkx (1.9MiB)
4452
  Downloading nvidia-cufft-cu12 (184.2MiB)
 
 
 
4453
  Downloading setuptools (1.1MiB)
4454
- Downloading nvidia-curand-cu12 (60.7MiB)
4455
- Downloading nvidia-cublas-cu12 (566.8MiB)
 
 
 
4456
  Downloading nvidia-nvjitlink-cu12 (37.4MiB)
 
 
 
4457
  Downloading nvidia-cuda-cupti-cu12 (9.8MiB)
4458
- Downloading nvidia-cusparse-cu12 (274.9MiB)
4459
- Downloading numpy (16.2MiB)
4460
- Downloading triton (148.3MiB)
4461
- Downloading nvidia-cusolver-cu12 (255.1MiB)
4462
  Downloading nvidia-cufile-cu12
4463
  Downloading setuptools
4464
  Downloading networkx
@@ -4471,13 +4471,13 @@ Downloading nvidia-cusolver-cu12 (255.1MiB)
4471
  Downloading triton
4472
  Downloading nvidia-cufft-cu12
4473
  Downloading nvidia-cusolver-cu12
4474
- Downloading nvidia-cusparse-cu12
4475
  Downloading nvidia-cusparselt-cu12
 
4476
  Downloading nvidia-nccl-cu12
4477
  Downloading nvidia-cublas-cu12
4478
  Downloading nvidia-cudnn-cu12
4479
  Downloading torch
4480
- Installed 26 packages in 450ms
4481
  </div>
4482
  </div>
4483
  </div>
@@ -4490,7 +4490,7 @@ Installed 26 packages in 450ms
4490
  <span onclick="toggleOutput('save_data')" style="cursor: pointer;">▼ output</span>
4491
  <span id="uv-indicator-save_data" onclick="toggleUvLogsFromHeader('save_data')" style="cursor: pointer;">▶ uv-logs</span>
4492
  </span> |
4493
- Cell: save_data | deps: torch, numpy | 39.54s
4494
  | <button class="run-btn" onclick="runCell('save_data')">▶ run</button>
4495
  <button class="copy-btn" onclick="copyCell('save_data')">Copy</button>
4496
  <a href="cells/save_data.py" target="_blank" class="raw-btn">Raw</a>
@@ -4585,24 +4585,24 @@ Down sum: 206.729263
4585
  <div class="uv-install-logs" id="uv-logs-save_data">
4586
  <div class="uv-logs-header" onclick="toggleUvLogs(this)">▶ UV Install Logs</div>
4587
  <div class="uv-logs-content" style="display: none;">
 
 
 
4588
  Downloading nvidia-cusparselt-cu12 (273.9MiB)
4589
- Downloading nvidia-nvjitlink-cu12 (37.4MiB)
4590
- Downloading numpy (16.2MiB)
4591
  Downloading setuptools (1.1MiB)
4592
- Downloading nvidia-cublas-cu12 (566.8MiB)
4593
  Downloading nvidia-cudnn-cu12 (674.0MiB)
 
 
 
 
4594
  Downloading nvidia-curand-cu12 (60.7MiB)
4595
- Downloading nvidia-cusparse-cu12 (274.9MiB)
4596
- Downloading nvidia-nccl-cu12 (307.4MiB)
4597
- Downloading nvidia-cufft-cu12 (184.2MiB)
4598
- Downloading nvidia-cuda-nvrtc-cu12 (84.0MiB)
4599
  Downloading nvidia-cufile-cu12 (1.1MiB)
4600
- Downloading nvidia-cusolver-cu12 (255.1MiB)
4601
- Downloading nvidia-cuda-cupti-cu12 (9.8MiB)
4602
  Downloading sympy (6.0MiB)
4603
  Downloading torch (846.9MiB)
4604
- Downloading networkx (1.9MiB)
4605
- Downloading triton (148.3MiB)
 
4606
  Downloading nvidia-cufile-cu12
4607
  Downloading setuptools
4608
  Downloading networkx
@@ -4621,16 +4621,16 @@ Downloading triton (148.3MiB)
4621
  Downloading nvidia-cublas-cu12
4622
  Downloading nvidia-cudnn-cu12
4623
  Downloading torch
4624
- Installed 26 packages in 446ms
4625
  </div>
4626
  </div>
4627
  <div class="cell-artifacts">
4628
  <h4>Artifacts:</h4>
4629
- <a href="artifacts/save_data/gate_up_proj_bias.pt" class="artifact" target="_blank">gate_up_proj_bias.pt</a>
4630
  <a href="artifacts/save_data/down_proj_bias.pt" class="artifact" target="_blank">down_proj_bias.pt</a>
4631
  <a href="artifacts/save_data/down_proj.pt" class="artifact" target="_blank">down_proj.pt</a>
4632
  <a href="artifacts/save_data/router_weight.pt" class="artifact" target="_blank">router_weight.pt</a>
4633
  <a href="artifacts/save_data/router_bias.pt" class="artifact" target="_blank">router_bias.pt</a>
 
4634
  <a href="artifacts/save_data/gate_up_proj.pt" class="artifact" target="_blank">gate_up_proj.pt</a>
4635
  </div>
4636
  </div>
@@ -4645,7 +4645,7 @@ Installed 26 packages in 446ms
4645
  <span onclick="toggleOutput('yamoe_run')" style="cursor: pointer;">▼ output</span>
4646
  <span id="uv-indicator-yamoe_run" onclick="toggleUvLogsFromHeader('yamoe_run')" style="cursor: pointer;">▶ uv-logs</span>
4647
  </span> |
4648
- Cell: yamoe_run | deps: torch, kernels, numpy | 39.10s
4649
  | <button class="run-btn" onclick="runCell('yamoe_run')">▶ run</button>
4650
  <button class="copy-btn" onclick="copyCell('yamoe_run')">Copy</button>
4651
  <a href="cells/yamoe_run.py" target="_blank" class="raw-btn">Raw</a>
@@ -4938,10 +4938,10 @@ Input Variation: +0.001 * iteration (deterministic)
4938
 
4939
  Warming up (10 iterations)...
4940
  Benchmarking (50 iterations)...
4941
- Progress: 20% complete (avg: 4.251 ms)
4942
- Progress: 40% complete (avg: 4.248 ms)
4943
- Progress: 60% complete (avg: 4.248 ms)
4944
- Progress: 80% complete (avg: 4.249 ms)
4945
 
4946
  Output tensors:
4947
  Primary: shape=(1, 100, 1152), dtype=torch.float32, device=cuda:0, range=[-0.049506, 0.054984], mean=0.000034, std=0.006508, norm=2.208791
@@ -4951,19 +4951,19 @@ Output tensors:
4951
  Iterations: 50
4952
 
4953
  Latency Statistics:
4954
- Average: 4.250 ms
4955
- Min: 4.144 ms
4956
- Max: 4.276 ms
4957
- Std Dev: 0.020 ms
4958
 
4959
  Percentiles:
4960
  P50 (median): 4.252 ms
4961
- P95: 4.269 ms
4962
- P99: 4.276 ms
4963
 
4964
  Throughput:
4965
- Tokens/sec: 23531.6
4966
- Std Dev: 113.9
4967
  ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━
4968
 
4969
  Saved benchmark results to yamoe_results.json
@@ -4973,25 +4973,25 @@ Output sum: 3.971905
4973
  <div class="uv-install-logs" id="uv-logs-yamoe_run">
4974
  <div class="uv-logs-header" onclick="toggleUvLogs(this)">▶ UV Install Logs</div>
4975
  <div class="uv-logs-content" style="display: none;">
4976
- Downloading hf-xet (3.0MiB)
4977
- Downloading nvidia-nccl-cu12 (307.4MiB)
4978
  Downloading networkx (1.9MiB)
4979
- Downloading nvidia-cufft-cu12 (184.2MiB)
4980
- Downloading nvidia-cusparselt-cu12 (273.9MiB)
4981
  Downloading setuptools (1.1MiB)
4982
- Downloading nvidia-cudnn-cu12 (674.0MiB)
 
4983
  Downloading nvidia-cufile-cu12 (1.1MiB)
 
 
 
4984
  Downloading nvidia-cusolver-cu12 (255.1MiB)
4985
- Downloading numpy (16.2MiB)
 
 
4986
  Downloading torch (846.9MiB)
4987
- Downloading nvidia-cuda-cupti-cu12 (9.8MiB)
4988
- Downloading sympy (6.0MiB)
4989
  Downloading nvidia-curand-cu12 (60.7MiB)
4990
- Downloading nvidia-cublas-cu12 (566.8MiB)
4991
  Downloading nvidia-nvjitlink-cu12 (37.4MiB)
4992
- Downloading triton (148.3MiB)
4993
- Downloading nvidia-cusparse-cu12 (274.9MiB)
4994
- Downloading nvidia-cuda-nvrtc-cu12 (84.0MiB)
4995
  Downloading nvidia-cufile-cu12
4996
  Downloading hf-xet
4997
  Downloading setuptools
@@ -5011,13 +5011,14 @@ Downloading nvidia-cuda-nvrtc-cu12 (84.0MiB)
5011
  Downloading nvidia-cublas-cu12
5012
  Downloading nvidia-cudnn-cu12
5013
  Downloading torch
5014
- Installed 37 packages in 454ms
5015
  </div>
5016
  </div>
5017
  <div class="cell-stderr">Fetching 6 files: 0%| | 0/6 [00:00&lt;?, ?it/s]
5018
- Fetching 6 files: 17%|█▋ | 1/6 [00:00&lt;00:01, 3.47it/s]
5019
- Fetching 6 files: 50%|█████ | 3/6 [00:00&lt;00:00, 4.22it/s]
5020
- Fetching 6 files: 100%|██████████| 6/6 [00:00&lt;00:00, 8.26it/s]</div>
 
5021
  <div class="cell-artifacts">
5022
  <h4>Artifacts:</h4>
5023
  <a href="artifacts/yamoe_run/yamoe_results.json" class="artifact" target="_blank">yamoe_results.json</a>
@@ -5034,7 +5035,7 @@ Fetching 6 files: 100%|██████████| 6/6 [00:00&lt;00:00, 8.2
5034
  <span onclick="toggleOutput('binned_run')" style="cursor: pointer;">▼ output</span>
5035
  <span id="uv-indicator-binned_run" onclick="toggleUvLogsFromHeader('binned_run')" style="cursor: pointer;">▶ uv-logs</span>
5036
  </span> |
5037
- Cell: binned_run | deps: torch, numpy | 39.44s
5038
  | <button class="run-btn" onclick="runCell('binned_run')">▶ run</button>
5039
  <button class="copy-btn" onclick="copyCell('binned_run')">Copy</button>
5040
  <a href="cells/binned_run.py" target="_blank" class="raw-btn">Raw</a>
@@ -5448,10 +5449,10 @@ Input Variation: +0.001 * iteration (deterministic)
5448
 
5449
  Warming up (10 iterations)...
5450
  Benchmarking (50 iterations)...
5451
- Progress: 20% complete (avg: 37.889 ms)
5452
- Progress: 40% complete (avg: 37.238 ms)
5453
- Progress: 60% complete (avg: 36.997 ms)
5454
- Progress: 80% complete (avg: 36.387 ms)
5455
 
5456
  Output tensors:
5457
  Primary: shape=(1, 100, 1152), dtype=torch.float32, device=cuda:0, range=[-0.049506, 0.054984], mean=0.000034, std=0.006508, norm=2.208791
@@ -5461,19 +5462,19 @@ Output tensors:
5461
  Iterations: 50
5462
 
5463
  Latency Statistics:
5464
- Average: 35.833 ms
5465
- Min: 32.582 ms
5466
- Max: 40.501 ms
5467
- Std Dev: 1.694 ms
5468
 
5469
  Percentiles:
5470
- P50 (median): 36.177 ms
5471
- P95: 38.671 ms
5472
- P99: 39.929 ms
5473
 
5474
  Throughput:
5475
- Tokens/sec: 2790.7
5476
- Std Dev: 131.3
5477
  ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━
5478
 
5479
  Saved benchmark results to binned_results.json
@@ -5483,24 +5484,24 @@ Output sum: 3.971905
5483
  <div class="uv-install-logs" id="uv-logs-binned_run">
5484
  <div class="uv-logs-header" onclick="toggleUvLogs(this)">▶ UV Install Logs</div>
5485
  <div class="uv-logs-content" style="display: none;">
5486
- Downloading nvidia-nvjitlink-cu12 (37.4MiB)
5487
- Downloading sympy (6.0MiB)
5488
- Downloading nvidia-cufile-cu12 (1.1MiB)
5489
- Downloading setuptools (1.1MiB)
5490
- Downloading numpy (16.2MiB)
5491
  Downloading networkx (1.9MiB)
5492
- Downloading nvidia-cudnn-cu12 (674.0MiB)
 
5493
  Downloading nvidia-cusparselt-cu12 (273.9MiB)
5494
- Downloading nvidia-cublas-cu12 (566.8MiB)
5495
- Downloading nvidia-cusparse-cu12 (274.9MiB)
5496
- Downloading nvidia-cusolver-cu12 (255.1MiB)
5497
- Downloading nvidia-cuda-cupti-cu12 (9.8MiB)
5498
  Downloading nvidia-cufft-cu12 (184.2MiB)
5499
- Downloading nvidia-cuda-nvrtc-cu12 (84.0MiB)
5500
- Downloading nvidia-nccl-cu12 (307.4MiB)
5501
  Downloading nvidia-curand-cu12 (60.7MiB)
5502
- Downloading torch (846.9MiB)
 
 
5503
  Downloading triton (148.3MiB)
 
 
 
 
 
 
5504
  Downloading nvidia-cufile-cu12
5505
  Downloading setuptools
5506
  Downloading networkx
@@ -5513,13 +5514,13 @@ Downloading triton (148.3MiB)
5513
  Downloading triton
5514
  Downloading nvidia-cufft-cu12
5515
  Downloading nvidia-cusolver-cu12
5516
- Downloading nvidia-cusparselt-cu12
5517
  Downloading nvidia-cusparse-cu12
 
5518
  Downloading nvidia-nccl-cu12
5519
  Downloading nvidia-cublas-cu12
5520
  Downloading nvidia-cudnn-cu12
5521
  Downloading torch
5522
- Installed 26 packages in 446ms
5523
  </div>
5524
  </div>
5525
  <div class="cell-artifacts">
@@ -5538,7 +5539,7 @@ Installed 26 packages in 446ms
5538
  <span onclick="toggleOutput('gptoss_run')" style="cursor: pointer;">▼ output</span>
5539
  <span id="uv-indicator-gptoss_run" onclick="toggleUvLogsFromHeader('gptoss_run')" style="cursor: pointer;">▶ uv-logs</span>
5540
  </span> |
5541
- Cell: gptoss_run | deps: torch, numpy | 40.46s
5542
  | <button class="run-btn" onclick="runCell('gptoss_run')">▶ run</button>
5543
  <button class="copy-btn" onclick="copyCell('gptoss_run')">Copy</button>
5544
  <a href="cells/gptoss_run.py" target="_blank" class="raw-btn">Raw</a>
@@ -5856,10 +5857,10 @@ Input Variation: +0.001 * iteration (deterministic)
5856
 
5857
  Warming up (10 iterations)...
5858
  Benchmarking (50 iterations)...
5859
- Progress: 20% complete (avg: 50.504 ms)
5860
- Progress: 40% complete (avg: 50.045 ms)
5861
- Progress: 60% complete (avg: 49.107 ms)
5862
- Progress: 80% complete (avg: 48.012 ms)
5863
 
5864
  Output tensors:
5865
  Primary: shape=(1, 100, 1152), dtype=torch.float32, device=cuda:0, range=[-0.064982, 0.061193], mean=0.000100, std=0.013510, norm=4.585560
@@ -5869,19 +5870,19 @@ Output tensors:
5869
  Iterations: 50
5870
 
5871
  Latency Statistics:
5872
- Average: 46.791 ms
5873
- Min: 39.036 ms
5874
- Max: 50.857 ms
5875
- Std Dev: 3.251 ms
5876
 
5877
  Percentiles:
5878
- P50 (median): 47.476 ms
5879
- P95: 50.806 ms
5880
- P99: 50.839 ms
5881
 
5882
  Throughput:
5883
- Tokens/sec: 2137.2
5884
- Std Dev: 155.2
5885
  ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━
5886
 
5887
  Saved benchmark results to gptoss_results.json
@@ -5891,23 +5892,23 @@ Output sum: 11.532237
5891
  <div class="uv-install-logs" id="uv-logs-gptoss_run">
5892
  <div class="uv-logs-header" onclick="toggleUvLogs(this)">▶ UV Install Logs</div>
5893
  <div class="uv-logs-content" style="display: none;">
 
 
 
5894
  Downloading setuptools (1.1MiB)
5895
- Downloading nvidia-cusparselt-cu12 (273.9MiB)
5896
  Downloading nvidia-curand-cu12 (60.7MiB)
5897
- Downloading numpy (16.2MiB)
5898
  Downloading nvidia-cuda-cupti-cu12 (9.8MiB)
5899
- Downloading sympy (6.0MiB)
5900
- Downloading torch (846.9MiB)
5901
  Downloading nvidia-nccl-cu12 (307.4MiB)
 
5902
  Downloading nvidia-nvjitlink-cu12 (37.4MiB)
5903
- Downloading nvidia-cusolver-cu12 (255.1MiB)
5904
- Downloading nvidia-cudnn-cu12 (674.0MiB)
5905
- Downloading nvidia-cufft-cu12 (184.2MiB)
5906
- Downloading networkx (1.9MiB)
5907
  Downloading nvidia-cufile-cu12 (1.1MiB)
 
5908
  Downloading triton (148.3MiB)
5909
- Downloading nvidia-cusparse-cu12 (274.9MiB)
5910
- Downloading nvidia-cuda-nvrtc-cu12 (84.0MiB)
 
 
5911
  Downloading nvidia-cublas-cu12 (566.8MiB)
5912
  Downloading nvidia-cufile-cu12
5913
  Downloading setuptools
@@ -5921,13 +5922,13 @@ Downloading nvidia-cublas-cu12 (566.8MiB)
5921
  Downloading triton
5922
  Downloading nvidia-cufft-cu12
5923
  Downloading nvidia-cusolver-cu12
5924
- Downloading nvidia-cusparselt-cu12
5925
  Downloading nvidia-cusparse-cu12
 
5926
  Downloading nvidia-nccl-cu12
5927
  Downloading nvidia-cublas-cu12
5928
  Downloading nvidia-cudnn-cu12
5929
  Downloading torch
5930
- Installed 26 packages in 442ms
5931
  </div>
5932
  </div>
5933
  <div class="cell-artifacts">
@@ -5946,7 +5947,7 @@ Installed 26 packages in 442ms
5946
  <span onclick="toggleOutput('gptoss_training_run')" style="cursor: pointer;">▼ output</span>
5947
  <span id="uv-indicator-gptoss_training_run" onclick="toggleUvLogsFromHeader('gptoss_training_run')" style="cursor: pointer;">▶ uv-logs</span>
5948
  </span> |
5949
- Cell: gptoss_training_run | deps: torch, numpy | 39.65s
5950
  | <button class="run-btn" onclick="runCell('gptoss_training_run')">▶ run</button>
5951
  <button class="copy-btn" onclick="copyCell('gptoss_training_run')">Copy</button>
5952
  <a href="cells/gptoss_training_run.py" target="_blank" class="raw-btn">Raw</a>
@@ -6247,10 +6248,10 @@ Input Variation: +0.001 * iteration (deterministic)
6247
 
6248
  Warming up (10 iterations)...
6249
  Benchmarking (50 iterations)...
6250
- Progress: 20% complete (avg: 48.334 ms)
6251
- Progress: 40% complete (avg: 47.917 ms)
6252
- Progress: 60% complete (avg: 47.077 ms)
6253
- Progress: 80% complete (avg: 46.038 ms)
6254
 
6255
  Output tensors:
6256
  Primary: shape=(1, 100, 1152), dtype=torch.float32, device=cuda:0, range=[-0.064982, 0.061193], mean=0.000100, std=0.013510, norm=4.585560
@@ -6260,19 +6261,19 @@ Output tensors:
6260
  Iterations: 50
6261
 
6262
  Latency Statistics:
6263
- Average: 45.007 ms
6264
- Min: 38.837 ms
6265
- Max: 49.308 ms
6266
- Std Dev: 2.894 ms
6267
 
6268
  Percentiles:
6269
- P50 (median): 45.575 ms
6270
- P95: 48.573 ms
6271
- P99: 48.964 ms
6272
 
6273
  Throughput:
6274
- Tokens/sec: 2221.9
6275
- Std Dev: 147.9
6276
  ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━
6277
 
6278
  Saved benchmark results to gptoss_training_results.json
@@ -6282,24 +6283,24 @@ Output sum: 11.532237
6282
  <div class="uv-install-logs" id="uv-logs-gptoss_training_run">
6283
  <div class="uv-logs-header" onclick="toggleUvLogs(this)">▶ UV Install Logs</div>
6284
  <div class="uv-logs-content" style="display: none;">
6285
- Downloading nvidia-cuda-cupti-cu12 (9.8MiB)
6286
- Downloading networkx (1.9MiB)
6287
  Downloading setuptools (1.1MiB)
6288
- Downloading numpy (16.2MiB)
 
6289
  Downloading nvidia-cusparselt-cu12 (273.9MiB)
6290
- Downloading nvidia-curand-cu12 (60.7MiB)
 
 
6291
  Downloading nvidia-cudnn-cu12 (674.0MiB)
6292
- Downloading sympy (6.0MiB)
 
 
6293
  Downloading nvidia-cusolver-cu12 (255.1MiB)
6294
- Downloading nvidia-nccl-cu12 (307.4MiB)
6295
- Downloading nvidia-cusparse-cu12 (274.9MiB)
6296
- Downloading nvidia-nvjitlink-cu12 (37.4MiB)
6297
  Downloading nvidia-cuda-nvrtc-cu12 (84.0MiB)
 
 
6298
  Downloading nvidia-cufft-cu12 (184.2MiB)
6299
- Downloading nvidia-cufile-cu12 (1.1MiB)
6300
- Downloading nvidia-cublas-cu12 (566.8MiB)
6301
  Downloading triton (148.3MiB)
6302
- Downloading torch (846.9MiB)
6303
  Downloading nvidia-cufile-cu12
6304
  Downloading setuptools
6305
  Downloading networkx
@@ -6318,7 +6319,7 @@ Downloading torch (846.9MiB)
6318
  Downloading nvidia-cublas-cu12
6319
  Downloading nvidia-cudnn-cu12
6320
  Downloading torch
6321
- Installed 26 packages in 448ms
6322
  </div>
6323
  </div>
6324
  <div class="cell-artifacts">
@@ -6337,7 +6338,7 @@ Installed 26 packages in 448ms
6337
  <span onclick="toggleOutput('megablocks_run')" style="cursor: pointer;">▼ output</span>
6338
  <span id="uv-indicator-megablocks_run" onclick="toggleUvLogsFromHeader('megablocks_run')" style="cursor: pointer;">▶ uv-logs</span>
6339
  </span> |
6340
- Cell: megablocks_run | deps: torch, numpy, kernels | 41.38s | FAILED
6341
  | <button class="run-btn" onclick="runCell('megablocks_run')">▶ run</button>
6342
  <button class="copy-btn" onclick="copyCell('megablocks_run')">Copy</button>
6343
  <a href="cells/megablocks_run.py" target="_blank" class="raw-btn">Raw</a>
@@ -6492,7 +6493,7 @@ Cell: megablocks_run | deps: torch, numpy, kernels | 41.38s | FAILED
6492
  <span class="c1"># Attach loaded expert weights to the experts container</span>
6493
  <span class="n">e</span> <span class="o">=</span> <span class="n">model</span><span class="o">.</span><span class="n">experts</span>
6494
  <span class="n">e</span><span class="o">.</span><span class="n">alpha</span> <span class="o">=</span> <span class="mf">1.702</span>
6495
- <span class="n">e</span><span class="o">.</span><span class="n">capacity_factor</span> <span class="o">=</span> <span class="mi">64</span>
6496
  <span class="n">e</span><span class="o">.</span><span class="n">gate_up_proj</span> <span class="o">=</span> <span class="n">torch</span><span class="o">.</span><span class="n">nn</span><span class="o">.</span><span class="n">Parameter</span><span class="p">(</span><span class="n">gate_up_proj</span><span class="o">.</span><span class="n">clone</span><span class="p">()</span><span class="o">.</span><span class="n">to</span><span class="p">(</span><span class="n">device</span><span class="p">))</span>
6497
  <span class="n">e</span><span class="o">.</span><span class="n">gate_up_proj_bias</span> <span class="o">=</span> <span class="n">torch</span><span class="o">.</span><span class="n">nn</span><span class="o">.</span><span class="n">Parameter</span><span class="p">(</span><span class="n">gate_up_proj_bias</span><span class="o">.</span><span class="n">clone</span><span class="p">()</span><span class="o">.</span><span class="n">to</span><span class="p">(</span><span class="n">device</span><span class="p">))</span>
6498
  <span class="n">e</span><span class="o">.</span><span class="n">down_proj</span> <span class="o">=</span> <span class="n">torch</span><span class="o">.</span><span class="n">nn</span><span class="o">.</span><span class="n">Parameter</span><span class="p">(</span><span class="n">down_proj</span><span class="o">.</span><span class="n">clone</span><span class="p">()</span><span class="o">.</span><span class="n">to</span><span class="p">(</span><span class="n">device</span><span class="p">))</span>
@@ -6569,25 +6570,25 @@ Warming up (10 iterations)...
6569
  <div class="uv-install-logs" id="uv-logs-megablocks_run">
6570
  <div class="uv-logs-header" onclick="toggleUvLogs(this)">▶ UV Install Logs</div>
6571
  <div class="uv-logs-content" style="display: none;">
6572
- Downloading nvidia-cublas-cu12 (566.8MiB)
6573
- Downloading setuptools (1.1MiB)
6574
  Downloading numpy (16.2MiB)
6575
- Downloading networkx (1.9MiB)
6576
- Downloading nvidia-cuda-nvrtc-cu12 (84.0MiB)
6577
  Downloading nvidia-cuda-cupti-cu12 (9.8MiB)
6578
- Downloading nvidia-cusparselt-cu12 (273.9MiB)
6579
- Downloading nvidia-nvjitlink-cu12 (37.4MiB)
6580
  Downloading nvidia-cufile-cu12 (1.1MiB)
6581
- Downloading torch (846.9MiB)
6582
- Downloading nvidia-nccl-cu12 (307.4MiB)
6583
- Downloading nvidia-cufft-cu12 (184.2MiB)
6584
  Downloading hf-xet (3.0MiB)
6585
- Downloading nvidia-curand-cu12 (60.7MiB)
 
6586
  Downloading nvidia-cusparse-cu12 (274.9MiB)
6587
- Downloading nvidia-cusolver-cu12 (255.1MiB)
6588
- Downloading triton (148.3MiB)
6589
  Downloading nvidia-cudnn-cu12 (674.0MiB)
 
 
 
 
6590
  Downloading sympy (6.0MiB)
 
 
 
 
 
6591
  Downloading nvidia-cufile-cu12
6592
  Downloading hf-xet
6593
  Downloading setuptools
@@ -6601,26 +6602,25 @@ Downloading sympy (6.0MiB)
6601
  Downloading triton
6602
  Downloading nvidia-cufft-cu12
6603
  Downloading nvidia-cusolver-cu12
6604
- Downloading nvidia-cusparselt-cu12
6605
  Downloading nvidia-cusparse-cu12
 
6606
  Downloading nvidia-nccl-cu12
6607
  Downloading nvidia-cublas-cu12
6608
  Downloading nvidia-cudnn-cu12
6609
  Downloading torch
6610
- Installed 37 packages in 543ms
6611
  </div>
6612
  </div>
6613
  <div class="cell-stderr">Fetching 66 files: 0%| | 0/66 [00:00&lt;?, ?it/s]
6614
- Fetching 66 files: 2%|▏ | 1/66 [00:00&lt;00:27, 2.39it/s]
6615
- Fetching 66 files: 6%|▌ | 4/66 [00:00&lt;00:07, 8.04it/s]
6616
- Fetching 66 files: 17%|█▋ | 11/66 [00:00&lt;00:02, 21.45it/s]
6617
- Fetching 66 files: 26%|██▌ | 17/66 [00:01&lt;00:02, 17.15it/s]
6618
- Fetching 66 files: 48%|████▊ | 32/66 [00:01&lt;00:01, 30.72it/s]
6619
- Fetching 66 files: 62%|██████▏ | 41/66 [00:01&lt;00:01, 23.83it/s]
6620
- Fetching 66 files: 71%|███████ | 47/66 [00:02&lt;00:00, 25.88it/s]
6621
- Fetching 66 files: 100%|██████████| 66/66 [00:02&lt;00:00, 45.13it/s]
6622
- Fetching 66 files: 100%|██████████| 66/66 [00:02&lt;00:00, 29.34it/s]
6623
- /tmp/tmpq5pei8xr/cuda_utils.c:5:10: fatal error: Python.h: No such file or directory
6624
  5 | #include &lt;Python.h&gt;
6625
  | ^~~~~~~~~~
6626
  compilation terminated.
@@ -6637,87 +6637,87 @@ Traceback (most recent call last):
6637
  File &quot;/repo/moe_benchmarks/megablocks_yamoe/.uvnote/cells/bench_utils.py&quot;, line 177, in &lt;lambda&gt;
6638
  call = lambda x: fn(x, *args[1:], **kwargs)
6639
  ^^^^^^^^^^^^^^^^^^^^^^^^^^
6640
- File &quot;/tmp/uvnote-run-ab5uowvg/home/.cache/uv/environments-v2/megablocks-run-8802ebf6d3566120/lib/python3.11/site-packages/torch/nn/modules/module.py&quot;, line 1773, in _wrapped_call_impl
6641
  return self._call_impl(*args, **kwargs)
6642
  ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
6643
- File &quot;/tmp/uvnote-run-ab5uowvg/home/.cache/uv/environments-v2/megablocks-run-8802ebf6d3566120/lib/python3.11/site-packages/torch/nn/modules/module.py&quot;, line 1784, in _call_impl
6644
  return forward_call(*args, **kwargs)
6645
  ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
6646
  File &quot;/repo/moe_benchmarks/megablocks_yamoe/.uvnote/cells/megablocks_run.py&quot;, line 81, in forward
6647
  output, dummy_routing_weights = self.model(hidden_states)
6648
  ^^^^^^^^^^^^^^^^^^^^^^^^^
6649
- File &quot;/tmp/uvnote-run-ab5uowvg/home/.cache/uv/environments-v2/megablocks-run-8802ebf6d3566120/lib/python3.11/site-packages/torch/nn/modules/module.py&quot;, line 1773, in _wrapped_call_impl
6650
  return self._call_impl(*args, **kwargs)
6651
  ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
6652
- File &quot;/tmp/uvnote-run-ab5uowvg/home/.cache/uv/environments-v2/megablocks-run-8802ebf6d3566120/lib/python3.11/site-packages/torch/nn/modules/module.py&quot;, line 1784, in _call_impl
6653
  return forward_call(*args, **kwargs)
6654
  ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
6655
- File &quot;/tmp/uvnote-run-ab5uowvg/home/.cache/huggingface/hub/models--kernels-community--megablocks/snapshots/e0fb1437de3f8d7079c4da13be8cb64dc0cfcdd5/build/torch28-cxx11-cu128-x86_64-linux/megablocks/layers.py&quot;, line 896, in forward
6656
  output, expert_weights_out, *_ = moe_forward(
6657
  ^^^^^^^^^^^^
6658
- File &quot;/tmp/uvnote-run-ab5uowvg/home/.cache/huggingface/hub/models--kernels-community--megablocks/snapshots/e0fb1437de3f8d7079c4da13be8cb64dc0cfcdd5/build/torch28-cxx11-cu128-x86_64-linux/megablocks/layers.py&quot;, line 730, in moe_forward
6659
  x, tokens_per_expert = forward_fn(**forward_args)
6660
  ^^^^^^^^^^^^^^^^^^^^^^^^^^
6661
- File &quot;/tmp/uvnote-run-ab5uowvg/home/.cache/huggingface/hub/models--kernels-community--megablocks/snapshots/e0fb1437de3f8d7079c4da13be8cb64dc0cfcdd5/build/torch28-cxx11-cu128-x86_64-linux/megablocks/layers.py&quot;, line 457, in forward_once
6662
  x = permute_and_compute(
6663
  ^^^^^^^^^^^^^^^^^^^^
6664
- File &quot;/tmp/uvnote-run-ab5uowvg/home/.cache/huggingface/hub/models--kernels-community--megablocks/snapshots/e0fb1437de3f8d7079c4da13be8cb64dc0cfcdd5/build/torch28-cxx11-cu128-x86_64-linux/megablocks/layers.py&quot;, line 401, in permute_and_compute
6665
  x = ops.binned_gather(x, indices, bins, expert_capacity, top_k)
6666
  ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
6667
- File &quot;/tmp/uvnote-run-ab5uowvg/home/.cache/uv/environments-v2/megablocks-run-8802ebf6d3566120/lib/python3.11/site-packages/torch/autograd/function.py&quot;, line 576, in apply
6668
  return super().apply(*args, **kwargs) # type: ignore[misc]
6669
  ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
6670
- File &quot;/tmp/uvnote-run-ab5uowvg/home/.cache/huggingface/hub/models--kernels-community--megablocks/snapshots/e0fb1437de3f8d7079c4da13be8cb64dc0cfcdd5/build/torch28-cxx11-cu128-x86_64-linux/megablocks/ops/stk_autocast.py&quot;, line 30, in decorate_fwd
6671
  return fwd(*args, **kwargs)
6672
  ^^^^^^^^^^^^^^^^^^^^
6673
- File &quot;/tmp/uvnote-run-ab5uowvg/home/.cache/huggingface/hub/models--kernels-community--megablocks/snapshots/e0fb1437de3f8d7079c4da13be8cb64dc0cfcdd5/build/torch28-cxx11-cu128-x86_64-linux/megablocks/ops/binned_gather.py&quot;, line 26, in forward
6674
  return kernels.binned_gather(x, indices, None, bins, bin_size, top_k)
6675
  ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
6676
- File &quot;/tmp/uvnote-run-ab5uowvg/home/.cache/huggingface/hub/models--kernels-community--megablocks/snapshots/e0fb1437de3f8d7079c4da13be8cb64dc0cfcdd5/build/torch28-cxx11-cu128-x86_64-linux/megablocks/backend/kernels.py&quot;, line 419, in binned_gather
6677
  _binned_copy[(num_experts, expert_capacity)](
6678
- File &quot;/tmp/uvnote-run-ab5uowvg/home/.cache/uv/environments-v2/megablocks-run-8802ebf6d3566120/lib/python3.11/site-packages/triton/runtime/jit.py&quot;, line 390, in &lt;lambda&gt;
6679
  return lambda *args, **kwargs: self.run(grid=grid, warmup=False, *args, **kwargs)
6680
  ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
6681
- File &quot;/tmp/uvnote-run-ab5uowvg/home/.cache/uv/environments-v2/megablocks-run-8802ebf6d3566120/lib/python3.11/site-packages/triton/runtime/autotuner.py&quot;, line 239, in run
6682
  benchmark()
6683
- File &quot;/tmp/uvnote-run-ab5uowvg/home/.cache/uv/environments-v2/megablocks-run-8802ebf6d3566120/lib/python3.11/site-packages/triton/runtime/autotuner.py&quot;, line 228, in benchmark
6684
  timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}
6685
  ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
6686
- File &quot;/tmp/uvnote-run-ab5uowvg/home/.cache/uv/environments-v2/megablocks-run-8802ebf6d3566120/lib/python3.11/site-packages/triton/runtime/autotuner.py&quot;, line 228, in &lt;dictcomp&gt;
6687
  timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}
6688
  ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
6689
- File &quot;/tmp/uvnote-run-ab5uowvg/home/.cache/uv/environments-v2/megablocks-run-8802ebf6d3566120/lib/python3.11/site-packages/triton/runtime/autotuner.py&quot;, line 160, in _bench
6690
  return self.do_bench(kernel_call, quantiles=(0.5, 0.2, 0.8))
6691
  ^^^^^^^^^^^^^
6692
  File &quot;/usr/lib/python3.11/functools.py&quot;, line 1001, in __get__
6693
  val = self.func(instance)
6694
  ^^^^^^^^^^^^^^^^^^^
6695
- File &quot;/tmp/uvnote-run-ab5uowvg/home/.cache/uv/environments-v2/megablocks-run-8802ebf6d3566120/lib/python3.11/site-packages/triton/runtime/autotuner.py&quot;, line 121, in do_bench
6696
  return driver.active.get_benchmarker()
6697
  ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
6698
- File &quot;/tmp/uvnote-run-ab5uowvg/home/.cache/uv/environments-v2/megablocks-run-8802ebf6d3566120/lib/python3.11/site-packages/triton/runtime/driver.py&quot;, line 30, in __getattr__
6699
  return getattr(self._initialize_obj(), name)
6700
  ^^^^^^^^^^^^^^^^^^^^^^
6701
- File &quot;/tmp/uvnote-run-ab5uowvg/home/.cache/uv/environments-v2/megablocks-run-8802ebf6d3566120/lib/python3.11/site-packages/triton/runtime/driver.py&quot;, line 26, in _initialize_obj
6702
  self._obj = self._init_fn()
6703
  ^^^^^^^^^^^^^^^
6704
- File &quot;/tmp/uvnote-run-ab5uowvg/home/.cache/uv/environments-v2/megablocks-run-8802ebf6d3566120/lib/python3.11/site-packages/triton/runtime/driver.py&quot;, line 12, in _create_driver
6705
  return active_drivers[0]()
6706
  ^^^^^^^^^^^^^^^^^^^
6707
- File &quot;/tmp/uvnote-run-ab5uowvg/home/.cache/uv/environments-v2/megablocks-run-8802ebf6d3566120/lib/python3.11/site-packages/triton/backends/nvidia/driver.py&quot;, line 715, in __init__
6708
  self.utils = CudaUtils() # TODO: make static
6709
  ^^^^^^^^^^^
6710
- File &quot;/tmp/uvnote-run-ab5uowvg/home/.cache/uv/environments-v2/megablocks-run-8802ebf6d3566120/lib/python3.11/site-packages/triton/backends/nvidia/driver.py&quot;, line 62, in __init__
6711
  mod = compile_module_from_src(
6712
  ^^^^^^^^^^^^^^^^^^^^^^^^
6713
- File &quot;/tmp/uvnote-run-ab5uowvg/home/.cache/uv/environments-v2/megablocks-run-8802ebf6d3566120/lib/python3.11/site-packages/triton/runtime/build.py&quot;, line 88, in compile_module_from_src
6714
  so = _build(name, src_path, tmpdir, library_dirs or [], include_dirs or [], libraries or [])
6715
  ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
6716
- File &quot;/tmp/uvnote-run-ab5uowvg/home/.cache/uv/environments-v2/megablocks-run-8802ebf6d3566120/lib/python3.11/site-packages/triton/runtime/build.py&quot;, line 51, in _build
6717
  subprocess.check_call(cc_cmd, stdout=subprocess.DEVNULL)
6718
  File &quot;/usr/lib/python3.11/subprocess.py&quot;, line 413, in check_call
6719
  raise CalledProcessError(retcode, cmd)
6720
- subprocess.CalledProcessError: Command &#x27;[&#x27;/usr/bin/gcc&#x27;, &#x27;/tmp/tmpq5pei8xr/cuda_utils.c&#x27;, &#x27;-O3&#x27;, &#x27;-shared&#x27;, &#x27;-fPIC&#x27;, &#x27;-Wno-psabi&#x27;, &#x27;-o&#x27;, &#x27;/tmp/tmpq5pei8xr/cuda_utils.cpython-311-x86_64-linux-gnu.so&#x27;, &#x27;-lcuda&#x27;, &#x27;-L/tmp/uvnote-run-ab5uowvg/home/.cache/uv/environments-v2/megablocks-run-8802ebf6d3566120/lib/python3.11/site-packages/triton/backends/nvidia/lib&#x27;, &#x27;-L/usr/lib/x86_64-linux-gnu&#x27;, &#x27;-I/tmp/uvnote-run-ab5uowvg/home/.cache/uv/environments-v2/megablocks-run-8802ebf6d3566120/lib/python3.11/site-packages/triton/backends/nvidia/include&#x27;, &#x27;-I/tmp/tmpq5pei8xr&#x27;, &#x27;-I/usr/include/python3.11&#x27;]&#x27; returned non-zero exit status 1.</div>
6721
  </div>
6722
  </div>
6723
 
 
3720
  <span onclick="toggleOutput('utils')" style="cursor: pointer;">▼ output</span>
3721
  <span id="uv-indicator-utils" onclick="toggleUvLogsFromHeader('utils')" style="cursor: pointer;">▶ uv-logs</span>
3722
  </span> |
3723
+ Cell: utils | deps: torch, numpy | 35.29s
3724
  | <button class="run-btn" onclick="runCell('utils')">▶ run</button>
3725
  <button class="copy-btn" onclick="copyCell('utils')">Copy</button>
3726
  <a href="cells/utils.py" target="_blank" class="raw-btn">Raw</a>
 
3794
  <div class="uv-install-logs" id="uv-logs-utils">
3795
  <div class="uv-logs-header" onclick="toggleUvLogs(this)">▶ UV Install Logs</div>
3796
  <div class="uv-logs-content" style="display: none;">
3797
+ Downloading networkx (1.9MiB)
3798
  Downloading setuptools (1.1MiB)
 
 
 
 
3799
  Downloading numpy (16.2MiB)
3800
+ Downloading sympy (6.0MiB)
 
3801
  Downloading nvidia-curand-cu12 (60.7MiB)
3802
+ Downloading nvidia-nvjitlink-cu12 (37.4MiB)
3803
+ Downloading nvidia-cusparse-cu12 (274.9MiB)
3804
+ Downloading nvidia-cusolver-cu12 (255.1MiB)
3805
+ Downloading nvidia-cudnn-cu12 (674.0MiB)
3806
  Downloading nvidia-nccl-cu12 (307.4MiB)
3807
+ Downloading nvidia-cufile-cu12 (1.1MiB)
3808
  Downloading nvidia-cusparselt-cu12 (273.9MiB)
3809
+ Downloading nvidia-cublas-cu12 (566.8MiB)
3810
+ Downloading nvidia-cuda-cupti-cu12 (9.8MiB)
3811
+ Downloading nvidia-cufft-cu12 (184.2MiB)
3812
  Downloading nvidia-cuda-nvrtc-cu12 (84.0MiB)
 
3813
  Downloading torch (846.9MiB)
3814
+ Downloading triton (148.3MiB)
3815
  Downloading nvidia-cufile-cu12
3816
  Downloading setuptools
3817
  Downloading networkx
 
3824
  Downloading triton
3825
  Downloading nvidia-cufft-cu12
3826
  Downloading nvidia-cusolver-cu12
 
3827
  Downloading nvidia-cusparse-cu12
3828
+ Downloading nvidia-cusparselt-cu12
3829
  Downloading nvidia-nccl-cu12
3830
  Downloading nvidia-cublas-cu12
3831
  Downloading nvidia-cudnn-cu12
3832
  Downloading torch
3833
+ Installed 26 packages in 455ms
3834
  </div>
3835
  </div>
3836
  </div>
 
3843
  <span onclick="toggleOutput('bench_utils')" style="cursor: pointer;">▼ output</span>
3844
  <span id="uv-indicator-bench_utils" onclick="toggleUvLogsFromHeader('bench_utils')" style="cursor: pointer;">▶ uv-logs</span>
3845
  </span> |
3846
+ Cell: bench_utils | deps: torch, numpy | 34.44s
3847
  | <button class="run-btn" onclick="runCell('bench_utils')">▶ run</button>
3848
  <button class="copy-btn" onclick="copyCell('bench_utils')">Copy</button>
3849
  <a href="cells/bench_utils.py" target="_blank" class="raw-btn">Raw</a>
 
4331
  <div class="uv-install-logs" id="uv-logs-bench_utils">
4332
  <div class="uv-logs-header" onclick="toggleUvLogs(this)">▶ UV Install Logs</div>
4333
  <div class="uv-logs-content" style="display: none;">
 
 
 
 
 
4334
  Downloading setuptools (1.1MiB)
4335
+ Downloading nvidia-cuda-cupti-cu12 (9.8MiB)
4336
+ Downloading nvidia-cublas-cu12 (566.8MiB)
4337
+ Downloading nvidia-cudnn-cu12 (674.0MiB)
4338
  Downloading nvidia-nvjitlink-cu12 (37.4MiB)
4339
+ Downloading nvidia-cufile-cu12 (1.1MiB)
4340
  Downloading sympy (6.0MiB)
 
 
 
4341
  Downloading nvidia-cusparse-cu12 (274.9MiB)
4342
  Downloading nvidia-cusparselt-cu12 (273.9MiB)
4343
+ Downloading triton (148.3MiB)
4344
+ Downloading nvidia-curand-cu12 (60.7MiB)
4345
+ Downloading torch (846.9MiB)
4346
+ Downloading networkx (1.9MiB)
4347
+ Downloading nvidia-cusolver-cu12 (255.1MiB)
4348
+ Downloading nvidia-cufft-cu12 (184.2MiB)
4349
+ Downloading nvidia-cuda-nvrtc-cu12 (84.0MiB)
4350
+ Downloading numpy (16.2MiB)
4351
+ Downloading nvidia-nccl-cu12 (307.4MiB)
4352
  Downloading nvidia-cufile-cu12
4353
  Downloading setuptools
4354
  Downloading networkx
 
4367
  Downloading nvidia-cublas-cu12
4368
  Downloading nvidia-cudnn-cu12
4369
  Downloading torch
4370
+ Installed 26 packages in 447ms
4371
  </div>
4372
  </div>
4373
  </div>
 
4381
  <span onclick="toggleOutput('config')" style="cursor: pointer;">▼ output</span>
4382
  <span id="uv-indicator-config" onclick="toggleUvLogsFromHeader('config')" style="cursor: pointer;">▶ uv-logs</span>
4383
  </span> |
4384
+ Cell: config | deps: torch, numpy | 34.69s
4385
  | <button class="run-btn" onclick="runCell('config')">▶ run</button>
4386
  <button class="copy-btn" onclick="copyCell('config')">Copy</button>
4387
  <a href="cells/config.py" target="_blank" class="raw-btn">Raw</a>
 
4441
  <div class="uv-install-logs" id="uv-logs-config">
4442
  <div class="uv-logs-header" onclick="toggleUvLogs(this)">▶ UV Install Logs</div>
4443
  <div class="uv-logs-content" style="display: none;">
4444
+ Downloading numpy (16.2MiB)
 
 
 
 
 
 
 
4445
  Downloading nvidia-cufft-cu12 (184.2MiB)
4446
+ Downloading nvidia-cusolver-cu12 (255.1MiB)
4447
+ Downloading torch (846.9MiB)
4448
+ Downloading nvidia-cuda-nvrtc-cu12 (84.0MiB)
4449
  Downloading setuptools (1.1MiB)
4450
+ Downloading triton (148.3MiB)
4451
+ Downloading nvidia-cusparse-cu12 (274.9MiB)
4452
+ Downloading networkx (1.9MiB)
4453
+ Downloading nvidia-cusparselt-cu12 (273.9MiB)
4454
+ Downloading nvidia-nccl-cu12 (307.4MiB)
4455
  Downloading nvidia-nvjitlink-cu12 (37.4MiB)
4456
+ Downloading sympy (6.0MiB)
4457
+ Downloading nvidia-cudnn-cu12 (674.0MiB)
4458
+ Downloading nvidia-curand-cu12 (60.7MiB)
4459
  Downloading nvidia-cuda-cupti-cu12 (9.8MiB)
4460
+ Downloading nvidia-cufile-cu12 (1.1MiB)
4461
+ Downloading nvidia-cublas-cu12 (566.8MiB)
 
 
4462
  Downloading nvidia-cufile-cu12
4463
  Downloading setuptools
4464
  Downloading networkx
 
4471
  Downloading triton
4472
  Downloading nvidia-cufft-cu12
4473
  Downloading nvidia-cusolver-cu12
 
4474
  Downloading nvidia-cusparselt-cu12
4475
+ Downloading nvidia-cusparse-cu12
4476
  Downloading nvidia-nccl-cu12
4477
  Downloading nvidia-cublas-cu12
4478
  Downloading nvidia-cudnn-cu12
4479
  Downloading torch
4480
+ Installed 26 packages in 526ms
4481
  </div>
4482
  </div>
4483
  </div>
 
4490
  <span onclick="toggleOutput('save_data')" style="cursor: pointer;">▼ output</span>
4491
  <span id="uv-indicator-save_data" onclick="toggleUvLogsFromHeader('save_data')" style="cursor: pointer;">▶ uv-logs</span>
4492
  </span> |
4493
+ Cell: save_data | deps: torch, numpy | 40.40s
4494
  | <button class="run-btn" onclick="runCell('save_data')">▶ run</button>
4495
  <button class="copy-btn" onclick="copyCell('save_data')">Copy</button>
4496
  <a href="cells/save_data.py" target="_blank" class="raw-btn">Raw</a>
 
4585
  <div class="uv-install-logs" id="uv-logs-save_data">
4586
  <div class="uv-logs-header" onclick="toggleUvLogs(this)">▶ UV Install Logs</div>
4587
  <div class="uv-logs-content" style="display: none;">
4588
+ Downloading nvidia-cuda-nvrtc-cu12 (84.0MiB)
4589
+ Downloading nvidia-nccl-cu12 (307.4MiB)
4590
+ Downloading nvidia-cusparse-cu12 (274.9MiB)
4591
  Downloading nvidia-cusparselt-cu12 (273.9MiB)
 
 
4592
  Downloading setuptools (1.1MiB)
 
4593
  Downloading nvidia-cudnn-cu12 (674.0MiB)
4594
+ Downloading numpy (16.2MiB)
4595
+ Downloading triton (148.3MiB)
4596
+ Downloading networkx (1.9MiB)
4597
+ Downloading nvidia-cuda-cupti-cu12 (9.8MiB)
4598
  Downloading nvidia-curand-cu12 (60.7MiB)
 
 
 
 
4599
  Downloading nvidia-cufile-cu12 (1.1MiB)
4600
+ Downloading nvidia-cufft-cu12 (184.2MiB)
 
4601
  Downloading sympy (6.0MiB)
4602
  Downloading torch (846.9MiB)
4603
+ Downloading nvidia-cublas-cu12 (566.8MiB)
4604
+ Downloading nvidia-cusolver-cu12 (255.1MiB)
4605
+ Downloading nvidia-nvjitlink-cu12 (37.4MiB)
4606
  Downloading nvidia-cufile-cu12
4607
  Downloading setuptools
4608
  Downloading networkx
 
4621
  Downloading nvidia-cublas-cu12
4622
  Downloading nvidia-cudnn-cu12
4623
  Downloading torch
4624
+ Installed 26 packages in 563ms
4625
  </div>
4626
  </div>
4627
  <div class="cell-artifacts">
4628
  <h4>Artifacts:</h4>
 
4629
  <a href="artifacts/save_data/down_proj_bias.pt" class="artifact" target="_blank">down_proj_bias.pt</a>
4630
  <a href="artifacts/save_data/down_proj.pt" class="artifact" target="_blank">down_proj.pt</a>
4631
  <a href="artifacts/save_data/router_weight.pt" class="artifact" target="_blank">router_weight.pt</a>
4632
  <a href="artifacts/save_data/router_bias.pt" class="artifact" target="_blank">router_bias.pt</a>
4633
+ <a href="artifacts/save_data/gate_up_proj_bias.pt" class="artifact" target="_blank">gate_up_proj_bias.pt</a>
4634
  <a href="artifacts/save_data/gate_up_proj.pt" class="artifact" target="_blank">gate_up_proj.pt</a>
4635
  </div>
4636
  </div>
 
4645
  <span onclick="toggleOutput('yamoe_run')" style="cursor: pointer;">▼ output</span>
4646
  <span id="uv-indicator-yamoe_run" onclick="toggleUvLogsFromHeader('yamoe_run')" style="cursor: pointer;">▶ uv-logs</span>
4647
  </span> |
4648
+ Cell: yamoe_run | deps: torch, kernels, numpy | 38.77s
4649
  | <button class="run-btn" onclick="runCell('yamoe_run')">▶ run</button>
4650
  <button class="copy-btn" onclick="copyCell('yamoe_run')">Copy</button>
4651
  <a href="cells/yamoe_run.py" target="_blank" class="raw-btn">Raw</a>
 
4938
 
4939
  Warming up (10 iterations)...
4940
  Benchmarking (50 iterations)...
4941
+ Progress: 20% complete (avg: 4.248 ms)
4942
+ Progress: 40% complete (avg: 4.246 ms)
4943
+ Progress: 60% complete (avg: 4.247 ms)
4944
+ Progress: 80% complete (avg: 4.247 ms)
4945
 
4946
  Output tensors:
4947
  Primary: shape=(1, 100, 1152), dtype=torch.float32, device=cuda:0, range=[-0.049506, 0.054984], mean=0.000034, std=0.006508, norm=2.208791
 
4951
  Iterations: 50
4952
 
4953
  Latency Statistics:
4954
+ Average: 4.248 ms
4955
+ Min: 4.129 ms
4956
+ Max: 4.266 ms
4957
+ Std Dev: 0.021 ms
4958
 
4959
  Percentiles:
4960
  P50 (median): 4.252 ms
4961
+ P95: 4.264 ms
4962
+ P99: 4.265 ms
4963
 
4964
  Throughput:
4965
+ Tokens/sec: 23542.6
4966
+ Std Dev: 117.1
4967
  ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━
4968
 
4969
  Saved benchmark results to yamoe_results.json
 
4973
  <div class="uv-install-logs" id="uv-logs-yamoe_run">
4974
  <div class="uv-logs-header" onclick="toggleUvLogs(this)">▶ UV Install Logs</div>
4975
  <div class="uv-logs-content" style="display: none;">
 
 
4976
  Downloading networkx (1.9MiB)
4977
+ Downloading sympy (6.0MiB)
 
4978
  Downloading setuptools (1.1MiB)
4979
+ Downloading nvidia-nccl-cu12 (307.4MiB)
4980
+ Downloading hf-xet (3.0MiB)
4981
  Downloading nvidia-cufile-cu12 (1.1MiB)
4982
+ Downloading triton (148.3MiB)
4983
+ Downloading nvidia-cuda-cupti-cu12 (9.8MiB)
4984
+ Downloading nvidia-cuda-nvrtc-cu12 (84.0MiB)
4985
  Downloading nvidia-cusolver-cu12 (255.1MiB)
4986
+ Downloading nvidia-cusparse-cu12 (274.9MiB)
4987
+ Downloading nvidia-cudnn-cu12 (674.0MiB)
4988
+ Downloading nvidia-cufft-cu12 (184.2MiB)
4989
  Downloading torch (846.9MiB)
 
 
4990
  Downloading nvidia-curand-cu12 (60.7MiB)
 
4991
  Downloading nvidia-nvjitlink-cu12 (37.4MiB)
4992
+ Downloading nvidia-cusparselt-cu12 (273.9MiB)
4993
+ Downloading nvidia-cublas-cu12 (566.8MiB)
4994
+ Downloading numpy (16.2MiB)
4995
  Downloading nvidia-cufile-cu12
4996
  Downloading hf-xet
4997
  Downloading setuptools
 
5011
  Downloading nvidia-cublas-cu12
5012
  Downloading nvidia-cudnn-cu12
5013
  Downloading torch
5014
+ Installed 37 packages in 449ms
5015
  </div>
5016
  </div>
5017
  <div class="cell-stderr">Fetching 6 files: 0%| | 0/6 [00:00&lt;?, ?it/s]
5018
+ Fetching 6 files: 17%|█▋ | 1/6 [00:00&lt;00:00, 5.90it/s]
5019
+ Fetching 6 files: 33%|███▎ | 2/6 [00:00&lt;00:00, 7.70it/s]
5020
+ Fetching 6 files: 50%|█████ | 3/6 [00:00&lt;00:00, 4.70it/s]
5021
+ Fetching 6 files: 100%|██████████| 6/6 [00:00&lt;00:00, 10.28it/s]</div>
5022
  <div class="cell-artifacts">
5023
  <h4>Artifacts:</h4>
5024
  <a href="artifacts/yamoe_run/yamoe_results.json" class="artifact" target="_blank">yamoe_results.json</a>
 
5035
  <span onclick="toggleOutput('binned_run')" style="cursor: pointer;">▼ output</span>
5036
  <span id="uv-indicator-binned_run" onclick="toggleUvLogsFromHeader('binned_run')" style="cursor: pointer;">▶ uv-logs</span>
5037
  </span> |
5038
+ Cell: binned_run | deps: torch, numpy | 38.76s
5039
  | <button class="run-btn" onclick="runCell('binned_run')">▶ run</button>
5040
  <button class="copy-btn" onclick="copyCell('binned_run')">Copy</button>
5041
  <a href="cells/binned_run.py" target="_blank" class="raw-btn">Raw</a>
 
5449
 
5450
  Warming up (10 iterations)...
5451
  Benchmarking (50 iterations)...
5452
+ Progress: 20% complete (avg: 37.794 ms)
5453
+ Progress: 40% complete (avg: 37.656 ms)
5454
+ Progress: 60% complete (avg: 37.188 ms)
5455
+ Progress: 80% complete (avg: 36.704 ms)
5456
 
5457
  Output tensors:
5458
  Primary: shape=(1, 100, 1152), dtype=torch.float32, device=cuda:0, range=[-0.049506, 0.054984], mean=0.000034, std=0.006508, norm=2.208791
 
5462
  Iterations: 50
5463
 
5464
  Latency Statistics:
5465
+ Average: 36.215 ms
5466
+ Min: 33.172 ms
5467
+ Max: 38.754 ms
5468
+ Std Dev: 1.401 ms
5469
 
5470
  Percentiles:
5471
+ P50 (median): 36.364 ms
5472
+ P95: 38.061 ms
5473
+ P99: 38.464 ms
5474
 
5475
  Throughput:
5476
+ Tokens/sec: 2761.3
5477
+ Std Dev: 108.1
5478
  ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━
5479
 
5480
  Saved benchmark results to binned_results.json
 
5484
  <div class="uv-install-logs" id="uv-logs-binned_run">
5485
  <div class="uv-logs-header" onclick="toggleUvLogs(this)">▶ UV Install Logs</div>
5486
  <div class="uv-logs-content" style="display: none;">
 
 
 
 
 
5487
  Downloading networkx (1.9MiB)
5488
+ Downloading numpy (16.2MiB)
5489
+ Downloading setuptools (1.1MiB)
5490
  Downloading nvidia-cusparselt-cu12 (273.9MiB)
 
 
 
 
5491
  Downloading nvidia-cufft-cu12 (184.2MiB)
5492
+ Downloading nvidia-cudnn-cu12 (674.0MiB)
5493
+ Downloading nvidia-cufile-cu12 (1.1MiB)
5494
  Downloading nvidia-curand-cu12 (60.7MiB)
5495
+ Downloading nvidia-nccl-cu12 (307.4MiB)
5496
+ Downloading nvidia-nvjitlink-cu12 (37.4MiB)
5497
+ Downloading nvidia-cublas-cu12 (566.8MiB)
5498
  Downloading triton (148.3MiB)
5499
+ Downloading nvidia-cusparse-cu12 (274.9MiB)
5500
+ Downloading torch (846.9MiB)
5501
+ Downloading nvidia-cuda-nvrtc-cu12 (84.0MiB)
5502
+ Downloading nvidia-cuda-cupti-cu12 (9.8MiB)
5503
+ Downloading sympy (6.0MiB)
5504
+ Downloading nvidia-cusolver-cu12 (255.1MiB)
5505
  Downloading nvidia-cufile-cu12
5506
  Downloading setuptools
5507
  Downloading networkx
 
5514
  Downloading triton
5515
  Downloading nvidia-cufft-cu12
5516
  Downloading nvidia-cusolver-cu12
 
5517
  Downloading nvidia-cusparse-cu12
5518
+ Downloading nvidia-cusparselt-cu12
5519
  Downloading nvidia-nccl-cu12
5520
  Downloading nvidia-cublas-cu12
5521
  Downloading nvidia-cudnn-cu12
5522
  Downloading torch
5523
+ Installed 26 packages in 455ms
5524
  </div>
5525
  </div>
5526
  <div class="cell-artifacts">
 
5539
  <span onclick="toggleOutput('gptoss_run')" style="cursor: pointer;">▼ output</span>
5540
  <span id="uv-indicator-gptoss_run" onclick="toggleUvLogsFromHeader('gptoss_run')" style="cursor: pointer;">▶ uv-logs</span>
5541
  </span> |
5542
+ Cell: gptoss_run | deps: torch, numpy | 39.76s
5543
  | <button class="run-btn" onclick="runCell('gptoss_run')">▶ run</button>
5544
  <button class="copy-btn" onclick="copyCell('gptoss_run')">Copy</button>
5545
  <a href="cells/gptoss_run.py" target="_blank" class="raw-btn">Raw</a>
 
5857
 
5858
  Warming up (10 iterations)...
5859
  Benchmarking (50 iterations)...
5860
+ Progress: 20% complete (avg: 51.012 ms)
5861
+ Progress: 40% complete (avg: 49.954 ms)
5862
+ Progress: 60% complete (avg: 48.390 ms)
5863
+ Progress: 80% complete (avg: 46.993 ms)
5864
 
5865
  Output tensors:
5866
  Primary: shape=(1, 100, 1152), dtype=torch.float32, device=cuda:0, range=[-0.064982, 0.061193], mean=0.000100, std=0.013510, norm=4.585560
 
5870
  Iterations: 50
5871
 
5872
  Latency Statistics:
5873
+ Average: 45.950 ms
5874
+ Min: 40.765 ms
5875
+ Max: 52.300 ms
5876
+ Std Dev: 3.623 ms
5877
 
5878
  Percentiles:
5879
+ P50 (median): 45.469 ms
5880
+ P95: 51.353 ms
5881
+ P99: 52.122 ms
5882
 
5883
  Throughput:
5884
+ Tokens/sec: 2176.3
5885
+ Std Dev: 169.8
5886
  ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━
5887
 
5888
  Saved benchmark results to gptoss_results.json
 
5892
  <div class="uv-install-logs" id="uv-logs-gptoss_run">
5893
  <div class="uv-logs-header" onclick="toggleUvLogs(this)">▶ UV Install Logs</div>
5894
  <div class="uv-logs-content" style="display: none;">
5895
+ Downloading numpy (16.2MiB)
5896
+ Downloading networkx (1.9MiB)
5897
+ Downloading nvidia-cusparse-cu12 (274.9MiB)
5898
  Downloading setuptools (1.1MiB)
5899
+ Downloading nvidia-cuda-nvrtc-cu12 (84.0MiB)
5900
  Downloading nvidia-curand-cu12 (60.7MiB)
 
5901
  Downloading nvidia-cuda-cupti-cu12 (9.8MiB)
 
 
5902
  Downloading nvidia-nccl-cu12 (307.4MiB)
5903
+ Downloading nvidia-cusparselt-cu12 (273.9MiB)
5904
  Downloading nvidia-nvjitlink-cu12 (37.4MiB)
 
 
 
 
5905
  Downloading nvidia-cufile-cu12 (1.1MiB)
5906
+ Downloading nvidia-cufft-cu12 (184.2MiB)
5907
  Downloading triton (148.3MiB)
5908
+ Downloading nvidia-cusolver-cu12 (255.1MiB)
5909
+ Downloading nvidia-cudnn-cu12 (674.0MiB)
5910
+ Downloading torch (846.9MiB)
5911
+ Downloading sympy (6.0MiB)
5912
  Downloading nvidia-cublas-cu12 (566.8MiB)
5913
  Downloading nvidia-cufile-cu12
5914
  Downloading setuptools
 
5922
  Downloading triton
5923
  Downloading nvidia-cufft-cu12
5924
  Downloading nvidia-cusolver-cu12
 
5925
  Downloading nvidia-cusparse-cu12
5926
+ Downloading nvidia-cusparselt-cu12
5927
  Downloading nvidia-nccl-cu12
5928
  Downloading nvidia-cublas-cu12
5929
  Downloading nvidia-cudnn-cu12
5930
  Downloading torch
5931
+ Installed 26 packages in 524ms
5932
  </div>
5933
  </div>
5934
  <div class="cell-artifacts">
 
5947
  <span onclick="toggleOutput('gptoss_training_run')" style="cursor: pointer;">▼ output</span>
5948
  <span id="uv-indicator-gptoss_training_run" onclick="toggleUvLogsFromHeader('gptoss_training_run')" style="cursor: pointer;">▶ uv-logs</span>
5949
  </span> |
5950
+ Cell: gptoss_training_run | deps: torch, numpy | 40.42s
5951
  | <button class="run-btn" onclick="runCell('gptoss_training_run')">▶ run</button>
5952
  <button class="copy-btn" onclick="copyCell('gptoss_training_run')">Copy</button>
5953
  <a href="cells/gptoss_training_run.py" target="_blank" class="raw-btn">Raw</a>
 
6248
 
6249
  Warming up (10 iterations)...
6250
  Benchmarking (50 iterations)...
6251
+ Progress: 20% complete (avg: 48.387 ms)
6252
+ Progress: 40% complete (avg: 48.249 ms)
6253
+ Progress: 60% complete (avg: 47.887 ms)
6254
+ Progress: 80% complete (avg: 47.011 ms)
6255
 
6256
  Output tensors:
6257
  Primary: shape=(1, 100, 1152), dtype=torch.float32, device=cuda:0, range=[-0.064982, 0.061193], mean=0.000100, std=0.013510, norm=4.585560
 
6261
  Iterations: 50
6262
 
6263
  Latency Statistics:
6264
+ Average: 46.098 ms
6265
+ Min: 38.839 ms
6266
+ Max: 49.404 ms
6267
+ Std Dev: 2.469 ms
6268
 
6269
  Percentiles:
6270
+ P50 (median): 47.240 ms
6271
+ P95: 48.725 ms
6272
+ P99: 49.168 ms
6273
 
6274
  Throughput:
6275
+ Tokens/sec: 2169.3
6276
+ Std Dev: 122.3
6277
  ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━
6278
 
6279
  Saved benchmark results to gptoss_training_results.json
 
6283
  <div class="uv-install-logs" id="uv-logs-gptoss_training_run">
6284
  <div class="uv-logs-header" onclick="toggleUvLogs(this)">▶ UV Install Logs</div>
6285
  <div class="uv-logs-content" style="display: none;">
6286
+ Downloading nvidia-cufile-cu12 (1.1MiB)
 
6287
  Downloading setuptools (1.1MiB)
6288
+ Downloading nvidia-nvjitlink-cu12 (37.4MiB)
6289
+ Downloading sympy (6.0MiB)
6290
  Downloading nvidia-cusparselt-cu12 (273.9MiB)
6291
+ Downloading nvidia-cublas-cu12 (566.8MiB)
6292
+ Downloading nvidia-nccl-cu12 (307.4MiB)
6293
+ Downloading torch (846.9MiB)
6294
  Downloading nvidia-cudnn-cu12 (674.0MiB)
6295
+ Downloading networkx (1.9MiB)
6296
+ Downloading nvidia-cuda-cupti-cu12 (9.8MiB)
6297
+ Downloading nvidia-curand-cu12 (60.7MiB)
6298
  Downloading nvidia-cusolver-cu12 (255.1MiB)
 
 
 
6299
  Downloading nvidia-cuda-nvrtc-cu12 (84.0MiB)
6300
+ Downloading nvidia-cusparse-cu12 (274.9MiB)
6301
+ Downloading numpy (16.2MiB)
6302
  Downloading nvidia-cufft-cu12 (184.2MiB)
 
 
6303
  Downloading triton (148.3MiB)
 
6304
  Downloading nvidia-cufile-cu12
6305
  Downloading setuptools
6306
  Downloading networkx
 
6319
  Downloading nvidia-cublas-cu12
6320
  Downloading nvidia-cudnn-cu12
6321
  Downloading torch
6322
+ Installed 26 packages in 451ms
6323
  </div>
6324
  </div>
6325
  <div class="cell-artifacts">
 
6338
  <span onclick="toggleOutput('megablocks_run')" style="cursor: pointer;">▼ output</span>
6339
  <span id="uv-indicator-megablocks_run" onclick="toggleUvLogsFromHeader('megablocks_run')" style="cursor: pointer;">▶ uv-logs</span>
6340
  </span> |
6341
+ Cell: megablocks_run | deps: torch, numpy, kernels | 40.19s | FAILED
6342
  | <button class="run-btn" onclick="runCell('megablocks_run')">▶ run</button>
6343
  <button class="copy-btn" onclick="copyCell('megablocks_run')">Copy</button>
6344
  <a href="cells/megablocks_run.py" target="_blank" class="raw-btn">Raw</a>
 
6493
  <span class="c1"># Attach loaded expert weights to the experts container</span>
6494
  <span class="n">e</span> <span class="o">=</span> <span class="n">model</span><span class="o">.</span><span class="n">experts</span>
6495
  <span class="n">e</span><span class="o">.</span><span class="n">alpha</span> <span class="o">=</span> <span class="mf">1.702</span>
6496
+ <span class="n">e</span><span class="o">.</span><span class="n">capacity_factor</span> <span class="o">=</span> <span class="mi">32</span>
6497
  <span class="n">e</span><span class="o">.</span><span class="n">gate_up_proj</span> <span class="o">=</span> <span class="n">torch</span><span class="o">.</span><span class="n">nn</span><span class="o">.</span><span class="n">Parameter</span><span class="p">(</span><span class="n">gate_up_proj</span><span class="o">.</span><span class="n">clone</span><span class="p">()</span><span class="o">.</span><span class="n">to</span><span class="p">(</span><span class="n">device</span><span class="p">))</span>
6498
  <span class="n">e</span><span class="o">.</span><span class="n">gate_up_proj_bias</span> <span class="o">=</span> <span class="n">torch</span><span class="o">.</span><span class="n">nn</span><span class="o">.</span><span class="n">Parameter</span><span class="p">(</span><span class="n">gate_up_proj_bias</span><span class="o">.</span><span class="n">clone</span><span class="p">()</span><span class="o">.</span><span class="n">to</span><span class="p">(</span><span class="n">device</span><span class="p">))</span>
6499
  <span class="n">e</span><span class="o">.</span><span class="n">down_proj</span> <span class="o">=</span> <span class="n">torch</span><span class="o">.</span><span class="n">nn</span><span class="o">.</span><span class="n">Parameter</span><span class="p">(</span><span class="n">down_proj</span><span class="o">.</span><span class="n">clone</span><span class="p">()</span><span class="o">.</span><span class="n">to</span><span class="p">(</span><span class="n">device</span><span class="p">))</span>
 
6570
  <div class="uv-install-logs" id="uv-logs-megablocks_run">
6571
  <div class="uv-logs-header" onclick="toggleUvLogs(this)">▶ UV Install Logs</div>
6572
  <div class="uv-logs-content" style="display: none;">
 
 
6573
  Downloading numpy (16.2MiB)
 
 
6574
  Downloading nvidia-cuda-cupti-cu12 (9.8MiB)
 
 
6575
  Downloading nvidia-cufile-cu12 (1.1MiB)
 
 
 
6576
  Downloading hf-xet (3.0MiB)
6577
+ Downloading networkx (1.9MiB)
6578
+ Downloading torch (846.9MiB)
6579
  Downloading nvidia-cusparse-cu12 (274.9MiB)
6580
+ Downloading nvidia-cusparselt-cu12 (273.9MiB)
 
6581
  Downloading nvidia-cudnn-cu12 (674.0MiB)
6582
+ Downloading triton (148.3MiB)
6583
+ Downloading nvidia-cuda-nvrtc-cu12 (84.0MiB)
6584
+ Downloading nvidia-curand-cu12 (60.7MiB)
6585
+ Downloading nvidia-nvjitlink-cu12 (37.4MiB)
6586
  Downloading sympy (6.0MiB)
6587
+ Downloading nvidia-cusolver-cu12 (255.1MiB)
6588
+ Downloading nvidia-cublas-cu12 (566.8MiB)
6589
+ Downloading nvidia-cufft-cu12 (184.2MiB)
6590
+ Downloading nvidia-nccl-cu12 (307.4MiB)
6591
+ Downloading setuptools (1.1MiB)
6592
  Downloading nvidia-cufile-cu12
6593
  Downloading hf-xet
6594
  Downloading setuptools
 
6602
  Downloading triton
6603
  Downloading nvidia-cufft-cu12
6604
  Downloading nvidia-cusolver-cu12
 
6605
  Downloading nvidia-cusparse-cu12
6606
+ Downloading nvidia-cusparselt-cu12
6607
  Downloading nvidia-nccl-cu12
6608
  Downloading nvidia-cublas-cu12
6609
  Downloading nvidia-cudnn-cu12
6610
  Downloading torch
6611
+ Installed 37 packages in 449ms
6612
  </div>
6613
  </div>
6614
  <div class="cell-stderr">Fetching 66 files: 0%| | 0/66 [00:00&lt;?, ?it/s]
6615
+ Fetching 66 files: 2%|▏ | 1/66 [00:00&lt;00:23, 2.74it/s]
6616
+ Fetching 66 files: 14%|█▎ | 9/66 [00:00&lt;00:03, 17.38it/s]
6617
+ Fetching 66 files: 26%|██▌ | 17/66 [00:01&lt;00:02, 17.85it/s]
6618
+ Fetching 66 files: 55%|█████▍ | 36/66 [00:01&lt;00:00, 42.23it/s]
6619
+ Fetching 66 files: 65%|██████▌ | 43/66 [00:01&lt;00:00, 38.03it/s]
6620
+ Fetching 66 files: 74%|███████▍ | 49/66 [00:01&lt;00:00, 30.77it/s]
6621
+ Fetching 66 files: 97%|█████████▋| 64/66 [00:01&lt;00:00, 48.18it/s]
6622
+ Fetching 66 files: 100%|██████████| 66/66 [00:01&lt;00:00, 34.40it/s]
6623
+ /tmp/tmptrubhjfl/cuda_utils.c:5:10: fatal error: Python.h: No such file or directory
 
6624
  5 | #include &lt;Python.h&gt;
6625
  | ^~~~~~~~~~
6626
  compilation terminated.
 
6637
  File &quot;/repo/moe_benchmarks/megablocks_yamoe/.uvnote/cells/bench_utils.py&quot;, line 177, in &lt;lambda&gt;
6638
  call = lambda x: fn(x, *args[1:], **kwargs)
6639
  ^^^^^^^^^^^^^^^^^^^^^^^^^^
6640
+ File &quot;/tmp/uvnote-run-68wjowzh/home/.cache/uv/environments-v2/megablocks-run-8802ebf6d3566120/lib/python3.11/site-packages/torch/nn/modules/module.py&quot;, line 1773, in _wrapped_call_impl
6641
  return self._call_impl(*args, **kwargs)
6642
  ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
6643
+ File &quot;/tmp/uvnote-run-68wjowzh/home/.cache/uv/environments-v2/megablocks-run-8802ebf6d3566120/lib/python3.11/site-packages/torch/nn/modules/module.py&quot;, line 1784, in _call_impl
6644
  return forward_call(*args, **kwargs)
6645
  ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
6646
  File &quot;/repo/moe_benchmarks/megablocks_yamoe/.uvnote/cells/megablocks_run.py&quot;, line 81, in forward
6647
  output, dummy_routing_weights = self.model(hidden_states)
6648
  ^^^^^^^^^^^^^^^^^^^^^^^^^
6649
+ File &quot;/tmp/uvnote-run-68wjowzh/home/.cache/uv/environments-v2/megablocks-run-8802ebf6d3566120/lib/python3.11/site-packages/torch/nn/modules/module.py&quot;, line 1773, in _wrapped_call_impl
6650
  return self._call_impl(*args, **kwargs)
6651
  ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
6652
+ File &quot;/tmp/uvnote-run-68wjowzh/home/.cache/uv/environments-v2/megablocks-run-8802ebf6d3566120/lib/python3.11/site-packages/torch/nn/modules/module.py&quot;, line 1784, in _call_impl
6653
  return forward_call(*args, **kwargs)
6654
  ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
6655
+ File &quot;/tmp/uvnote-run-68wjowzh/home/.cache/huggingface/hub/models--kernels-community--megablocks/snapshots/e0fb1437de3f8d7079c4da13be8cb64dc0cfcdd5/build/torch28-cxx11-cu128-x86_64-linux/megablocks/layers.py&quot;, line 896, in forward
6656
  output, expert_weights_out, *_ = moe_forward(
6657
  ^^^^^^^^^^^^
6658
+ File &quot;/tmp/uvnote-run-68wjowzh/home/.cache/huggingface/hub/models--kernels-community--megablocks/snapshots/e0fb1437de3f8d7079c4da13be8cb64dc0cfcdd5/build/torch28-cxx11-cu128-x86_64-linux/megablocks/layers.py&quot;, line 730, in moe_forward
6659
  x, tokens_per_expert = forward_fn(**forward_args)
6660
  ^^^^^^^^^^^^^^^^^^^^^^^^^^
6661
+ File &quot;/tmp/uvnote-run-68wjowzh/home/.cache/huggingface/hub/models--kernels-community--megablocks/snapshots/e0fb1437de3f8d7079c4da13be8cb64dc0cfcdd5/build/torch28-cxx11-cu128-x86_64-linux/megablocks/layers.py&quot;, line 457, in forward_once
6662
  x = permute_and_compute(
6663
  ^^^^^^^^^^^^^^^^^^^^
6664
+ File &quot;/tmp/uvnote-run-68wjowzh/home/.cache/huggingface/hub/models--kernels-community--megablocks/snapshots/e0fb1437de3f8d7079c4da13be8cb64dc0cfcdd5/build/torch28-cxx11-cu128-x86_64-linux/megablocks/layers.py&quot;, line 401, in permute_and_compute
6665
  x = ops.binned_gather(x, indices, bins, expert_capacity, top_k)
6666
  ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
6667
+ File &quot;/tmp/uvnote-run-68wjowzh/home/.cache/uv/environments-v2/megablocks-run-8802ebf6d3566120/lib/python3.11/site-packages/torch/autograd/function.py&quot;, line 576, in apply
6668
  return super().apply(*args, **kwargs) # type: ignore[misc]
6669
  ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
6670
+ File &quot;/tmp/uvnote-run-68wjowzh/home/.cache/huggingface/hub/models--kernels-community--megablocks/snapshots/e0fb1437de3f8d7079c4da13be8cb64dc0cfcdd5/build/torch28-cxx11-cu128-x86_64-linux/megablocks/ops/stk_autocast.py&quot;, line 30, in decorate_fwd
6671
  return fwd(*args, **kwargs)
6672
  ^^^^^^^^^^^^^^^^^^^^
6673
+ File &quot;/tmp/uvnote-run-68wjowzh/home/.cache/huggingface/hub/models--kernels-community--megablocks/snapshots/e0fb1437de3f8d7079c4da13be8cb64dc0cfcdd5/build/torch28-cxx11-cu128-x86_64-linux/megablocks/ops/binned_gather.py&quot;, line 26, in forward
6674
  return kernels.binned_gather(x, indices, None, bins, bin_size, top_k)
6675
  ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
6676
+ File &quot;/tmp/uvnote-run-68wjowzh/home/.cache/huggingface/hub/models--kernels-community--megablocks/snapshots/e0fb1437de3f8d7079c4da13be8cb64dc0cfcdd5/build/torch28-cxx11-cu128-x86_64-linux/megablocks/backend/kernels.py&quot;, line 419, in binned_gather
6677
  _binned_copy[(num_experts, expert_capacity)](
6678
+ File &quot;/tmp/uvnote-run-68wjowzh/home/.cache/uv/environments-v2/megablocks-run-8802ebf6d3566120/lib/python3.11/site-packages/triton/runtime/jit.py&quot;, line 390, in &lt;lambda&gt;
6679
  return lambda *args, **kwargs: self.run(grid=grid, warmup=False, *args, **kwargs)
6680
  ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
6681
+ File &quot;/tmp/uvnote-run-68wjowzh/home/.cache/uv/environments-v2/megablocks-run-8802ebf6d3566120/lib/python3.11/site-packages/triton/runtime/autotuner.py&quot;, line 239, in run
6682
  benchmark()
6683
+ File &quot;/tmp/uvnote-run-68wjowzh/home/.cache/uv/environments-v2/megablocks-run-8802ebf6d3566120/lib/python3.11/site-packages/triton/runtime/autotuner.py&quot;, line 228, in benchmark
6684
  timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}
6685
  ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
6686
+ File &quot;/tmp/uvnote-run-68wjowzh/home/.cache/uv/environments-v2/megablocks-run-8802ebf6d3566120/lib/python3.11/site-packages/triton/runtime/autotuner.py&quot;, line 228, in &lt;dictcomp&gt;
6687
  timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}
6688
  ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
6689
+ File &quot;/tmp/uvnote-run-68wjowzh/home/.cache/uv/environments-v2/megablocks-run-8802ebf6d3566120/lib/python3.11/site-packages/triton/runtime/autotuner.py&quot;, line 160, in _bench
6690
  return self.do_bench(kernel_call, quantiles=(0.5, 0.2, 0.8))
6691
  ^^^^^^^^^^^^^
6692
  File &quot;/usr/lib/python3.11/functools.py&quot;, line 1001, in __get__
6693
  val = self.func(instance)
6694
  ^^^^^^^^^^^^^^^^^^^
6695
+ File &quot;/tmp/uvnote-run-68wjowzh/home/.cache/uv/environments-v2/megablocks-run-8802ebf6d3566120/lib/python3.11/site-packages/triton/runtime/autotuner.py&quot;, line 121, in do_bench
6696
  return driver.active.get_benchmarker()
6697
  ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
6698
+ File &quot;/tmp/uvnote-run-68wjowzh/home/.cache/uv/environments-v2/megablocks-run-8802ebf6d3566120/lib/python3.11/site-packages/triton/runtime/driver.py&quot;, line 30, in __getattr__
6699
  return getattr(self._initialize_obj(), name)
6700
  ^^^^^^^^^^^^^^^^^^^^^^
6701
+ File &quot;/tmp/uvnote-run-68wjowzh/home/.cache/uv/environments-v2/megablocks-run-8802ebf6d3566120/lib/python3.11/site-packages/triton/runtime/driver.py&quot;, line 26, in _initialize_obj
6702
  self._obj = self._init_fn()
6703
  ^^^^^^^^^^^^^^^
6704
+ File &quot;/tmp/uvnote-run-68wjowzh/home/.cache/uv/environments-v2/megablocks-run-8802ebf6d3566120/lib/python3.11/site-packages/triton/runtime/driver.py&quot;, line 12, in _create_driver
6705
  return active_drivers[0]()
6706
  ^^^^^^^^^^^^^^^^^^^
6707
+ File &quot;/tmp/uvnote-run-68wjowzh/home/.cache/uv/environments-v2/megablocks-run-8802ebf6d3566120/lib/python3.11/site-packages/triton/backends/nvidia/driver.py&quot;, line 715, in __init__
6708
  self.utils = CudaUtils() # TODO: make static
6709
  ^^^^^^^^^^^
6710
+ File &quot;/tmp/uvnote-run-68wjowzh/home/.cache/uv/environments-v2/megablocks-run-8802ebf6d3566120/lib/python3.11/site-packages/triton/backends/nvidia/driver.py&quot;, line 62, in __init__
6711
  mod = compile_module_from_src(
6712
  ^^^^^^^^^^^^^^^^^^^^^^^^
6713
+ File &quot;/tmp/uvnote-run-68wjowzh/home/.cache/uv/environments-v2/megablocks-run-8802ebf6d3566120/lib/python3.11/site-packages/triton/runtime/build.py&quot;, line 88, in compile_module_from_src
6714
  so = _build(name, src_path, tmpdir, library_dirs or [], include_dirs or [], libraries or [])
6715
  ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
6716
+ File &quot;/tmp/uvnote-run-68wjowzh/home/.cache/uv/environments-v2/megablocks-run-8802ebf6d3566120/lib/python3.11/site-packages/triton/runtime/build.py&quot;, line 51, in _build
6717
  subprocess.check_call(cc_cmd, stdout=subprocess.DEVNULL)
6718
  File &quot;/usr/lib/python3.11/subprocess.py&quot;, line 413, in check_call
6719
  raise CalledProcessError(retcode, cmd)
6720
+ subprocess.CalledProcessError: Command &#x27;[&#x27;/usr/bin/gcc&#x27;, &#x27;/tmp/tmptrubhjfl/cuda_utils.c&#x27;, &#x27;-O3&#x27;, &#x27;-shared&#x27;, &#x27;-fPIC&#x27;, &#x27;-Wno-psabi&#x27;, &#x27;-o&#x27;, &#x27;/tmp/tmptrubhjfl/cuda_utils.cpython-311-x86_64-linux-gnu.so&#x27;, &#x27;-lcuda&#x27;, &#x27;-L/tmp/uvnote-run-68wjowzh/home/.cache/uv/environments-v2/megablocks-run-8802ebf6d3566120/lib/python3.11/site-packages/triton/backends/nvidia/lib&#x27;, &#x27;-L/usr/lib/x86_64-linux-gnu&#x27;, &#x27;-I/tmp/uvnote-run-68wjowzh/home/.cache/uv/environments-v2/megablocks-run-8802ebf6d3566120/lib/python3.11/site-packages/triton/backends/nvidia/include&#x27;, &#x27;-I/tmp/tmptrubhjfl&#x27;, &#x27;-I/usr/include/python3.11&#x27;]&#x27; returned non-zero exit status 1.</div>
6721
  </div>
6722
  </div>
6723