drbh HF Staff commited on
Commit
cbfd677
·
verified ·
1 Parent(s): 254b8f4

Upload folder using huggingface_hub

Browse files
flash_attn/artifacts/benchmark/Attention Benchmark.csv CHANGED
@@ -1,7 +1,7 @@
1
  seq_len,torch_cudnn,torch_cudnn_compile_d,torch_cudnn_compile_ma,torch_flash,torch_flash_compile_d,torch_flash_compile_ma,hf_flash_attn,hf_flash_attn3
2
- 4224.000000,3.802832,3.792784,4.181488,3.966576,3.956640,4.313024,3.396816,3.333200
3
- 4352.000000,4.081776,4.086624,4.433040,4.399632,4.392240,4.736416,3.837312,3.758016
4
- 4416.000000,4.146080,4.139200,4.479680,4.456032,4.446992,4.795904,3.893088,3.864576
5
- 4480.000000,4.211200,4.203072,4.555296,4.529248,4.523104,4.877248,3.951152,3.871312
6
- 4544.000000,4.436080,4.432784,4.789248,4.585120,4.580192,4.938464,4.010128,3.978448
7
- 4608.000000,4.504256,4.497184,4.872832,4.662272,4.654272,5.030304,4.065760,3.986496
 
1
  seq_len,torch_cudnn,torch_cudnn_compile_d,torch_cudnn_compile_ma,torch_flash,torch_flash_compile_d,torch_flash_compile_ma,hf_flash_attn,hf_flash_attn3
2
+ 4224.000000,3.807456,3.789232,4.191984,3.974816,3.953792,4.322096,3.403408,3.328416
3
+ 4352.000000,4.078480,4.072352,4.420736,4.400480,4.390000,4.738144,3.833424,3.755664
4
+ 4416.000000,4.139680,4.134800,4.490464,4.451040,4.443680,4.795104,3.890112,3.860992
5
+ 4480.000000,4.202048,4.195216,4.561248,4.524608,4.519520,4.877056,3.948816,3.866704
6
+ 4544.000000,4.434992,4.427040,4.788000,4.582336,4.571872,4.945728,4.015280,3.982320
7
+ 4608.000000,4.499456,4.490816,4.874464,4.669152,4.663648,5.035232,4.071872,3.983520
flash_attn/artifacts/benchmark/Attention Benchmark.png CHANGED

Git LFS Details

  • SHA256: 2746bf86889727851b82b499a36e0c654a874a2459835df179c5a02932db7554
  • Pointer size: 131 Bytes
  • Size of remote file: 145 kB

Git LFS Details

  • SHA256: 6c53c2d36126acbc493a355a1c4f5e298951b0aef26c51dfeaf9ad6e96ed00cd
  • Pointer size: 131 Bytes
  • Size of remote file: 145 kB
flash_attn/benchmark.html CHANGED
@@ -3715,19 +3715,86 @@ span.linenos.special { color: #000000; background-color: #ffffc0; padding-left:
3715
  </div>
3716
 
3717
  <div class="main-content">
3718
- <div class="cell" id="cell-benchmark">
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
3719
  <div class="cell-header">
3720
  <span class="collapse-indicators">
3721
  <span onclick="toggleCode('benchmark')" style="cursor: pointer;">▼ code</span>
3722
  <span onclick="toggleOutput('benchmark')" style="cursor: pointer;">▼ output</span>
3723
  <span id="uv-indicator-benchmark" onclick="toggleUvLogsFromHeader('benchmark')" style="cursor: pointer;">▶ uv-logs</span>
3724
  </span> |
3725
- Cell: benchmark | 77.66s
3726
  | <button class="run-btn" onclick="runCell('benchmark')">▶ run</button>
3727
  <button class="copy-btn" onclick="copyCell('benchmark')">Copy</button>
3728
  <a href="cells/benchmark.py" target="_blank" class="raw-btn">Raw</a>
3729
  </div>
3730
- <div id="code-benchmark" class="cell-code" data-lines="343">
3731
  <div class="highlight-with-lines">
3732
  <div class="line-numbers" id="lines-benchmark">
3733
  <a class="line-number" data-cell="benchmark" data-line="1" href="#cell-benchmark" onclick="event.preventDefault(); selectCellLine('benchmark', 1, true);">1</a>
@@ -4073,6 +4140,8 @@ Cell: benchmark | 77.66s
4073
  <a class="line-number" data-cell="benchmark" data-line="341" href="#cell-benchmark" onclick="event.preventDefault(); selectCellLine('benchmark', 341, true);">341</a>
4074
  <a class="line-number" data-cell="benchmark" data-line="342" href="#cell-benchmark" onclick="event.preventDefault(); selectCellLine('benchmark', 342, true);">342</a>
4075
  <a class="line-number" data-cell="benchmark" data-line="343" href="#cell-benchmark" onclick="event.preventDefault(); selectCellLine('benchmark', 343, true);">343</a>
 
 
4076
  </div>
4077
  <div class="code-wrap">
4078
  <div class="highlight"><pre><span></span><span class="c1"># /// script</span>
@@ -4417,7 +4486,9 @@ Cell: benchmark | 77.66s
4417
 
4418
  <span class="k">with</span> <span class="n">torch</span><span class="o">.</span><span class="n">inference_mode</span><span class="p">():</span>
4419
  <span class="n">correctness</span><span class="p">()</span>
4420
- <span class="n">benchmark_fn</span><span class="o">.</span><span class="n">run</span><span class="p">(</span><span class="n">print_data</span><span class="o">=</span><span class="kc">True</span><span class="p">,</span> <span class="n">save_path</span><span class="o">=</span><span class="n">output_dir</span><span class="o">.</span><span class="n">as_posix</span><span class="p">())</span>
 
 
4421
  </pre></div>
4422
 
4423
  <div class="code-line-highlight" id="line-highlight-benchmark"></div>
@@ -4433,105 +4504,105 @@ xFormers not found.
4433
 
4434
 
4435
  ===== Testing shape: (1, 4224, 24, 128) =====
4436
- torch_cudnn : absmax=0.000871, mae=0.000075, mse=0.000000
4437
- torch_cudnn_compile_d : absmax=0.000871, mae=0.000075, mse=0.000000
4438
- torch_cudnn_compile_ma : absmax=0.000871, mae=0.000075, mse=0.000000
4439
- torch_flash : absmax=0.000947, mae=0.000075, mse=0.000000
4440
- torch_flash_compile_d : absmax=0.000947, mae=0.000075, mse=0.000000
4441
- torch_flash_compile_ma : absmax=0.000947, mae=0.000075, mse=0.000000
4442
- hf_flash_attn : absmax=0.000947, mae=0.000075, mse=0.000000
4443
- hf_flash_attn3 : absmax=0.000842, mae=0.000075, mse=0.000000
4444
 
4445
 
4446
  ===== Testing shape: (1, 4352, 24, 128) =====
4447
- torch_cudnn : absmax=0.001069, mae=0.000073, mse=0.000000
4448
- torch_cudnn_compile_d : absmax=0.001069, mae=0.000073, mse=0.000000
4449
- torch_cudnn_compile_ma : absmax=0.001069, mae=0.000073, mse=0.000000
4450
- torch_flash : absmax=0.000963, mae=0.000073, mse=0.000000
4451
- torch_flash_compile_d : absmax=0.000963, mae=0.000073, mse=0.000000
4452
- torch_flash_compile_ma : absmax=0.000963, mae=0.000073, mse=0.000000
4453
- hf_flash_attn : absmax=0.000963, mae=0.000073, mse=0.000000
4454
- hf_flash_attn3 : absmax=0.001069, mae=0.000073, mse=0.000000
4455
 
4456
 
4457
  ===== Testing shape: (1, 4416, 24, 128) =====
4458
- torch_cudnn : absmax=0.001802, mae=0.000073, mse=0.000000
4459
- torch_cudnn_compile_d : absmax=0.001802, mae=0.000073, mse=0.000000
4460
- torch_cudnn_compile_ma : absmax=0.001802, mae=0.000073, mse=0.000000
4461
- torch_flash : absmax=0.001802, mae=0.000073, mse=0.000000
4462
- torch_flash_compile_d : absmax=0.001802, mae=0.000073, mse=0.000000
4463
- torch_flash_compile_ma : absmax=0.001802, mae=0.000073, mse=0.000000
4464
- hf_flash_attn : absmax=0.001802, mae=0.000073, mse=0.000000
4465
- hf_flash_attn3 : absmax=0.001802, mae=0.000073, mse=0.000000
4466
 
4467
 
4468
  ===== Testing shape: (1, 4480, 24, 128) =====
4469
- torch_cudnn : absmax=0.001438, mae=0.000073, mse=0.000000
4470
- torch_cudnn_compile_d : absmax=0.001438, mae=0.000073, mse=0.000000
4471
- torch_cudnn_compile_ma : absmax=0.001438, mae=0.000073, mse=0.000000
4472
- torch_flash : absmax=0.001438, mae=0.000073, mse=0.000000
4473
- torch_flash_compile_d : absmax=0.001438, mae=0.000073, mse=0.000000
4474
- torch_flash_compile_ma : absmax=0.001438, mae=0.000073, mse=0.000000
4475
- hf_flash_attn : absmax=0.001438, mae=0.000073, mse=0.000000
4476
- hf_flash_attn3 : absmax=0.001438, mae=0.000073, mse=0.000000
4477
 
4478
 
4479
  ===== Testing shape: (1, 4544, 24, 128) =====
4480
  torch_cudnn : absmax=0.000976, mae=0.000072, mse=0.000000
4481
  torch_cudnn_compile_d : absmax=0.000976, mae=0.000072, mse=0.000000
4482
  torch_cudnn_compile_ma : absmax=0.000976, mae=0.000072, mse=0.000000
4483
- torch_flash : absmax=0.000976, mae=0.000072, mse=0.000000
4484
- torch_flash_compile_d : absmax=0.000976, mae=0.000072, mse=0.000000
4485
- torch_flash_compile_ma : absmax=0.000976, mae=0.000072, mse=0.000000
4486
- hf_flash_attn : absmax=0.000976, mae=0.000072, mse=0.000000
4487
- hf_flash_attn3 : absmax=0.000976, mae=0.000072, mse=0.000000
4488
 
4489
 
4490
  ===== Testing shape: (1, 4608, 24, 128) =====
4491
- torch_cudnn : absmax=0.000937, mae=0.000072, mse=0.000000
4492
- torch_cudnn_compile_d : absmax=0.000937, mae=0.000072, mse=0.000000
4493
- torch_cudnn_compile_ma : absmax=0.000937, mae=0.000072, mse=0.000000
4494
- torch_flash : absmax=0.000937, mae=0.000072, mse=0.000000
4495
- torch_flash_compile_d : absmax=0.000937, mae=0.000072, mse=0.000000
4496
- torch_flash_compile_ma : absmax=0.000937, mae=0.000072, mse=0.000000
4497
- hf_flash_attn : absmax=0.000937, mae=0.000072, mse=0.000000
4498
- hf_flash_attn3 : absmax=0.000937, mae=0.000072, mse=0.000000
4499
  Attention Benchmark:
4500
  seq_len torch_cudnn torch_cudnn_compile_d torch_cudnn_compile_ma torch_flash torch_flash_compile_d torch_flash_compile_ma hf_flash_attn hf_flash_attn3
4501
- 0 4224.0 3.802832 3.792784 4.181488 3.966576 3.956640 4.313024 3.396816 3.333200
4502
- 1 4352.0 4.081776 4.086624 4.433040 4.399632 4.392240 4.736416 3.837312 3.758016
4503
- 2 4416.0 4.146080 4.139200 4.479680 4.456032 4.446992 4.795904 3.893088 3.864576
4504
- 3 4480.0 4.211200 4.203072 4.555296 4.529248 4.523104 4.877248 3.951152 3.871312
4505
- 4 4544.0 4.436080 4.432784 4.789248 4.585120 4.580192 4.938464 4.010128 3.978448
4506
- 5 4608.0 4.504256 4.497184 4.872832 4.662272 4.654272 5.030304 4.065760 3.986496
4507
  </div>
4508
  <div class="uv-install-logs" id="uv-logs-benchmark">
4509
  <div class="uv-logs-header" onclick="toggleUvLogs(this)">▶ UV Install Logs</div>
4510
  <div class="uv-logs-content" style="display: none;">
4511
- Downloading nvidia-cuda-cupti-cu12 (9.8MiB)
4512
- Downloading pandas (11.8MiB)
4513
- Downloading sympy (6.0MiB)
4514
  Downloading nvidia-nvjitlink-cu12 (37.4MiB)
 
 
 
 
4515
  Downloading nvidia-cusparselt-cu12 (273.9MiB)
4516
- Downloading nvidia-cudnn-cu12 (674.0MiB)
4517
- Downloading fonttools (4.7MiB)
4518
- Downloading nvidia-cuda-nvrtc-cu12 (84.0MiB)
4519
- Downloading triton (148.3MiB)
4520
- Downloading setuptools (1.1MiB)
4521
- Downloading pillow (6.3MiB)
4522
  Downloading nvidia-cufft-cu12 (184.2MiB)
4523
- Downloading kiwisolver (1.4MiB)
4524
  Downloading matplotlib (8.3MiB)
 
 
 
4525
  Downloading nvidia-curand-cu12 (60.7MiB)
4526
- Downloading nvidia-cusolver-cu12 (255.1MiB)
4527
- Downloading nvidia-cusparse-cu12 (274.9MiB)
4528
- Downloading torch (846.9MiB)
4529
  Downloading nvidia-cublas-cu12 (566.8MiB)
 
4530
  Downloading numpy (16.2MiB)
4531
- Downloading nvidia-nccl-cu12 (307.4MiB)
4532
- Downloading nvidia-cufile-cu12 (1.1MiB)
4533
- Downloading networkx (1.9MiB)
4534
- Downloading hf-xet (3.0MiB)
 
4535
  Downloading nvidia-cufile-cu12
4536
  Downloading kiwisolver
4537
  Downloading hf-xet
@@ -4544,30 +4615,35 @@ Downloading hf-xet (3.0MiB)
4544
  Downloading sympy
4545
  Downloading numpy
4546
  Downloading nvidia-nvjitlink-cu12
4547
- Downloading pandas
4548
  Downloading nvidia-curand-cu12
 
4549
  Downloading nvidia-cuda-nvrtc-cu12
4550
  Downloading triton
4551
  Downloading nvidia-cufft-cu12
4552
  Downloading nvidia-cusolver-cu12
4553
- Downloading nvidia-cusparse-cu12
4554
  Downloading nvidia-cusparselt-cu12
 
4555
  Downloading nvidia-nccl-cu12
4556
  Downloading nvidia-cublas-cu12
4557
  Downloading nvidia-cudnn-cu12
4558
  Downloading torch
4559
- Installed 49 packages in 617ms
4560
  </div>
4561
  </div>
4562
  <div class="cell-stderr">Fetching 20 files: 0%| | 0/20 [00:00&lt;?, ?it/s]
4563
- Fetching 20 files: 5%|▌ | 1/20 [00:00&lt;00:04, 4.35it/s]
4564
- Fetching 20 files: 10%|█ | 2/20 [00:02&lt;00:23, 1.31s/it]
4565
- Fetching 20 files: 100%|██████████| 20/20 [00:02&lt;00:00, 8.73it/s]
4566
 
4567
  Fetching 4 files: 0%| | 0/4 [00:00&lt;?, ?it/s]
4568
- Fetching 4 files: 25%|██▌ | 1/4 [00:00&lt;00:00, 5.70it/s]
4569
- Fetching 4 files: 50%|█████ | 2/4 [00:01&lt;00:01, 1.31it/s]
4570
- Fetching 4 files: 100%|██████████| 4/4 [00:01&lt;00:00, 2.96it/s]</div>
 
 
 
 
 
4571
  <div class="cell-artifacts">
4572
  <h4>Artifacts:</h4>
4573
  <a href="artifacts/benchmark/Attention Benchmark.png" class="artifact" target="_blank">Attention Benchmark.png</a>
 
3715
  </div>
3716
 
3717
  <div class="main-content">
3718
+ <div class="cell" id="cell-nv">
3719
+ <div class="cell-header">
3720
+ <span class="collapse-indicators">
3721
+ <span onclick="toggleCode('nv')" style="cursor: pointer;">▼ code</span>
3722
+ <span onclick="toggleOutput('nv')" style="cursor: pointer;">▼ output</span>
3723
+ <span id="uv-indicator-nv" style="cursor: default; opacity: 0.3;">▶ uv-logs</span>
3724
+ </span> |
3725
+ Cell: nv | 0.70s
3726
+ | <button class="run-btn" onclick="runCell('nv')">▶ run</button>
3727
+ <button class="copy-btn" onclick="copyCell('nv')">Copy</button>
3728
+ <a href="cells/nv.py" target="_blank" class="raw-btn">Raw</a>
3729
+ </div>
3730
+ <div id="code-nv" class="cell-code" data-lines="3">
3731
+ <div class="highlight-with-lines">
3732
+ <div class="line-numbers" id="lines-nv">
3733
+ <a class="line-number" data-cell="nv" data-line="1" href="#cell-nv" onclick="event.preventDefault(); selectCellLine('nv', 1, true);">1</a>
3734
+ <a class="line-number" data-cell="nv" data-line="2" href="#cell-nv" onclick="event.preventDefault(); selectCellLine('nv', 2, true);">2</a>
3735
+ <a class="line-number" data-cell="nv" data-line="3" href="#cell-nv" onclick="event.preventDefault(); selectCellLine('nv', 3, true);">3</a>
3736
+ </div>
3737
+ <div class="code-wrap">
3738
+ <div class="highlight"><pre><span></span><span class="kn">import</span><span class="w"> </span><span class="nn">subprocess</span>
3739
+
3740
+ <span class="nb">print</span><span class="p">(</span><span class="n">subprocess</span><span class="o">.</span><span class="n">run</span><span class="p">([</span><span class="s2">&quot;nvidia-smi&quot;</span><span class="p">],</span> <span class="n">capture_output</span><span class="o">=</span><span class="kc">True</span><span class="p">,</span> <span class="n">text</span><span class="o">=</span><span class="kc">True</span><span class="p">)</span><span class="o">.</span><span class="n">stdout</span><span class="p">)</span>
3741
+ </pre></div>
3742
+
3743
+ <div class="code-line-highlight" id="line-highlight-nv"></div>
3744
+ </div>
3745
+ </div>
3746
+ </div>
3747
+ <div id="output-nv" class="cell-output">
3748
+ <div class="cell-stdout">Fri Sep 26 02:23:10 2025
3749
+ +-----------------------------------------------------------------------------------------+
3750
+ | NVIDIA-SMI 570.172.08 Driver Version: 570.172.08 CUDA Version: 12.8 |
3751
+ |-----------------------------------------+------------------------+----------------------+
3752
+ | GPU Name Persistence-M | Bus-Id Disp.A | Volatile Uncorr. ECC |
3753
+ | Fan Temp Perf Pwr:Usage/Cap | Memory-Usage | GPU-Util Compute M. |
3754
+ | | | MIG M. |
3755
+ |=========================================+========================+======================|
3756
+ | 0 NVIDIA A10G On | 00000000:00:1B.0 Off | 0 |
3757
+ | 0% 31C P8 26W / 300W | 0MiB / 23028MiB | 0% Default |
3758
+ | | | N/A |
3759
+ +-----------------------------------------+------------------------+----------------------+
3760
+ | 1 NVIDIA A10G On | 00000000:00:1C.0 Off | 0 |
3761
+ | 0% 30C P8 26W / 300W | 0MiB / 23028MiB | 0% Default |
3762
+ | | | N/A |
3763
+ +-----------------------------------------+------------------------+----------------------+
3764
+ | 2 NVIDIA A10G On | 00000000:00:1D.0 Off | 0 |
3765
+ | 0% 31C P8 27W / 300W | 0MiB / 23028MiB | 0% Default |
3766
+ | | | N/A |
3767
+ +-----------------------------------------+------------------------+----------------------+
3768
+ | 3 NVIDIA A10G On | 00000000:00:1E.0 Off | 0 |
3769
+ | 0% 30C P8 27W / 300W | 0MiB / 23028MiB | 0% Default |
3770
+ | | | N/A |
3771
+ +-----------------------------------------+------------------------+----------------------+
3772
+
3773
+ +-----------------------------------------------------------------------------------------+
3774
+ | Processes: |
3775
+ | GPU GI CI PID Type Process name GPU Memory |
3776
+ | ID ID Usage |
3777
+ |=========================================================================================|
3778
+ | No running processes found |
3779
+ +-----------------------------------------------------------------------------------------+
3780
+
3781
+ </div>
3782
+ </div>
3783
+ </div>
3784
+
3785
+ <div class="cell cell-failed" id="cell-benchmark">
3786
  <div class="cell-header">
3787
  <span class="collapse-indicators">
3788
  <span onclick="toggleCode('benchmark')" style="cursor: pointer;">▼ code</span>
3789
  <span onclick="toggleOutput('benchmark')" style="cursor: pointer;">▼ output</span>
3790
  <span id="uv-indicator-benchmark" onclick="toggleUvLogsFromHeader('benchmark')" style="cursor: pointer;">▶ uv-logs</span>
3791
  </span> |
3792
+ Cell: benchmark | 77.48s | FAILED
3793
  | <button class="run-btn" onclick="runCell('benchmark')">▶ run</button>
3794
  <button class="copy-btn" onclick="copyCell('benchmark')">Copy</button>
3795
  <a href="cells/benchmark.py" target="_blank" class="raw-btn">Raw</a>
3796
  </div>
3797
+ <div id="code-benchmark" class="cell-code" data-lines="345">
3798
  <div class="highlight-with-lines">
3799
  <div class="line-numbers" id="lines-benchmark">
3800
  <a class="line-number" data-cell="benchmark" data-line="1" href="#cell-benchmark" onclick="event.preventDefault(); selectCellLine('benchmark', 1, true);">1</a>
 
4140
  <a class="line-number" data-cell="benchmark" data-line="341" href="#cell-benchmark" onclick="event.preventDefault(); selectCellLine('benchmark', 341, true);">341</a>
4141
  <a class="line-number" data-cell="benchmark" data-line="342" href="#cell-benchmark" onclick="event.preventDefault(); selectCellLine('benchmark', 342, true);">342</a>
4142
  <a class="line-number" data-cell="benchmark" data-line="343" href="#cell-benchmark" onclick="event.preventDefault(); selectCellLine('benchmark', 343, true);">343</a>
4143
+ <a class="line-number" data-cell="benchmark" data-line="344" href="#cell-benchmark" onclick="event.preventDefault(); selectCellLine('benchmark', 344, true);">344</a>
4144
+ <a class="line-number" data-cell="benchmark" data-line="345" href="#cell-benchmark" onclick="event.preventDefault(); selectCellLine('benchmark', 345, true);">345</a>
4145
  </div>
4146
  <div class="code-wrap">
4147
  <div class="highlight"><pre><span></span><span class="c1"># /// script</span>
 
4486
 
4487
  <span class="k">with</span> <span class="n">torch</span><span class="o">.</span><span class="n">inference_mode</span><span class="p">():</span>
4488
  <span class="n">correctness</span><span class="p">()</span>
4489
+ <span class="n">fig</span> <span class="o">=</span> <span class="n">benchmark_fn</span><span class="o">.</span><span class="n">run</span><span class="p">(</span><span class="n">print_data</span><span class="o">=</span><span class="kc">True</span><span class="p">,</span> <span class="n">save_path</span><span class="o">=</span><span class="n">output_dir</span><span class="o">.</span><span class="n">as_posix</span><span class="p">())</span>
4490
+ <span class="n">fig</span><span class="o">.</span><span class="n">savefig</span><span class="p">(</span><span class="n">output_dir</span> <span class="o">/</span> <span class="s2">&quot;attention_benchmark.png&quot;</span><span class="p">,</span> <span class="n">dpi</span><span class="o">=</span><span class="mi">300</span><span class="p">,</span> <span class="n">bbox_inches</span><span class="o">=</span><span class="s2">&quot;tight&quot;</span><span class="p">)</span>
4491
+ <span class="nb">print</span><span class="p">(</span><span class="sa">f</span><span class="s2">&quot;Benchmark plot saved to: </span><span class="si">{</span><span class="n">output_dir</span><span class="w"> </span><span class="o">/</span><span class="w"> </span><span class="s1">&#39;attention_benchmark.png&#39;</span><span class="si">}</span><span class="s2">&quot;</span><span class="p">)</span>
4492
  </pre></div>
4493
 
4494
  <div class="code-line-highlight" id="line-highlight-benchmark"></div>
 
4504
 
4505
 
4506
  ===== Testing shape: (1, 4224, 24, 128) =====
4507
+ torch_cudnn : absmax=0.000994, mae=0.000075, mse=0.000000
4508
+ torch_cudnn_compile_d : absmax=0.000994, mae=0.000075, mse=0.000000
4509
+ torch_cudnn_compile_ma : absmax=0.000994, mae=0.000075, mse=0.000000
4510
+ torch_flash : absmax=0.000994, mae=0.000075, mse=0.000000
4511
+ torch_flash_compile_d : absmax=0.000994, mae=0.000075, mse=0.000000
4512
+ torch_flash_compile_ma : absmax=0.000994, mae=0.000075, mse=0.000000
4513
+ hf_flash_attn : absmax=0.000994, mae=0.000075, mse=0.000000
4514
+ hf_flash_attn3 : absmax=0.000994, mae=0.000075, mse=0.000000
4515
 
4516
 
4517
  ===== Testing shape: (1, 4352, 24, 128) =====
4518
+ torch_cudnn : absmax=0.001718, mae=0.000073, mse=0.000000
4519
+ torch_cudnn_compile_d : absmax=0.001718, mae=0.000073, mse=0.000000
4520
+ torch_cudnn_compile_ma : absmax=0.001718, mae=0.000073, mse=0.000000
4521
+ torch_flash : absmax=0.001718, mae=0.000073, mse=0.000000
4522
+ torch_flash_compile_d : absmax=0.001718, mae=0.000073, mse=0.000000
4523
+ torch_flash_compile_ma : absmax=0.001718, mae=0.000073, mse=0.000000
4524
+ hf_flash_attn : absmax=0.001718, mae=0.000073, mse=0.000000
4525
+ hf_flash_attn3 : absmax=0.001718, mae=0.000073, mse=0.000000
4526
 
4527
 
4528
  ===== Testing shape: (1, 4416, 24, 128) =====
4529
+ torch_cudnn : absmax=0.001273, mae=0.000073, mse=0.000000
4530
+ torch_cudnn_compile_d : absmax=0.001273, mae=0.000073, mse=0.000000
4531
+ torch_cudnn_compile_ma : absmax=0.001273, mae=0.000073, mse=0.000000
4532
+ torch_flash : absmax=0.001440, mae=0.000073, mse=0.000000
4533
+ torch_flash_compile_d : absmax=0.001440, mae=0.000073, mse=0.000000
4534
+ torch_flash_compile_ma : absmax=0.001440, mae=0.000073, mse=0.000000
4535
+ hf_flash_attn : absmax=0.001440, mae=0.000073, mse=0.000000
4536
+ hf_flash_attn3 : absmax=0.001440, mae=0.000073, mse=0.000000
4537
 
4538
 
4539
  ===== Testing shape: (1, 4480, 24, 128) =====
4540
+ torch_cudnn : absmax=0.001284, mae=0.000073, mse=0.000000
4541
+ torch_cudnn_compile_d : absmax=0.001284, mae=0.000073, mse=0.000000
4542
+ torch_cudnn_compile_ma : absmax=0.001284, mae=0.000073, mse=0.000000
4543
+ torch_flash : absmax=0.001284, mae=0.000073, mse=0.000000
4544
+ torch_flash_compile_d : absmax=0.001284, mae=0.000073, mse=0.000000
4545
+ torch_flash_compile_ma : absmax=0.001284, mae=0.000073, mse=0.000000
4546
+ hf_flash_attn : absmax=0.001284, mae=0.000073, mse=0.000000
4547
+ hf_flash_attn3 : absmax=0.001284, mae=0.000073, mse=0.000000
4548
 
4549
 
4550
  ===== Testing shape: (1, 4544, 24, 128) =====
4551
  torch_cudnn : absmax=0.000976, mae=0.000072, mse=0.000000
4552
  torch_cudnn_compile_d : absmax=0.000976, mae=0.000072, mse=0.000000
4553
  torch_cudnn_compile_ma : absmax=0.000976, mae=0.000072, mse=0.000000
4554
+ torch_flash : absmax=0.000834, mae=0.000072, mse=0.000000
4555
+ torch_flash_compile_d : absmax=0.000834, mae=0.000072, mse=0.000000
4556
+ torch_flash_compile_ma : absmax=0.000834, mae=0.000072, mse=0.000000
4557
+ hf_flash_attn : absmax=0.000834, mae=0.000072, mse=0.000000
4558
+ hf_flash_attn3 : absmax=0.000815, mae=0.000072, mse=0.000000
4559
 
4560
 
4561
  ===== Testing shape: (1, 4608, 24, 128) =====
4562
+ torch_cudnn : absmax=0.000926, mae=0.000072, mse=0.000000
4563
+ torch_cudnn_compile_d : absmax=0.000926, mae=0.000072, mse=0.000000
4564
+ torch_cudnn_compile_ma : absmax=0.000926, mae=0.000072, mse=0.000000
4565
+ torch_flash : absmax=0.000926, mae=0.000072, mse=0.000000
4566
+ torch_flash_compile_d : absmax=0.000926, mae=0.000072, mse=0.000000
4567
+ torch_flash_compile_ma : absmax=0.000926, mae=0.000072, mse=0.000000
4568
+ hf_flash_attn : absmax=0.000926, mae=0.000072, mse=0.000000
4569
+ hf_flash_attn3 : absmax=0.000926, mae=0.000072, mse=0.000000
4570
  Attention Benchmark:
4571
  seq_len torch_cudnn torch_cudnn_compile_d torch_cudnn_compile_ma torch_flash torch_flash_compile_d torch_flash_compile_ma hf_flash_attn hf_flash_attn3
4572
+ 0 4224.0 3.807456 3.789232 4.191984 3.974816 3.953792 4.322096 3.403408 3.328416
4573
+ 1 4352.0 4.078480 4.072352 4.420736 4.400480 4.390000 4.738144 3.833424 3.755664
4574
+ 2 4416.0 4.139680 4.134800 4.490464 4.451040 4.443680 4.795104 3.890112 3.860992
4575
+ 3 4480.0 4.202048 4.195216 4.561248 4.524608 4.519520 4.877056 3.948816 3.866704
4576
+ 4 4544.0 4.434992 4.427040 4.788000 4.582336 4.571872 4.945728 4.015280 3.982320
4577
+ 5 4608.0 4.499456 4.490816 4.874464 4.669152 4.663648 5.035232 4.071872 3.983520
4578
  </div>
4579
  <div class="uv-install-logs" id="uv-logs-benchmark">
4580
  <div class="uv-logs-header" onclick="toggleUvLogs(this)">▶ UV Install Logs</div>
4581
  <div class="uv-logs-content" style="display: none;">
4582
+ Downloading hf-xet (3.0MiB)
4583
+ Downloading setuptools (1.1MiB)
4584
+ Downloading nvidia-cufile-cu12 (1.1MiB)
4585
  Downloading nvidia-nvjitlink-cu12 (37.4MiB)
4586
+ Downloading pandas (11.8MiB)
4587
+ Downloading nvidia-cusparse-cu12 (274.9MiB)
4588
+ Downloading networkx (1.9MiB)
4589
+ Downloading nvidia-cusolver-cu12 (255.1MiB)
4590
  Downloading nvidia-cusparselt-cu12 (273.9MiB)
 
 
 
 
 
 
4591
  Downloading nvidia-cufft-cu12 (184.2MiB)
4592
+ Downloading triton (148.3MiB)
4593
  Downloading matplotlib (8.3MiB)
4594
+ Downloading kiwisolver (1.4MiB)
4595
+ Downloading nvidia-nccl-cu12 (307.4MiB)
4596
+ Downloading nvidia-cuda-cupti-cu12 (9.8MiB)
4597
  Downloading nvidia-curand-cu12 (60.7MiB)
 
 
 
4598
  Downloading nvidia-cublas-cu12 (566.8MiB)
4599
+ Downloading torch (846.9MiB)
4600
  Downloading numpy (16.2MiB)
4601
+ Downloading pillow (6.3MiB)
4602
+ Downloading nvidia-cuda-nvrtc-cu12 (84.0MiB)
4603
+ Downloading nvidia-cudnn-cu12 (674.0MiB)
4604
+ Downloading fonttools (4.7MiB)
4605
+ Downloading sympy (6.0MiB)
4606
  Downloading nvidia-cufile-cu12
4607
  Downloading kiwisolver
4608
  Downloading hf-xet
 
4615
  Downloading sympy
4616
  Downloading numpy
4617
  Downloading nvidia-nvjitlink-cu12
 
4618
  Downloading nvidia-curand-cu12
4619
+ Downloading pandas
4620
  Downloading nvidia-cuda-nvrtc-cu12
4621
  Downloading triton
4622
  Downloading nvidia-cufft-cu12
4623
  Downloading nvidia-cusolver-cu12
 
4624
  Downloading nvidia-cusparselt-cu12
4625
+ Downloading nvidia-cusparse-cu12
4626
  Downloading nvidia-nccl-cu12
4627
  Downloading nvidia-cublas-cu12
4628
  Downloading nvidia-cudnn-cu12
4629
  Downloading torch
4630
+ Installed 49 packages in 520ms
4631
  </div>
4632
  </div>
4633
  <div class="cell-stderr">Fetching 20 files: 0%| | 0/20 [00:00&lt;?, ?it/s]
4634
+ Fetching 20 files: 5%|▌ | 1/20 [00:00&lt;00:03, 4.89it/s]
4635
+ Fetching 20 files: 10%|█ | 2/20 [00:01&lt;00:17, 1.02it/s]
4636
+ Fetching 20 files: 100%|██████████| 20/20 [00:01&lt;00:00, 11.55it/s]
4637
 
4638
  Fetching 4 files: 0%| | 0/4 [00:00&lt;?, ?it/s]
4639
+ Fetching 4 files: 25%|██▌ | 1/4 [00:00&lt;00:00, 6.41it/s]
4640
+ Fetching 4 files: 50%|█████ | 2/4 [00:01&lt;00:01, 1.10it/s]
4641
+ Fetching 4 files: 100%|██████████| 4/4 [00:01&lt;00:00, 2.52it/s]
4642
+ Traceback (most recent call last):
4643
+ File &quot;/repo/flash_attn/.uvnote/cells/benchmark.py&quot;, line 344, in &lt;module&gt;
4644
+ fig.savefig(output_dir / &quot;attention_benchmark.png&quot;, dpi=300, bbox_inches=&quot;tight&quot;)
4645
+ ^^^^^^^^^^^
4646
+ AttributeError: &#x27;NoneType&#x27; object has no attribute &#x27;savefig&#x27;</div>
4647
  <div class="cell-artifacts">
4648
  <h4>Artifacts:</h4>
4649
  <a href="artifacts/benchmark/Attention Benchmark.png" class="artifact" target="_blank">Attention Benchmark.png</a>
flash_attn/cells/benchmark.py CHANGED
@@ -340,4 +340,6 @@ def benchmark_fn(seq_len: int, provider: str):
340
 
341
  with torch.inference_mode():
342
  correctness()
343
- benchmark_fn.run(print_data=True, save_path=output_dir.as_posix())
 
 
 
340
 
341
  with torch.inference_mode():
342
  correctness()
343
+ fig = benchmark_fn.run(print_data=True, save_path=output_dir.as_posix())
344
+ fig.savefig(output_dir / "attention_benchmark.png", dpi=300, bbox_inches="tight")
345
+ print(f"Benchmark plot saved to: {output_dir / 'attention_benchmark.png'}")
flash_attn/cells/nv.py ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ import subprocess
2
+
3
+ print(subprocess.run(["nvidia-smi"], capture_output=True, text=True).stdout)
moe_benchmarks/megablocks/megablocks_only.html CHANGED
@@ -3715,74 +3715,7 @@ span.linenos.special { color: #000000; background-color: #ffffc0; padding-left:
3715
  </div>
3716
 
3717
  <div class="main-content">
3718
- <div class="cell" id="cell-nv">
3719
- <div class="cell-header">
3720
- <span class="collapse-indicators">
3721
- <span onclick="toggleCode('nv')" style="cursor: pointer;">▼ code</span>
3722
- <span onclick="toggleOutput('nv')" style="cursor: pointer;">▼ output</span>
3723
- <span id="uv-indicator-nv" style="cursor: default; opacity: 0.3;">▶ uv-logs</span>
3724
- </span> |
3725
- Cell: nv | 0.67s
3726
- | <button class="run-btn" onclick="runCell('nv')">▶ run</button>
3727
- <button class="copy-btn" onclick="copyCell('nv')">Copy</button>
3728
- <a href="cells/nv.py" target="_blank" class="raw-btn">Raw</a>
3729
- </div>
3730
- <div id="code-nv" class="cell-code" data-lines="3">
3731
- <div class="highlight-with-lines">
3732
- <div class="line-numbers" id="lines-nv">
3733
- <a class="line-number" data-cell="nv" data-line="1" href="#cell-nv" onclick="event.preventDefault(); selectCellLine('nv', 1, true);">1</a>
3734
- <a class="line-number" data-cell="nv" data-line="2" href="#cell-nv" onclick="event.preventDefault(); selectCellLine('nv', 2, true);">2</a>
3735
- <a class="line-number" data-cell="nv" data-line="3" href="#cell-nv" onclick="event.preventDefault(); selectCellLine('nv', 3, true);">3</a>
3736
- </div>
3737
- <div class="code-wrap">
3738
- <div class="highlight"><pre><span></span><span class="kn">import</span><span class="w"> </span><span class="nn">subprocess</span>
3739
-
3740
- <span class="nb">print</span><span class="p">(</span><span class="n">subprocess</span><span class="o">.</span><span class="n">run</span><span class="p">([</span><span class="s2">&quot;nvidia-smi&quot;</span><span class="p">],</span> <span class="n">capture_output</span><span class="o">=</span><span class="kc">True</span><span class="p">,</span> <span class="n">text</span><span class="o">=</span><span class="kc">True</span><span class="p">)</span><span class="o">.</span><span class="n">stdout</span><span class="p">)</span>
3741
- </pre></div>
3742
-
3743
- <div class="code-line-highlight" id="line-highlight-nv"></div>
3744
- </div>
3745
- </div>
3746
- </div>
3747
- <div id="output-nv" class="cell-output">
3748
- <div class="cell-stdout">Thu Sep 25 20:02:38 2025
3749
- +-----------------------------------------------------------------------------------------+
3750
- | NVIDIA-SMI 570.172.08 Driver Version: 570.172.08 CUDA Version: 12.8 |
3751
- |-----------------------------------------+------------------------+----------------------+
3752
- | GPU Name Persistence-M | Bus-Id Disp.A | Volatile Uncorr. ECC |
3753
- | Fan Temp Perf Pwr:Usage/Cap | Memory-Usage | GPU-Util Compute M. |
3754
- | | | MIG M. |
3755
- |=========================================+========================+======================|
3756
- | 0 NVIDIA A10G On | 00000000:00:1B.0 Off | 0 |
3757
- | 0% 40C P0 49W / 300W | 0MiB / 23028MiB | 0% Default |
3758
- | | | N/A |
3759
- +-----------------------------------------+------------------------+----------------------+
3760
- | 1 NVIDIA A10G On | 00000000:00:1C.0 Off | 0 |
3761
- | 0% 33C P8 23W / 300W | 0MiB / 23028MiB | 0% Default |
3762
- | | | N/A |
3763
- +-----------------------------------------+------------------------+----------------------+
3764
- | 2 NVIDIA A10G On | 00000000:00:1D.0 Off | 0 |
3765
- | 0% 33C P8 24W / 300W | 0MiB / 23028MiB | 0% Default |
3766
- | | | N/A |
3767
- +-----------------------------------------+------------------------+----------------------+
3768
- | 3 NVIDIA A10G On | 00000000:00:1E.0 Off | 0 |
3769
- | 0% 34C P8 26W / 300W | 0MiB / 23028MiB | 0% Default |
3770
- | | | N/A |
3771
- +-----------------------------------------+------------------------+----------------------+
3772
-
3773
- +-----------------------------------------------------------------------------------------+
3774
- | Processes: |
3775
- | GPU GI CI PID Type Process name GPU Memory |
3776
- | ID ID Usage |
3777
- |=========================================================================================|
3778
- | No running processes found |
3779
- +-----------------------------------------------------------------------------------------+
3780
-
3781
- </div>
3782
- </div>
3783
- </div>
3784
-
3785
- <h1>No Kernels</h1>
3786
  <p>First, we run the model without any custom kernels to get a reference point.</p>
3787
  <h2>Forward</h2>
3788
  <h2>Forward and Backward</h2>
@@ -3794,7 +3727,7 @@ Cell: nv | 0.67s
3794
  <span onclick="toggleOutput('forward_and_backward_no_kernel')" style="cursor: pointer;">▼ output</span>
3795
  <span id="uv-indicator-forward_and_backward_no_kernel" style="cursor: default; opacity: 0.3;">▶ uv-logs</span>
3796
  </span> |
3797
- Cell: forward_and_backward_no_kernel | 16.89s | FAILED
3798
  | <button class="run-btn" onclick="runCell('forward_and_backward_no_kernel')">▶ run</button>
3799
  <button class="copy-btn" onclick="copyCell('forward_and_backward_no_kernel')">Copy</button>
3800
  <a href="cells/forward_and_backward_no_kernel.py" target="_blank" class="raw-btn">Raw</a>
 
3715
  </div>
3716
 
3717
  <div class="main-content">
3718
+ <h1>No Kernels</h1>
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
3719
  <p>First, we run the model without any custom kernels to get a reference point.</p>
3720
  <h2>Forward</h2>
3721
  <h2>Forward and Backward</h2>
 
3727
  <span onclick="toggleOutput('forward_and_backward_no_kernel')" style="cursor: pointer;">▼ output</span>
3728
  <span id="uv-indicator-forward_and_backward_no_kernel" style="cursor: default; opacity: 0.3;">▶ uv-logs</span>
3729
  </span> |
3730
+ Cell: forward_and_backward_no_kernel | 17.31s | FAILED
3731
  | <button class="run-btn" onclick="runCell('forward_and_backward_no_kernel')">▶ run</button>
3732
  <button class="copy-btn" onclick="copyCell('forward_and_backward_no_kernel')">Copy</button>
3733
  <a href="cells/forward_and_backward_no_kernel.py" target="_blank" class="raw-btn">Raw</a>
moe_benchmarks/megablocks_yamoe/artifacts/binned_run/binned_results.json CHANGED
@@ -9,16 +9,16 @@
9
  "vary_inputs": true
10
  },
11
  "stats": {
12
- "avg_ms": 36.1408544400183,
13
- "min_ms": 33.21830599998066,
14
- "max_ms": 38.347281000142175,
15
- "std_ms": 1.386811930577117,
16
- "p50_ms": 36.57941149992894,
17
- "p95_ms": 37.79091359995164,
18
- "p99_ms": 38.30271452006173,
19
  "num_iters": 50,
20
- "tokens_per_s": 2766.951737844673,
21
- "throughput_variance": 108.07582031577446
22
  },
23
  "output_sum": 3.97190523147583
24
  }
 
9
  "vary_inputs": true
10
  },
11
  "stats": {
12
+ "avg_ms": 37.042515599997614,
13
+ "min_ms": 33.61098199997059,
14
+ "max_ms": 39.77627800003347,
15
+ "std_ms": 1.6558189449135647,
16
+ "p50_ms": 37.082583499994826,
17
+ "p95_ms": 39.325366850013665,
18
+ "p99_ms": 39.73607153999694,
19
  "num_iters": 50,
20
+ "tokens_per_s": 2699.600671829276,
21
+ "throughput_variance": 122.63980223025922
22
  },
23
  "output_sum": 3.97190523147583
24
  }
moe_benchmarks/megablocks_yamoe/artifacts/gptoss_run/gptoss_results.json CHANGED
@@ -9,16 +9,16 @@
9
  "vary_inputs": true
10
  },
11
  "stats": {
12
- "avg_ms": 46.66910740000276,
13
- "min_ms": 40.354887999910716,
14
- "max_ms": 50.56437000007463,
15
- "std_ms": 2.944349756547624,
16
- "p50_ms": 47.07003099997564,
17
- "p95_ms": 50.338209400013056,
18
- "p99_ms": 50.5430893000721,
19
  "num_iters": 50,
20
- "tokens_per_s": 2142.7450742285696,
21
- "throughput_variance": 139.2849368532802
22
  },
23
  "output_sum": 11.53223705291748
24
  }
 
9
  "vary_inputs": true
10
  },
11
  "stats": {
12
+ "avg_ms": 47.38012357999878,
13
+ "min_ms": 40.92212500000869,
14
+ "max_ms": 51.281423999967046,
15
+ "std_ms": 2.9172375790717613,
16
+ "p50_ms": 48.13728099998116,
17
+ "p95_ms": 51.063823949996845,
18
+ "p99_ms": 51.260956209974324,
19
  "num_iters": 50,
20
+ "tokens_per_s": 2110.589682847817,
21
+ "throughput_variance": 134.12269492084684
22
  },
23
  "output_sum": 11.53223705291748
24
  }
moe_benchmarks/megablocks_yamoe/artifacts/gptoss_training_run/gptoss_training_results.json CHANGED
@@ -9,16 +9,16 @@
9
  "vary_inputs": true
10
  },
11
  "stats": {
12
- "avg_ms": 47.25082628000564,
13
- "min_ms": 40.695745999983046,
14
- "max_ms": 51.11116500006574,
15
- "std_ms": 2.9791735891229654,
16
- "p50_ms": 47.64148850006222,
17
- "p95_ms": 50.98971859999892,
18
- "p99_ms": 51.07645830010824,
19
  "num_iters": 50,
20
- "tokens_per_s": 2116.365106663021,
21
- "throughput_variance": 137.95784254249725
22
  },
23
  "output_sum": 11.53223705291748
24
  }
 
9
  "vary_inputs": true
10
  },
11
  "stats": {
12
+ "avg_ms": 46.92225349999944,
13
+ "min_ms": 38.89427600000772,
14
+ "max_ms": 51.62209400003803,
15
+ "std_ms": 3.930283839179673,
16
+ "p50_ms": 48.18643950000023,
17
+ "p95_ms": 51.4210894500053,
18
+ "p99_ms": 51.56389033003563,
19
  "num_iters": 50,
20
+ "tokens_per_s": 2131.1849397855794,
21
+ "throughput_variance": 188.79708542409617
22
  },
23
  "output_sum": 11.53223705291748
24
  }
moe_benchmarks/megablocks_yamoe/artifacts/megablocks_run/megablocks_results.json CHANGED
@@ -9,16 +9,16 @@
9
  "vary_inputs": true
10
  },
11
  "stats": {
12
- "avg_ms": 3.824586980035747,
13
- "min_ms": 0.8051279999108374,
14
- "max_ms": 8.439711999926658,
15
- "std_ms": 3.6657124717057186,
16
- "p50_ms": 0.8526945000539854,
17
- "p95_ms": 8.437759499952335,
18
- "p99_ms": 8.439305299993975,
19
  "num_iters": 50,
20
- "tokens_per_s": 26146.614136898344,
21
- "throughput_variance": 52691.24007431396
22
  },
23
  "output_sum": 6.4738850593566895
24
  }
 
9
  "vary_inputs": true
10
  },
11
  "stats": {
12
+ "avg_ms": 3.8653191399976095,
13
+ "min_ms": 0.8422269999073251,
14
+ "max_ms": 8.544625000013184,
15
+ "std_ms": 3.690530253469649,
16
+ "p50_ms": 0.88288749992671,
17
+ "p95_ms": 8.536876499982782,
18
+ "p99_ms": 8.542453809967583,
19
  "num_iters": 50,
20
+ "tokens_per_s": 25871.084994048342,
21
+ "throughput_variance": 50674.824252369
22
  },
23
  "output_sum": 6.4738850593566895
24
  }
moe_benchmarks/megablocks_yamoe/artifacts/visualization/moe_performance_comparison.png CHANGED

Git LFS Details

  • SHA256: 2bdd995bfa88d4d6ce39358bb6c647cb884e453b683b3faf072c001199831567
  • Pointer size: 131 Bytes
  • Size of remote file: 308 kB

Git LFS Details

  • SHA256: 1c2252616019a28c0117d1bd8bb023d074018bfde3ba059b7a83dc3b192c9ace
  • Pointer size: 131 Bytes
  • Size of remote file: 308 kB
moe_benchmarks/megablocks_yamoe/artifacts/yamoe_run/yamoe_results.json CHANGED
@@ -9,16 +9,16 @@
9
  "vary_inputs": true
10
  },
11
  "stats": {
12
- "avg_ms": 4.249273639984494,
13
- "min_ms": 4.135004000090703,
14
- "max_ms": 4.295798000157447,
15
- "std_ms": 0.022830750834695483,
16
- "p50_ms": 4.2523765000623825,
17
- "p95_ms": 4.274072999987766,
18
- "p99_ms": 4.289211910063386,
19
  "num_iters": 50,
20
- "tokens_per_s": 23533.433822436742,
21
- "throughput_variance": 128.24246969319347
22
  },
23
- "output_sum": 3.97190523147583
24
  }
 
9
  "vary_inputs": true
10
  },
11
  "stats": {
12
+ "avg_ms": 4.248850420003691,
13
+ "min_ms": 4.146223000020655,
14
+ "max_ms": 4.269965999981196,
15
+ "std_ms": 0.01914249322297606,
16
+ "p50_ms": 4.252545499980442,
17
+ "p95_ms": 4.265578499993694,
18
+ "p99_ms": 4.269833699987657,
19
  "num_iters": 50,
20
+ "tokens_per_s": 23535.77794341678,
21
+ "throughput_variance": 107.68667127056374
22
  },
23
+ "output_sum": 3.9719059467315674
24
  }
moe_benchmarks/megablocks_yamoe/cells/__pycache__/bench_utils.cpython-311.pyc CHANGED
Binary files a/moe_benchmarks/megablocks_yamoe/cells/__pycache__/bench_utils.cpython-311.pyc and b/moe_benchmarks/megablocks_yamoe/cells/__pycache__/bench_utils.cpython-311.pyc differ
 
moe_benchmarks/megablocks_yamoe/cells/__pycache__/config.cpython-311.pyc CHANGED
Binary files a/moe_benchmarks/megablocks_yamoe/cells/__pycache__/config.cpython-311.pyc and b/moe_benchmarks/megablocks_yamoe/cells/__pycache__/config.cpython-311.pyc differ
 
moe_benchmarks/megablocks_yamoe/megablocks_yamoe.html CHANGED
@@ -3715,74 +3715,7 @@ span.linenos.special { color: #000000; background-color: #ffffc0; padding-left:
3715
  </div>
3716
 
3717
  <div class="main-content">
3718
- <div class="cell" id="cell-nv">
3719
- <div class="cell-header">
3720
- <span class="collapse-indicators">
3721
- <span onclick="toggleCode('nv')" style="cursor: pointer;">▼ code</span>
3722
- <span onclick="toggleOutput('nv')" style="cursor: pointer;">▼ output</span>
3723
- <span id="uv-indicator-nv" style="cursor: default; opacity: 0.3;">▶ uv-logs</span>
3724
- </span> |
3725
- Cell: nv | 0.71s
3726
- | <button class="run-btn" onclick="runCell('nv')">▶ run</button>
3727
- <button class="copy-btn" onclick="copyCell('nv')">Copy</button>
3728
- <a href="cells/nv.py" target="_blank" class="raw-btn">Raw</a>
3729
- </div>
3730
- <div id="code-nv" class="cell-code" data-lines="3">
3731
- <div class="highlight-with-lines">
3732
- <div class="line-numbers" id="lines-nv">
3733
- <a class="line-number" data-cell="nv" data-line="1" href="#cell-nv" onclick="event.preventDefault(); selectCellLine('nv', 1, true);">1</a>
3734
- <a class="line-number" data-cell="nv" data-line="2" href="#cell-nv" onclick="event.preventDefault(); selectCellLine('nv', 2, true);">2</a>
3735
- <a class="line-number" data-cell="nv" data-line="3" href="#cell-nv" onclick="event.preventDefault(); selectCellLine('nv', 3, true);">3</a>
3736
- </div>
3737
- <div class="code-wrap">
3738
- <div class="highlight"><pre><span></span><span class="kn">import</span><span class="w"> </span><span class="nn">subprocess</span>
3739
-
3740
- <span class="nb">print</span><span class="p">(</span><span class="n">subprocess</span><span class="o">.</span><span class="n">run</span><span class="p">([</span><span class="s2">&quot;nvidia-smi&quot;</span><span class="p">],</span> <span class="n">capture_output</span><span class="o">=</span><span class="kc">True</span><span class="p">,</span> <span class="n">text</span><span class="o">=</span><span class="kc">True</span><span class="p">)</span><span class="o">.</span><span class="n">stdout</span><span class="p">)</span>
3741
- </pre></div>
3742
-
3743
- <div class="code-line-highlight" id="line-highlight-nv"></div>
3744
- </div>
3745
- </div>
3746
- </div>
3747
- <div id="output-nv" class="cell-output">
3748
- <div class="cell-stdout">Thu Sep 25 20:02:55 2025
3749
- +-----------------------------------------------------------------------------------------+
3750
- | NVIDIA-SMI 570.172.08 Driver Version: 570.172.08 CUDA Version: 12.8 |
3751
- |-----------------------------------------+------------------------+----------------------+
3752
- | GPU Name Persistence-M | Bus-Id Disp.A | Volatile Uncorr. ECC |
3753
- | Fan Temp Perf Pwr:Usage/Cap | Memory-Usage | GPU-Util Compute M. |
3754
- | | | MIG M. |
3755
- |=========================================+========================+======================|
3756
- | 0 NVIDIA A10G On | 00000000:00:1B.0 Off | 0 |
3757
- | 0% 36C P8 25W / 300W | 0MiB / 23028MiB | 0% Default |
3758
- | | | N/A |
3759
- +-----------------------------------------+------------------------+----------------------+
3760
- | 1 NVIDIA A10G On | 00000000:00:1C.0 Off | 0 |
3761
- | 0% 33C P8 23W / 300W | 0MiB / 23028MiB | 0% Default |
3762
- | | | N/A |
3763
- +-----------------------------------------+------------------------+----------------------+
3764
- | 2 NVIDIA A10G On | 00000000:00:1D.0 Off | 0 |
3765
- | 0% 33C P8 24W / 300W | 0MiB / 23028MiB | 0% Default |
3766
- | | | N/A |
3767
- +-----------------------------------------+------------------------+----------------------+
3768
- | 3 NVIDIA A10G On | 00000000:00:1E.0 Off | 0 |
3769
- | 0% 33C P8 26W / 300W | 0MiB / 23028MiB | 0% Default |
3770
- | | | N/A |
3771
- +-----------------------------------------+------------------------+----------------------+
3772
-
3773
- +-----------------------------------------------------------------------------------------+
3774
- | Processes: |
3775
- | GPU GI CI PID Type Process name GPU Memory |
3776
- | ID ID Usage |
3777
- |=========================================================================================|
3778
- | No running processes found |
3779
- +-----------------------------------------------------------------------------------------+
3780
-
3781
- </div>
3782
- </div>
3783
- </div>
3784
-
3785
- <h1>Comparison of Megablocks and Yamoe Kernels</h1>
3786
  <p>This note compares the performance of the Megablocks and Yamoe kernels on the GPT-OSS-20B model.</p>
3787
  <h2>Megablocks kernel</h2>
3788
  <h2>Yamoe Kernel</h2>
@@ -3793,7 +3726,7 @@ Cell: nv | 0.71s
3793
  <span onclick="toggleOutput('setup')" style="cursor: pointer;">▼ output</span>
3794
  <span id="uv-indicator-setup" style="cursor: default; opacity: 0.3;">▶ uv-logs</span>
3795
  </span> |
3796
- Cell: setup | 16.83s | FAILED
3797
  | <button class="run-btn" onclick="runCell('setup')">▶ run</button>
3798
  <button class="copy-btn" onclick="copyCell('setup')">Copy</button>
3799
  <a href="cells/setup.py" target="_blank" class="raw-btn">Raw</a>
 
3715
  </div>
3716
 
3717
  <div class="main-content">
3718
+ <h1>Comparison of Megablocks and Yamoe Kernels</h1>
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
3719
  <p>This note compares the performance of the Megablocks and Yamoe kernels on the GPT-OSS-20B model.</p>
3720
  <h2>Megablocks kernel</h2>
3721
  <h2>Yamoe Kernel</h2>
 
3726
  <span onclick="toggleOutput('setup')" style="cursor: pointer;">▼ output</span>
3727
  <span id="uv-indicator-setup" style="cursor: default; opacity: 0.3;">▶ uv-logs</span>
3728
  </span> |
3729
+ Cell: setup | 17.08s | FAILED
3730
  | <button class="run-btn" onclick="runCell('setup')">▶ run</button>
3731
  <button class="copy-btn" onclick="copyCell('setup')">Copy</button>
3732
  <a href="cells/setup.py" target="_blank" class="raw-btn">Raw</a>
moe_benchmarks/megablocks_yamoe/torch_profile.html CHANGED
@@ -3720,7 +3720,7 @@ span.linenos.special { color: #000000; background-color: #ffffc0; padding-left:
3720
  <span onclick="toggleOutput('utils')" style="cursor: pointer;">▼ output</span>
3721
  <span id="uv-indicator-utils" onclick="toggleUvLogsFromHeader('utils')" style="cursor: pointer;">▶ uv-logs</span>
3722
  </span> |
3723
- Cell: utils | deps: torch, numpy | 36.15s
3724
  | <button class="run-btn" onclick="runCell('utils')">▶ run</button>
3725
  <button class="copy-btn" onclick="copyCell('utils')">Copy</button>
3726
  <a href="cells/utils.py" target="_blank" class="raw-btn">Raw</a>
@@ -3794,24 +3794,24 @@ Cell: utils | deps: torch, numpy | 36.15s
3794
  <div class="uv-install-logs" id="uv-logs-utils">
3795
  <div class="uv-logs-header" onclick="toggleUvLogs(this)">▶ UV Install Logs</div>
3796
  <div class="uv-logs-content" style="display: none;">
 
 
3797
  Downloading networkx (1.9MiB)
3798
  Downloading setuptools (1.1MiB)
3799
- Downloading numpy (16.2MiB)
3800
  Downloading nvidia-cufile-cu12 (1.1MiB)
 
 
 
 
3801
  Downloading nvidia-nvjitlink-cu12 (37.4MiB)
3802
- Downloading nvidia-cuda-cupti-cu12 (9.8MiB)
3803
  Downloading triton (148.3MiB)
 
3804
  Downloading nvidia-cufft-cu12 (184.2MiB)
3805
- Downloading nvidia-curand-cu12 (60.7MiB)
3806
- Downloading nvidia-cudnn-cu12 (674.0MiB)
3807
  Downloading nvidia-cusparse-cu12 (274.9MiB)
3808
- Downloading sympy (6.0MiB)
3809
- Downloading nvidia-cublas-cu12 (566.8MiB)
3810
- Downloading nvidia-cusolver-cu12 (255.1MiB)
3811
- Downloading nvidia-cuda-nvrtc-cu12 (84.0MiB)
3812
- Downloading nvidia-nccl-cu12 (307.4MiB)
3813
  Downloading torch (846.9MiB)
3814
- Downloading nvidia-cusparselt-cu12 (273.9MiB)
3815
  Downloading nvidia-cufile-cu12
3816
  Downloading setuptools
3817
  Downloading networkx
@@ -3830,7 +3830,7 @@ Downloading nvidia-cusparselt-cu12 (273.9MiB)
3830
  Downloading nvidia-cublas-cu12
3831
  Downloading nvidia-cudnn-cu12
3832
  Downloading torch
3833
- Installed 26 packages in 452ms
3834
  </div>
3835
  </div>
3836
  </div>
@@ -3843,7 +3843,7 @@ Installed 26 packages in 452ms
3843
  <span onclick="toggleOutput('bench_utils')" style="cursor: pointer;">▼ output</span>
3844
  <span id="uv-indicator-bench_utils" onclick="toggleUvLogsFromHeader('bench_utils')" style="cursor: pointer;">▶ uv-logs</span>
3845
  </span> |
3846
- Cell: bench_utils | deps: torch, numpy | 34.88s
3847
  | <button class="run-btn" onclick="runCell('bench_utils')">▶ run</button>
3848
  <button class="copy-btn" onclick="copyCell('bench_utils')">Copy</button>
3849
  <a href="cells/bench_utils.py" target="_blank" class="raw-btn">Raw</a>
@@ -4331,24 +4331,24 @@ Cell: bench_utils | deps: torch, numpy | 34.88s
4331
  <div class="uv-install-logs" id="uv-logs-bench_utils">
4332
  <div class="uv-logs-header" onclick="toggleUvLogs(this)">▶ UV Install Logs</div>
4333
  <div class="uv-logs-content" style="display: none;">
4334
- Downloading nvidia-cusparselt-cu12 (273.9MiB)
4335
- Downloading nvidia-cudnn-cu12 (674.0MiB)
4336
- Downloading setuptools (1.1MiB)
4337
- Downloading nvidia-curand-cu12 (60.7MiB)
4338
- Downloading nvidia-nvjitlink-cu12 (37.4MiB)
4339
  Downloading sympy (6.0MiB)
 
 
 
4340
  Downloading nvidia-cusolver-cu12 (255.1MiB)
4341
  Downloading numpy (16.2MiB)
4342
- Downloading nvidia-cufft-cu12 (184.2MiB)
4343
- Downloading nvidia-cuda-cupti-cu12 (9.8MiB)
4344
  Downloading torch (846.9MiB)
4345
- Downloading triton (148.3MiB)
4346
  Downloading nvidia-cusparse-cu12 (274.9MiB)
4347
- Downloading nvidia-cufile-cu12 (1.1MiB)
4348
  Downloading nvidia-cublas-cu12 (566.8MiB)
4349
- Downloading networkx (1.9MiB)
 
 
 
 
4350
  Downloading nvidia-nccl-cu12 (307.4MiB)
4351
- Downloading nvidia-cuda-nvrtc-cu12 (84.0MiB)
4352
  Downloading nvidia-cufile-cu12
4353
  Downloading setuptools
4354
  Downloading networkx
@@ -4361,13 +4361,13 @@ Downloading nvidia-cuda-nvrtc-cu12 (84.0MiB)
4361
  Downloading triton
4362
  Downloading nvidia-cufft-cu12
4363
  Downloading nvidia-cusolver-cu12
4364
- Downloading nvidia-cusparse-cu12
4365
  Downloading nvidia-cusparselt-cu12
 
4366
  Downloading nvidia-nccl-cu12
4367
  Downloading nvidia-cublas-cu12
4368
  Downloading nvidia-cudnn-cu12
4369
  Downloading torch
4370
- Installed 26 packages in 453ms
4371
  </div>
4372
  </div>
4373
  </div>
@@ -4381,7 +4381,7 @@ Installed 26 packages in 453ms
4381
  <span onclick="toggleOutput('config')" style="cursor: pointer;">▼ output</span>
4382
  <span id="uv-indicator-config" onclick="toggleUvLogsFromHeader('config')" style="cursor: pointer;">▶ uv-logs</span>
4383
  </span> |
4384
- Cell: config | deps: torch, numpy | 37.12s
4385
  | <button class="run-btn" onclick="runCell('config')">▶ run</button>
4386
  <button class="copy-btn" onclick="copyCell('config')">Copy</button>
4387
  <a href="cells/config.py" target="_blank" class="raw-btn">Raw</a>
@@ -4441,24 +4441,24 @@ Cell: config | deps: torch, numpy | 37.12s
4441
  <div class="uv-install-logs" id="uv-logs-config">
4442
  <div class="uv-logs-header" onclick="toggleUvLogs(this)">▶ UV Install Logs</div>
4443
  <div class="uv-logs-content" style="display: none;">
4444
- Downloading networkx (1.9MiB)
4445
- Downloading sympy (6.0MiB)
4446
- Downloading nvidia-cusparselt-cu12 (273.9MiB)
4447
- Downloading numpy (16.2MiB)
4448
  Downloading nvidia-cufile-cu12 (1.1MiB)
 
 
 
4449
  Downloading triton (148.3MiB)
4450
- Downloading nvidia-nvjitlink-cu12 (37.4MiB)
 
4451
  Downloading nvidia-cublas-cu12 (566.8MiB)
4452
- Downloading nvidia-cusparse-cu12 (274.9MiB)
4453
  Downloading setuptools (1.1MiB)
4454
- Downloading nvidia-cudnn-cu12 (674.0MiB)
4455
- Downloading nvidia-curand-cu12 (60.7MiB)
4456
- Downloading nvidia-cusolver-cu12 (255.1MiB)
4457
- Downloading nvidia-cufft-cu12 (184.2MiB)
4458
- Downloading nvidia-cuda-cupti-cu12 (9.8MiB)
4459
  Downloading nvidia-nccl-cu12 (307.4MiB)
 
 
 
 
4460
  Downloading nvidia-cuda-nvrtc-cu12 (84.0MiB)
4461
  Downloading torch (846.9MiB)
 
4462
  Downloading nvidia-cufile-cu12
4463
  Downloading setuptools
4464
  Downloading networkx
@@ -4471,13 +4471,13 @@ Downloading torch (846.9MiB)
4471
  Downloading triton
4472
  Downloading nvidia-cufft-cu12
4473
  Downloading nvidia-cusolver-cu12
4474
- Downloading nvidia-cusparselt-cu12
4475
  Downloading nvidia-cusparse-cu12
 
4476
  Downloading nvidia-nccl-cu12
4477
  Downloading nvidia-cublas-cu12
4478
  Downloading nvidia-cudnn-cu12
4479
  Downloading torch
4480
- Installed 26 packages in 453ms
4481
  </div>
4482
  </div>
4483
  </div>
@@ -4490,7 +4490,7 @@ Installed 26 packages in 453ms
4490
  <span onclick="toggleOutput('save_data')" style="cursor: pointer;">▼ output</span>
4491
  <span id="uv-indicator-save_data" onclick="toggleUvLogsFromHeader('save_data')" style="cursor: pointer;">▶ uv-logs</span>
4492
  </span> |
4493
- Cell: save_data | deps: torch, numpy | 39.39s
4494
  | <button class="run-btn" onclick="runCell('save_data')">▶ run</button>
4495
  <button class="copy-btn" onclick="copyCell('save_data')">Copy</button>
4496
  <a href="cells/save_data.py" target="_blank" class="raw-btn">Raw</a>
@@ -4585,24 +4585,24 @@ Down sum: 206.729263
4585
  <div class="uv-install-logs" id="uv-logs-save_data">
4586
  <div class="uv-logs-header" onclick="toggleUvLogs(this)">▶ UV Install Logs</div>
4587
  <div class="uv-logs-content" style="display: none;">
4588
- Downloading networkx (1.9MiB)
4589
- Downloading nvidia-cuda-nvrtc-cu12 (84.0MiB)
4590
- Downloading nvidia-cusparse-cu12 (274.9MiB)
4591
- Downloading numpy (16.2MiB)
4592
- Downloading sympy (6.0MiB)
4593
- Downloading nvidia-nccl-cu12 (307.4MiB)
4594
  Downloading setuptools (1.1MiB)
4595
- Downloading nvidia-cuda-cupti-cu12 (9.8MiB)
4596
- Downloading nvidia-cufft-cu12 (184.2MiB)
4597
  Downloading nvidia-cudnn-cu12 (674.0MiB)
4598
- Downloading nvidia-cufile-cu12 (1.1MiB)
 
 
 
4599
  Downloading nvidia-cusolver-cu12 (255.1MiB)
4600
- Downloading nvidia-cusparselt-cu12 (273.9MiB)
4601
- Downloading nvidia-nvjitlink-cu12 (37.4MiB)
4602
  Downloading nvidia-curand-cu12 (60.7MiB)
4603
- Downloading torch (846.9MiB)
 
 
 
4604
  Downloading triton (148.3MiB)
 
 
 
4605
  Downloading nvidia-cublas-cu12 (566.8MiB)
 
4606
  Downloading nvidia-cufile-cu12
4607
  Downloading setuptools
4608
  Downloading networkx
@@ -4621,17 +4621,17 @@ Downloading nvidia-cublas-cu12 (566.8MiB)
4621
  Downloading nvidia-cublas-cu12
4622
  Downloading nvidia-cudnn-cu12
4623
  Downloading torch
4624
- Installed 26 packages in 464ms
4625
  </div>
4626
  </div>
4627
  <div class="cell-artifacts">
4628
  <h4>Artifacts:</h4>
4629
- <a href="artifacts/save_data/router_bias.pt" class="artifact" target="_blank">router_bias.pt</a>
4630
- <a href="artifacts/save_data/down_proj_bias.pt" class="artifact" target="_blank">down_proj_bias.pt</a>
4631
- <a href="artifacts/save_data/gate_up_proj_bias.pt" class="artifact" target="_blank">gate_up_proj_bias.pt</a>
4632
  <a href="artifacts/save_data/router_weight.pt" class="artifact" target="_blank">router_weight.pt</a>
4633
- <a href="artifacts/save_data/down_proj.pt" class="artifact" target="_blank">down_proj.pt</a>
4634
  <a href="artifacts/save_data/gate_up_proj.pt" class="artifact" target="_blank">gate_up_proj.pt</a>
 
 
 
4635
  </div>
4636
  </div>
4637
  </div>
@@ -4645,7 +4645,7 @@ Installed 26 packages in 464ms
4645
  <span onclick="toggleOutput('yamoe_run')" style="cursor: pointer;">▼ output</span>
4646
  <span id="uv-indicator-yamoe_run" onclick="toggleUvLogsFromHeader('yamoe_run')" style="cursor: pointer;">▶ uv-logs</span>
4647
  </span> |
4648
- Cell: yamoe_run | deps: torch, kernels, numpy | 38.20s
4649
  | <button class="run-btn" onclick="runCell('yamoe_run')">▶ run</button>
4650
  <button class="copy-btn" onclick="copyCell('yamoe_run')">Copy</button>
4651
  <a href="cells/yamoe_run.py" target="_blank" class="raw-btn">Raw</a>
@@ -4938,10 +4938,10 @@ Input Variation: +0.001 * iteration (deterministic)
4938
 
4939
  Warming up (10 iterations)...
4940
  Benchmarking (50 iterations)...
4941
- Progress: 20% complete (avg: 4.253 ms)
4942
- Progress: 40% complete (avg: 4.249 ms)
4943
- Progress: 60% complete (avg: 4.249 ms)
4944
- Progress: 80% complete (avg: 4.249 ms)
4945
 
4946
  Output tensors:
4947
  Primary: shape=(1, 100, 1152), dtype=torch.float32, device=cuda:0, range=[-0.049506, 0.054984], mean=0.000034, std=0.006508, norm=2.208791
@@ -4952,46 +4952,46 @@ Iterations: 50
4952
 
4953
  Latency Statistics:
4954
  Average: 4.249 ms
4955
- Min: 4.135 ms
4956
- Max: 4.296 ms
4957
- Std Dev: 0.023 ms
4958
 
4959
  Percentiles:
4960
- P50 (median): 4.252 ms
4961
- P95: 4.274 ms
4962
- P99: 4.289 ms
4963
 
4964
  Throughput:
4965
- Tokens/sec: 23533.4
4966
- Std Dev: 128.2
4967
  ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━
4968
 
4969
  Saved benchmark results to yamoe_results.json
4970
 
4971
- Output sum: 3.971905
4972
  </div>
4973
  <div class="uv-install-logs" id="uv-logs-yamoe_run">
4974
  <div class="uv-logs-header" onclick="toggleUvLogs(this)">▶ UV Install Logs</div>
4975
  <div class="uv-logs-content" style="display: none;">
4976
- Downloading sympy (6.0MiB)
4977
- Downloading nvidia-cusolver-cu12 (255.1MiB)
4978
- Downloading hf-xet (3.0MiB)
4979
  Downloading nvidia-cuda-cupti-cu12 (9.8MiB)
4980
- Downloading nvidia-cuda-nvrtc-cu12 (84.0MiB)
4981
- Downloading numpy (16.2MiB)
 
4982
  Downloading nvidia-cusparselt-cu12 (273.9MiB)
4983
- Downloading nvidia-cufile-cu12 (1.1MiB)
4984
- Downloading nvidia-nvjitlink-cu12 (37.4MiB)
4985
- Downloading triton (148.3MiB)
4986
  Downloading nvidia-cublas-cu12 (566.8MiB)
4987
- Downloading nvidia-cudnn-cu12 (674.0MiB)
4988
  Downloading nvidia-cusparse-cu12 (274.9MiB)
 
4989
  Downloading nvidia-nccl-cu12 (307.4MiB)
 
 
 
4990
  Downloading nvidia-curand-cu12 (60.7MiB)
4991
- Downloading torch (846.9MiB)
4992
- Downloading nvidia-cufft-cu12 (184.2MiB)
4993
  Downloading setuptools (1.1MiB)
4994
- Downloading networkx (1.9MiB)
 
4995
  Downloading nvidia-cufile-cu12
4996
  Downloading hf-xet
4997
  Downloading setuptools
@@ -5005,19 +5005,19 @@ Downloading networkx (1.9MiB)
5005
  Downloading triton
5006
  Downloading nvidia-cufft-cu12
5007
  Downloading nvidia-cusolver-cu12
5008
- Downloading nvidia-cusparse-cu12
5009
  Downloading nvidia-cusparselt-cu12
 
5010
  Downloading nvidia-nccl-cu12
5011
  Downloading nvidia-cublas-cu12
5012
  Downloading nvidia-cudnn-cu12
5013
  Downloading torch
5014
- Installed 37 packages in 458ms
5015
  </div>
5016
  </div>
5017
  <div class="cell-stderr">Fetching 6 files: 0%| | 0/6 [00:00&lt;?, ?it/s]
5018
- Fetching 6 files: 17%|█▋ | 1/6 [00:00&lt;00:01, 4.00it/s]
5019
- Fetching 6 files: 50%|█████ | 3/6 [00:00&lt;00:00, 4.44it/s]
5020
- Fetching 6 files: 100%|██████████| 6/6 [00:00&lt;00:00, 8.79it/s]</div>
5021
  <div class="cell-artifacts">
5022
  <h4>Artifacts:</h4>
5023
  <a href="artifacts/yamoe_run/yamoe_results.json" class="artifact" target="_blank">yamoe_results.json</a>
@@ -5034,7 +5034,7 @@ Fetching 6 files: 100%|██████████| 6/6 [00:00&lt;00:00, 8.7
5034
  <span onclick="toggleOutput('binned_run')" style="cursor: pointer;">▼ output</span>
5035
  <span id="uv-indicator-binned_run" onclick="toggleUvLogsFromHeader('binned_run')" style="cursor: pointer;">▶ uv-logs</span>
5036
  </span> |
5037
- Cell: binned_run | deps: torch, numpy | 39.24s
5038
  | <button class="run-btn" onclick="runCell('binned_run')">▶ run</button>
5039
  <button class="copy-btn" onclick="copyCell('binned_run')">Copy</button>
5040
  <a href="cells/binned_run.py" target="_blank" class="raw-btn">Raw</a>
@@ -5448,10 +5448,10 @@ Input Variation: +0.001 * iteration (deterministic)
5448
 
5449
  Warming up (10 iterations)...
5450
  Benchmarking (50 iterations)...
5451
- Progress: 20% complete (avg: 37.466 ms)
5452
- Progress: 40% complete (avg: 37.465 ms)
5453
- Progress: 60% complete (avg: 37.162 ms)
5454
- Progress: 80% complete (avg: 36.629 ms)
5455
 
5456
  Output tensors:
5457
  Primary: shape=(1, 100, 1152), dtype=torch.float32, device=cuda:0, range=[-0.049506, 0.054984], mean=0.000034, std=0.006508, norm=2.208791
@@ -5461,19 +5461,19 @@ Output tensors:
5461
  Iterations: 50
5462
 
5463
  Latency Statistics:
5464
- Average: 36.141 ms
5465
- Min: 33.218 ms
5466
- Max: 38.347 ms
5467
- Std Dev: 1.387 ms
5468
 
5469
  Percentiles:
5470
- P50 (median): 36.579 ms
5471
- P95: 37.791 ms
5472
- P99: 38.303 ms
5473
 
5474
  Throughput:
5475
- Tokens/sec: 2767.0
5476
- Std Dev: 108.1
5477
  ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━
5478
 
5479
  Saved benchmark results to binned_results.json
@@ -5484,23 +5484,23 @@ Output sum: 3.971905
5484
  <div class="uv-logs-header" onclick="toggleUvLogs(this)">▶ UV Install Logs</div>
5485
  <div class="uv-logs-content" style="display: none;">
5486
  Downloading setuptools (1.1MiB)
5487
- Downloading numpy (16.2MiB)
5488
- Downloading networkx (1.9MiB)
5489
  Downloading nvidia-cuda-cupti-cu12 (9.8MiB)
 
5490
  Downloading nvidia-cublas-cu12 (566.8MiB)
 
 
 
5491
  Downloading nvidia-cusolver-cu12 (255.1MiB)
5492
- Downloading nvidia-cudnn-cu12 (674.0MiB)
5493
- Downloading nvidia-nvjitlink-cu12 (37.4MiB)
5494
- Downloading nvidia-cusparse-cu12 (274.9MiB)
5495
- Downloading nvidia-cuda-nvrtc-cu12 (84.0MiB)
5496
- Downloading sympy (6.0MiB)
5497
  Downloading nvidia-curand-cu12 (60.7MiB)
 
5498
  Downloading nvidia-cusparselt-cu12 (273.9MiB)
 
 
5499
  Downloading nvidia-cufile-cu12 (1.1MiB)
5500
- Downloading nvidia-cufft-cu12 (184.2MiB)
5501
- Downloading nvidia-nccl-cu12 (307.4MiB)
5502
- Downloading torch (846.9MiB)
5503
- Downloading triton (148.3MiB)
5504
  Downloading nvidia-cufile-cu12
5505
  Downloading setuptools
5506
  Downloading networkx
@@ -5513,13 +5513,13 @@ Downloading triton (148.3MiB)
5513
  Downloading triton
5514
  Downloading nvidia-cufft-cu12
5515
  Downloading nvidia-cusolver-cu12
5516
- Downloading nvidia-cusparselt-cu12
5517
  Downloading nvidia-cusparse-cu12
 
5518
  Downloading nvidia-nccl-cu12
5519
  Downloading nvidia-cublas-cu12
5520
  Downloading nvidia-cudnn-cu12
5521
  Downloading torch
5522
- Installed 26 packages in 444ms
5523
  </div>
5524
  </div>
5525
  <div class="cell-artifacts">
@@ -5538,7 +5538,7 @@ Installed 26 packages in 444ms
5538
  <span onclick="toggleOutput('gptoss_run')" style="cursor: pointer;">▼ output</span>
5539
  <span id="uv-indicator-gptoss_run" onclick="toggleUvLogsFromHeader('gptoss_run')" style="cursor: pointer;">▶ uv-logs</span>
5540
  </span> |
5541
- Cell: gptoss_run | deps: torch, numpy | 43.23s
5542
  | <button class="run-btn" onclick="runCell('gptoss_run')">▶ run</button>
5543
  <button class="copy-btn" onclick="copyCell('gptoss_run')">Copy</button>
5544
  <a href="cells/gptoss_run.py" target="_blank" class="raw-btn">Raw</a>
@@ -5856,10 +5856,10 @@ Input Variation: +0.001 * iteration (deterministic)
5856
 
5857
  Warming up (10 iterations)...
5858
  Benchmarking (50 iterations)...
5859
- Progress: 20% complete (avg: 50.062 ms)
5860
- Progress: 40% complete (avg: 49.677 ms)
5861
- Progress: 60% complete (avg: 48.802 ms)
5862
- Progress: 80% complete (avg: 47.718 ms)
5863
 
5864
  Output tensors:
5865
  Primary: shape=(1, 100, 1152), dtype=torch.float32, device=cuda:0, range=[-0.064982, 0.061193], mean=0.000100, std=0.013510, norm=4.585560
@@ -5869,19 +5869,19 @@ Output tensors:
5869
  Iterations: 50
5870
 
5871
  Latency Statistics:
5872
- Average: 46.669 ms
5873
- Min: 40.355 ms
5874
- Max: 50.564 ms
5875
- Std Dev: 2.944 ms
5876
 
5877
  Percentiles:
5878
- P50 (median): 47.070 ms
5879
- P95: 50.338 ms
5880
- P99: 50.543 ms
5881
 
5882
  Throughput:
5883
- Tokens/sec: 2142.7
5884
- Std Dev: 139.3
5885
  ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━
5886
 
5887
  Saved benchmark results to gptoss_results.json
@@ -5891,24 +5891,24 @@ Output sum: 11.532237
5891
  <div class="uv-install-logs" id="uv-logs-gptoss_run">
5892
  <div class="uv-logs-header" onclick="toggleUvLogs(this)">▶ UV Install Logs</div>
5893
  <div class="uv-logs-content" style="display: none;">
 
5894
  Downloading networkx (1.9MiB)
5895
- Downloading nvidia-curand-cu12 (60.7MiB)
5896
- Downloading nvidia-nvjitlink-cu12 (37.4MiB)
5897
- Downloading nvidia-nccl-cu12 (307.4MiB)
5898
  Downloading triton (148.3MiB)
5899
- Downloading nvidia-cuda-nvrtc-cu12 (84.0MiB)
5900
- Downloading nvidia-cusparselt-cu12 (273.9MiB)
5901
- Downloading nvidia-cudnn-cu12 (674.0MiB)
5902
- Downloading nvidia-cufft-cu12 (184.2MiB)
5903
- Downloading nvidia-cublas-cu12 (566.8MiB)
5904
- Downloading sympy (6.0MiB)
5905
- Downloading setuptools (1.1MiB)
5906
  Downloading nvidia-cufile-cu12 (1.1MiB)
5907
- Downloading nvidia-cusolver-cu12 (255.1MiB)
5908
  Downloading nvidia-cuda-cupti-cu12 (9.8MiB)
 
 
 
 
5909
  Downloading numpy (16.2MiB)
5910
- Downloading torch (846.9MiB)
 
 
 
5911
  Downloading nvidia-cusparse-cu12 (274.9MiB)
 
 
 
5912
  Downloading nvidia-cufile-cu12
5913
  Downloading setuptools
5914
  Downloading networkx
@@ -5927,7 +5927,7 @@ Downloading nvidia-cusparse-cu12 (274.9MiB)
5927
  Downloading nvidia-cublas-cu12
5928
  Downloading nvidia-cudnn-cu12
5929
  Downloading torch
5930
- Installed 26 packages in 455ms
5931
  </div>
5932
  </div>
5933
  <div class="cell-artifacts">
@@ -5946,7 +5946,7 @@ Installed 26 packages in 455ms
5946
  <span onclick="toggleOutput('gptoss_training_run')" style="cursor: pointer;">▼ output</span>
5947
  <span id="uv-indicator-gptoss_training_run" onclick="toggleUvLogsFromHeader('gptoss_training_run')" style="cursor: pointer;">▶ uv-logs</span>
5948
  </span> |
5949
- Cell: gptoss_training_run | deps: torch, numpy | 40.10s
5950
  | <button class="run-btn" onclick="runCell('gptoss_training_run')">▶ run</button>
5951
  <button class="copy-btn" onclick="copyCell('gptoss_training_run')">Copy</button>
5952
  <a href="cells/gptoss_training_run.py" target="_blank" class="raw-btn">Raw</a>
@@ -6247,10 +6247,10 @@ Input Variation: +0.001 * iteration (deterministic)
6247
 
6248
  Warming up (10 iterations)...
6249
  Benchmarking (50 iterations)...
6250
- Progress: 20% complete (avg: 50.696 ms)
6251
- Progress: 40% complete (avg: 50.262 ms)
6252
- Progress: 60% complete (avg: 49.357 ms)
6253
- Progress: 80% complete (avg: 48.257 ms)
6254
 
6255
  Output tensors:
6256
  Primary: shape=(1, 100, 1152), dtype=torch.float32, device=cuda:0, range=[-0.064982, 0.061193], mean=0.000100, std=0.013510, norm=4.585560
@@ -6260,19 +6260,19 @@ Output tensors:
6260
  Iterations: 50
6261
 
6262
  Latency Statistics:
6263
- Average: 47.251 ms
6264
- Min: 40.696 ms
6265
- Max: 51.111 ms
6266
- Std Dev: 2.979 ms
6267
 
6268
  Percentiles:
6269
- P50 (median): 47.641 ms
6270
- P95: 50.990 ms
6271
- P99: 51.076 ms
6272
 
6273
  Throughput:
6274
- Tokens/sec: 2116.4
6275
- Std Dev: 138.0
6276
  ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━
6277
 
6278
  Saved benchmark results to gptoss_training_results.json
@@ -6282,24 +6282,24 @@ Output sum: 11.532237
6282
  <div class="uv-install-logs" id="uv-logs-gptoss_training_run">
6283
  <div class="uv-logs-header" onclick="toggleUvLogs(this)">▶ UV Install Logs</div>
6284
  <div class="uv-logs-content" style="display: none;">
6285
- Downloading setuptools (1.1MiB)
6286
- Downloading nvidia-curand-cu12 (60.7MiB)
6287
  Downloading nvidia-cudnn-cu12 (674.0MiB)
6288
- Downloading nvidia-cusparselt-cu12 (273.9MiB)
6289
- Downloading numpy (16.2MiB)
6290
- Downloading nvidia-cufft-cu12 (184.2MiB)
6291
- Downloading nvidia-cusparse-cu12 (274.9MiB)
6292
- Downloading nvidia-nvjitlink-cu12 (37.4MiB)
6293
- Downloading nvidia-cublas-cu12 (566.8MiB)
6294
  Downloading nvidia-cuda-cupti-cu12 (9.8MiB)
6295
- Downloading nvidia-cuda-nvrtc-cu12 (84.0MiB)
6296
- Downloading networkx (1.9MiB)
6297
  Downloading nvidia-cusolver-cu12 (255.1MiB)
6298
- Downloading nvidia-cufile-cu12 (1.1MiB)
6299
  Downloading nvidia-nccl-cu12 (307.4MiB)
6300
- Downloading triton (148.3MiB)
 
 
 
6301
  Downloading sympy (6.0MiB)
 
 
 
6302
  Downloading torch (846.9MiB)
 
 
 
 
 
6303
  Downloading nvidia-cufile-cu12
6304
  Downloading setuptools
6305
  Downloading networkx
@@ -6318,7 +6318,7 @@ Downloading torch (846.9MiB)
6318
  Downloading nvidia-cublas-cu12
6319
  Downloading nvidia-cudnn-cu12
6320
  Downloading torch
6321
- Installed 26 packages in 444ms
6322
  </div>
6323
  </div>
6324
  <div class="cell-artifacts">
@@ -6337,7 +6337,7 @@ Installed 26 packages in 444ms
6337
  <span onclick="toggleOutput('megablocks_run')" style="cursor: pointer;">▼ output</span>
6338
  <span id="uv-indicator-megablocks_run" onclick="toggleUvLogsFromHeader('megablocks_run')" style="cursor: pointer;">▶ uv-logs</span>
6339
  </span> |
6340
- Cell: megablocks_run | deps: torch, numpy, kernels | 47.19s
6341
  | <button class="run-btn" onclick="runCell('megablocks_run')">▶ run</button>
6342
  <button class="copy-btn" onclick="copyCell('megablocks_run')">Copy</button>
6343
  <a href="cells/megablocks_run.py" target="_blank" class="raw-btn">Raw</a>
@@ -6566,10 +6566,10 @@ Input Variation: +0.001 * iteration (deterministic)
6566
 
6567
  Warming up (10 iterations)...
6568
  Benchmarking (50 iterations)...
6569
- Progress: 20% complete (avg: 0.854 ms)
6570
- Progress: 40% complete (avg: 0.841 ms)
6571
- Progress: 60% complete (avg: 0.843 ms)
6572
- Progress: 80% complete (avg: 2.673 ms)
6573
 
6574
  Output tensors:
6575
  Primary: shape=(1, 100, 1152), dtype=torch.float32, device=cuda:0, range=[-0.061104, 0.055115], mean=0.000056, std=0.013535, norm=4.593927
@@ -6579,19 +6579,19 @@ Output tensors:
6579
  Iterations: 50
6580
 
6581
  Latency Statistics:
6582
- Average: 3.825 ms
6583
- Min: 0.805 ms
6584
- Max: 8.440 ms
6585
- Std Dev: 3.666 ms
6586
 
6587
  Percentiles:
6588
- P50 (median): 0.853 ms
6589
- P95: 8.438 ms
6590
- P99: 8.439 ms
6591
 
6592
  Throughput:
6593
- Tokens/sec: 26146.6
6594
- Std Dev: 52691.2
6595
  ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━
6596
 
6597
  Saved benchmark results to megablocks_results.json
@@ -6601,25 +6601,25 @@ Output sum: 6.473885
6601
  <div class="uv-install-logs" id="uv-logs-megablocks_run">
6602
  <div class="uv-logs-header" onclick="toggleUvLogs(this)">▶ UV Install Logs</div>
6603
  <div class="uv-logs-content" style="display: none;">
6604
- Downloading nvidia-cuda-nvrtc-cu12 (84.0MiB)
6605
- Downloading nvidia-cusparse-cu12 (274.9MiB)
6606
- Downloading setuptools (1.1MiB)
6607
- Downloading hf-xet (3.0MiB)
6608
- Downloading nvidia-cufile-cu12 (1.1MiB)
6609
  Downloading numpy (16.2MiB)
6610
- Downloading networkx (1.9MiB)
6611
- Downloading nvidia-nvjitlink-cu12 (37.4MiB)
6612
- Downloading nvidia-cusparselt-cu12 (273.9MiB)
 
 
6613
  Downloading torch (846.9MiB)
 
 
6614
  Downloading nvidia-cudnn-cu12 (674.0MiB)
6615
- Downloading sympy (6.0MiB)
6616
- Downloading nvidia-cublas-cu12 (566.8MiB)
6617
- Downloading nvidia-cuda-cupti-cu12 (9.8MiB)
6618
- Downloading nvidia-cusolver-cu12 (255.1MiB)
6619
  Downloading nvidia-cufft-cu12 (184.2MiB)
6620
- Downloading nvidia-nccl-cu12 (307.4MiB)
6621
- Downloading nvidia-curand-cu12 (60.7MiB)
6622
  Downloading triton (148.3MiB)
 
 
 
6623
  Downloading nvidia-cufile-cu12
6624
  Downloading hf-xet
6625
  Downloading setuptools
@@ -6639,19 +6639,19 @@ Downloading triton (148.3MiB)
6639
  Downloading nvidia-cublas-cu12
6640
  Downloading nvidia-cudnn-cu12
6641
  Downloading torch
6642
- Installed 37 packages in 459ms
6643
  </div>
6644
  </div>
6645
  <div class="cell-stderr">Fetching 66 files: 0%| | 0/66 [00:00&lt;?, ?it/s]
6646
- Fetching 66 files: 2%|▏ | 1/66 [00:00&lt;00:15, 4.22it/s]
6647
- Fetching 66 files: 5%|▍ | 3/66 [00:00&lt;00:07, 8.32it/s]
6648
- Fetching 66 files: 21%|██ | 14/66 [00:00&lt;00:01, 35.22it/s]
6649
- Fetching 66 files: 29%|██▉ | 19/66 [00:01&lt;00:02, 18.12it/s]
6650
- Fetching 66 files: 55%|█████▍ | 36/66 [00:01&lt;00:01, 29.43it/s]
6651
- Fetching 66 files: 70%|██████▉ | 46/66 [00:01&lt;00:00, 38.98it/s]
6652
- Fetching 66 files: 82%|████████▏ | 54/66 [00:01&lt;00:00, 42.11it/s]
6653
- Fetching 66 files: 95%|█████████▌| 63/66 [00:01&lt;00:00, 48.66it/s]
6654
- Fetching 66 files: 100%|██████████| 66/66 [00:01&lt;00:00, 33.84it/s]</div>
6655
  <div class="cell-artifacts">
6656
  <h4>Artifacts:</h4>
6657
  <a href="artifacts/megablocks_run/megablocks_results.json" class="artifact" target="_blank">megablocks_results.json</a>
@@ -6668,7 +6668,7 @@ Fetching 66 files: 100%|██████████| 66/66 [00:01&lt;00:00, 3
6668
  <span onclick="toggleOutput('visualization')" style="cursor: pointer;">▼ output</span>
6669
  <span id="uv-indicator-visualization" onclick="toggleUvLogsFromHeader('visualization')" style="cursor: pointer;">▶ uv-logs</span>
6670
  </span> |
6671
- Cell: visualization | deps: matplotlib | 3.13s
6672
  | <button class="run-btn" onclick="runCell('visualization')">▶ run</button>
6673
  <button class="copy-btn" onclick="copyCell('visualization')">Copy</button>
6674
  <a href="cells/visualization.py" target="_blank" class="raw-btn">Raw</a>
@@ -6914,30 +6914,30 @@ Loaded /repo/moe_benchmarks/megablocks_yamoe/.uvnote/cache/0febdf3420999533bc2e1
6914
  Performance Summary:
6915
  Implementation Avg (ms) P95 (ms) Tokens/sec Relative Speed
6916
  --------------------------------------------------------------------------------
6917
- megablocks_results 3.82 8.44 26147 1.00x
6918
- yamoe_results 4.25 4.27 23533 0.90x
6919
- binned_results 36.14 37.79 2767 0.11x
6920
- gptoss_results 46.67 50.34 2143 0.08x
6921
- gptoss_training_results 47.25 50.99 2116 0.08x
6922
-
6923
- Fastest: megablocks_results (3.82ms avg)
6924
- Slowest: gptoss_training_results (47.25ms avg)
6925
- Max Speedup: 12.4x
6926
  </div>
6927
  <div class="uv-install-logs" id="uv-logs-visualization">
6928
  <div class="uv-logs-header" onclick="toggleUvLogs(this)">▶ UV Install Logs</div>
6929
  <div class="uv-logs-content" style="display: none;">
6930
- Downloading pillow (6.3MiB)
6931
- Downloading matplotlib (8.3MiB)
6932
- Downloading fonttools (4.7MiB)
6933
  Downloading numpy (16.2MiB)
6934
  Downloading kiwisolver (1.4MiB)
 
 
 
6935
  Downloading kiwisolver
6936
  Downloading pillow
6937
  Downloading fonttools
6938
  Downloading matplotlib
6939
  Downloading numpy
6940
- Installed 11 packages in 49ms
6941
  </div>
6942
  </div>
6943
  <div class="cell-artifacts">
 
3720
  <span onclick="toggleOutput('utils')" style="cursor: pointer;">▼ output</span>
3721
  <span id="uv-indicator-utils" onclick="toggleUvLogsFromHeader('utils')" style="cursor: pointer;">▶ uv-logs</span>
3722
  </span> |
3723
+ Cell: utils | deps: torch, numpy | 34.86s
3724
  | <button class="run-btn" onclick="runCell('utils')">▶ run</button>
3725
  <button class="copy-btn" onclick="copyCell('utils')">Copy</button>
3726
  <a href="cells/utils.py" target="_blank" class="raw-btn">Raw</a>
 
3794
  <div class="uv-install-logs" id="uv-logs-utils">
3795
  <div class="uv-logs-header" onclick="toggleUvLogs(this)">▶ UV Install Logs</div>
3796
  <div class="uv-logs-content" style="display: none;">
3797
+ Downloading nvidia-cuda-cupti-cu12 (9.8MiB)
3798
+ Downloading sympy (6.0MiB)
3799
  Downloading networkx (1.9MiB)
3800
  Downloading setuptools (1.1MiB)
3801
+ Downloading nvidia-cudnn-cu12 (674.0MiB)
3802
  Downloading nvidia-cufile-cu12 (1.1MiB)
3803
+ Downloading nvidia-nccl-cu12 (307.4MiB)
3804
+ Downloading nvidia-cusolver-cu12 (255.1MiB)
3805
+ Downloading nvidia-cusparselt-cu12 (273.9MiB)
3806
+ Downloading nvidia-curand-cu12 (60.7MiB)
3807
  Downloading nvidia-nvjitlink-cu12 (37.4MiB)
3808
+ Downloading numpy (16.2MiB)
3809
  Downloading triton (148.3MiB)
3810
+ Downloading nvidia-cublas-cu12 (566.8MiB)
3811
  Downloading nvidia-cufft-cu12 (184.2MiB)
 
 
3812
  Downloading nvidia-cusparse-cu12 (274.9MiB)
 
 
 
 
 
3813
  Downloading torch (846.9MiB)
3814
+ Downloading nvidia-cuda-nvrtc-cu12 (84.0MiB)
3815
  Downloading nvidia-cufile-cu12
3816
  Downloading setuptools
3817
  Downloading networkx
 
3830
  Downloading nvidia-cublas-cu12
3831
  Downloading nvidia-cudnn-cu12
3832
  Downloading torch
3833
+ Installed 26 packages in 447ms
3834
  </div>
3835
  </div>
3836
  </div>
 
3843
  <span onclick="toggleOutput('bench_utils')" style="cursor: pointer;">▼ output</span>
3844
  <span id="uv-indicator-bench_utils" onclick="toggleUvLogsFromHeader('bench_utils')" style="cursor: pointer;">▶ uv-logs</span>
3845
  </span> |
3846
+ Cell: bench_utils | deps: torch, numpy | 35.19s
3847
  | <button class="run-btn" onclick="runCell('bench_utils')">▶ run</button>
3848
  <button class="copy-btn" onclick="copyCell('bench_utils')">Copy</button>
3849
  <a href="cells/bench_utils.py" target="_blank" class="raw-btn">Raw</a>
 
4331
  <div class="uv-install-logs" id="uv-logs-bench_utils">
4332
  <div class="uv-logs-header" onclick="toggleUvLogs(this)">▶ UV Install Logs</div>
4333
  <div class="uv-logs-content" style="display: none;">
 
 
 
 
 
4334
  Downloading sympy (6.0MiB)
4335
+ Downloading networkx (1.9MiB)
4336
+ Downloading setuptools (1.1MiB)
4337
+ Downloading nvidia-cudnn-cu12 (674.0MiB)
4338
  Downloading nvidia-cusolver-cu12 (255.1MiB)
4339
  Downloading numpy (16.2MiB)
4340
+ Downloading nvidia-cuda-nvrtc-cu12 (84.0MiB)
4341
+ Downloading nvidia-nvjitlink-cu12 (37.4MiB)
4342
  Downloading torch (846.9MiB)
4343
+ Downloading nvidia-cufft-cu12 (184.2MiB)
4344
  Downloading nvidia-cusparse-cu12 (274.9MiB)
 
4345
  Downloading nvidia-cublas-cu12 (566.8MiB)
4346
+ Downloading nvidia-curand-cu12 (60.7MiB)
4347
+ Downloading nvidia-cufile-cu12 (1.1MiB)
4348
+ Downloading nvidia-cuda-cupti-cu12 (9.8MiB)
4349
+ Downloading nvidia-cusparselt-cu12 (273.9MiB)
4350
+ Downloading triton (148.3MiB)
4351
  Downloading nvidia-nccl-cu12 (307.4MiB)
 
4352
  Downloading nvidia-cufile-cu12
4353
  Downloading setuptools
4354
  Downloading networkx
 
4361
  Downloading triton
4362
  Downloading nvidia-cufft-cu12
4363
  Downloading nvidia-cusolver-cu12
 
4364
  Downloading nvidia-cusparselt-cu12
4365
+ Downloading nvidia-cusparse-cu12
4366
  Downloading nvidia-nccl-cu12
4367
  Downloading nvidia-cublas-cu12
4368
  Downloading nvidia-cudnn-cu12
4369
  Downloading torch
4370
+ Installed 26 packages in 455ms
4371
  </div>
4372
  </div>
4373
  </div>
 
4381
  <span onclick="toggleOutput('config')" style="cursor: pointer;">▼ output</span>
4382
  <span id="uv-indicator-config" onclick="toggleUvLogsFromHeader('config')" style="cursor: pointer;">▶ uv-logs</span>
4383
  </span> |
4384
+ Cell: config | deps: torch, numpy | 34.96s
4385
  | <button class="run-btn" onclick="runCell('config')">▶ run</button>
4386
  <button class="copy-btn" onclick="copyCell('config')">Copy</button>
4387
  <a href="cells/config.py" target="_blank" class="raw-btn">Raw</a>
 
4441
  <div class="uv-install-logs" id="uv-logs-config">
4442
  <div class="uv-logs-header" onclick="toggleUvLogs(this)">▶ UV Install Logs</div>
4443
  <div class="uv-logs-content" style="display: none;">
 
 
 
 
4444
  Downloading nvidia-cufile-cu12 (1.1MiB)
4445
+ Downloading nvidia-cusparse-cu12 (274.9MiB)
4446
+ Downloading nvidia-cufft-cu12 (184.2MiB)
4447
+ Downloading sympy (6.0MiB)
4448
  Downloading triton (148.3MiB)
4449
+ Downloading nvidia-curand-cu12 (60.7MiB)
4450
+ Downloading nvidia-cusparselt-cu12 (273.9MiB)
4451
  Downloading nvidia-cublas-cu12 (566.8MiB)
4452
+ Downloading nvidia-nvjitlink-cu12 (37.4MiB)
4453
  Downloading setuptools (1.1MiB)
 
 
 
 
 
4454
  Downloading nvidia-nccl-cu12 (307.4MiB)
4455
+ Downloading networkx (1.9MiB)
4456
+ Downloading nvidia-cuda-cupti-cu12 (9.8MiB)
4457
+ Downloading nvidia-cusolver-cu12 (255.1MiB)
4458
+ Downloading nvidia-cudnn-cu12 (674.0MiB)
4459
  Downloading nvidia-cuda-nvrtc-cu12 (84.0MiB)
4460
  Downloading torch (846.9MiB)
4461
+ Downloading numpy (16.2MiB)
4462
  Downloading nvidia-cufile-cu12
4463
  Downloading setuptools
4464
  Downloading networkx
 
4471
  Downloading triton
4472
  Downloading nvidia-cufft-cu12
4473
  Downloading nvidia-cusolver-cu12
 
4474
  Downloading nvidia-cusparse-cu12
4475
+ Downloading nvidia-cusparselt-cu12
4476
  Downloading nvidia-nccl-cu12
4477
  Downloading nvidia-cublas-cu12
4478
  Downloading nvidia-cudnn-cu12
4479
  Downloading torch
4480
+ Installed 26 packages in 449ms
4481
  </div>
4482
  </div>
4483
  </div>
 
4490
  <span onclick="toggleOutput('save_data')" style="cursor: pointer;">▼ output</span>
4491
  <span id="uv-indicator-save_data" onclick="toggleUvLogsFromHeader('save_data')" style="cursor: pointer;">▶ uv-logs</span>
4492
  </span> |
4493
+ Cell: save_data | deps: torch, numpy | 39.26s
4494
  | <button class="run-btn" onclick="runCell('save_data')">▶ run</button>
4495
  <button class="copy-btn" onclick="copyCell('save_data')">Copy</button>
4496
  <a href="cells/save_data.py" target="_blank" class="raw-btn">Raw</a>
 
4585
  <div class="uv-install-logs" id="uv-logs-save_data">
4586
  <div class="uv-logs-header" onclick="toggleUvLogs(this)">▶ UV Install Logs</div>
4587
  <div class="uv-logs-content" style="display: none;">
 
 
 
 
 
 
4588
  Downloading setuptools (1.1MiB)
 
 
4589
  Downloading nvidia-cudnn-cu12 (674.0MiB)
4590
+ Downloading nvidia-cuda-cupti-cu12 (9.8MiB)
4591
+ Downloading nvidia-nccl-cu12 (307.4MiB)
4592
+ Downloading nvidia-cusparse-cu12 (274.9MiB)
4593
+ Downloading sympy (6.0MiB)
4594
  Downloading nvidia-cusolver-cu12 (255.1MiB)
 
 
4595
  Downloading nvidia-curand-cu12 (60.7MiB)
4596
+ Downloading numpy (16.2MiB)
4597
+ Downloading nvidia-cufile-cu12 (1.1MiB)
4598
+ Downloading nvidia-cuda-nvrtc-cu12 (84.0MiB)
4599
+ Downloading nvidia-cusparselt-cu12 (273.9MiB)
4600
  Downloading triton (148.3MiB)
4601
+ Downloading nvidia-cufft-cu12 (184.2MiB)
4602
+ Downloading networkx (1.9MiB)
4603
+ Downloading torch (846.9MiB)
4604
  Downloading nvidia-cublas-cu12 (566.8MiB)
4605
+ Downloading nvidia-nvjitlink-cu12 (37.4MiB)
4606
  Downloading nvidia-cufile-cu12
4607
  Downloading setuptools
4608
  Downloading networkx
 
4621
  Downloading nvidia-cublas-cu12
4622
  Downloading nvidia-cudnn-cu12
4623
  Downloading torch
4624
+ Installed 26 packages in 453ms
4625
  </div>
4626
  </div>
4627
  <div class="cell-artifacts">
4628
  <h4>Artifacts:</h4>
 
 
 
4629
  <a href="artifacts/save_data/router_weight.pt" class="artifact" target="_blank">router_weight.pt</a>
4630
+ <a href="artifacts/save_data/gate_up_proj_bias.pt" class="artifact" target="_blank">gate_up_proj_bias.pt</a>
4631
  <a href="artifacts/save_data/gate_up_proj.pt" class="artifact" target="_blank">gate_up_proj.pt</a>
4632
+ <a href="artifacts/save_data/down_proj.pt" class="artifact" target="_blank">down_proj.pt</a>
4633
+ <a href="artifacts/save_data/router_bias.pt" class="artifact" target="_blank">router_bias.pt</a>
4634
+ <a href="artifacts/save_data/down_proj_bias.pt" class="artifact" target="_blank">down_proj_bias.pt</a>
4635
  </div>
4636
  </div>
4637
  </div>
 
4645
  <span onclick="toggleOutput('yamoe_run')" style="cursor: pointer;">▼ output</span>
4646
  <span id="uv-indicator-yamoe_run" onclick="toggleUvLogsFromHeader('yamoe_run')" style="cursor: pointer;">▶ uv-logs</span>
4647
  </span> |
4648
+ Cell: yamoe_run | deps: torch, kernels, numpy | 39.67s
4649
  | <button class="run-btn" onclick="runCell('yamoe_run')">▶ run</button>
4650
  <button class="copy-btn" onclick="copyCell('yamoe_run')">Copy</button>
4651
  <a href="cells/yamoe_run.py" target="_blank" class="raw-btn">Raw</a>
 
4938
 
4939
  Warming up (10 iterations)...
4940
  Benchmarking (50 iterations)...
4941
+ Progress: 20% complete (avg: 4.250 ms)
4942
+ Progress: 40% complete (avg: 4.247 ms)
4943
+ Progress: 60% complete (avg: 4.248 ms)
4944
+ Progress: 80% complete (avg: 4.248 ms)
4945
 
4946
  Output tensors:
4947
  Primary: shape=(1, 100, 1152), dtype=torch.float32, device=cuda:0, range=[-0.049506, 0.054984], mean=0.000034, std=0.006508, norm=2.208791
 
4952
 
4953
  Latency Statistics:
4954
  Average: 4.249 ms
4955
+ Min: 4.146 ms
4956
+ Max: 4.270 ms
4957
+ Std Dev: 0.019 ms
4958
 
4959
  Percentiles:
4960
+ P50 (median): 4.253 ms
4961
+ P95: 4.266 ms
4962
+ P99: 4.270 ms
4963
 
4964
  Throughput:
4965
+ Tokens/sec: 23535.8
4966
+ Std Dev: 107.7
4967
  ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━
4968
 
4969
  Saved benchmark results to yamoe_results.json
4970
 
4971
+ Output sum: 3.971906
4972
  </div>
4973
  <div class="uv-install-logs" id="uv-logs-yamoe_run">
4974
  <div class="uv-logs-header" onclick="toggleUvLogs(this)">▶ UV Install Logs</div>
4975
  <div class="uv-logs-content" style="display: none;">
 
 
 
4976
  Downloading nvidia-cuda-cupti-cu12 (9.8MiB)
4977
+ Downloading networkx (1.9MiB)
4978
+ Downloading nvidia-cufft-cu12 (184.2MiB)
4979
+ Downloading nvidia-cudnn-cu12 (674.0MiB)
4980
  Downloading nvidia-cusparselt-cu12 (273.9MiB)
 
 
 
4981
  Downloading nvidia-cublas-cu12 (566.8MiB)
4982
+ Downloading torch (846.9MiB)
4983
  Downloading nvidia-cusparse-cu12 (274.9MiB)
4984
+ Downloading nvidia-cuda-nvrtc-cu12 (84.0MiB)
4985
  Downloading nvidia-nccl-cu12 (307.4MiB)
4986
+ Downloading nvidia-cusolver-cu12 (255.1MiB)
4987
+ Downloading triton (148.3MiB)
4988
+ Downloading numpy (16.2MiB)
4989
  Downloading nvidia-curand-cu12 (60.7MiB)
4990
+ Downloading nvidia-nvjitlink-cu12 (37.4MiB)
4991
+ Downloading sympy (6.0MiB)
4992
  Downloading setuptools (1.1MiB)
4993
+ Downloading hf-xet (3.0MiB)
4994
+ Downloading nvidia-cufile-cu12 (1.1MiB)
4995
  Downloading nvidia-cufile-cu12
4996
  Downloading hf-xet
4997
  Downloading setuptools
 
5005
  Downloading triton
5006
  Downloading nvidia-cufft-cu12
5007
  Downloading nvidia-cusolver-cu12
 
5008
  Downloading nvidia-cusparselt-cu12
5009
+ Downloading nvidia-cusparse-cu12
5010
  Downloading nvidia-nccl-cu12
5011
  Downloading nvidia-cublas-cu12
5012
  Downloading nvidia-cudnn-cu12
5013
  Downloading torch
5014
+ Installed 37 packages in 453ms
5015
  </div>
5016
  </div>
5017
  <div class="cell-stderr">Fetching 6 files: 0%| | 0/6 [00:00&lt;?, ?it/s]
5018
+ Fetching 6 files: 17%|█▋ | 1/6 [00:00&lt;00:01, 3.69it/s]
5019
+ Fetching 6 files: 50%|█████ | 3/6 [00:00&lt;00:00, 3.01it/s]
5020
+ Fetching 6 files: 100%|██████████| 6/6 [00:00&lt;00:00, 6.13it/s]</div>
5021
  <div class="cell-artifacts">
5022
  <h4>Artifacts:</h4>
5023
  <a href="artifacts/yamoe_run/yamoe_results.json" class="artifact" target="_blank">yamoe_results.json</a>
 
5034
  <span onclick="toggleOutput('binned_run')" style="cursor: pointer;">▼ output</span>
5035
  <span id="uv-indicator-binned_run" onclick="toggleUvLogsFromHeader('binned_run')" style="cursor: pointer;">▶ uv-logs</span>
5036
  </span> |
5037
+ Cell: binned_run | deps: torch, numpy | 39.33s
5038
  | <button class="run-btn" onclick="runCell('binned_run')">▶ run</button>
5039
  <button class="copy-btn" onclick="copyCell('binned_run')">Copy</button>
5040
  <a href="cells/binned_run.py" target="_blank" class="raw-btn">Raw</a>
 
5448
 
5449
  Warming up (10 iterations)...
5450
  Benchmarking (50 iterations)...
5451
+ Progress: 20% complete (avg: 38.670 ms)
5452
+ Progress: 40% complete (avg: 38.443 ms)
5453
+ Progress: 60% complete (avg: 38.101 ms)
5454
+ Progress: 80% complete (avg: 37.640 ms)
5455
 
5456
  Output tensors:
5457
  Primary: shape=(1, 100, 1152), dtype=torch.float32, device=cuda:0, range=[-0.049506, 0.054984], mean=0.000034, std=0.006508, norm=2.208791
 
5461
  Iterations: 50
5462
 
5463
  Latency Statistics:
5464
+ Average: 37.043 ms
5465
+ Min: 33.611 ms
5466
+ Max: 39.776 ms
5467
+ Std Dev: 1.656 ms
5468
 
5469
  Percentiles:
5470
+ P50 (median): 37.083 ms
5471
+ P95: 39.325 ms
5472
+ P99: 39.736 ms
5473
 
5474
  Throughput:
5475
+ Tokens/sec: 2699.6
5476
+ Std Dev: 122.6
5477
  ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━
5478
 
5479
  Saved benchmark results to binned_results.json
 
5484
  <div class="uv-logs-header" onclick="toggleUvLogs(this)">▶ UV Install Logs</div>
5485
  <div class="uv-logs-content" style="display: none;">
5486
  Downloading setuptools (1.1MiB)
5487
+ Downloading nvidia-nvjitlink-cu12 (37.4MiB)
 
5488
  Downloading nvidia-cuda-cupti-cu12 (9.8MiB)
5489
+ Downloading networkx (1.9MiB)
5490
  Downloading nvidia-cublas-cu12 (566.8MiB)
5491
+ Downloading nvidia-cufft-cu12 (184.2MiB)
5492
+ Downloading triton (148.3MiB)
5493
+ Downloading nvidia-nccl-cu12 (307.4MiB)
5494
  Downloading nvidia-cusolver-cu12 (255.1MiB)
5495
+ Downloading torch (846.9MiB)
 
 
 
 
5496
  Downloading nvidia-curand-cu12 (60.7MiB)
5497
+ Downloading numpy (16.2MiB)
5498
  Downloading nvidia-cusparselt-cu12 (273.9MiB)
5499
+ Downloading sympy (6.0MiB)
5500
+ Downloading nvidia-cuda-nvrtc-cu12 (84.0MiB)
5501
  Downloading nvidia-cufile-cu12 (1.1MiB)
5502
+ Downloading nvidia-cudnn-cu12 (674.0MiB)
5503
+ Downloading nvidia-cusparse-cu12 (274.9MiB)
 
 
5504
  Downloading nvidia-cufile-cu12
5505
  Downloading setuptools
5506
  Downloading networkx
 
5513
  Downloading triton
5514
  Downloading nvidia-cufft-cu12
5515
  Downloading nvidia-cusolver-cu12
 
5516
  Downloading nvidia-cusparse-cu12
5517
+ Downloading nvidia-cusparselt-cu12
5518
  Downloading nvidia-nccl-cu12
5519
  Downloading nvidia-cublas-cu12
5520
  Downloading nvidia-cudnn-cu12
5521
  Downloading torch
5522
+ Installed 26 packages in 526ms
5523
  </div>
5524
  </div>
5525
  <div class="cell-artifacts">
 
5538
  <span onclick="toggleOutput('gptoss_run')" style="cursor: pointer;">▼ output</span>
5539
  <span id="uv-indicator-gptoss_run" onclick="toggleUvLogsFromHeader('gptoss_run')" style="cursor: pointer;">▶ uv-logs</span>
5540
  </span> |
5541
+ Cell: gptoss_run | deps: torch, numpy | 40.94s
5542
  | <button class="run-btn" onclick="runCell('gptoss_run')">▶ run</button>
5543
  <button class="copy-btn" onclick="copyCell('gptoss_run')">Copy</button>
5544
  <a href="cells/gptoss_run.py" target="_blank" class="raw-btn">Raw</a>
 
5856
 
5857
  Warming up (10 iterations)...
5858
  Benchmarking (50 iterations)...
5859
+ Progress: 20% complete (avg: 50.824 ms)
5860
+ Progress: 40% complete (avg: 50.316 ms)
5861
+ Progress: 60% complete (avg: 49.490 ms)
5862
+ Progress: 80% complete (avg: 48.422 ms)
5863
 
5864
  Output tensors:
5865
  Primary: shape=(1, 100, 1152), dtype=torch.float32, device=cuda:0, range=[-0.064982, 0.061193], mean=0.000100, std=0.013510, norm=4.585560
 
5869
  Iterations: 50
5870
 
5871
  Latency Statistics:
5872
+ Average: 47.380 ms
5873
+ Min: 40.922 ms
5874
+ Max: 51.281 ms
5875
+ Std Dev: 2.917 ms
5876
 
5877
  Percentiles:
5878
+ P50 (median): 48.137 ms
5879
+ P95: 51.064 ms
5880
+ P99: 51.261 ms
5881
 
5882
  Throughput:
5883
+ Tokens/sec: 2110.6
5884
+ Std Dev: 134.1
5885
  ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━
5886
 
5887
  Saved benchmark results to gptoss_results.json
 
5891
  <div class="uv-install-logs" id="uv-logs-gptoss_run">
5892
  <div class="uv-logs-header" onclick="toggleUvLogs(this)">▶ UV Install Logs</div>
5893
  <div class="uv-logs-content" style="display: none;">
5894
+ Downloading setuptools (1.1MiB)
5895
  Downloading networkx (1.9MiB)
 
 
 
5896
  Downloading triton (148.3MiB)
 
 
 
 
 
 
 
5897
  Downloading nvidia-cufile-cu12 (1.1MiB)
 
5898
  Downloading nvidia-cuda-cupti-cu12 (9.8MiB)
5899
+ Downloading nvidia-nvjitlink-cu12 (37.4MiB)
5900
+ Downloading nvidia-cufft-cu12 (184.2MiB)
5901
+ Downloading nvidia-cudnn-cu12 (674.0MiB)
5902
+ Downloading nvidia-curand-cu12 (60.7MiB)
5903
  Downloading numpy (16.2MiB)
5904
+ Downloading nvidia-nccl-cu12 (307.4MiB)
5905
+ Downloading nvidia-cuda-nvrtc-cu12 (84.0MiB)
5906
+ Downloading nvidia-cusolver-cu12 (255.1MiB)
5907
+ Downloading sympy (6.0MiB)
5908
  Downloading nvidia-cusparse-cu12 (274.9MiB)
5909
+ Downloading torch (846.9MiB)
5910
+ Downloading nvidia-cusparselt-cu12 (273.9MiB)
5911
+ Downloading nvidia-cublas-cu12 (566.8MiB)
5912
  Downloading nvidia-cufile-cu12
5913
  Downloading setuptools
5914
  Downloading networkx
 
5927
  Downloading nvidia-cublas-cu12
5928
  Downloading nvidia-cudnn-cu12
5929
  Downloading torch
5930
+ Installed 26 packages in 449ms
5931
  </div>
5932
  </div>
5933
  <div class="cell-artifacts">
 
5946
  <span onclick="toggleOutput('gptoss_training_run')" style="cursor: pointer;">▼ output</span>
5947
  <span id="uv-indicator-gptoss_training_run" onclick="toggleUvLogsFromHeader('gptoss_training_run')" style="cursor: pointer;">▶ uv-logs</span>
5948
  </span> |
5949
+ Cell: gptoss_training_run | deps: torch, numpy | 40.19s
5950
  | <button class="run-btn" onclick="runCell('gptoss_training_run')">▶ run</button>
5951
  <button class="copy-btn" onclick="copyCell('gptoss_training_run')">Copy</button>
5952
  <a href="cells/gptoss_training_run.py" target="_blank" class="raw-btn">Raw</a>
 
6247
 
6248
  Warming up (10 iterations)...
6249
  Benchmarking (50 iterations)...
6250
+ Progress: 20% complete (avg: 51.217 ms)
6251
+ Progress: 40% complete (avg: 50.712 ms)
6252
+ Progress: 60% complete (avg: 49.798 ms)
6253
+ Progress: 80% complete (avg: 48.389 ms)
6254
 
6255
  Output tensors:
6256
  Primary: shape=(1, 100, 1152), dtype=torch.float32, device=cuda:0, range=[-0.064982, 0.061193], mean=0.000100, std=0.013510, norm=4.585560
 
6260
  Iterations: 50
6261
 
6262
  Latency Statistics:
6263
+ Average: 46.922 ms
6264
+ Min: 38.894 ms
6265
+ Max: 51.622 ms
6266
+ Std Dev: 3.930 ms
6267
 
6268
  Percentiles:
6269
+ P50 (median): 48.186 ms
6270
+ P95: 51.421 ms
6271
+ P99: 51.564 ms
6272
 
6273
  Throughput:
6274
+ Tokens/sec: 2131.2
6275
+ Std Dev: 188.8
6276
  ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━
6277
 
6278
  Saved benchmark results to gptoss_training_results.json
 
6282
  <div class="uv-install-logs" id="uv-logs-gptoss_training_run">
6283
  <div class="uv-logs-header" onclick="toggleUvLogs(this)">▶ UV Install Logs</div>
6284
  <div class="uv-logs-content" style="display: none;">
 
 
6285
  Downloading nvidia-cudnn-cu12 (674.0MiB)
 
 
 
 
 
 
6286
  Downloading nvidia-cuda-cupti-cu12 (9.8MiB)
 
 
6287
  Downloading nvidia-cusolver-cu12 (255.1MiB)
 
6288
  Downloading nvidia-nccl-cu12 (307.4MiB)
6289
+ Downloading nvidia-cufft-cu12 (184.2MiB)
6290
+ Downloading nvidia-cusparse-cu12 (274.9MiB)
6291
+ Downloading networkx (1.9MiB)
6292
+ Downloading nvidia-curand-cu12 (60.7MiB)
6293
  Downloading sympy (6.0MiB)
6294
+ Downloading numpy (16.2MiB)
6295
+ Downloading nvidia-cufile-cu12 (1.1MiB)
6296
+ Downloading nvidia-cusparselt-cu12 (273.9MiB)
6297
  Downloading torch (846.9MiB)
6298
+ Downloading setuptools (1.1MiB)
6299
+ Downloading nvidia-nvjitlink-cu12 (37.4MiB)
6300
+ Downloading nvidia-cuda-nvrtc-cu12 (84.0MiB)
6301
+ Downloading nvidia-cublas-cu12 (566.8MiB)
6302
+ Downloading triton (148.3MiB)
6303
  Downloading nvidia-cufile-cu12
6304
  Downloading setuptools
6305
  Downloading networkx
 
6318
  Downloading nvidia-cublas-cu12
6319
  Downloading nvidia-cudnn-cu12
6320
  Downloading torch
6321
+ Installed 26 packages in 548ms
6322
  </div>
6323
  </div>
6324
  <div class="cell-artifacts">
 
6337
  <span onclick="toggleOutput('megablocks_run')" style="cursor: pointer;">▼ output</span>
6338
  <span id="uv-indicator-megablocks_run" onclick="toggleUvLogsFromHeader('megablocks_run')" style="cursor: pointer;">▶ uv-logs</span>
6339
  </span> |
6340
+ Cell: megablocks_run | deps: torch, numpy, kernels | 48.02s
6341
  | <button class="run-btn" onclick="runCell('megablocks_run')">▶ run</button>
6342
  <button class="copy-btn" onclick="copyCell('megablocks_run')">Copy</button>
6343
  <a href="cells/megablocks_run.py" target="_blank" class="raw-btn">Raw</a>
 
6566
 
6567
  Warming up (10 iterations)...
6568
  Benchmarking (50 iterations)...
6569
+ Progress: 20% complete (avg: 0.894 ms)
6570
+ Progress: 40% complete (avg: 0.876 ms)
6571
+ Progress: 60% complete (avg: 0.872 ms)
6572
+ Progress: 80% complete (avg: 2.709 ms)
6573
 
6574
  Output tensors:
6575
  Primary: shape=(1, 100, 1152), dtype=torch.float32, device=cuda:0, range=[-0.061104, 0.055115], mean=0.000056, std=0.013535, norm=4.593927
 
6579
  Iterations: 50
6580
 
6581
  Latency Statistics:
6582
+ Average: 3.865 ms
6583
+ Min: 0.842 ms
6584
+ Max: 8.545 ms
6585
+ Std Dev: 3.691 ms
6586
 
6587
  Percentiles:
6588
+ P50 (median): 0.883 ms
6589
+ P95: 8.537 ms
6590
+ P99: 8.542 ms
6591
 
6592
  Throughput:
6593
+ Tokens/sec: 25871.1
6594
+ Std Dev: 50674.8
6595
  ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━
6596
 
6597
  Saved benchmark results to megablocks_results.json
 
6601
  <div class="uv-install-logs" id="uv-logs-megablocks_run">
6602
  <div class="uv-logs-header" onclick="toggleUvLogs(this)">▶ UV Install Logs</div>
6603
  <div class="uv-logs-content" style="display: none;">
6604
+ Downloading sympy (6.0MiB)
 
 
 
 
6605
  Downloading numpy (16.2MiB)
6606
+ Downloading nvidia-nccl-cu12 (307.4MiB)
6607
+ Downloading hf-xet (3.0MiB)
6608
+ Downloading setuptools (1.1MiB)
6609
+ Downloading nvidia-cuda-cupti-cu12 (9.8MiB)
6610
+ Downloading nvidia-cublas-cu12 (566.8MiB)
6611
  Downloading torch (846.9MiB)
6612
+ Downloading nvidia-nvjitlink-cu12 (37.4MiB)
6613
+ Downloading nvidia-cufile-cu12 (1.1MiB)
6614
  Downloading nvidia-cudnn-cu12 (674.0MiB)
6615
+ Downloading nvidia-cuda-nvrtc-cu12 (84.0MiB)
6616
+ Downloading nvidia-cusparse-cu12 (274.9MiB)
6617
+ Downloading nvidia-cusparselt-cu12 (273.9MiB)
 
6618
  Downloading nvidia-cufft-cu12 (184.2MiB)
 
 
6619
  Downloading triton (148.3MiB)
6620
+ Downloading networkx (1.9MiB)
6621
+ Downloading nvidia-curand-cu12 (60.7MiB)
6622
+ Downloading nvidia-cusolver-cu12 (255.1MiB)
6623
  Downloading nvidia-cufile-cu12
6624
  Downloading hf-xet
6625
  Downloading setuptools
 
6639
  Downloading nvidia-cublas-cu12
6640
  Downloading nvidia-cudnn-cu12
6641
  Downloading torch
6642
+ Installed 37 packages in 525ms
6643
  </div>
6644
  </div>
6645
  <div class="cell-stderr">Fetching 66 files: 0%| | 0/66 [00:00&lt;?, ?it/s]
6646
+ Fetching 66 files: 2%|▏ | 1/66 [00:00&lt;00:15, 4.11it/s]
6647
+ Fetching 66 files: 15%|█▌ | 10/66 [00:00&lt;00:02, 21.70it/s]
6648
+ Fetching 66 files: 26%|██▌ | 17/66 [00:00&lt;00:02, 19.93it/s]
6649
+ Fetching 66 files: 47%|████▋ | 31/66 [00:01&lt;00:00, 39.55it/s]
6650
+ Fetching 66 files: 61%|██████ | 40/66 [00:01&lt;00:00, 29.77it/s]
6651
+ Fetching 66 files: 77%|███████▋ | 51/66 [00:01&lt;00:00, 38.15it/s]
6652
+ Fetching 66 files: 89%|████████▉ | 59/66 [00:01&lt;00:00, 42.51it/s]
6653
+ Fetching 66 files: 98%|█████████▊| 65/66 [00:01&lt;00:00, 44.76it/s]
6654
+ Fetching 66 files: 100%|██████████| 66/66 [00:01&lt;00:00, 35.09it/s]</div>
6655
  <div class="cell-artifacts">
6656
  <h4>Artifacts:</h4>
6657
  <a href="artifacts/megablocks_run/megablocks_results.json" class="artifact" target="_blank">megablocks_results.json</a>
 
6668
  <span onclick="toggleOutput('visualization')" style="cursor: pointer;">▼ output</span>
6669
  <span id="uv-indicator-visualization" onclick="toggleUvLogsFromHeader('visualization')" style="cursor: pointer;">▶ uv-logs</span>
6670
  </span> |
6671
+ Cell: visualization | deps: matplotlib | 3.14s
6672
  | <button class="run-btn" onclick="runCell('visualization')">▶ run</button>
6673
  <button class="copy-btn" onclick="copyCell('visualization')">Copy</button>
6674
  <a href="cells/visualization.py" target="_blank" class="raw-btn">Raw</a>
 
6914
  Performance Summary:
6915
  Implementation Avg (ms) P95 (ms) Tokens/sec Relative Speed
6916
  --------------------------------------------------------------------------------
6917
+ megablocks_results 3.87 8.54 25871 1.00x
6918
+ yamoe_results 4.25 4.27 23536 0.91x
6919
+ binned_results 37.04 39.33 2700 0.10x
6920
+ gptoss_training_results 46.92 51.42 2131 0.08x
6921
+ gptoss_results 47.38 51.06 2111 0.08x
6922
+
6923
+ Fastest: megablocks_results (3.87ms avg)
6924
+ Slowest: gptoss_results (47.38ms avg)
6925
+ Max Speedup: 12.3x
6926
  </div>
6927
  <div class="uv-install-logs" id="uv-logs-visualization">
6928
  <div class="uv-logs-header" onclick="toggleUvLogs(this)">▶ UV Install Logs</div>
6929
  <div class="uv-logs-content" style="display: none;">
 
 
 
6930
  Downloading numpy (16.2MiB)
6931
  Downloading kiwisolver (1.4MiB)
6932
+ Downloading pillow (6.3MiB)
6933
+ Downloading fonttools (4.7MiB)
6934
+ Downloading matplotlib (8.3MiB)
6935
  Downloading kiwisolver
6936
  Downloading pillow
6937
  Downloading fonttools
6938
  Downloading matplotlib
6939
  Downloading numpy
6940
+ Installed 11 packages in 48ms
6941
  </div>
6942
  </div>
6943
  <div class="cell-artifacts">