Upload folder using huggingface_hub
Browse files- flash_attn/artifacts/benchmark/Attention Benchmark.csv +6 -6
- flash_attn/artifacts/benchmark/Attention Benchmark.png +2 -2
- flash_attn/benchmark.html +87 -96
- flash_attn/cells/benchmark.py +0 -2
- moe_benchmarks/megablocks/megablocks_only.html +1 -1
- moe_benchmarks/megablocks_yamoe/artifacts/binned_run/binned_results.json +9 -9
- moe_benchmarks/megablocks_yamoe/artifacts/gptoss_run/gptoss_results.json +9 -9
- moe_benchmarks/megablocks_yamoe/artifacts/gptoss_training_run/gptoss_training_results.json +9 -9
- moe_benchmarks/megablocks_yamoe/artifacts/megablocks_run/megablocks_results.json +9 -9
- moe_benchmarks/megablocks_yamoe/artifacts/visualization/moe_performance_comparison.png +2 -2
- moe_benchmarks/megablocks_yamoe/artifacts/yamoe_run/yamoe_results.json +10 -10
- moe_benchmarks/megablocks_yamoe/cells/__pycache__/bench_utils.cpython-311.pyc +0 -0
- moe_benchmarks/megablocks_yamoe/cells/__pycache__/config.cpython-311.pyc +0 -0
- moe_benchmarks/megablocks_yamoe/megablocks_yamoe.html +1 -1
- moe_benchmarks/megablocks_yamoe/torch_profile.html +211 -210
flash_attn/artifacts/benchmark/Attention Benchmark.csv
CHANGED
|
@@ -1,7 +1,7 @@
|
|
| 1 |
seq_len,torch_cudnn,torch_cudnn_compile_d,torch_cudnn_compile_ma,torch_flash,torch_flash_compile_d,torch_flash_compile_ma,hf_flash_attn,hf_flash_attn3
|
| 2 |
-
4224.000000,3.
|
| 3 |
-
4352.000000,4.
|
| 4 |
-
4416.000000,4.
|
| 5 |
-
4480.000000,4.
|
| 6 |
-
4544.000000,4.
|
| 7 |
-
4608.000000,4.
|
|
|
|
| 1 |
seq_len,torch_cudnn,torch_cudnn_compile_d,torch_cudnn_compile_ma,torch_flash,torch_flash_compile_d,torch_flash_compile_ma,hf_flash_attn,hf_flash_attn3
|
| 2 |
+
4224.000000,3.801472,3.790064,4.182320,3.968000,3.957824,4.311152,3.398160,3.330400
|
| 3 |
+
4352.000000,4.082944,4.082912,4.413488,4.400000,4.391936,4.738048,3.837424,3.758208
|
| 4 |
+
4416.000000,4.142624,4.135648,4.484160,4.452304,4.446096,4.792480,3.892064,3.864128
|
| 5 |
+
4480.000000,4.206144,4.198752,4.551808,4.530752,4.522944,4.873760,3.949344,3.870224
|
| 6 |
+
4544.000000,4.438320,4.433104,4.787584,4.584160,4.576640,4.934304,4.008960,3.974672
|
| 7 |
+
4608.000000,4.502432,4.495456,4.871872,4.660192,4.651040,5.029792,4.065616,3.984160
|
flash_attn/artifacts/benchmark/Attention Benchmark.png
CHANGED
|
Git LFS Details
|
|
Git LFS Details
|
flash_attn/benchmark.html
CHANGED
|
@@ -3722,7 +3722,7 @@ span.linenos.special { color: #000000; background-color: #ffffc0; padding-left:
|
|
| 3722 |
<span onclick="toggleOutput('nv')" style="cursor: pointer;">▼ output</span>
|
| 3723 |
<span id="uv-indicator-nv" style="cursor: default; opacity: 0.3;">▶ uv-logs</span>
|
| 3724 |
</span> |
|
| 3725 |
-
Cell: nv | 0.
|
| 3726 |
| <button class="run-btn" onclick="runCell('nv')">▶ run</button>
|
| 3727 |
<button class="copy-btn" onclick="copyCell('nv')">Copy</button>
|
| 3728 |
<a href="cells/nv.py" target="_blank" class="raw-btn">Raw</a>
|
|
@@ -3745,7 +3745,7 @@ Cell: nv | 0.70s
|
|
| 3745 |
</div>
|
| 3746 |
</div>
|
| 3747 |
<div id="output-nv" class="cell-output">
|
| 3748 |
-
<div class="cell-stdout">Fri Sep 26
|
| 3749 |
+-----------------------------------------------------------------------------------------+
|
| 3750 |
| NVIDIA-SMI 570.172.08 Driver Version: 570.172.08 CUDA Version: 12.8 |
|
| 3751 |
|-----------------------------------------+------------------------+----------------------+
|
|
@@ -3754,19 +3754,19 @@ Cell: nv | 0.70s
|
|
| 3754 |
| | | MIG M. |
|
| 3755 |
|=========================================+========================+======================|
|
| 3756 |
| 0 NVIDIA A10G On | 00000000:00:1B.0 Off | 0 |
|
| 3757 |
-
| 0%
|
| 3758 |
| | | N/A |
|
| 3759 |
+-----------------------------------------+------------------------+----------------------+
|
| 3760 |
| 1 NVIDIA A10G On | 00000000:00:1C.0 Off | 0 |
|
| 3761 |
-
| 0%
|
| 3762 |
| | | N/A |
|
| 3763 |
+-----------------------------------------+------------------------+----------------------+
|
| 3764 |
| 2 NVIDIA A10G On | 00000000:00:1D.0 Off | 0 |
|
| 3765 |
-
| 0% 31C P8
|
| 3766 |
| | | N/A |
|
| 3767 |
+-----------------------------------------+------------------------+----------------------+
|
| 3768 |
| 3 NVIDIA A10G On | 00000000:00:1E.0 Off | 0 |
|
| 3769 |
-
| 0%
|
| 3770 |
| | | N/A |
|
| 3771 |
+-----------------------------------------+------------------------+----------------------+
|
| 3772 |
|
|
@@ -3782,19 +3782,19 @@ Cell: nv | 0.70s
|
|
| 3782 |
</div>
|
| 3783 |
</div>
|
| 3784 |
|
| 3785 |
-
<div class="cell
|
| 3786 |
<div class="cell-header">
|
| 3787 |
<span class="collapse-indicators">
|
| 3788 |
<span onclick="toggleCode('benchmark')" style="cursor: pointer;">▼ code</span>
|
| 3789 |
<span onclick="toggleOutput('benchmark')" style="cursor: pointer;">▼ output</span>
|
| 3790 |
<span id="uv-indicator-benchmark" onclick="toggleUvLogsFromHeader('benchmark')" style="cursor: pointer;">▶ uv-logs</span>
|
| 3791 |
</span> |
|
| 3792 |
-
Cell: benchmark |
|
| 3793 |
| <button class="run-btn" onclick="runCell('benchmark')">▶ run</button>
|
| 3794 |
<button class="copy-btn" onclick="copyCell('benchmark')">Copy</button>
|
| 3795 |
<a href="cells/benchmark.py" target="_blank" class="raw-btn">Raw</a>
|
| 3796 |
</div>
|
| 3797 |
-
<div id="code-benchmark" class="cell-code" data-lines="
|
| 3798 |
<div class="highlight-with-lines">
|
| 3799 |
<div class="line-numbers" id="lines-benchmark">
|
| 3800 |
<a class="line-number" data-cell="benchmark" data-line="1" href="#cell-benchmark" onclick="event.preventDefault(); selectCellLine('benchmark', 1, true);">1</a>
|
|
@@ -4140,8 +4140,6 @@ Cell: benchmark | 77.48s | FAILED
|
|
| 4140 |
<a class="line-number" data-cell="benchmark" data-line="341" href="#cell-benchmark" onclick="event.preventDefault(); selectCellLine('benchmark', 341, true);">341</a>
|
| 4141 |
<a class="line-number" data-cell="benchmark" data-line="342" href="#cell-benchmark" onclick="event.preventDefault(); selectCellLine('benchmark', 342, true);">342</a>
|
| 4142 |
<a class="line-number" data-cell="benchmark" data-line="343" href="#cell-benchmark" onclick="event.preventDefault(); selectCellLine('benchmark', 343, true);">343</a>
|
| 4143 |
-
<a class="line-number" data-cell="benchmark" data-line="344" href="#cell-benchmark" onclick="event.preventDefault(); selectCellLine('benchmark', 344, true);">344</a>
|
| 4144 |
-
<a class="line-number" data-cell="benchmark" data-line="345" href="#cell-benchmark" onclick="event.preventDefault(); selectCellLine('benchmark', 345, true);">345</a>
|
| 4145 |
</div>
|
| 4146 |
<div class="code-wrap">
|
| 4147 |
<div class="highlight"><pre><span></span><span class="c1"># /// script</span>
|
|
@@ -4487,8 +4485,6 @@ Cell: benchmark | 77.48s | FAILED
|
|
| 4487 |
<span class="k">with</span> <span class="n">torch</span><span class="o">.</span><span class="n">inference_mode</span><span class="p">():</span>
|
| 4488 |
<span class="n">correctness</span><span class="p">()</span>
|
| 4489 |
<span class="n">fig</span> <span class="o">=</span> <span class="n">benchmark_fn</span><span class="o">.</span><span class="n">run</span><span class="p">(</span><span class="n">print_data</span><span class="o">=</span><span class="kc">True</span><span class="p">,</span> <span class="n">save_path</span><span class="o">=</span><span class="n">output_dir</span><span class="o">.</span><span class="n">as_posix</span><span class="p">())</span>
|
| 4490 |
-
<span class="n">fig</span><span class="o">.</span><span class="n">savefig</span><span class="p">(</span><span class="n">output_dir</span> <span class="o">/</span> <span class="s2">"attention_benchmark.png"</span><span class="p">,</span> <span class="n">dpi</span><span class="o">=</span><span class="mi">300</span><span class="p">,</span> <span class="n">bbox_inches</span><span class="o">=</span><span class="s2">"tight"</span><span class="p">)</span>
|
| 4491 |
-
<span class="nb">print</span><span class="p">(</span><span class="sa">f</span><span class="s2">"Benchmark plot saved to: </span><span class="si">{</span><span class="n">output_dir</span><span class="w"> </span><span class="o">/</span><span class="w"> </span><span class="s1">'attention_benchmark.png'</span><span class="si">}</span><span class="s2">"</span><span class="p">)</span>
|
| 4492 |
</pre></div>
|
| 4493 |
|
| 4494 |
<div class="code-line-highlight" id="line-highlight-benchmark"></div>
|
|
@@ -4504,105 +4500,105 @@ xFormers not found.
|
|
| 4504 |
|
| 4505 |
|
| 4506 |
===== Testing shape: (1, 4224, 24, 128) =====
|
| 4507 |
-
torch_cudnn : absmax=0.
|
| 4508 |
-
torch_cudnn_compile_d : absmax=0.
|
| 4509 |
-
torch_cudnn_compile_ma : absmax=0.
|
| 4510 |
-
torch_flash : absmax=0.
|
| 4511 |
-
torch_flash_compile_d : absmax=0.
|
| 4512 |
-
torch_flash_compile_ma : absmax=0.
|
| 4513 |
-
hf_flash_attn : absmax=0.
|
| 4514 |
-
hf_flash_attn3 : absmax=0.
|
| 4515 |
|
| 4516 |
|
| 4517 |
===== Testing shape: (1, 4352, 24, 128) =====
|
| 4518 |
-
torch_cudnn : absmax=0.
|
| 4519 |
-
torch_cudnn_compile_d : absmax=0.
|
| 4520 |
-
torch_cudnn_compile_ma : absmax=0.
|
| 4521 |
-
torch_flash : absmax=0.
|
| 4522 |
-
torch_flash_compile_d : absmax=0.
|
| 4523 |
-
torch_flash_compile_ma : absmax=0.
|
| 4524 |
-
hf_flash_attn : absmax=0.
|
| 4525 |
-
hf_flash_attn3 : absmax=0.
|
| 4526 |
|
| 4527 |
|
| 4528 |
===== Testing shape: (1, 4416, 24, 128) =====
|
| 4529 |
-
torch_cudnn : absmax=0.
|
| 4530 |
-
torch_cudnn_compile_d : absmax=0.
|
| 4531 |
-
torch_cudnn_compile_ma : absmax=0.
|
| 4532 |
-
torch_flash : absmax=0.
|
| 4533 |
-
torch_flash_compile_d : absmax=0.
|
| 4534 |
-
torch_flash_compile_ma : absmax=0.
|
| 4535 |
-
hf_flash_attn : absmax=0.
|
| 4536 |
-
hf_flash_attn3 : absmax=0.
|
| 4537 |
|
| 4538 |
|
| 4539 |
===== Testing shape: (1, 4480, 24, 128) =====
|
| 4540 |
-
torch_cudnn : absmax=0.
|
| 4541 |
-
torch_cudnn_compile_d : absmax=0.
|
| 4542 |
-
torch_cudnn_compile_ma : absmax=0.
|
| 4543 |
-
torch_flash : absmax=0.
|
| 4544 |
-
torch_flash_compile_d : absmax=0.
|
| 4545 |
-
torch_flash_compile_ma : absmax=0.
|
| 4546 |
-
hf_flash_attn : absmax=0.
|
| 4547 |
-
hf_flash_attn3 : absmax=0.
|
| 4548 |
|
| 4549 |
|
| 4550 |
===== Testing shape: (1, 4544, 24, 128) =====
|
| 4551 |
-
torch_cudnn : absmax=0.
|
| 4552 |
-
torch_cudnn_compile_d : absmax=0.
|
| 4553 |
-
torch_cudnn_compile_ma : absmax=0.
|
| 4554 |
-
torch_flash : absmax=0.
|
| 4555 |
-
torch_flash_compile_d : absmax=0.
|
| 4556 |
-
torch_flash_compile_ma : absmax=0.
|
| 4557 |
-
hf_flash_attn : absmax=0.
|
| 4558 |
-
hf_flash_attn3 : absmax=0.
|
| 4559 |
|
| 4560 |
|
| 4561 |
===== Testing shape: (1, 4608, 24, 128) =====
|
| 4562 |
-
torch_cudnn : absmax=0.
|
| 4563 |
-
torch_cudnn_compile_d : absmax=0.
|
| 4564 |
-
torch_cudnn_compile_ma : absmax=0.
|
| 4565 |
-
torch_flash : absmax=0.
|
| 4566 |
-
torch_flash_compile_d : absmax=0.
|
| 4567 |
-
torch_flash_compile_ma : absmax=0.
|
| 4568 |
-
hf_flash_attn : absmax=0.
|
| 4569 |
-
hf_flash_attn3 : absmax=0.
|
| 4570 |
Attention Benchmark:
|
| 4571 |
seq_len torch_cudnn torch_cudnn_compile_d torch_cudnn_compile_ma torch_flash torch_flash_compile_d torch_flash_compile_ma hf_flash_attn hf_flash_attn3
|
| 4572 |
-
0 4224.0 3.
|
| 4573 |
-
1 4352.0 4.
|
| 4574 |
-
2 4416.0 4.
|
| 4575 |
-
3 4480.0 4.
|
| 4576 |
-
4 4544.0 4.
|
| 4577 |
-
5 4608.0 4.
|
| 4578 |
</div>
|
| 4579 |
<div class="uv-install-logs" id="uv-logs-benchmark">
|
| 4580 |
<div class="uv-logs-header" onclick="toggleUvLogs(this)">▶ UV Install Logs</div>
|
| 4581 |
<div class="uv-logs-content" style="display: none;">
|
| 4582 |
-
Downloading
|
|
|
|
|
|
|
|
|
|
| 4583 |
Downloading setuptools (1.1MiB)
|
| 4584 |
Downloading nvidia-cufile-cu12 (1.1MiB)
|
| 4585 |
-
Downloading
|
| 4586 |
Downloading pandas (11.8MiB)
|
|
|
|
| 4587 |
Downloading nvidia-cusparse-cu12 (274.9MiB)
|
| 4588 |
-
Downloading networkx (1.9MiB)
|
| 4589 |
Downloading nvidia-cusolver-cu12 (255.1MiB)
|
| 4590 |
-
Downloading nvidia-
|
| 4591 |
-
Downloading nvidia-cufft-cu12 (184.2MiB)
|
| 4592 |
-
Downloading triton (148.3MiB)
|
| 4593 |
Downloading matplotlib (8.3MiB)
|
|
|
|
|
|
|
|
|
|
|
|
|
| 4594 |
Downloading kiwisolver (1.4MiB)
|
| 4595 |
-
Downloading nvidia-
|
|
|
|
|
|
|
| 4596 |
Downloading nvidia-cuda-cupti-cu12 (9.8MiB)
|
| 4597 |
Downloading nvidia-curand-cu12 (60.7MiB)
|
| 4598 |
-
Downloading
|
| 4599 |
-
Downloading torch (846.9MiB)
|
| 4600 |
-
Downloading numpy (16.2MiB)
|
| 4601 |
-
Downloading pillow (6.3MiB)
|
| 4602 |
-
Downloading nvidia-cuda-nvrtc-cu12 (84.0MiB)
|
| 4603 |
-
Downloading nvidia-cudnn-cu12 (674.0MiB)
|
| 4604 |
-
Downloading fonttools (4.7MiB)
|
| 4605 |
-
Downloading sympy (6.0MiB)
|
| 4606 |
Downloading nvidia-cufile-cu12
|
| 4607 |
Downloading kiwisolver
|
| 4608 |
Downloading hf-xet
|
|
@@ -4616,34 +4612,29 @@ Downloading sympy (6.0MiB)
|
|
| 4616 |
Downloading numpy
|
| 4617 |
Downloading nvidia-nvjitlink-cu12
|
| 4618 |
Downloading nvidia-curand-cu12
|
| 4619 |
-
Downloading pandas
|
| 4620 |
Downloading nvidia-cuda-nvrtc-cu12
|
|
|
|
| 4621 |
Downloading triton
|
| 4622 |
Downloading nvidia-cufft-cu12
|
| 4623 |
Downloading nvidia-cusolver-cu12
|
| 4624 |
-
Downloading nvidia-cusparselt-cu12
|
| 4625 |
Downloading nvidia-cusparse-cu12
|
|
|
|
| 4626 |
Downloading nvidia-nccl-cu12
|
| 4627 |
Downloading nvidia-cublas-cu12
|
| 4628 |
Downloading nvidia-cudnn-cu12
|
| 4629 |
Downloading torch
|
| 4630 |
-
Installed 49 packages in
|
| 4631 |
</div>
|
| 4632 |
</div>
|
| 4633 |
<div class="cell-stderr">Fetching 20 files: 0%| | 0/20 [00:00<?, ?it/s]
|
| 4634 |
-
Fetching 20 files: 5%|▌ | 1/20 [00:00<00:03,
|
| 4635 |
-
Fetching 20 files: 10%|█ | 2/20 [00:01<00:
|
| 4636 |
-
Fetching 20 files: 100%|██████████| 20/20 [00:01<00:00,
|
| 4637 |
|
| 4638 |
Fetching 4 files: 0%| | 0/4 [00:00<?, ?it/s]
|
| 4639 |
-
Fetching 4 files: 25%|██▌ | 1/4 [00:00<00:00, 6.
|
| 4640 |
-
Fetching 4 files: 50
|
| 4641 |
-
Fetching 4 files: 100%|██████████| 4/4 [00:01<00:00,
|
| 4642 |
-
Traceback (most recent call last):
|
| 4643 |
-
File "/repo/flash_attn/.uvnote/cells/benchmark.py", line 344, in <module>
|
| 4644 |
-
fig.savefig(output_dir / "attention_benchmark.png", dpi=300, bbox_inches="tight")
|
| 4645 |
-
^^^^^^^^^^^
|
| 4646 |
-
AttributeError: 'NoneType' object has no attribute 'savefig'</div>
|
| 4647 |
<div class="cell-artifacts">
|
| 4648 |
<h4>Artifacts:</h4>
|
| 4649 |
<a href="artifacts/benchmark/Attention Benchmark.png" class="artifact" target="_blank">Attention Benchmark.png</a>
|
|
|
|
| 3722 |
<span onclick="toggleOutput('nv')" style="cursor: pointer;">▼ output</span>
|
| 3723 |
<span id="uv-indicator-nv" style="cursor: default; opacity: 0.3;">▶ uv-logs</span>
|
| 3724 |
</span> |
|
| 3725 |
+
Cell: nv | 0.67s
|
| 3726 |
| <button class="run-btn" onclick="runCell('nv')">▶ run</button>
|
| 3727 |
<button class="copy-btn" onclick="copyCell('nv')">Copy</button>
|
| 3728 |
<a href="cells/nv.py" target="_blank" class="raw-btn">Raw</a>
|
|
|
|
| 3745 |
</div>
|
| 3746 |
</div>
|
| 3747 |
<div id="output-nv" class="cell-output">
|
| 3748 |
+
<div class="cell-stdout">Fri Sep 26 03:53:23 2025
|
| 3749 |
+-----------------------------------------------------------------------------------------+
|
| 3750 |
| NVIDIA-SMI 570.172.08 Driver Version: 570.172.08 CUDA Version: 12.8 |
|
| 3751 |
|-----------------------------------------+------------------------+----------------------+
|
|
|
|
| 3754 |
| | | MIG M. |
|
| 3755 |
|=========================================+========================+======================|
|
| 3756 |
| 0 NVIDIA A10G On | 00000000:00:1B.0 Off | 0 |
|
| 3757 |
+
| 0% 38C P0 51W / 300W | 0MiB / 23028MiB | 0% Default |
|
| 3758 |
| | | N/A |
|
| 3759 |
+-----------------------------------------+------------------------+----------------------+
|
| 3760 |
| 1 NVIDIA A10G On | 00000000:00:1C.0 Off | 0 |
|
| 3761 |
+
| 0% 31C P8 24W / 300W | 0MiB / 23028MiB | 0% Default |
|
| 3762 |
| | | N/A |
|
| 3763 |
+-----------------------------------------+------------------------+----------------------+
|
| 3764 |
| 2 NVIDIA A10G On | 00000000:00:1D.0 Off | 0 |
|
| 3765 |
+
| 0% 31C P8 25W / 300W | 0MiB / 23028MiB | 0% Default |
|
| 3766 |
| | | N/A |
|
| 3767 |
+-----------------------------------------+------------------------+----------------------+
|
| 3768 |
| 3 NVIDIA A10G On | 00000000:00:1E.0 Off | 0 |
|
| 3769 |
+
| 0% 31C P8 25W / 300W | 0MiB / 23028MiB | 0% Default |
|
| 3770 |
| | | N/A |
|
| 3771 |
+-----------------------------------------+------------------------+----------------------+
|
| 3772 |
|
|
|
|
| 3782 |
</div>
|
| 3783 |
</div>
|
| 3784 |
|
| 3785 |
+
<div class="cell" id="cell-benchmark">
|
| 3786 |
<div class="cell-header">
|
| 3787 |
<span class="collapse-indicators">
|
| 3788 |
<span onclick="toggleCode('benchmark')" style="cursor: pointer;">▼ code</span>
|
| 3789 |
<span onclick="toggleOutput('benchmark')" style="cursor: pointer;">▼ output</span>
|
| 3790 |
<span id="uv-indicator-benchmark" onclick="toggleUvLogsFromHeader('benchmark')" style="cursor: pointer;">▶ uv-logs</span>
|
| 3791 |
</span> |
|
| 3792 |
+
Cell: benchmark | 75.46s
|
| 3793 |
| <button class="run-btn" onclick="runCell('benchmark')">▶ run</button>
|
| 3794 |
<button class="copy-btn" onclick="copyCell('benchmark')">Copy</button>
|
| 3795 |
<a href="cells/benchmark.py" target="_blank" class="raw-btn">Raw</a>
|
| 3796 |
</div>
|
| 3797 |
+
<div id="code-benchmark" class="cell-code" data-lines="343">
|
| 3798 |
<div class="highlight-with-lines">
|
| 3799 |
<div class="line-numbers" id="lines-benchmark">
|
| 3800 |
<a class="line-number" data-cell="benchmark" data-line="1" href="#cell-benchmark" onclick="event.preventDefault(); selectCellLine('benchmark', 1, true);">1</a>
|
|
|
|
| 4140 |
<a class="line-number" data-cell="benchmark" data-line="341" href="#cell-benchmark" onclick="event.preventDefault(); selectCellLine('benchmark', 341, true);">341</a>
|
| 4141 |
<a class="line-number" data-cell="benchmark" data-line="342" href="#cell-benchmark" onclick="event.preventDefault(); selectCellLine('benchmark', 342, true);">342</a>
|
| 4142 |
<a class="line-number" data-cell="benchmark" data-line="343" href="#cell-benchmark" onclick="event.preventDefault(); selectCellLine('benchmark', 343, true);">343</a>
|
|
|
|
|
|
|
| 4143 |
</div>
|
| 4144 |
<div class="code-wrap">
|
| 4145 |
<div class="highlight"><pre><span></span><span class="c1"># /// script</span>
|
|
|
|
| 4485 |
<span class="k">with</span> <span class="n">torch</span><span class="o">.</span><span class="n">inference_mode</span><span class="p">():</span>
|
| 4486 |
<span class="n">correctness</span><span class="p">()</span>
|
| 4487 |
<span class="n">fig</span> <span class="o">=</span> <span class="n">benchmark_fn</span><span class="o">.</span><span class="n">run</span><span class="p">(</span><span class="n">print_data</span><span class="o">=</span><span class="kc">True</span><span class="p">,</span> <span class="n">save_path</span><span class="o">=</span><span class="n">output_dir</span><span class="o">.</span><span class="n">as_posix</span><span class="p">())</span>
|
|
|
|
|
|
|
| 4488 |
</pre></div>
|
| 4489 |
|
| 4490 |
<div class="code-line-highlight" id="line-highlight-benchmark"></div>
|
|
|
|
| 4500 |
|
| 4501 |
|
| 4502 |
===== Testing shape: (1, 4224, 24, 128) =====
|
| 4503 |
+
torch_cudnn : absmax=0.001524, mae=0.000075, mse=0.000000
|
| 4504 |
+
torch_cudnn_compile_d : absmax=0.001524, mae=0.000075, mse=0.000000
|
| 4505 |
+
torch_cudnn_compile_ma : absmax=0.001524, mae=0.000075, mse=0.000000
|
| 4506 |
+
torch_flash : absmax=0.001524, mae=0.000075, mse=0.000000
|
| 4507 |
+
torch_flash_compile_d : absmax=0.001524, mae=0.000075, mse=0.000000
|
| 4508 |
+
torch_flash_compile_ma : absmax=0.001524, mae=0.000075, mse=0.000000
|
| 4509 |
+
hf_flash_attn : absmax=0.001524, mae=0.000075, mse=0.000000
|
| 4510 |
+
hf_flash_attn3 : absmax=0.001524, mae=0.000075, mse=0.000000
|
| 4511 |
|
| 4512 |
|
| 4513 |
===== Testing shape: (1, 4352, 24, 128) =====
|
| 4514 |
+
torch_cudnn : absmax=0.001335, mae=0.000074, mse=0.000000
|
| 4515 |
+
torch_cudnn_compile_d : absmax=0.001335, mae=0.000074, mse=0.000000
|
| 4516 |
+
torch_cudnn_compile_ma : absmax=0.001335, mae=0.000074, mse=0.000000
|
| 4517 |
+
torch_flash : absmax=0.001321, mae=0.000074, mse=0.000000
|
| 4518 |
+
torch_flash_compile_d : absmax=0.001321, mae=0.000074, mse=0.000000
|
| 4519 |
+
torch_flash_compile_ma : absmax=0.001321, mae=0.000074, mse=0.000000
|
| 4520 |
+
hf_flash_attn : absmax=0.001321, mae=0.000074, mse=0.000000
|
| 4521 |
+
hf_flash_attn3 : absmax=0.001321, mae=0.000074, mse=0.000000
|
| 4522 |
|
| 4523 |
|
| 4524 |
===== Testing shape: (1, 4416, 24, 128) =====
|
| 4525 |
+
torch_cudnn : absmax=0.000897, mae=0.000073, mse=0.000000
|
| 4526 |
+
torch_cudnn_compile_d : absmax=0.000897, mae=0.000073, mse=0.000000
|
| 4527 |
+
torch_cudnn_compile_ma : absmax=0.000897, mae=0.000073, mse=0.000000
|
| 4528 |
+
torch_flash : absmax=0.000897, mae=0.000073, mse=0.000000
|
| 4529 |
+
torch_flash_compile_d : absmax=0.000897, mae=0.000073, mse=0.000000
|
| 4530 |
+
torch_flash_compile_ma : absmax=0.000897, mae=0.000073, mse=0.000000
|
| 4531 |
+
hf_flash_attn : absmax=0.000897, mae=0.000073, mse=0.000000
|
| 4532 |
+
hf_flash_attn3 : absmax=0.000897, mae=0.000073, mse=0.000000
|
| 4533 |
|
| 4534 |
|
| 4535 |
===== Testing shape: (1, 4480, 24, 128) =====
|
| 4536 |
+
torch_cudnn : absmax=0.001691, mae=0.000073, mse=0.000000
|
| 4537 |
+
torch_cudnn_compile_d : absmax=0.001691, mae=0.000073, mse=0.000000
|
| 4538 |
+
torch_cudnn_compile_ma : absmax=0.001691, mae=0.000073, mse=0.000000
|
| 4539 |
+
torch_flash : absmax=0.001691, mae=0.000073, mse=0.000000
|
| 4540 |
+
torch_flash_compile_d : absmax=0.001691, mae=0.000073, mse=0.000000
|
| 4541 |
+
torch_flash_compile_ma : absmax=0.001691, mae=0.000073, mse=0.000000
|
| 4542 |
+
hf_flash_attn : absmax=0.001691, mae=0.000073, mse=0.000000
|
| 4543 |
+
hf_flash_attn3 : absmax=0.001691, mae=0.000073, mse=0.000000
|
| 4544 |
|
| 4545 |
|
| 4546 |
===== Testing shape: (1, 4544, 24, 128) =====
|
| 4547 |
+
torch_cudnn : absmax=0.001201, mae=0.000072, mse=0.000000
|
| 4548 |
+
torch_cudnn_compile_d : absmax=0.001201, mae=0.000072, mse=0.000000
|
| 4549 |
+
torch_cudnn_compile_ma : absmax=0.001201, mae=0.000072, mse=0.000000
|
| 4550 |
+
torch_flash : absmax=0.001201, mae=0.000072, mse=0.000000
|
| 4551 |
+
torch_flash_compile_d : absmax=0.001201, mae=0.000072, mse=0.000000
|
| 4552 |
+
torch_flash_compile_ma : absmax=0.001201, mae=0.000072, mse=0.000000
|
| 4553 |
+
hf_flash_attn : absmax=0.001201, mae=0.000072, mse=0.000000
|
| 4554 |
+
hf_flash_attn3 : absmax=0.001201, mae=0.000072, mse=0.000000
|
| 4555 |
|
| 4556 |
|
| 4557 |
===== Testing shape: (1, 4608, 24, 128) =====
|
| 4558 |
+
torch_cudnn : absmax=0.001150, mae=0.000071, mse=0.000000
|
| 4559 |
+
torch_cudnn_compile_d : absmax=0.001150, mae=0.000071, mse=0.000000
|
| 4560 |
+
torch_cudnn_compile_ma : absmax=0.001150, mae=0.000071, mse=0.000000
|
| 4561 |
+
torch_flash : absmax=0.001150, mae=0.000071, mse=0.000000
|
| 4562 |
+
torch_flash_compile_d : absmax=0.001150, mae=0.000071, mse=0.000000
|
| 4563 |
+
torch_flash_compile_ma : absmax=0.001150, mae=0.000071, mse=0.000000
|
| 4564 |
+
hf_flash_attn : absmax=0.001150, mae=0.000071, mse=0.000000
|
| 4565 |
+
hf_flash_attn3 : absmax=0.001150, mae=0.000071, mse=0.000000
|
| 4566 |
Attention Benchmark:
|
| 4567 |
seq_len torch_cudnn torch_cudnn_compile_d torch_cudnn_compile_ma torch_flash torch_flash_compile_d torch_flash_compile_ma hf_flash_attn hf_flash_attn3
|
| 4568 |
+
0 4224.0 3.801472 3.790064 4.182320 3.968000 3.957824 4.311152 3.398160 3.330400
|
| 4569 |
+
1 4352.0 4.082944 4.082912 4.413488 4.400000 4.391936 4.738048 3.837424 3.758208
|
| 4570 |
+
2 4416.0 4.142624 4.135648 4.484160 4.452304 4.446096 4.792480 3.892064 3.864128
|
| 4571 |
+
3 4480.0 4.206144 4.198752 4.551808 4.530752 4.522944 4.873760 3.949344 3.870224
|
| 4572 |
+
4 4544.0 4.438320 4.433104 4.787584 4.584160 4.576640 4.934304 4.008960 3.974672
|
| 4573 |
+
5 4608.0 4.502432 4.495456 4.871872 4.660192 4.651040 5.029792 4.065616 3.984160
|
| 4574 |
</div>
|
| 4575 |
<div class="uv-install-logs" id="uv-logs-benchmark">
|
| 4576 |
<div class="uv-logs-header" onclick="toggleUvLogs(this)">▶ UV Install Logs</div>
|
| 4577 |
<div class="uv-logs-content" style="display: none;">
|
| 4578 |
+
Downloading networkx (1.9MiB)
|
| 4579 |
+
Downloading numpy (16.2MiB)
|
| 4580 |
+
Downloading pillow (6.3MiB)
|
| 4581 |
+
Downloading fonttools (4.7MiB)
|
| 4582 |
Downloading setuptools (1.1MiB)
|
| 4583 |
Downloading nvidia-cufile-cu12 (1.1MiB)
|
| 4584 |
+
Downloading sympy (6.0MiB)
|
| 4585 |
Downloading pandas (11.8MiB)
|
| 4586 |
+
Downloading hf-xet (3.0MiB)
|
| 4587 |
Downloading nvidia-cusparse-cu12 (274.9MiB)
|
|
|
|
| 4588 |
Downloading nvidia-cusolver-cu12 (255.1MiB)
|
| 4589 |
+
Downloading nvidia-nccl-cu12 (307.4MiB)
|
|
|
|
|
|
|
| 4590 |
Downloading matplotlib (8.3MiB)
|
| 4591 |
+
Downloading nvidia-cufft-cu12 (184.2MiB)
|
| 4592 |
+
Downloading nvidia-cusparselt-cu12 (273.9MiB)
|
| 4593 |
+
Downloading nvidia-cublas-cu12 (566.8MiB)
|
| 4594 |
+
Downloading nvidia-nvjitlink-cu12 (37.4MiB)
|
| 4595 |
Downloading kiwisolver (1.4MiB)
|
| 4596 |
+
Downloading nvidia-cudnn-cu12 (674.0MiB)
|
| 4597 |
+
Downloading nvidia-cuda-nvrtc-cu12 (84.0MiB)
|
| 4598 |
+
Downloading torch (846.9MiB)
|
| 4599 |
Downloading nvidia-cuda-cupti-cu12 (9.8MiB)
|
| 4600 |
Downloading nvidia-curand-cu12 (60.7MiB)
|
| 4601 |
+
Downloading triton (148.3MiB)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 4602 |
Downloading nvidia-cufile-cu12
|
| 4603 |
Downloading kiwisolver
|
| 4604 |
Downloading hf-xet
|
|
|
|
| 4612 |
Downloading numpy
|
| 4613 |
Downloading nvidia-nvjitlink-cu12
|
| 4614 |
Downloading nvidia-curand-cu12
|
|
|
|
| 4615 |
Downloading nvidia-cuda-nvrtc-cu12
|
| 4616 |
+
Downloading pandas
|
| 4617 |
Downloading triton
|
| 4618 |
Downloading nvidia-cufft-cu12
|
| 4619 |
Downloading nvidia-cusolver-cu12
|
|
|
|
| 4620 |
Downloading nvidia-cusparse-cu12
|
| 4621 |
+
Downloading nvidia-cusparselt-cu12
|
| 4622 |
Downloading nvidia-nccl-cu12
|
| 4623 |
Downloading nvidia-cublas-cu12
|
| 4624 |
Downloading nvidia-cudnn-cu12
|
| 4625 |
Downloading torch
|
| 4626 |
+
Installed 49 packages in 518ms
|
| 4627 |
</div>
|
| 4628 |
</div>
|
| 4629 |
<div class="cell-stderr">Fetching 20 files: 0%| | 0/20 [00:00<?, ?it/s]
|
| 4630 |
+
Fetching 20 files: 5%|▌ | 1/20 [00:00<00:03, 5.10it/s]
|
| 4631 |
+
Fetching 20 files: 10%|█ | 2/20 [00:01<00:14, 1.23it/s]
|
| 4632 |
+
Fetching 20 files: 100%|██████████| 20/20 [00:01<00:00, 13.86it/s]
|
| 4633 |
|
| 4634 |
Fetching 4 files: 0%| | 0/4 [00:00<?, ?it/s]
|
| 4635 |
+
Fetching 4 files: 25%|██▌ | 1/4 [00:00<00:00, 6.31it/s]
|
| 4636 |
+
Fetching 4 files: 50%|███��█ | 2/4 [00:01<00:01, 1.34it/s]
|
| 4637 |
+
Fetching 4 files: 100%|██████████| 4/4 [00:01<00:00, 3.05it/s]</div>
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 4638 |
<div class="cell-artifacts">
|
| 4639 |
<h4>Artifacts:</h4>
|
| 4640 |
<a href="artifacts/benchmark/Attention Benchmark.png" class="artifact" target="_blank">Attention Benchmark.png</a>
|
flash_attn/cells/benchmark.py
CHANGED
|
@@ -341,5 +341,3 @@ def benchmark_fn(seq_len: int, provider: str):
|
|
| 341 |
with torch.inference_mode():
|
| 342 |
correctness()
|
| 343 |
fig = benchmark_fn.run(print_data=True, save_path=output_dir.as_posix())
|
| 344 |
-
fig.savefig(output_dir / "attention_benchmark.png", dpi=300, bbox_inches="tight")
|
| 345 |
-
print(f"Benchmark plot saved to: {output_dir / 'attention_benchmark.png'}")
|
|
|
|
| 341 |
with torch.inference_mode():
|
| 342 |
correctness()
|
| 343 |
fig = benchmark_fn.run(print_data=True, save_path=output_dir.as_posix())
|
|
|
|
|
|
moe_benchmarks/megablocks/megablocks_only.html
CHANGED
|
@@ -3727,7 +3727,7 @@ span.linenos.special { color: #000000; background-color: #ffffc0; padding-left:
|
|
| 3727 |
<span onclick="toggleOutput('forward_and_backward_no_kernel')" style="cursor: pointer;">▼ output</span>
|
| 3728 |
<span id="uv-indicator-forward_and_backward_no_kernel" style="cursor: default; opacity: 0.3;">▶ uv-logs</span>
|
| 3729 |
</span> |
|
| 3730 |
-
Cell: forward_and_backward_no_kernel | 17.
|
| 3731 |
| <button class="run-btn" onclick="runCell('forward_and_backward_no_kernel')">▶ run</button>
|
| 3732 |
<button class="copy-btn" onclick="copyCell('forward_and_backward_no_kernel')">Copy</button>
|
| 3733 |
<a href="cells/forward_and_backward_no_kernel.py" target="_blank" class="raw-btn">Raw</a>
|
|
|
|
| 3727 |
<span onclick="toggleOutput('forward_and_backward_no_kernel')" style="cursor: pointer;">▼ output</span>
|
| 3728 |
<span id="uv-indicator-forward_and_backward_no_kernel" style="cursor: default; opacity: 0.3;">▶ uv-logs</span>
|
| 3729 |
</span> |
|
| 3730 |
+
Cell: forward_and_backward_no_kernel | 17.10s | FAILED
|
| 3731 |
| <button class="run-btn" onclick="runCell('forward_and_backward_no_kernel')">▶ run</button>
|
| 3732 |
<button class="copy-btn" onclick="copyCell('forward_and_backward_no_kernel')">Copy</button>
|
| 3733 |
<a href="cells/forward_and_backward_no_kernel.py" target="_blank" class="raw-btn">Raw</a>
|
moe_benchmarks/megablocks_yamoe/artifacts/binned_run/binned_results.json
CHANGED
|
@@ -9,16 +9,16 @@
|
|
| 9 |
"vary_inputs": true
|
| 10 |
},
|
| 11 |
"stats": {
|
| 12 |
-
"avg_ms":
|
| 13 |
-
"min_ms": 33.
|
| 14 |
-
"max_ms":
|
| 15 |
-
"std_ms": 1.
|
| 16 |
-
"p50_ms":
|
| 17 |
-
"p95_ms":
|
| 18 |
-
"p99_ms":
|
| 19 |
"num_iters": 50,
|
| 20 |
-
"tokens_per_s":
|
| 21 |
-
"throughput_variance":
|
| 22 |
},
|
| 23 |
"output_sum": 3.97190523147583
|
| 24 |
}
|
|
|
|
| 9 |
"vary_inputs": true
|
| 10 |
},
|
| 11 |
"stats": {
|
| 12 |
+
"avg_ms": 35.79408963999981,
|
| 13 |
+
"min_ms": 33.22658100000808,
|
| 14 |
+
"max_ms": 37.58223699998098,
|
| 15 |
+
"std_ms": 1.260985811405264,
|
| 16 |
+
"p50_ms": 36.03647150001166,
|
| 17 |
+
"p95_ms": 37.377484250018256,
|
| 18 |
+
"p99_ms": 37.52526078000528,
|
| 19 |
"num_iters": 50,
|
| 20 |
+
"tokens_per_s": 2793.7573215509365,
|
| 21 |
+
"throughput_variance": 99.68321642463675
|
| 22 |
},
|
| 23 |
"output_sum": 3.97190523147583
|
| 24 |
}
|
moe_benchmarks/megablocks_yamoe/artifacts/gptoss_run/gptoss_results.json
CHANGED
|
@@ -9,16 +9,16 @@
|
|
| 9 |
"vary_inputs": true
|
| 10 |
},
|
| 11 |
"stats": {
|
| 12 |
-
"avg_ms":
|
| 13 |
-
"min_ms": 40.
|
| 14 |
-
"max_ms":
|
| 15 |
-
"std_ms": 2.
|
| 16 |
-
"p50_ms":
|
| 17 |
-
"p95_ms":
|
| 18 |
-
"p99_ms":
|
| 19 |
"num_iters": 50,
|
| 20 |
-
"tokens_per_s":
|
| 21 |
-
"throughput_variance":
|
| 22 |
},
|
| 23 |
"output_sum": 11.53223705291748
|
| 24 |
}
|
|
|
|
| 9 |
"vary_inputs": true
|
| 10 |
},
|
| 11 |
"stats": {
|
| 12 |
+
"avg_ms": 45.9334355199951,
|
| 13 |
+
"min_ms": 40.05551199998081,
|
| 14 |
+
"max_ms": 49.51232600001276,
|
| 15 |
+
"std_ms": 2.4709340263031536,
|
| 16 |
+
"p50_ms": 46.49940249998963,
|
| 17 |
+
"p95_ms": 49.05830289997937,
|
| 18 |
+
"p99_ms": 49.3528599099983,
|
| 19 |
"num_iters": 50,
|
| 20 |
+
"tokens_per_s": 2177.0633715492368,
|
| 21 |
+
"throughput_variance": 121.25434497483073
|
| 22 |
},
|
| 23 |
"output_sum": 11.53223705291748
|
| 24 |
}
|
moe_benchmarks/megablocks_yamoe/artifacts/gptoss_training_run/gptoss_training_results.json
CHANGED
|
@@ -9,16 +9,16 @@
|
|
| 9 |
"vary_inputs": true
|
| 10 |
},
|
| 11 |
"stats": {
|
| 12 |
-
"avg_ms":
|
| 13 |
-
"min_ms": 38.
|
| 14 |
-
"max_ms": 51.
|
| 15 |
-
"std_ms": 3.
|
| 16 |
-
"p50_ms":
|
| 17 |
-
"p95_ms": 51.
|
| 18 |
-
"p99_ms": 51.
|
| 19 |
"num_iters": 50,
|
| 20 |
-
"tokens_per_s":
|
| 21 |
-
"throughput_variance": 188.
|
| 22 |
},
|
| 23 |
"output_sum": 11.53223705291748
|
| 24 |
}
|
|
|
|
| 9 |
"vary_inputs": true
|
| 10 |
},
|
| 11 |
"stats": {
|
| 12 |
+
"avg_ms": 45.94743435999817,
|
| 13 |
+
"min_ms": 38.690121999991334,
|
| 14 |
+
"max_ms": 51.193351999984316,
|
| 15 |
+
"std_ms": 3.91507100876056,
|
| 16 |
+
"p50_ms": 45.20909099997539,
|
| 17 |
+
"p95_ms": 51.039028550002286,
|
| 18 |
+
"p99_ms": 51.14429515998495,
|
| 19 |
"num_iters": 50,
|
| 20 |
+
"tokens_per_s": 2176.4000839851024,
|
| 21 |
+
"throughput_variance": 188.75969966024954
|
| 22 |
},
|
| 23 |
"output_sum": 11.53223705291748
|
| 24 |
}
|
moe_benchmarks/megablocks_yamoe/artifacts/megablocks_run/megablocks_results.json
CHANGED
|
@@ -9,16 +9,16 @@
|
|
| 9 |
"vary_inputs": true
|
| 10 |
},
|
| 11 |
"stats": {
|
| 12 |
-
"avg_ms": 3.
|
| 13 |
-
"min_ms": 0.
|
| 14 |
-
"max_ms": 8.
|
| 15 |
-
"std_ms": 3.
|
| 16 |
-
"p50_ms": 0.
|
| 17 |
-
"p95_ms": 8.
|
| 18 |
-
"p99_ms": 8.
|
| 19 |
"num_iters": 50,
|
| 20 |
-
"tokens_per_s":
|
| 21 |
-
"throughput_variance":
|
| 22 |
},
|
| 23 |
"output_sum": 6.4738850593566895
|
| 24 |
}
|
|
|
|
| 9 |
"vary_inputs": true
|
| 10 |
},
|
| 11 |
"stats": {
|
| 12 |
+
"avg_ms": 3.8478457200017147,
|
| 13 |
+
"min_ms": 0.8121239999354657,
|
| 14 |
+
"max_ms": 8.535666000057063,
|
| 15 |
+
"std_ms": 3.697659288553723,
|
| 16 |
+
"p50_ms": 0.8394504999955643,
|
| 17 |
+
"p95_ms": 8.499624499950187,
|
| 18 |
+
"p99_ms": 8.528520820026415,
|
| 19 |
"num_iters": 50,
|
| 20 |
+
"tokens_per_s": 25988.567961595778,
|
| 21 |
+
"throughput_variance": 53035.39729321811
|
| 22 |
},
|
| 23 |
"output_sum": 6.4738850593566895
|
| 24 |
}
|
moe_benchmarks/megablocks_yamoe/artifacts/visualization/moe_performance_comparison.png
CHANGED
|
Git LFS Details
|
|
Git LFS Details
|
moe_benchmarks/megablocks_yamoe/artifacts/yamoe_run/yamoe_results.json
CHANGED
|
@@ -9,16 +9,16 @@
|
|
| 9 |
"vary_inputs": true
|
| 10 |
},
|
| 11 |
"stats": {
|
| 12 |
-
"avg_ms": 4.
|
| 13 |
-
"min_ms": 4.
|
| 14 |
-
"max_ms": 4.
|
| 15 |
-
"std_ms": 0.
|
| 16 |
-
"p50_ms": 4.
|
| 17 |
-
"p95_ms": 4.
|
| 18 |
-
"p99_ms": 4.
|
| 19 |
"num_iters": 50,
|
| 20 |
-
"tokens_per_s":
|
| 21 |
-
"throughput_variance":
|
| 22 |
},
|
| 23 |
-
"output_sum": 3.
|
| 24 |
}
|
|
|
|
| 9 |
"vary_inputs": true
|
| 10 |
},
|
| 11 |
"stats": {
|
| 12 |
+
"avg_ms": 4.246394759998111,
|
| 13 |
+
"min_ms": 4.066528999999264,
|
| 14 |
+
"max_ms": 4.294285000014497,
|
| 15 |
+
"std_ms": 0.033808054217192726,
|
| 16 |
+
"p50_ms": 4.2530110000313925,
|
| 17 |
+
"p95_ms": 4.267295049984909,
|
| 18 |
+
"p99_ms": 4.287134920007816,
|
| 19 |
"num_iters": 50,
|
| 20 |
+
"tokens_per_s": 23549.38851705923,
|
| 21 |
+
"throughput_variance": 193.18069406896424
|
| 22 |
},
|
| 23 |
+
"output_sum": 3.97190523147583
|
| 24 |
}
|
moe_benchmarks/megablocks_yamoe/cells/__pycache__/bench_utils.cpython-311.pyc
CHANGED
|
Binary files a/moe_benchmarks/megablocks_yamoe/cells/__pycache__/bench_utils.cpython-311.pyc and b/moe_benchmarks/megablocks_yamoe/cells/__pycache__/bench_utils.cpython-311.pyc differ
|
|
|
moe_benchmarks/megablocks_yamoe/cells/__pycache__/config.cpython-311.pyc
CHANGED
|
Binary files a/moe_benchmarks/megablocks_yamoe/cells/__pycache__/config.cpython-311.pyc and b/moe_benchmarks/megablocks_yamoe/cells/__pycache__/config.cpython-311.pyc differ
|
|
|
moe_benchmarks/megablocks_yamoe/megablocks_yamoe.html
CHANGED
|
@@ -3726,7 +3726,7 @@ span.linenos.special { color: #000000; background-color: #ffffc0; padding-left:
|
|
| 3726 |
<span onclick="toggleOutput('setup')" style="cursor: pointer;">▼ output</span>
|
| 3727 |
<span id="uv-indicator-setup" style="cursor: default; opacity: 0.3;">▶ uv-logs</span>
|
| 3728 |
</span> |
|
| 3729 |
-
Cell: setup |
|
| 3730 |
| <button class="run-btn" onclick="runCell('setup')">▶ run</button>
|
| 3731 |
<button class="copy-btn" onclick="copyCell('setup')">Copy</button>
|
| 3732 |
<a href="cells/setup.py" target="_blank" class="raw-btn">Raw</a>
|
|
|
|
| 3726 |
<span onclick="toggleOutput('setup')" style="cursor: pointer;">▼ output</span>
|
| 3727 |
<span id="uv-indicator-setup" style="cursor: default; opacity: 0.3;">▶ uv-logs</span>
|
| 3728 |
</span> |
|
| 3729 |
+
Cell: setup | 16.96s | FAILED
|
| 3730 |
| <button class="run-btn" onclick="runCell('setup')">▶ run</button>
|
| 3731 |
<button class="copy-btn" onclick="copyCell('setup')">Copy</button>
|
| 3732 |
<a href="cells/setup.py" target="_blank" class="raw-btn">Raw</a>
|
moe_benchmarks/megablocks_yamoe/torch_profile.html
CHANGED
|
@@ -3720,7 +3720,7 @@ span.linenos.special { color: #000000; background-color: #ffffc0; padding-left:
|
|
| 3720 |
<span onclick="toggleOutput('utils')" style="cursor: pointer;">▼ output</span>
|
| 3721 |
<span id="uv-indicator-utils" onclick="toggleUvLogsFromHeader('utils')" style="cursor: pointer;">▶ uv-logs</span>
|
| 3722 |
</span> |
|
| 3723 |
-
Cell: utils | deps: torch, numpy | 34.
|
| 3724 |
| <button class="run-btn" onclick="runCell('utils')">▶ run</button>
|
| 3725 |
<button class="copy-btn" onclick="copyCell('utils')">Copy</button>
|
| 3726 |
<a href="cells/utils.py" target="_blank" class="raw-btn">Raw</a>
|
|
@@ -3794,24 +3794,24 @@ Cell: utils | deps: torch, numpy | 34.86s
|
|
| 3794 |
<div class="uv-install-logs" id="uv-logs-utils">
|
| 3795 |
<div class="uv-logs-header" onclick="toggleUvLogs(this)">▶ UV Install Logs</div>
|
| 3796 |
<div class="uv-logs-content" style="display: none;">
|
| 3797 |
-
Downloading nvidia-cuda-cupti-cu12 (9.8MiB)
|
| 3798 |
-
Downloading sympy (6.0MiB)
|
| 3799 |
Downloading networkx (1.9MiB)
|
| 3800 |
-
Downloading setuptools (1.1MiB)
|
| 3801 |
-
Downloading nvidia-cudnn-cu12 (674.0MiB)
|
| 3802 |
-
Downloading nvidia-cufile-cu12 (1.1MiB)
|
| 3803 |
Downloading nvidia-nccl-cu12 (307.4MiB)
|
|
|
|
|
|
|
|
|
|
|
|
|
| 3804 |
Downloading nvidia-cusolver-cu12 (255.1MiB)
|
| 3805 |
-
Downloading nvidia-cusparselt-cu12 (273.9MiB)
|
| 3806 |
Downloading nvidia-curand-cu12 (60.7MiB)
|
| 3807 |
-
Downloading nvidia-
|
| 3808 |
-
Downloading
|
| 3809 |
-
Downloading triton (148.3MiB)
|
| 3810 |
-
Downloading nvidia-cublas-cu12 (566.8MiB)
|
| 3811 |
Downloading nvidia-cufft-cu12 (184.2MiB)
|
|
|
|
| 3812 |
Downloading nvidia-cusparse-cu12 (274.9MiB)
|
| 3813 |
-
Downloading
|
| 3814 |
-
Downloading nvidia-
|
|
|
|
|
|
|
|
|
|
| 3815 |
Downloading nvidia-cufile-cu12
|
| 3816 |
Downloading setuptools
|
| 3817 |
Downloading networkx
|
|
@@ -3843,7 +3843,7 @@ Installed 26 packages in 447ms
|
|
| 3843 |
<span onclick="toggleOutput('bench_utils')" style="cursor: pointer;">▼ output</span>
|
| 3844 |
<span id="uv-indicator-bench_utils" onclick="toggleUvLogsFromHeader('bench_utils')" style="cursor: pointer;">▶ uv-logs</span>
|
| 3845 |
</span> |
|
| 3846 |
-
Cell: bench_utils | deps: torch, numpy |
|
| 3847 |
| <button class="run-btn" onclick="runCell('bench_utils')">▶ run</button>
|
| 3848 |
<button class="copy-btn" onclick="copyCell('bench_utils')">Copy</button>
|
| 3849 |
<a href="cells/bench_utils.py" target="_blank" class="raw-btn">Raw</a>
|
|
@@ -4331,24 +4331,24 @@ Cell: bench_utils | deps: torch, numpy | 35.19s
|
|
| 4331 |
<div class="uv-install-logs" id="uv-logs-bench_utils">
|
| 4332 |
<div class="uv-logs-header" onclick="toggleUvLogs(this)">▶ UV Install Logs</div>
|
| 4333 |
<div class="uv-logs-content" style="display: none;">
|
| 4334 |
-
Downloading sympy (6.0MiB)
|
| 4335 |
-
Downloading networkx (1.9MiB)
|
| 4336 |
Downloading setuptools (1.1MiB)
|
| 4337 |
-
Downloading
|
| 4338 |
-
Downloading nvidia-
|
|
|
|
| 4339 |
Downloading numpy (16.2MiB)
|
| 4340 |
-
Downloading nvidia-cuda-nvrtc-cu12 (84.0MiB)
|
| 4341 |
-
Downloading nvidia-nvjitlink-cu12 (37.4MiB)
|
| 4342 |
-
Downloading torch (846.9MiB)
|
| 4343 |
Downloading nvidia-cufft-cu12 (184.2MiB)
|
| 4344 |
-
Downloading
|
|
|
|
| 4345 |
Downloading nvidia-cublas-cu12 (566.8MiB)
|
| 4346 |
-
Downloading nvidia-curand-cu12 (60.7MiB)
|
| 4347 |
Downloading nvidia-cufile-cu12 (1.1MiB)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 4348 |
Downloading nvidia-cuda-cupti-cu12 (9.8MiB)
|
| 4349 |
Downloading nvidia-cusparselt-cu12 (273.9MiB)
|
| 4350 |
-
Downloading
|
| 4351 |
-
Downloading nvidia-nccl-cu12 (307.4MiB)
|
| 4352 |
Downloading nvidia-cufile-cu12
|
| 4353 |
Downloading setuptools
|
| 4354 |
Downloading networkx
|
|
@@ -4361,8 +4361,8 @@ Downloading nvidia-nccl-cu12 (307.4MiB)
|
|
| 4361 |
Downloading triton
|
| 4362 |
Downloading nvidia-cufft-cu12
|
| 4363 |
Downloading nvidia-cusolver-cu12
|
| 4364 |
-
Downloading nvidia-cusparselt-cu12
|
| 4365 |
Downloading nvidia-cusparse-cu12
|
|
|
|
| 4366 |
Downloading nvidia-nccl-cu12
|
| 4367 |
Downloading nvidia-cublas-cu12
|
| 4368 |
Downloading nvidia-cudnn-cu12
|
|
@@ -4381,7 +4381,7 @@ Installed 26 packages in 455ms
|
|
| 4381 |
<span onclick="toggleOutput('config')" style="cursor: pointer;">▼ output</span>
|
| 4382 |
<span id="uv-indicator-config" onclick="toggleUvLogsFromHeader('config')" style="cursor: pointer;">▶ uv-logs</span>
|
| 4383 |
</span> |
|
| 4384 |
-
Cell: config | deps: torch, numpy |
|
| 4385 |
| <button class="run-btn" onclick="runCell('config')">▶ run</button>
|
| 4386 |
<button class="copy-btn" onclick="copyCell('config')">Copy</button>
|
| 4387 |
<a href="cells/config.py" target="_blank" class="raw-btn">Raw</a>
|
|
@@ -4441,24 +4441,24 @@ Cell: config | deps: torch, numpy | 34.96s
|
|
| 4441 |
<div class="uv-install-logs" id="uv-logs-config">
|
| 4442 |
<div class="uv-logs-header" onclick="toggleUvLogs(this)">▶ UV Install Logs</div>
|
| 4443 |
<div class="uv-logs-content" style="display: none;">
|
|
|
|
|
|
|
| 4444 |
Downloading nvidia-cufile-cu12 (1.1MiB)
|
|
|
|
|
|
|
|
|
|
|
|
|
| 4445 |
Downloading nvidia-cusparse-cu12 (274.9MiB)
|
|
|
|
|
|
|
| 4446 |
Downloading nvidia-cufft-cu12 (184.2MiB)
|
| 4447 |
-
Downloading sympy (6.0MiB)
|
| 4448 |
-
Downloading triton (148.3MiB)
|
| 4449 |
Downloading nvidia-curand-cu12 (60.7MiB)
|
|
|
|
|
|
|
| 4450 |
Downloading nvidia-cusparselt-cu12 (273.9MiB)
|
| 4451 |
-
Downloading nvidia-cublas-cu12 (566.8MiB)
|
| 4452 |
-
Downloading nvidia-nvjitlink-cu12 (37.4MiB)
|
| 4453 |
-
Downloading setuptools (1.1MiB)
|
| 4454 |
-
Downloading nvidia-nccl-cu12 (307.4MiB)
|
| 4455 |
-
Downloading networkx (1.9MiB)
|
| 4456 |
Downloading nvidia-cuda-cupti-cu12 (9.8MiB)
|
| 4457 |
-
Downloading nvidia-cusolver-cu12 (255.1MiB)
|
| 4458 |
-
Downloading nvidia-cudnn-cu12 (674.0MiB)
|
| 4459 |
-
Downloading nvidia-cuda-nvrtc-cu12 (84.0MiB)
|
| 4460 |
Downloading torch (846.9MiB)
|
| 4461 |
-
Downloading
|
| 4462 |
Downloading nvidia-cufile-cu12
|
| 4463 |
Downloading setuptools
|
| 4464 |
Downloading networkx
|
|
@@ -4471,8 +4471,8 @@ Downloading numpy (16.2MiB)
|
|
| 4471 |
Downloading triton
|
| 4472 |
Downloading nvidia-cufft-cu12
|
| 4473 |
Downloading nvidia-cusolver-cu12
|
| 4474 |
-
Downloading nvidia-cusparse-cu12
|
| 4475 |
Downloading nvidia-cusparselt-cu12
|
|
|
|
| 4476 |
Downloading nvidia-nccl-cu12
|
| 4477 |
Downloading nvidia-cublas-cu12
|
| 4478 |
Downloading nvidia-cudnn-cu12
|
|
@@ -4490,7 +4490,7 @@ Installed 26 packages in 449ms
|
|
| 4490 |
<span onclick="toggleOutput('save_data')" style="cursor: pointer;">▼ output</span>
|
| 4491 |
<span id="uv-indicator-save_data" onclick="toggleUvLogsFromHeader('save_data')" style="cursor: pointer;">▶ uv-logs</span>
|
| 4492 |
</span> |
|
| 4493 |
-
Cell: save_data | deps: torch, numpy | 39.
|
| 4494 |
| <button class="run-btn" onclick="runCell('save_data')">▶ run</button>
|
| 4495 |
<button class="copy-btn" onclick="copyCell('save_data')">Copy</button>
|
| 4496 |
<a href="cells/save_data.py" target="_blank" class="raw-btn">Raw</a>
|
|
@@ -4585,24 +4585,24 @@ Down sum: 206.729263
|
|
| 4585 |
<div class="uv-install-logs" id="uv-logs-save_data">
|
| 4586 |
<div class="uv-logs-header" onclick="toggleUvLogs(this)">▶ UV Install Logs</div>
|
| 4587 |
<div class="uv-logs-content" style="display: none;">
|
| 4588 |
-
Downloading
|
| 4589 |
-
Downloading nvidia-cudnn-cu12 (674.0MiB)
|
| 4590 |
-
Downloading nvidia-cuda-cupti-cu12 (9.8MiB)
|
| 4591 |
Downloading nvidia-nccl-cu12 (307.4MiB)
|
| 4592 |
-
Downloading nvidia-cusparse-cu12 (274.9MiB)
|
| 4593 |
Downloading sympy (6.0MiB)
|
|
|
|
|
|
|
|
|
|
| 4594 |
Downloading nvidia-cusolver-cu12 (255.1MiB)
|
|
|
|
| 4595 |
Downloading nvidia-curand-cu12 (60.7MiB)
|
| 4596 |
-
Downloading numpy (16.2MiB)
|
| 4597 |
-
Downloading nvidia-cufile-cu12 (1.1MiB)
|
| 4598 |
-
Downloading nvidia-cuda-nvrtc-cu12 (84.0MiB)
|
| 4599 |
Downloading nvidia-cusparselt-cu12 (273.9MiB)
|
| 4600 |
-
Downloading
|
| 4601 |
-
Downloading nvidia-
|
|
|
|
| 4602 |
Downloading networkx (1.9MiB)
|
| 4603 |
-
Downloading torch (846.9MiB)
|
| 4604 |
Downloading nvidia-cublas-cu12 (566.8MiB)
|
|
|
|
| 4605 |
Downloading nvidia-nvjitlink-cu12 (37.4MiB)
|
|
|
|
| 4606 |
Downloading nvidia-cufile-cu12
|
| 4607 |
Downloading setuptools
|
| 4608 |
Downloading networkx
|
|
@@ -4621,17 +4621,17 @@ Downloading nvidia-nvjitlink-cu12 (37.4MiB)
|
|
| 4621 |
Downloading nvidia-cublas-cu12
|
| 4622 |
Downloading nvidia-cudnn-cu12
|
| 4623 |
Downloading torch
|
| 4624 |
-
Installed 26 packages in
|
| 4625 |
</div>
|
| 4626 |
</div>
|
| 4627 |
<div class="cell-artifacts">
|
| 4628 |
<h4>Artifacts:</h4>
|
| 4629 |
<a href="artifacts/save_data/router_weight.pt" class="artifact" target="_blank">router_weight.pt</a>
|
| 4630 |
<a href="artifacts/save_data/gate_up_proj_bias.pt" class="artifact" target="_blank">gate_up_proj_bias.pt</a>
|
| 4631 |
-
<a href="artifacts/save_data/gate_up_proj.pt" class="artifact" target="_blank">gate_up_proj.pt</a>
|
| 4632 |
-
<a href="artifacts/save_data/down_proj.pt" class="artifact" target="_blank">down_proj.pt</a>
|
| 4633 |
<a href="artifacts/save_data/router_bias.pt" class="artifact" target="_blank">router_bias.pt</a>
|
|
|
|
| 4634 |
<a href="artifacts/save_data/down_proj_bias.pt" class="artifact" target="_blank">down_proj_bias.pt</a>
|
|
|
|
| 4635 |
</div>
|
| 4636 |
</div>
|
| 4637 |
</div>
|
|
@@ -4645,7 +4645,7 @@ Installed 26 packages in 453ms
|
|
| 4645 |
<span onclick="toggleOutput('yamoe_run')" style="cursor: pointer;">▼ output</span>
|
| 4646 |
<span id="uv-indicator-yamoe_run" onclick="toggleUvLogsFromHeader('yamoe_run')" style="cursor: pointer;">▶ uv-logs</span>
|
| 4647 |
</span> |
|
| 4648 |
-
Cell: yamoe_run | deps: torch, kernels, numpy |
|
| 4649 |
| <button class="run-btn" onclick="runCell('yamoe_run')">▶ run</button>
|
| 4650 |
<button class="copy-btn" onclick="copyCell('yamoe_run')">Copy</button>
|
| 4651 |
<a href="cells/yamoe_run.py" target="_blank" class="raw-btn">Raw</a>
|
|
@@ -4938,10 +4938,10 @@ Input Variation: +0.001 * iteration (deterministic)
|
|
| 4938 |
|
| 4939 |
Warming up (10 iterations)...
|
| 4940 |
Benchmarking (50 iterations)...
|
| 4941 |
-
Progress: 20% complete (avg: 4.
|
| 4942 |
-
Progress: 40% complete (avg: 4.
|
| 4943 |
-
Progress: 60% complete (avg: 4.
|
| 4944 |
-
Progress: 80% complete (avg: 4.
|
| 4945 |
|
| 4946 |
Output tensors:
|
| 4947 |
Primary: shape=(1, 100, 1152), dtype=torch.float32, device=cuda:0, range=[-0.049506, 0.054984], mean=0.000034, std=0.006508, norm=2.208791
|
|
@@ -4951,47 +4951,47 @@ Output tensors:
|
|
| 4951 |
Iterations: 50
|
| 4952 |
|
| 4953 |
Latency Statistics:
|
| 4954 |
-
Average: 4.
|
| 4955 |
-
Min: 4.
|
| 4956 |
-
Max: 4.
|
| 4957 |
-
Std Dev: 0.
|
| 4958 |
|
| 4959 |
Percentiles:
|
| 4960 |
P50 (median): 4.253 ms
|
| 4961 |
-
P95: 4.
|
| 4962 |
-
P99: 4.
|
| 4963 |
|
| 4964 |
Throughput:
|
| 4965 |
-
Tokens/sec:
|
| 4966 |
-
Std Dev:
|
| 4967 |
━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━
|
| 4968 |
|
| 4969 |
Saved benchmark results to yamoe_results.json
|
| 4970 |
|
| 4971 |
-
Output sum: 3.
|
| 4972 |
</div>
|
| 4973 |
<div class="uv-install-logs" id="uv-logs-yamoe_run">
|
| 4974 |
<div class="uv-logs-header" onclick="toggleUvLogs(this)">▶ UV Install Logs</div>
|
| 4975 |
<div class="uv-logs-content" style="display: none;">
|
| 4976 |
-
Downloading
|
| 4977 |
-
Downloading
|
| 4978 |
-
Downloading nvidia-cufft-cu12 (184.2MiB)
|
| 4979 |
-
Downloading nvidia-cudnn-cu12 (674.0MiB)
|
| 4980 |
Downloading nvidia-cusparselt-cu12 (273.9MiB)
|
| 4981 |
-
Downloading nvidia-cublas-cu12 (566.8MiB)
|
| 4982 |
Downloading torch (846.9MiB)
|
| 4983 |
-
Downloading nvidia-
|
| 4984 |
-
Downloading nvidia-cuda-nvrtc-cu12 (84.0MiB)
|
| 4985 |
-
Downloading nvidia-nccl-cu12 (307.4MiB)
|
| 4986 |
Downloading nvidia-cusolver-cu12 (255.1MiB)
|
| 4987 |
-
Downloading
|
| 4988 |
-
Downloading
|
| 4989 |
Downloading nvidia-curand-cu12 (60.7MiB)
|
|
|
|
|
|
|
| 4990 |
Downloading nvidia-nvjitlink-cu12 (37.4MiB)
|
|
|
|
|
|
|
| 4991 |
Downloading sympy (6.0MiB)
|
| 4992 |
-
Downloading
|
| 4993 |
Downloading hf-xet (3.0MiB)
|
| 4994 |
-
Downloading nvidia-
|
|
|
|
| 4995 |
Downloading nvidia-cufile-cu12
|
| 4996 |
Downloading hf-xet
|
| 4997 |
Downloading setuptools
|
|
@@ -5005,19 +5005,19 @@ Downloading nvidia-cufile-cu12 (1.1MiB)
|
|
| 5005 |
Downloading triton
|
| 5006 |
Downloading nvidia-cufft-cu12
|
| 5007 |
Downloading nvidia-cusolver-cu12
|
| 5008 |
-
Downloading nvidia-cusparselt-cu12
|
| 5009 |
Downloading nvidia-cusparse-cu12
|
|
|
|
| 5010 |
Downloading nvidia-nccl-cu12
|
| 5011 |
Downloading nvidia-cublas-cu12
|
| 5012 |
Downloading nvidia-cudnn-cu12
|
| 5013 |
Downloading torch
|
| 5014 |
-
Installed 37 packages in
|
| 5015 |
</div>
|
| 5016 |
</div>
|
| 5017 |
<div class="cell-stderr">Fetching 6 files: 0%| | 0/6 [00:00<?, ?it/s]
|
| 5018 |
-
Fetching 6 files: 17%|█▋ | 1/6 [00:00<00:01, 3.
|
| 5019 |
-
Fetching 6 files: 50%|█████ | 3/6 [00:00<00:00, 3.
|
| 5020 |
-
Fetching 6 files: 100%|██████████| 6/6 [00:00<00:00, 6.
|
| 5021 |
<div class="cell-artifacts">
|
| 5022 |
<h4>Artifacts:</h4>
|
| 5023 |
<a href="artifacts/yamoe_run/yamoe_results.json" class="artifact" target="_blank">yamoe_results.json</a>
|
|
@@ -5034,7 +5034,7 @@ Fetching 6 files: 100%|██████████| 6/6 [00:00<00:00, 6.1
|
|
| 5034 |
<span onclick="toggleOutput('binned_run')" style="cursor: pointer;">▼ output</span>
|
| 5035 |
<span id="uv-indicator-binned_run" onclick="toggleUvLogsFromHeader('binned_run')" style="cursor: pointer;">▶ uv-logs</span>
|
| 5036 |
</span> |
|
| 5037 |
-
Cell: binned_run | deps: torch, numpy | 39.
|
| 5038 |
| <button class="run-btn" onclick="runCell('binned_run')">▶ run</button>
|
| 5039 |
<button class="copy-btn" onclick="copyCell('binned_run')">Copy</button>
|
| 5040 |
<a href="cells/binned_run.py" target="_blank" class="raw-btn">Raw</a>
|
|
@@ -5448,10 +5448,10 @@ Input Variation: +0.001 * iteration (deterministic)
|
|
| 5448 |
|
| 5449 |
Warming up (10 iterations)...
|
| 5450 |
Benchmarking (50 iterations)...
|
| 5451 |
-
Progress: 20% complete (avg:
|
| 5452 |
-
Progress: 40% complete (avg:
|
| 5453 |
-
Progress: 60% complete (avg:
|
| 5454 |
-
Progress: 80% complete (avg:
|
| 5455 |
|
| 5456 |
Output tensors:
|
| 5457 |
Primary: shape=(1, 100, 1152), dtype=torch.float32, device=cuda:0, range=[-0.049506, 0.054984], mean=0.000034, std=0.006508, norm=2.208791
|
|
@@ -5461,19 +5461,19 @@ Output tensors:
|
|
| 5461 |
Iterations: 50
|
| 5462 |
|
| 5463 |
Latency Statistics:
|
| 5464 |
-
Average:
|
| 5465 |
-
Min: 33.
|
| 5466 |
-
Max:
|
| 5467 |
-
Std Dev: 1.
|
| 5468 |
|
| 5469 |
Percentiles:
|
| 5470 |
-
P50 (median):
|
| 5471 |
-
P95:
|
| 5472 |
-
P99:
|
| 5473 |
|
| 5474 |
Throughput:
|
| 5475 |
-
Tokens/sec:
|
| 5476 |
-
Std Dev:
|
| 5477 |
━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━
|
| 5478 |
|
| 5479 |
Saved benchmark results to binned_results.json
|
|
@@ -5483,23 +5483,23 @@ Output sum: 3.971905
|
|
| 5483 |
<div class="uv-install-logs" id="uv-logs-binned_run">
|
| 5484 |
<div class="uv-logs-header" onclick="toggleUvLogs(this)">▶ UV Install Logs</div>
|
| 5485 |
<div class="uv-logs-content" style="display: none;">
|
|
|
|
|
|
|
|
|
|
| 5486 |
Downloading setuptools (1.1MiB)
|
|
|
|
| 5487 |
Downloading nvidia-nvjitlink-cu12 (37.4MiB)
|
| 5488 |
-
Downloading nvidia-cuda-cupti-cu12 (9.8MiB)
|
| 5489 |
-
Downloading networkx (1.9MiB)
|
| 5490 |
-
Downloading nvidia-cublas-cu12 (566.8MiB)
|
| 5491 |
-
Downloading nvidia-cufft-cu12 (184.2MiB)
|
| 5492 |
Downloading triton (148.3MiB)
|
|
|
|
| 5493 |
Downloading nvidia-nccl-cu12 (307.4MiB)
|
| 5494 |
-
Downloading nvidia-
|
| 5495 |
Downloading torch (846.9MiB)
|
|
|
|
|
|
|
| 5496 |
Downloading nvidia-curand-cu12 (60.7MiB)
|
| 5497 |
-
Downloading
|
| 5498 |
Downloading nvidia-cusparselt-cu12 (273.9MiB)
|
| 5499 |
-
Downloading sympy (6.0MiB)
|
| 5500 |
Downloading nvidia-cuda-nvrtc-cu12 (84.0MiB)
|
| 5501 |
-
Downloading nvidia-cufile-cu12 (1.1MiB)
|
| 5502 |
-
Downloading nvidia-cudnn-cu12 (674.0MiB)
|
| 5503 |
Downloading nvidia-cusparse-cu12 (274.9MiB)
|
| 5504 |
Downloading nvidia-cufile-cu12
|
| 5505 |
Downloading setuptools
|
|
@@ -5513,13 +5513,13 @@ Downloading nvidia-cusparse-cu12 (274.9MiB)
|
|
| 5513 |
Downloading triton
|
| 5514 |
Downloading nvidia-cufft-cu12
|
| 5515 |
Downloading nvidia-cusolver-cu12
|
| 5516 |
-
Downloading nvidia-cusparse-cu12
|
| 5517 |
Downloading nvidia-cusparselt-cu12
|
|
|
|
| 5518 |
Downloading nvidia-nccl-cu12
|
| 5519 |
Downloading nvidia-cublas-cu12
|
| 5520 |
Downloading nvidia-cudnn-cu12
|
| 5521 |
Downloading torch
|
| 5522 |
-
Installed 26 packages in
|
| 5523 |
</div>
|
| 5524 |
</div>
|
| 5525 |
<div class="cell-artifacts">
|
|
@@ -5538,7 +5538,7 @@ Installed 26 packages in 526ms
|
|
| 5538 |
<span onclick="toggleOutput('gptoss_run')" style="cursor: pointer;">▼ output</span>
|
| 5539 |
<span id="uv-indicator-gptoss_run" onclick="toggleUvLogsFromHeader('gptoss_run')" style="cursor: pointer;">▶ uv-logs</span>
|
| 5540 |
</span> |
|
| 5541 |
-
Cell: gptoss_run | deps: torch, numpy | 40.
|
| 5542 |
| <button class="run-btn" onclick="runCell('gptoss_run')">▶ run</button>
|
| 5543 |
<button class="copy-btn" onclick="copyCell('gptoss_run')">Copy</button>
|
| 5544 |
<a href="cells/gptoss_run.py" target="_blank" class="raw-btn">Raw</a>
|
|
@@ -5856,10 +5856,10 @@ Input Variation: +0.001 * iteration (deterministic)
|
|
| 5856 |
|
| 5857 |
Warming up (10 iterations)...
|
| 5858 |
Benchmarking (50 iterations)...
|
| 5859 |
-
Progress: 20% complete (avg:
|
| 5860 |
-
Progress: 40% complete (avg:
|
| 5861 |
-
Progress: 60% complete (avg:
|
| 5862 |
-
Progress: 80% complete (avg:
|
| 5863 |
|
| 5864 |
Output tensors:
|
| 5865 |
Primary: shape=(1, 100, 1152), dtype=torch.float32, device=cuda:0, range=[-0.064982, 0.061193], mean=0.000100, std=0.013510, norm=4.585560
|
|
@@ -5869,19 +5869,19 @@ Output tensors:
|
|
| 5869 |
Iterations: 50
|
| 5870 |
|
| 5871 |
Latency Statistics:
|
| 5872 |
-
Average:
|
| 5873 |
-
Min: 40.
|
| 5874 |
-
Max:
|
| 5875 |
-
Std Dev: 2.
|
| 5876 |
|
| 5877 |
Percentiles:
|
| 5878 |
-
P50 (median):
|
| 5879 |
-
P95:
|
| 5880 |
-
P99:
|
| 5881 |
|
| 5882 |
Throughput:
|
| 5883 |
-
Tokens/sec:
|
| 5884 |
-
Std Dev:
|
| 5885 |
━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━��━━━━━━━━━━━━━
|
| 5886 |
|
| 5887 |
Saved benchmark results to gptoss_results.json
|
|
@@ -5892,23 +5892,23 @@ Output sum: 11.532237
|
|
| 5892 |
<div class="uv-logs-header" onclick="toggleUvLogs(this)">▶ UV Install Logs</div>
|
| 5893 |
<div class="uv-logs-content" style="display: none;">
|
| 5894 |
Downloading setuptools (1.1MiB)
|
|
|
|
|
|
|
| 5895 |
Downloading networkx (1.9MiB)
|
| 5896 |
-
Downloading
|
| 5897 |
-
Downloading nvidia-
|
| 5898 |
-
Downloading nvidia-
|
| 5899 |
-
Downloading nvidia-
|
|
|
|
| 5900 |
Downloading nvidia-cufft-cu12 (184.2MiB)
|
| 5901 |
-
Downloading nvidia-cudnn-cu12 (674.0MiB)
|
| 5902 |
Downloading nvidia-curand-cu12 (60.7MiB)
|
| 5903 |
-
Downloading numpy (16.2MiB)
|
| 5904 |
Downloading nvidia-nccl-cu12 (307.4MiB)
|
| 5905 |
-
Downloading nvidia-
|
| 5906 |
-
Downloading nvidia-
|
| 5907 |
-
Downloading sympy (6.0MiB)
|
| 5908 |
-
Downloading nvidia-cusparse-cu12 (274.9MiB)
|
| 5909 |
Downloading torch (846.9MiB)
|
| 5910 |
-
Downloading nvidia-
|
| 5911 |
-
Downloading nvidia-
|
|
|
|
| 5912 |
Downloading nvidia-cufile-cu12
|
| 5913 |
Downloading setuptools
|
| 5914 |
Downloading networkx
|
|
@@ -5927,7 +5927,7 @@ Downloading nvidia-cublas-cu12 (566.8MiB)
|
|
| 5927 |
Downloading nvidia-cublas-cu12
|
| 5928 |
Downloading nvidia-cudnn-cu12
|
| 5929 |
Downloading torch
|
| 5930 |
-
Installed 26 packages in
|
| 5931 |
</div>
|
| 5932 |
</div>
|
| 5933 |
<div class="cell-artifacts">
|
|
@@ -5946,7 +5946,7 @@ Installed 26 packages in 449ms
|
|
| 5946 |
<span onclick="toggleOutput('gptoss_training_run')" style="cursor: pointer;">▼ output</span>
|
| 5947 |
<span id="uv-indicator-gptoss_training_run" onclick="toggleUvLogsFromHeader('gptoss_training_run')" style="cursor: pointer;">▶ uv-logs</span>
|
| 5948 |
</span> |
|
| 5949 |
-
Cell: gptoss_training_run | deps: torch, numpy |
|
| 5950 |
| <button class="run-btn" onclick="runCell('gptoss_training_run')">▶ run</button>
|
| 5951 |
<button class="copy-btn" onclick="copyCell('gptoss_training_run')">Copy</button>
|
| 5952 |
<a href="cells/gptoss_training_run.py" target="_blank" class="raw-btn">Raw</a>
|
|
@@ -6247,10 +6247,10 @@ Input Variation: +0.001 * iteration (deterministic)
|
|
| 6247 |
|
| 6248 |
Warming up (10 iterations)...
|
| 6249 |
Benchmarking (50 iterations)...
|
| 6250 |
-
Progress: 20% complete (avg:
|
| 6251 |
-
Progress: 40% complete (avg: 50.
|
| 6252 |
-
Progress: 60% complete (avg:
|
| 6253 |
-
Progress: 80% complete (avg:
|
| 6254 |
|
| 6255 |
Output tensors:
|
| 6256 |
Primary: shape=(1, 100, 1152), dtype=torch.float32, device=cuda:0, range=[-0.064982, 0.061193], mean=0.000100, std=0.013510, norm=4.585560
|
|
@@ -6260,18 +6260,18 @@ Output tensors:
|
|
| 6260 |
Iterations: 50
|
| 6261 |
|
| 6262 |
Latency Statistics:
|
| 6263 |
-
Average:
|
| 6264 |
-
Min: 38.
|
| 6265 |
-
Max: 51.
|
| 6266 |
-
Std Dev: 3.
|
| 6267 |
|
| 6268 |
Percentiles:
|
| 6269 |
-
P50 (median):
|
| 6270 |
-
P95: 51.
|
| 6271 |
-
P99: 51.
|
| 6272 |
|
| 6273 |
Throughput:
|
| 6274 |
-
Tokens/sec:
|
| 6275 |
Std Dev: 188.8
|
| 6276 |
━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━
|
| 6277 |
|
|
@@ -6282,23 +6282,23 @@ Output sum: 11.532237
|
|
| 6282 |
<div class="uv-install-logs" id="uv-logs-gptoss_training_run">
|
| 6283 |
<div class="uv-logs-header" onclick="toggleUvLogs(this)">▶ UV Install Logs</div>
|
| 6284 |
<div class="uv-logs-content" style="display: none;">
|
| 6285 |
-
Downloading nvidia-cudnn-cu12 (674.0MiB)
|
| 6286 |
-
Downloading nvidia-cuda-cupti-cu12 (9.8MiB)
|
| 6287 |
-
Downloading nvidia-cusolver-cu12 (255.1MiB)
|
| 6288 |
-
Downloading nvidia-nccl-cu12 (307.4MiB)
|
| 6289 |
Downloading nvidia-cufft-cu12 (184.2MiB)
|
| 6290 |
-
Downloading nvidia-
|
| 6291 |
-
Downloading networkx (1.9MiB)
|
| 6292 |
-
Downloading nvidia-curand-cu12 (60.7MiB)
|
| 6293 |
Downloading sympy (6.0MiB)
|
| 6294 |
-
Downloading numpy (16.2MiB)
|
| 6295 |
Downloading nvidia-cufile-cu12 (1.1MiB)
|
| 6296 |
-
Downloading nvidia-cusparselt-cu12 (273.9MiB)
|
| 6297 |
Downloading torch (846.9MiB)
|
|
|
|
| 6298 |
Downloading setuptools (1.1MiB)
|
| 6299 |
-
Downloading nvidia-
|
| 6300 |
-
Downloading
|
|
|
|
|
|
|
| 6301 |
Downloading nvidia-cublas-cu12 (566.8MiB)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 6302 |
Downloading triton (148.3MiB)
|
| 6303 |
Downloading nvidia-cufile-cu12
|
| 6304 |
Downloading setuptools
|
|
@@ -6318,7 +6318,7 @@ Downloading triton (148.3MiB)
|
|
| 6318 |
Downloading nvidia-cublas-cu12
|
| 6319 |
Downloading nvidia-cudnn-cu12
|
| 6320 |
Downloading torch
|
| 6321 |
-
Installed 26 packages in
|
| 6322 |
</div>
|
| 6323 |
</div>
|
| 6324 |
<div class="cell-artifacts">
|
|
@@ -6337,7 +6337,7 @@ Installed 26 packages in 548ms
|
|
| 6337 |
<span onclick="toggleOutput('megablocks_run')" style="cursor: pointer;">▼ output</span>
|
| 6338 |
<span id="uv-indicator-megablocks_run" onclick="toggleUvLogsFromHeader('megablocks_run')" style="cursor: pointer;">▶ uv-logs</span>
|
| 6339 |
</span> |
|
| 6340 |
-
Cell: megablocks_run | deps: torch, numpy, kernels |
|
| 6341 |
| <button class="run-btn" onclick="runCell('megablocks_run')">▶ run</button>
|
| 6342 |
<button class="copy-btn" onclick="copyCell('megablocks_run')">Copy</button>
|
| 6343 |
<a href="cells/megablocks_run.py" target="_blank" class="raw-btn">Raw</a>
|
|
@@ -6566,10 +6566,10 @@ Input Variation: +0.001 * iteration (deterministic)
|
|
| 6566 |
|
| 6567 |
Warming up (10 iterations)...
|
| 6568 |
Benchmarking (50 iterations)...
|
| 6569 |
-
Progress: 20% complete (avg: 0.
|
| 6570 |
-
Progress: 40% complete (avg: 0.
|
| 6571 |
-
Progress: 60% complete (avg: 0.
|
| 6572 |
-
Progress: 80% complete (avg: 2.
|
| 6573 |
|
| 6574 |
Output tensors:
|
| 6575 |
Primary: shape=(1, 100, 1152), dtype=torch.float32, device=cuda:0, range=[-0.061104, 0.055115], mean=0.000056, std=0.013535, norm=4.593927
|
|
@@ -6579,19 +6579,19 @@ Output tensors:
|
|
| 6579 |
Iterations: 50
|
| 6580 |
|
| 6581 |
Latency Statistics:
|
| 6582 |
-
Average: 3.
|
| 6583 |
-
Min: 0.
|
| 6584 |
-
Max: 8.
|
| 6585 |
-
Std Dev: 3.
|
| 6586 |
|
| 6587 |
Percentiles:
|
| 6588 |
-
P50 (median): 0.
|
| 6589 |
-
P95: 8.
|
| 6590 |
-
P99: 8.
|
| 6591 |
|
| 6592 |
Throughput:
|
| 6593 |
-
Tokens/sec:
|
| 6594 |
-
Std Dev:
|
| 6595 |
━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━
|
| 6596 |
|
| 6597 |
Saved benchmark results to megablocks_results.json
|
|
@@ -6602,24 +6602,24 @@ Output sum: 6.473885
|
|
| 6602 |
<div class="uv-logs-header" onclick="toggleUvLogs(this)">▶ UV Install Logs</div>
|
| 6603 |
<div class="uv-logs-content" style="display: none;">
|
| 6604 |
Downloading sympy (6.0MiB)
|
|
|
|
| 6605 |
Downloading numpy (16.2MiB)
|
| 6606 |
-
Downloading nvidia-nccl-cu12 (307.4MiB)
|
| 6607 |
Downloading hf-xet (3.0MiB)
|
| 6608 |
-
Downloading setuptools (1.1MiB)
|
| 6609 |
-
Downloading nvidia-cuda-cupti-cu12 (9.8MiB)
|
| 6610 |
-
Downloading nvidia-cublas-cu12 (566.8MiB)
|
| 6611 |
-
Downloading torch (846.9MiB)
|
| 6612 |
-
Downloading nvidia-nvjitlink-cu12 (37.4MiB)
|
| 6613 |
-
Downloading nvidia-cufile-cu12 (1.1MiB)
|
| 6614 |
-
Downloading nvidia-cudnn-cu12 (674.0MiB)
|
| 6615 |
Downloading nvidia-cuda-nvrtc-cu12 (84.0MiB)
|
|
|
|
| 6616 |
Downloading nvidia-cusparse-cu12 (274.9MiB)
|
| 6617 |
-
Downloading nvidia-
|
| 6618 |
-
Downloading nvidia-
|
| 6619 |
-
Downloading triton (148.3MiB)
|
| 6620 |
Downloading networkx (1.9MiB)
|
| 6621 |
-
Downloading
|
|
|
|
|
|
|
| 6622 |
Downloading nvidia-cusolver-cu12 (255.1MiB)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 6623 |
Downloading nvidia-cufile-cu12
|
| 6624 |
Downloading hf-xet
|
| 6625 |
Downloading setuptools
|
|
@@ -6639,19 +6639,20 @@ Downloading nvidia-cusolver-cu12 (255.1MiB)
|
|
| 6639 |
Downloading nvidia-cublas-cu12
|
| 6640 |
Downloading nvidia-cudnn-cu12
|
| 6641 |
Downloading torch
|
| 6642 |
-
Installed 37 packages in
|
| 6643 |
</div>
|
| 6644 |
</div>
|
| 6645 |
<div class="cell-stderr">Fetching 66 files: 0%| | 0/66 [00:00<?, ?it/s]
|
| 6646 |
-
Fetching 66 files: 2%|▏ | 1/66 [00:00<00:
|
| 6647 |
-
Fetching 66 files:
|
| 6648 |
-
Fetching 66 files:
|
| 6649 |
-
Fetching 66 files:
|
| 6650 |
-
Fetching 66 files:
|
| 6651 |
-
Fetching 66 files:
|
| 6652 |
-
Fetching 66 files:
|
| 6653 |
-
Fetching 66 files:
|
| 6654 |
-
Fetching 66 files:
|
|
|
|
| 6655 |
<div class="cell-artifacts">
|
| 6656 |
<h4>Artifacts:</h4>
|
| 6657 |
<a href="artifacts/megablocks_run/megablocks_results.json" class="artifact" target="_blank">megablocks_results.json</a>
|
|
@@ -6668,7 +6669,7 @@ Fetching 66 files: 100%|██████████| 66/66 [00:01<00:00, 3
|
|
| 6668 |
<span onclick="toggleOutput('visualization')" style="cursor: pointer;">▼ output</span>
|
| 6669 |
<span id="uv-indicator-visualization" onclick="toggleUvLogsFromHeader('visualization')" style="cursor: pointer;">▶ uv-logs</span>
|
| 6670 |
</span> |
|
| 6671 |
-
Cell: visualization | deps: matplotlib | 3.
|
| 6672 |
| <button class="run-btn" onclick="runCell('visualization')">▶ run</button>
|
| 6673 |
<button class="copy-btn" onclick="copyCell('visualization')">Copy</button>
|
| 6674 |
<a href="cells/visualization.py" target="_blank" class="raw-btn">Raw</a>
|
|
@@ -6914,30 +6915,30 @@ Loaded /repo/moe_benchmarks/megablocks_yamoe/.uvnote/cache/0febdf3420999533bc2e1
|
|
| 6914 |
Performance Summary:
|
| 6915 |
Implementation Avg (ms) P95 (ms) Tokens/sec Relative Speed
|
| 6916 |
--------------------------------------------------------------------------------
|
| 6917 |
-
megablocks_results 3.
|
| 6918 |
-
yamoe_results 4.25 4.27
|
| 6919 |
-
binned_results
|
| 6920 |
-
|
| 6921 |
-
|
| 6922 |
-
|
| 6923 |
-
Fastest: megablocks_results (3.
|
| 6924 |
-
Slowest:
|
| 6925 |
-
Max Speedup:
|
| 6926 |
</div>
|
| 6927 |
<div class="uv-install-logs" id="uv-logs-visualization">
|
| 6928 |
<div class="uv-logs-header" onclick="toggleUvLogs(this)">▶ UV Install Logs</div>
|
| 6929 |
<div class="uv-logs-content" style="display: none;">
|
|
|
|
| 6930 |
Downloading numpy (16.2MiB)
|
|
|
|
| 6931 |
Downloading kiwisolver (1.4MiB)
|
| 6932 |
Downloading pillow (6.3MiB)
|
| 6933 |
-
Downloading fonttools (4.7MiB)
|
| 6934 |
-
Downloading matplotlib (8.3MiB)
|
| 6935 |
Downloading kiwisolver
|
| 6936 |
Downloading pillow
|
| 6937 |
Downloading fonttools
|
| 6938 |
Downloading matplotlib
|
| 6939 |
Downloading numpy
|
| 6940 |
-
Installed 11 packages in
|
| 6941 |
</div>
|
| 6942 |
</div>
|
| 6943 |
<div class="cell-artifacts">
|
|
|
|
| 3720 |
<span onclick="toggleOutput('utils')" style="cursor: pointer;">▼ output</span>
|
| 3721 |
<span id="uv-indicator-utils" onclick="toggleUvLogsFromHeader('utils')" style="cursor: pointer;">▶ uv-logs</span>
|
| 3722 |
</span> |
|
| 3723 |
+
Cell: utils | deps: torch, numpy | 34.47s
|
| 3724 |
| <button class="run-btn" onclick="runCell('utils')">▶ run</button>
|
| 3725 |
<button class="copy-btn" onclick="copyCell('utils')">Copy</button>
|
| 3726 |
<a href="cells/utils.py" target="_blank" class="raw-btn">Raw</a>
|
|
|
|
| 3794 |
<div class="uv-install-logs" id="uv-logs-utils">
|
| 3795 |
<div class="uv-logs-header" onclick="toggleUvLogs(this)">▶ UV Install Logs</div>
|
| 3796 |
<div class="uv-logs-content" style="display: none;">
|
|
|
|
|
|
|
| 3797 |
Downloading networkx (1.9MiB)
|
|
|
|
|
|
|
|
|
|
| 3798 |
Downloading nvidia-nccl-cu12 (307.4MiB)
|
| 3799 |
+
Downloading setuptools (1.1MiB)
|
| 3800 |
+
Downloading numpy (16.2MiB)
|
| 3801 |
+
Downloading nvidia-cuda-nvrtc-cu12 (84.0MiB)
|
| 3802 |
+
Downloading sympy (6.0MiB)
|
| 3803 |
Downloading nvidia-cusolver-cu12 (255.1MiB)
|
|
|
|
| 3804 |
Downloading nvidia-curand-cu12 (60.7MiB)
|
| 3805 |
+
Downloading nvidia-cudnn-cu12 (674.0MiB)
|
| 3806 |
+
Downloading torch (846.9MiB)
|
|
|
|
|
|
|
| 3807 |
Downloading nvidia-cufft-cu12 (184.2MiB)
|
| 3808 |
+
Downloading nvidia-cublas-cu12 (566.8MiB)
|
| 3809 |
Downloading nvidia-cusparse-cu12 (274.9MiB)
|
| 3810 |
+
Downloading nvidia-nvjitlink-cu12 (37.4MiB)
|
| 3811 |
+
Downloading nvidia-cusparselt-cu12 (273.9MiB)
|
| 3812 |
+
Downloading nvidia-cuda-cupti-cu12 (9.8MiB)
|
| 3813 |
+
Downloading triton (148.3MiB)
|
| 3814 |
+
Downloading nvidia-cufile-cu12 (1.1MiB)
|
| 3815 |
Downloading nvidia-cufile-cu12
|
| 3816 |
Downloading setuptools
|
| 3817 |
Downloading networkx
|
|
|
|
| 3843 |
<span onclick="toggleOutput('bench_utils')" style="cursor: pointer;">▼ output</span>
|
| 3844 |
<span id="uv-indicator-bench_utils" onclick="toggleUvLogsFromHeader('bench_utils')" style="cursor: pointer;">▶ uv-logs</span>
|
| 3845 |
</span> |
|
| 3846 |
+
Cell: bench_utils | deps: torch, numpy | 34.94s
|
| 3847 |
| <button class="run-btn" onclick="runCell('bench_utils')">▶ run</button>
|
| 3848 |
<button class="copy-btn" onclick="copyCell('bench_utils')">Copy</button>
|
| 3849 |
<a href="cells/bench_utils.py" target="_blank" class="raw-btn">Raw</a>
|
|
|
|
| 4331 |
<div class="uv-install-logs" id="uv-logs-bench_utils">
|
| 4332 |
<div class="uv-logs-header" onclick="toggleUvLogs(this)">▶ UV Install Logs</div>
|
| 4333 |
<div class="uv-logs-content" style="display: none;">
|
|
|
|
|
|
|
| 4334 |
Downloading setuptools (1.1MiB)
|
| 4335 |
+
Downloading sympy (6.0MiB)
|
| 4336 |
+
Downloading nvidia-curand-cu12 (60.7MiB)
|
| 4337 |
+
Downloading nvidia-cusparse-cu12 (274.9MiB)
|
| 4338 |
Downloading numpy (16.2MiB)
|
|
|
|
|
|
|
|
|
|
| 4339 |
Downloading nvidia-cufft-cu12 (184.2MiB)
|
| 4340 |
+
Downloading networkx (1.9MiB)
|
| 4341 |
+
Downloading nvidia-nvjitlink-cu12 (37.4MiB)
|
| 4342 |
Downloading nvidia-cublas-cu12 (566.8MiB)
|
|
|
|
| 4343 |
Downloading nvidia-cufile-cu12 (1.1MiB)
|
| 4344 |
+
Downloading torch (846.9MiB)
|
| 4345 |
+
Downloading nvidia-cudnn-cu12 (674.0MiB)
|
| 4346 |
+
Downloading nvidia-nccl-cu12 (307.4MiB)
|
| 4347 |
+
Downloading nvidia-cusolver-cu12 (255.1MiB)
|
| 4348 |
+
Downloading triton (148.3MiB)
|
| 4349 |
Downloading nvidia-cuda-cupti-cu12 (9.8MiB)
|
| 4350 |
Downloading nvidia-cusparselt-cu12 (273.9MiB)
|
| 4351 |
+
Downloading nvidia-cuda-nvrtc-cu12 (84.0MiB)
|
|
|
|
| 4352 |
Downloading nvidia-cufile-cu12
|
| 4353 |
Downloading setuptools
|
| 4354 |
Downloading networkx
|
|
|
|
| 4361 |
Downloading triton
|
| 4362 |
Downloading nvidia-cufft-cu12
|
| 4363 |
Downloading nvidia-cusolver-cu12
|
|
|
|
| 4364 |
Downloading nvidia-cusparse-cu12
|
| 4365 |
+
Downloading nvidia-cusparselt-cu12
|
| 4366 |
Downloading nvidia-nccl-cu12
|
| 4367 |
Downloading nvidia-cublas-cu12
|
| 4368 |
Downloading nvidia-cudnn-cu12
|
|
|
|
| 4381 |
<span onclick="toggleOutput('config')" style="cursor: pointer;">▼ output</span>
|
| 4382 |
<span id="uv-indicator-config" onclick="toggleUvLogsFromHeader('config')" style="cursor: pointer;">▶ uv-logs</span>
|
| 4383 |
</span> |
|
| 4384 |
+
Cell: config | deps: torch, numpy | 35.62s
|
| 4385 |
| <button class="run-btn" onclick="runCell('config')">▶ run</button>
|
| 4386 |
<button class="copy-btn" onclick="copyCell('config')">Copy</button>
|
| 4387 |
<a href="cells/config.py" target="_blank" class="raw-btn">Raw</a>
|
|
|
|
| 4441 |
<div class="uv-install-logs" id="uv-logs-config">
|
| 4442 |
<div class="uv-logs-header" onclick="toggleUvLogs(this)">▶ UV Install Logs</div>
|
| 4443 |
<div class="uv-logs-content" style="display: none;">
|
| 4444 |
+
Downloading nvidia-nvjitlink-cu12 (37.4MiB)
|
| 4445 |
+
Downloading sympy (6.0MiB)
|
| 4446 |
Downloading nvidia-cufile-cu12 (1.1MiB)
|
| 4447 |
+
Downloading nvidia-cuda-nvrtc-cu12 (84.0MiB)
|
| 4448 |
+
Downloading setuptools (1.1MiB)
|
| 4449 |
+
Downloading networkx (1.9MiB)
|
| 4450 |
+
Downloading nvidia-cublas-cu12 (566.8MiB)
|
| 4451 |
Downloading nvidia-cusparse-cu12 (274.9MiB)
|
| 4452 |
+
Downloading nvidia-nccl-cu12 (307.4MiB)
|
| 4453 |
+
Downloading nvidia-cusolver-cu12 (255.1MiB)
|
| 4454 |
Downloading nvidia-cufft-cu12 (184.2MiB)
|
|
|
|
|
|
|
| 4455 |
Downloading nvidia-curand-cu12 (60.7MiB)
|
| 4456 |
+
Downloading numpy (16.2MiB)
|
| 4457 |
+
Downloading nvidia-cudnn-cu12 (674.0MiB)
|
| 4458 |
Downloading nvidia-cusparselt-cu12 (273.9MiB)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 4459 |
Downloading nvidia-cuda-cupti-cu12 (9.8MiB)
|
|
|
|
|
|
|
|
|
|
| 4460 |
Downloading torch (846.9MiB)
|
| 4461 |
+
Downloading triton (148.3MiB)
|
| 4462 |
Downloading nvidia-cufile-cu12
|
| 4463 |
Downloading setuptools
|
| 4464 |
Downloading networkx
|
|
|
|
| 4471 |
Downloading triton
|
| 4472 |
Downloading nvidia-cufft-cu12
|
| 4473 |
Downloading nvidia-cusolver-cu12
|
|
|
|
| 4474 |
Downloading nvidia-cusparselt-cu12
|
| 4475 |
+
Downloading nvidia-cusparse-cu12
|
| 4476 |
Downloading nvidia-nccl-cu12
|
| 4477 |
Downloading nvidia-cublas-cu12
|
| 4478 |
Downloading nvidia-cudnn-cu12
|
|
|
|
| 4490 |
<span onclick="toggleOutput('save_data')" style="cursor: pointer;">▼ output</span>
|
| 4491 |
<span id="uv-indicator-save_data" onclick="toggleUvLogsFromHeader('save_data')" style="cursor: pointer;">▶ uv-logs</span>
|
| 4492 |
</span> |
|
| 4493 |
+
Cell: save_data | deps: torch, numpy | 39.76s
|
| 4494 |
| <button class="run-btn" onclick="runCell('save_data')">▶ run</button>
|
| 4495 |
<button class="copy-btn" onclick="copyCell('save_data')">Copy</button>
|
| 4496 |
<a href="cells/save_data.py" target="_blank" class="raw-btn">Raw</a>
|
|
|
|
| 4585 |
<div class="uv-install-logs" id="uv-logs-save_data">
|
| 4586 |
<div class="uv-logs-header" onclick="toggleUvLogs(this)">▶ UV Install Logs</div>
|
| 4587 |
<div class="uv-logs-content" style="display: none;">
|
| 4588 |
+
Downloading nvidia-cufile-cu12 (1.1MiB)
|
|
|
|
|
|
|
| 4589 |
Downloading nvidia-nccl-cu12 (307.4MiB)
|
|
|
|
| 4590 |
Downloading sympy (6.0MiB)
|
| 4591 |
+
Downloading nvidia-cuda-cupti-cu12 (9.8MiB)
|
| 4592 |
+
Downloading nvidia-cufft-cu12 (184.2MiB)
|
| 4593 |
+
Downloading numpy (16.2MiB)
|
| 4594 |
Downloading nvidia-cusolver-cu12 (255.1MiB)
|
| 4595 |
+
Downloading nvidia-cusparse-cu12 (274.9MiB)
|
| 4596 |
Downloading nvidia-curand-cu12 (60.7MiB)
|
|
|
|
|
|
|
|
|
|
| 4597 |
Downloading nvidia-cusparselt-cu12 (273.9MiB)
|
| 4598 |
+
Downloading setuptools (1.1MiB)
|
| 4599 |
+
Downloading nvidia-cudnn-cu12 (674.0MiB)
|
| 4600 |
+
Downloading nvidia-cuda-nvrtc-cu12 (84.0MiB)
|
| 4601 |
Downloading networkx (1.9MiB)
|
|
|
|
| 4602 |
Downloading nvidia-cublas-cu12 (566.8MiB)
|
| 4603 |
+
Downloading triton (148.3MiB)
|
| 4604 |
Downloading nvidia-nvjitlink-cu12 (37.4MiB)
|
| 4605 |
+
Downloading torch (846.9MiB)
|
| 4606 |
Downloading nvidia-cufile-cu12
|
| 4607 |
Downloading setuptools
|
| 4608 |
Downloading networkx
|
|
|
|
| 4621 |
Downloading nvidia-cublas-cu12
|
| 4622 |
Downloading nvidia-cudnn-cu12
|
| 4623 |
Downloading torch
|
| 4624 |
+
Installed 26 packages in 447ms
|
| 4625 |
</div>
|
| 4626 |
</div>
|
| 4627 |
<div class="cell-artifacts">
|
| 4628 |
<h4>Artifacts:</h4>
|
| 4629 |
<a href="artifacts/save_data/router_weight.pt" class="artifact" target="_blank">router_weight.pt</a>
|
| 4630 |
<a href="artifacts/save_data/gate_up_proj_bias.pt" class="artifact" target="_blank">gate_up_proj_bias.pt</a>
|
|
|
|
|
|
|
| 4631 |
<a href="artifacts/save_data/router_bias.pt" class="artifact" target="_blank">router_bias.pt</a>
|
| 4632 |
+
<a href="artifacts/save_data/gate_up_proj.pt" class="artifact" target="_blank">gate_up_proj.pt</a>
|
| 4633 |
<a href="artifacts/save_data/down_proj_bias.pt" class="artifact" target="_blank">down_proj_bias.pt</a>
|
| 4634 |
+
<a href="artifacts/save_data/down_proj.pt" class="artifact" target="_blank">down_proj.pt</a>
|
| 4635 |
</div>
|
| 4636 |
</div>
|
| 4637 |
</div>
|
|
|
|
| 4645 |
<span onclick="toggleOutput('yamoe_run')" style="cursor: pointer;">▼ output</span>
|
| 4646 |
<span id="uv-indicator-yamoe_run" onclick="toggleUvLogsFromHeader('yamoe_run')" style="cursor: pointer;">▶ uv-logs</span>
|
| 4647 |
</span> |
|
| 4648 |
+
Cell: yamoe_run | deps: torch, kernels, numpy | 38.79s
|
| 4649 |
| <button class="run-btn" onclick="runCell('yamoe_run')">▶ run</button>
|
| 4650 |
<button class="copy-btn" onclick="copyCell('yamoe_run')">Copy</button>
|
| 4651 |
<a href="cells/yamoe_run.py" target="_blank" class="raw-btn">Raw</a>
|
|
|
|
| 4938 |
|
| 4939 |
Warming up (10 iterations)...
|
| 4940 |
Benchmarking (50 iterations)...
|
| 4941 |
+
Progress: 20% complete (avg: 4.251 ms)
|
| 4942 |
+
Progress: 40% complete (avg: 4.249 ms)
|
| 4943 |
+
Progress: 60% complete (avg: 4.244 ms)
|
| 4944 |
+
Progress: 80% complete (avg: 4.246 ms)
|
| 4945 |
|
| 4946 |
Output tensors:
|
| 4947 |
Primary: shape=(1, 100, 1152), dtype=torch.float32, device=cuda:0, range=[-0.049506, 0.054984], mean=0.000034, std=0.006508, norm=2.208791
|
|
|
|
| 4951 |
Iterations: 50
|
| 4952 |
|
| 4953 |
Latency Statistics:
|
| 4954 |
+
Average: 4.246 ms
|
| 4955 |
+
Min: 4.067 ms
|
| 4956 |
+
Max: 4.294 ms
|
| 4957 |
+
Std Dev: 0.034 ms
|
| 4958 |
|
| 4959 |
Percentiles:
|
| 4960 |
P50 (median): 4.253 ms
|
| 4961 |
+
P95: 4.267 ms
|
| 4962 |
+
P99: 4.287 ms
|
| 4963 |
|
| 4964 |
Throughput:
|
| 4965 |
+
Tokens/sec: 23549.4
|
| 4966 |
+
Std Dev: 193.2
|
| 4967 |
━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━
|
| 4968 |
|
| 4969 |
Saved benchmark results to yamoe_results.json
|
| 4970 |
|
| 4971 |
+
Output sum: 3.971905
|
| 4972 |
</div>
|
| 4973 |
<div class="uv-install-logs" id="uv-logs-yamoe_run">
|
| 4974 |
<div class="uv-logs-header" onclick="toggleUvLogs(this)">▶ UV Install Logs</div>
|
| 4975 |
<div class="uv-logs-content" style="display: none;">
|
| 4976 |
+
Downloading numpy (16.2MiB)
|
| 4977 |
+
Downloading nvidia-cufile-cu12 (1.1MiB)
|
|
|
|
|
|
|
| 4978 |
Downloading nvidia-cusparselt-cu12 (273.9MiB)
|
|
|
|
| 4979 |
Downloading torch (846.9MiB)
|
| 4980 |
+
Downloading nvidia-cufft-cu12 (184.2MiB)
|
|
|
|
|
|
|
| 4981 |
Downloading nvidia-cusolver-cu12 (255.1MiB)
|
| 4982 |
+
Downloading nvidia-nccl-cu12 (307.4MiB)
|
| 4983 |
+
Downloading setuptools (1.1MiB)
|
| 4984 |
Downloading nvidia-curand-cu12 (60.7MiB)
|
| 4985 |
+
Downloading nvidia-cuda-nvrtc-cu12 (84.0MiB)
|
| 4986 |
+
Downloading nvidia-cublas-cu12 (566.8MiB)
|
| 4987 |
Downloading nvidia-nvjitlink-cu12 (37.4MiB)
|
| 4988 |
+
Downloading nvidia-cusparse-cu12 (274.9MiB)
|
| 4989 |
+
Downloading networkx (1.9MiB)
|
| 4990 |
Downloading sympy (6.0MiB)
|
| 4991 |
+
Downloading nvidia-cudnn-cu12 (674.0MiB)
|
| 4992 |
Downloading hf-xet (3.0MiB)
|
| 4993 |
+
Downloading nvidia-cuda-cupti-cu12 (9.8MiB)
|
| 4994 |
+
Downloading triton (148.3MiB)
|
| 4995 |
Downloading nvidia-cufile-cu12
|
| 4996 |
Downloading hf-xet
|
| 4997 |
Downloading setuptools
|
|
|
|
| 5005 |
Downloading triton
|
| 5006 |
Downloading nvidia-cufft-cu12
|
| 5007 |
Downloading nvidia-cusolver-cu12
|
|
|
|
| 5008 |
Downloading nvidia-cusparse-cu12
|
| 5009 |
+
Downloading nvidia-cusparselt-cu12
|
| 5010 |
Downloading nvidia-nccl-cu12
|
| 5011 |
Downloading nvidia-cublas-cu12
|
| 5012 |
Downloading nvidia-cudnn-cu12
|
| 5013 |
Downloading torch
|
| 5014 |
+
Installed 37 packages in 449ms
|
| 5015 |
</div>
|
| 5016 |
</div>
|
| 5017 |
<div class="cell-stderr">Fetching 6 files: 0%| | 0/6 [00:00<?, ?it/s]
|
| 5018 |
+
Fetching 6 files: 17%|█▋ | 1/6 [00:00<00:01, 3.75it/s]
|
| 5019 |
+
Fetching 6 files: 50%|█████ | 3/6 [00:00<00:00, 3.39it/s]
|
| 5020 |
+
Fetching 6 files: 100%|██████████| 6/6 [00:00<00:00, 6.84it/s]</div>
|
| 5021 |
<div class="cell-artifacts">
|
| 5022 |
<h4>Artifacts:</h4>
|
| 5023 |
<a href="artifacts/yamoe_run/yamoe_results.json" class="artifact" target="_blank">yamoe_results.json</a>
|
|
|
|
| 5034 |
<span onclick="toggleOutput('binned_run')" style="cursor: pointer;">▼ output</span>
|
| 5035 |
<span id="uv-indicator-binned_run" onclick="toggleUvLogsFromHeader('binned_run')" style="cursor: pointer;">▶ uv-logs</span>
|
| 5036 |
</span> |
|
| 5037 |
+
Cell: binned_run | deps: torch, numpy | 39.53s
|
| 5038 |
| <button class="run-btn" onclick="runCell('binned_run')">▶ run</button>
|
| 5039 |
<button class="copy-btn" onclick="copyCell('binned_run')">Copy</button>
|
| 5040 |
<a href="cells/binned_run.py" target="_blank" class="raw-btn">Raw</a>
|
|
|
|
| 5448 |
|
| 5449 |
Warming up (10 iterations)...
|
| 5450 |
Benchmarking (50 iterations)...
|
| 5451 |
+
Progress: 20% complete (avg: 37.247 ms)
|
| 5452 |
+
Progress: 40% complete (avg: 37.082 ms)
|
| 5453 |
+
Progress: 60% complete (avg: 36.706 ms)
|
| 5454 |
+
Progress: 80% complete (avg: 36.240 ms)
|
| 5455 |
|
| 5456 |
Output tensors:
|
| 5457 |
Primary: shape=(1, 100, 1152), dtype=torch.float32, device=cuda:0, range=[-0.049506, 0.054984], mean=0.000034, std=0.006508, norm=2.208791
|
|
|
|
| 5461 |
Iterations: 50
|
| 5462 |
|
| 5463 |
Latency Statistics:
|
| 5464 |
+
Average: 35.794 ms
|
| 5465 |
+
Min: 33.227 ms
|
| 5466 |
+
Max: 37.582 ms
|
| 5467 |
+
Std Dev: 1.261 ms
|
| 5468 |
|
| 5469 |
Percentiles:
|
| 5470 |
+
P50 (median): 36.036 ms
|
| 5471 |
+
P95: 37.377 ms
|
| 5472 |
+
P99: 37.525 ms
|
| 5473 |
|
| 5474 |
Throughput:
|
| 5475 |
+
Tokens/sec: 2793.8
|
| 5476 |
+
Std Dev: 99.7
|
| 5477 |
━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━
|
| 5478 |
|
| 5479 |
Saved benchmark results to binned_results.json
|
|
|
|
| 5483 |
<div class="uv-install-logs" id="uv-logs-binned_run">
|
| 5484 |
<div class="uv-logs-header" onclick="toggleUvLogs(this)">▶ UV Install Logs</div>
|
| 5485 |
<div class="uv-logs-content" style="display: none;">
|
| 5486 |
+
Downloading sympy (6.0MiB)
|
| 5487 |
+
Downloading nvidia-cufile-cu12 (1.1MiB)
|
| 5488 |
+
Downloading nvidia-cudnn-cu12 (674.0MiB)
|
| 5489 |
Downloading setuptools (1.1MiB)
|
| 5490 |
+
Downloading numpy (16.2MiB)
|
| 5491 |
Downloading nvidia-nvjitlink-cu12 (37.4MiB)
|
|
|
|
|
|
|
|
|
|
|
|
|
| 5492 |
Downloading triton (148.3MiB)
|
| 5493 |
+
Downloading nvidia-cufft-cu12 (184.2MiB)
|
| 5494 |
Downloading nvidia-nccl-cu12 (307.4MiB)
|
| 5495 |
+
Downloading nvidia-cublas-cu12 (566.8MiB)
|
| 5496 |
Downloading torch (846.9MiB)
|
| 5497 |
+
Downloading nvidia-cusolver-cu12 (255.1MiB)
|
| 5498 |
+
Downloading nvidia-cuda-cupti-cu12 (9.8MiB)
|
| 5499 |
Downloading nvidia-curand-cu12 (60.7MiB)
|
| 5500 |
+
Downloading networkx (1.9MiB)
|
| 5501 |
Downloading nvidia-cusparselt-cu12 (273.9MiB)
|
|
|
|
| 5502 |
Downloading nvidia-cuda-nvrtc-cu12 (84.0MiB)
|
|
|
|
|
|
|
| 5503 |
Downloading nvidia-cusparse-cu12 (274.9MiB)
|
| 5504 |
Downloading nvidia-cufile-cu12
|
| 5505 |
Downloading setuptools
|
|
|
|
| 5513 |
Downloading triton
|
| 5514 |
Downloading nvidia-cufft-cu12
|
| 5515 |
Downloading nvidia-cusolver-cu12
|
|
|
|
| 5516 |
Downloading nvidia-cusparselt-cu12
|
| 5517 |
+
Downloading nvidia-cusparse-cu12
|
| 5518 |
Downloading nvidia-nccl-cu12
|
| 5519 |
Downloading nvidia-cublas-cu12
|
| 5520 |
Downloading nvidia-cudnn-cu12
|
| 5521 |
Downloading torch
|
| 5522 |
+
Installed 26 packages in 525ms
|
| 5523 |
</div>
|
| 5524 |
</div>
|
| 5525 |
<div class="cell-artifacts">
|
|
|
|
| 5538 |
<span onclick="toggleOutput('gptoss_run')" style="cursor: pointer;">▼ output</span>
|
| 5539 |
<span id="uv-indicator-gptoss_run" onclick="toggleUvLogsFromHeader('gptoss_run')" style="cursor: pointer;">▶ uv-logs</span>
|
| 5540 |
</span> |
|
| 5541 |
+
Cell: gptoss_run | deps: torch, numpy | 40.29s
|
| 5542 |
| <button class="run-btn" onclick="runCell('gptoss_run')">▶ run</button>
|
| 5543 |
<button class="copy-btn" onclick="copyCell('gptoss_run')">Copy</button>
|
| 5544 |
<a href="cells/gptoss_run.py" target="_blank" class="raw-btn">Raw</a>
|
|
|
|
| 5856 |
|
| 5857 |
Warming up (10 iterations)...
|
| 5858 |
Benchmarking (50 iterations)...
|
| 5859 |
+
Progress: 20% complete (avg: 48.814 ms)
|
| 5860 |
+
Progress: 40% complete (avg: 48.182 ms)
|
| 5861 |
+
Progress: 60% complete (avg: 47.686 ms)
|
| 5862 |
+
Progress: 80% complete (avg: 46.880 ms)
|
| 5863 |
|
| 5864 |
Output tensors:
|
| 5865 |
Primary: shape=(1, 100, 1152), dtype=torch.float32, device=cuda:0, range=[-0.064982, 0.061193], mean=0.000100, std=0.013510, norm=4.585560
|
|
|
|
| 5869 |
Iterations: 50
|
| 5870 |
|
| 5871 |
Latency Statistics:
|
| 5872 |
+
Average: 45.933 ms
|
| 5873 |
+
Min: 40.056 ms
|
| 5874 |
+
Max: 49.512 ms
|
| 5875 |
+
Std Dev: 2.471 ms
|
| 5876 |
|
| 5877 |
Percentiles:
|
| 5878 |
+
P50 (median): 46.499 ms
|
| 5879 |
+
P95: 49.058 ms
|
| 5880 |
+
P99: 49.353 ms
|
| 5881 |
|
| 5882 |
Throughput:
|
| 5883 |
+
Tokens/sec: 2177.1
|
| 5884 |
+
Std Dev: 121.3
|
| 5885 |
━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━��━━━━━━━━━━━━━
|
| 5886 |
|
| 5887 |
Saved benchmark results to gptoss_results.json
|
|
|
|
| 5892 |
<div class="uv-logs-header" onclick="toggleUvLogs(this)">▶ UV Install Logs</div>
|
| 5893 |
<div class="uv-logs-content" style="display: none;">
|
| 5894 |
Downloading setuptools (1.1MiB)
|
| 5895 |
+
Downloading sympy (6.0MiB)
|
| 5896 |
+
Downloading numpy (16.2MiB)
|
| 5897 |
Downloading networkx (1.9MiB)
|
| 5898 |
+
Downloading nvidia-cusolver-cu12 (255.1MiB)
|
| 5899 |
+
Downloading nvidia-cusparselt-cu12 (273.9MiB)
|
| 5900 |
+
Downloading nvidia-cusparse-cu12 (274.9MiB)
|
| 5901 |
+
Downloading nvidia-cublas-cu12 (566.8MiB)
|
| 5902 |
+
Downloading nvidia-cuda-nvrtc-cu12 (84.0MiB)
|
| 5903 |
Downloading nvidia-cufft-cu12 (184.2MiB)
|
|
|
|
| 5904 |
Downloading nvidia-curand-cu12 (60.7MiB)
|
|
|
|
| 5905 |
Downloading nvidia-nccl-cu12 (307.4MiB)
|
| 5906 |
+
Downloading nvidia-cufile-cu12 (1.1MiB)
|
| 5907 |
+
Downloading nvidia-cudnn-cu12 (674.0MiB)
|
|
|
|
|
|
|
| 5908 |
Downloading torch (846.9MiB)
|
| 5909 |
+
Downloading nvidia-nvjitlink-cu12 (37.4MiB)
|
| 5910 |
+
Downloading nvidia-cuda-cupti-cu12 (9.8MiB)
|
| 5911 |
+
Downloading triton (148.3MiB)
|
| 5912 |
Downloading nvidia-cufile-cu12
|
| 5913 |
Downloading setuptools
|
| 5914 |
Downloading networkx
|
|
|
|
| 5927 |
Downloading nvidia-cublas-cu12
|
| 5928 |
Downloading nvidia-cudnn-cu12
|
| 5929 |
Downloading torch
|
| 5930 |
+
Installed 26 packages in 540ms
|
| 5931 |
</div>
|
| 5932 |
</div>
|
| 5933 |
<div class="cell-artifacts">
|
|
|
|
| 5946 |
<span onclick="toggleOutput('gptoss_training_run')" style="cursor: pointer;">▼ output</span>
|
| 5947 |
<span id="uv-indicator-gptoss_training_run" onclick="toggleUvLogsFromHeader('gptoss_training_run')" style="cursor: pointer;">▶ uv-logs</span>
|
| 5948 |
</span> |
|
| 5949 |
+
Cell: gptoss_training_run | deps: torch, numpy | 39.76s
|
| 5950 |
| <button class="run-btn" onclick="runCell('gptoss_training_run')">▶ run</button>
|
| 5951 |
<button class="copy-btn" onclick="copyCell('gptoss_training_run')">Copy</button>
|
| 5952 |
<a href="cells/gptoss_training_run.py" target="_blank" class="raw-btn">Raw</a>
|
|
|
|
| 6247 |
|
| 6248 |
Warming up (10 iterations)...
|
| 6249 |
Benchmarking (50 iterations)...
|
| 6250 |
+
Progress: 20% complete (avg: 50.744 ms)
|
| 6251 |
+
Progress: 40% complete (avg: 50.240 ms)
|
| 6252 |
+
Progress: 60% complete (avg: 48.683 ms)
|
| 6253 |
+
Progress: 80% complete (avg: 47.222 ms)
|
| 6254 |
|
| 6255 |
Output tensors:
|
| 6256 |
Primary: shape=(1, 100, 1152), dtype=torch.float32, device=cuda:0, range=[-0.064982, 0.061193], mean=0.000100, std=0.013510, norm=4.585560
|
|
|
|
| 6260 |
Iterations: 50
|
| 6261 |
|
| 6262 |
Latency Statistics:
|
| 6263 |
+
Average: 45.947 ms
|
| 6264 |
+
Min: 38.690 ms
|
| 6265 |
+
Max: 51.193 ms
|
| 6266 |
+
Std Dev: 3.915 ms
|
| 6267 |
|
| 6268 |
Percentiles:
|
| 6269 |
+
P50 (median): 45.209 ms
|
| 6270 |
+
P95: 51.039 ms
|
| 6271 |
+
P99: 51.144 ms
|
| 6272 |
|
| 6273 |
Throughput:
|
| 6274 |
+
Tokens/sec: 2176.4
|
| 6275 |
Std Dev: 188.8
|
| 6276 |
━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━
|
| 6277 |
|
|
|
|
| 6282 |
<div class="uv-install-logs" id="uv-logs-gptoss_training_run">
|
| 6283 |
<div class="uv-logs-header" onclick="toggleUvLogs(this)">▶ UV Install Logs</div>
|
| 6284 |
<div class="uv-logs-content" style="display: none;">
|
|
|
|
|
|
|
|
|
|
|
|
|
| 6285 |
Downloading nvidia-cufft-cu12 (184.2MiB)
|
| 6286 |
+
Downloading nvidia-cuda-cupti-cu12 (9.8MiB)
|
|
|
|
|
|
|
| 6287 |
Downloading sympy (6.0MiB)
|
|
|
|
| 6288 |
Downloading nvidia-cufile-cu12 (1.1MiB)
|
|
|
|
| 6289 |
Downloading torch (846.9MiB)
|
| 6290 |
+
Downloading nvidia-cusparse-cu12 (274.9MiB)
|
| 6291 |
Downloading setuptools (1.1MiB)
|
| 6292 |
+
Downloading nvidia-curand-cu12 (60.7MiB)
|
| 6293 |
+
Downloading networkx (1.9MiB)
|
| 6294 |
+
Downloading numpy (16.2MiB)
|
| 6295 |
+
Downloading nvidia-cudnn-cu12 (674.0MiB)
|
| 6296 |
Downloading nvidia-cublas-cu12 (566.8MiB)
|
| 6297 |
+
Downloading nvidia-cuda-nvrtc-cu12 (84.0MiB)
|
| 6298 |
+
Downloading nvidia-cusparselt-cu12 (273.9MiB)
|
| 6299 |
+
Downloading nvidia-nvjitlink-cu12 (37.4MiB)
|
| 6300 |
+
Downloading nvidia-nccl-cu12 (307.4MiB)
|
| 6301 |
+
Downloading nvidia-cusolver-cu12 (255.1MiB)
|
| 6302 |
Downloading triton (148.3MiB)
|
| 6303 |
Downloading nvidia-cufile-cu12
|
| 6304 |
Downloading setuptools
|
|
|
|
| 6318 |
Downloading nvidia-cublas-cu12
|
| 6319 |
Downloading nvidia-cudnn-cu12
|
| 6320 |
Downloading torch
|
| 6321 |
+
Installed 26 packages in 444ms
|
| 6322 |
</div>
|
| 6323 |
</div>
|
| 6324 |
<div class="cell-artifacts">
|
|
|
|
| 6337 |
<span onclick="toggleOutput('megablocks_run')" style="cursor: pointer;">▼ output</span>
|
| 6338 |
<span id="uv-indicator-megablocks_run" onclick="toggleUvLogsFromHeader('megablocks_run')" style="cursor: pointer;">▶ uv-logs</span>
|
| 6339 |
</span> |
|
| 6340 |
+
Cell: megablocks_run | deps: torch, numpy, kernels | 47.11s
|
| 6341 |
| <button class="run-btn" onclick="runCell('megablocks_run')">▶ run</button>
|
| 6342 |
<button class="copy-btn" onclick="copyCell('megablocks_run')">Copy</button>
|
| 6343 |
<a href="cells/megablocks_run.py" target="_blank" class="raw-btn">Raw</a>
|
|
|
|
| 6566 |
|
| 6567 |
Warming up (10 iterations)...
|
| 6568 |
Benchmarking (50 iterations)...
|
| 6569 |
+
Progress: 20% complete (avg: 0.855 ms)
|
| 6570 |
+
Progress: 40% complete (avg: 0.840 ms)
|
| 6571 |
+
Progress: 60% complete (avg: 0.838 ms)
|
| 6572 |
+
Progress: 80% complete (avg: 2.699 ms)
|
| 6573 |
|
| 6574 |
Output tensors:
|
| 6575 |
Primary: shape=(1, 100, 1152), dtype=torch.float32, device=cuda:0, range=[-0.061104, 0.055115], mean=0.000056, std=0.013535, norm=4.593927
|
|
|
|
| 6579 |
Iterations: 50
|
| 6580 |
|
| 6581 |
Latency Statistics:
|
| 6582 |
+
Average: 3.848 ms
|
| 6583 |
+
Min: 0.812 ms
|
| 6584 |
+
Max: 8.536 ms
|
| 6585 |
+
Std Dev: 3.698 ms
|
| 6586 |
|
| 6587 |
Percentiles:
|
| 6588 |
+
P50 (median): 0.839 ms
|
| 6589 |
+
P95: 8.500 ms
|
| 6590 |
+
P99: 8.529 ms
|
| 6591 |
|
| 6592 |
Throughput:
|
| 6593 |
+
Tokens/sec: 25988.6
|
| 6594 |
+
Std Dev: 53035.4
|
| 6595 |
━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━
|
| 6596 |
|
| 6597 |
Saved benchmark results to megablocks_results.json
|
|
|
|
| 6602 |
<div class="uv-logs-header" onclick="toggleUvLogs(this)">▶ UV Install Logs</div>
|
| 6603 |
<div class="uv-logs-content" style="display: none;">
|
| 6604 |
Downloading sympy (6.0MiB)
|
| 6605 |
+
Downloading nvidia-cudnn-cu12 (674.0MiB)
|
| 6606 |
Downloading numpy (16.2MiB)
|
|
|
|
| 6607 |
Downloading hf-xet (3.0MiB)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 6608 |
Downloading nvidia-cuda-nvrtc-cu12 (84.0MiB)
|
| 6609 |
+
Downloading nvidia-curand-cu12 (60.7MiB)
|
| 6610 |
Downloading nvidia-cusparse-cu12 (274.9MiB)
|
| 6611 |
+
Downloading nvidia-nvjitlink-cu12 (37.4MiB)
|
| 6612 |
+
Downloading nvidia-cufile-cu12 (1.1MiB)
|
|
|
|
| 6613 |
Downloading networkx (1.9MiB)
|
| 6614 |
+
Downloading triton (148.3MiB)
|
| 6615 |
+
Downloading nvidia-cufft-cu12 (184.2MiB)
|
| 6616 |
+
Downloading nvidia-cusparselt-cu12 (273.9MiB)
|
| 6617 |
Downloading nvidia-cusolver-cu12 (255.1MiB)
|
| 6618 |
+
Downloading torch (846.9MiB)
|
| 6619 |
+
Downloading setuptools (1.1MiB)
|
| 6620 |
+
Downloading nvidia-cublas-cu12 (566.8MiB)
|
| 6621 |
+
Downloading nvidia-nccl-cu12 (307.4MiB)
|
| 6622 |
+
Downloading nvidia-cuda-cupti-cu12 (9.8MiB)
|
| 6623 |
Downloading nvidia-cufile-cu12
|
| 6624 |
Downloading hf-xet
|
| 6625 |
Downloading setuptools
|
|
|
|
| 6639 |
Downloading nvidia-cublas-cu12
|
| 6640 |
Downloading nvidia-cudnn-cu12
|
| 6641 |
Downloading torch
|
| 6642 |
+
Installed 37 packages in 459ms
|
| 6643 |
</div>
|
| 6644 |
</div>
|
| 6645 |
<div class="cell-stderr">Fetching 66 files: 0%| | 0/66 [00:00<?, ?it/s]
|
| 6646 |
+
Fetching 66 files: 2%|▏ | 1/66 [00:00<00:11, 5.87it/s]
|
| 6647 |
+
Fetching 66 files: 3%|▎ | 2/66 [00:00<00:10, 6.31it/s]
|
| 6648 |
+
Fetching 66 files: 17%|█▋ | 11/66 [00:00<00:01, 34.40it/s]
|
| 6649 |
+
Fetching 66 files: 24%|██▍ | 16/66 [00:00<00:01, 38.36it/s]
|
| 6650 |
+
Fetching 66 files: 32%|███▏ | 21/66 [00:00<00:01, 23.86it/s]
|
| 6651 |
+
Fetching 66 files: 52%|█████▏ | 34/66 [00:01<00:00, 39.79it/s]
|
| 6652 |
+
Fetching 66 files: 59%|█████▉ | 39/66 [00:01<00:00, 28.73it/s]
|
| 6653 |
+
Fetching 66 files: 82%|████████▏ | 54/66 [00:01<00:00, 40.50it/s]
|
| 6654 |
+
Fetching 66 files: 94%|█████████▍| 62/66 [00:01<00:00, 45.88it/s]
|
| 6655 |
+
Fetching 66 files: 100%|██████████| 66/66 [00:01<00:00, 37.06it/s]</div>
|
| 6656 |
<div class="cell-artifacts">
|
| 6657 |
<h4>Artifacts:</h4>
|
| 6658 |
<a href="artifacts/megablocks_run/megablocks_results.json" class="artifact" target="_blank">megablocks_results.json</a>
|
|
|
|
| 6669 |
<span onclick="toggleOutput('visualization')" style="cursor: pointer;">▼ output</span>
|
| 6670 |
<span id="uv-indicator-visualization" onclick="toggleUvLogsFromHeader('visualization')" style="cursor: pointer;">▶ uv-logs</span>
|
| 6671 |
</span> |
|
| 6672 |
+
Cell: visualization | deps: matplotlib | 3.13s
|
| 6673 |
| <button class="run-btn" onclick="runCell('visualization')">▶ run</button>
|
| 6674 |
<button class="copy-btn" onclick="copyCell('visualization')">Copy</button>
|
| 6675 |
<a href="cells/visualization.py" target="_blank" class="raw-btn">Raw</a>
|
|
|
|
| 6915 |
Performance Summary:
|
| 6916 |
Implementation Avg (ms) P95 (ms) Tokens/sec Relative Speed
|
| 6917 |
--------------------------------------------------------------------------------
|
| 6918 |
+
megablocks_results 3.85 8.50 25989 1.00x
|
| 6919 |
+
yamoe_results 4.25 4.27 23549 0.91x
|
| 6920 |
+
binned_results 35.79 37.38 2794 0.11x
|
| 6921 |
+
gptoss_results 45.93 49.06 2177 0.08x
|
| 6922 |
+
gptoss_training_results 45.95 51.04 2176 0.08x
|
| 6923 |
+
|
| 6924 |
+
Fastest: megablocks_results (3.85ms avg)
|
| 6925 |
+
Slowest: gptoss_training_results (45.95ms avg)
|
| 6926 |
+
Max Speedup: 11.9x
|
| 6927 |
</div>
|
| 6928 |
<div class="uv-install-logs" id="uv-logs-visualization">
|
| 6929 |
<div class="uv-logs-header" onclick="toggleUvLogs(this)">▶ UV Install Logs</div>
|
| 6930 |
<div class="uv-logs-content" style="display: none;">
|
| 6931 |
+
Downloading fonttools (4.7MiB)
|
| 6932 |
Downloading numpy (16.2MiB)
|
| 6933 |
+
Downloading matplotlib (8.3MiB)
|
| 6934 |
Downloading kiwisolver (1.4MiB)
|
| 6935 |
Downloading pillow (6.3MiB)
|
|
|
|
|
|
|
| 6936 |
Downloading kiwisolver
|
| 6937 |
Downloading pillow
|
| 6938 |
Downloading fonttools
|
| 6939 |
Downloading matplotlib
|
| 6940 |
Downloading numpy
|
| 6941 |
+
Installed 11 packages in 46ms
|
| 6942 |
</div>
|
| 6943 |
</div>
|
| 6944 |
<div class="cell-artifacts">
|