Upload folder using huggingface_hub
Browse files- flash_attn/artifacts/benchmark/Attention Benchmark.csv +6 -6
- flash_attn/artifacts/benchmark/Attention Benchmark.png +2 -2
- flash_attn/benchmark.html +157 -81
- flash_attn/cells/benchmark.py +3 -1
- flash_attn/cells/nv.py +3 -0
- moe_benchmarks/megablocks/megablocks_only.html +2 -69
- moe_benchmarks/megablocks_yamoe/artifacts/binned_run/binned_results.json +9 -9
- moe_benchmarks/megablocks_yamoe/artifacts/gptoss_run/gptoss_results.json +9 -9
- moe_benchmarks/megablocks_yamoe/artifacts/gptoss_training_run/gptoss_training_results.json +9 -9
- moe_benchmarks/megablocks_yamoe/artifacts/megablocks_run/megablocks_results.json +9 -9
- moe_benchmarks/megablocks_yamoe/artifacts/visualization/moe_performance_comparison.png +2 -2
- moe_benchmarks/megablocks_yamoe/artifacts/yamoe_run/yamoe_results.json +10 -10
- moe_benchmarks/megablocks_yamoe/cells/__pycache__/bench_utils.cpython-311.pyc +0 -0
- moe_benchmarks/megablocks_yamoe/cells/__pycache__/config.cpython-311.pyc +0 -0
- moe_benchmarks/megablocks_yamoe/megablocks_yamoe.html +2 -69
- moe_benchmarks/megablocks_yamoe/torch_profile.html +222 -222
flash_attn/artifacts/benchmark/Attention Benchmark.csv
CHANGED
|
@@ -1,7 +1,7 @@
|
|
| 1 |
seq_len,torch_cudnn,torch_cudnn_compile_d,torch_cudnn_compile_ma,torch_flash,torch_flash_compile_d,torch_flash_compile_ma,hf_flash_attn,hf_flash_attn3
|
| 2 |
-
4224.000000,3.
|
| 3 |
-
4352.000000,4.
|
| 4 |
-
4416.000000,4.
|
| 5 |
-
4480.000000,4.
|
| 6 |
-
4544.000000,4.
|
| 7 |
-
4608.000000,4.
|
|
|
|
| 1 |
seq_len,torch_cudnn,torch_cudnn_compile_d,torch_cudnn_compile_ma,torch_flash,torch_flash_compile_d,torch_flash_compile_ma,hf_flash_attn,hf_flash_attn3
|
| 2 |
+
4224.000000,3.807456,3.789232,4.191984,3.974816,3.953792,4.322096,3.403408,3.328416
|
| 3 |
+
4352.000000,4.078480,4.072352,4.420736,4.400480,4.390000,4.738144,3.833424,3.755664
|
| 4 |
+
4416.000000,4.139680,4.134800,4.490464,4.451040,4.443680,4.795104,3.890112,3.860992
|
| 5 |
+
4480.000000,4.202048,4.195216,4.561248,4.524608,4.519520,4.877056,3.948816,3.866704
|
| 6 |
+
4544.000000,4.434992,4.427040,4.788000,4.582336,4.571872,4.945728,4.015280,3.982320
|
| 7 |
+
4608.000000,4.499456,4.490816,4.874464,4.669152,4.663648,5.035232,4.071872,3.983520
|
flash_attn/artifacts/benchmark/Attention Benchmark.png
CHANGED
|
Git LFS Details
|
|
Git LFS Details
|
flash_attn/benchmark.html
CHANGED
|
@@ -3715,19 +3715,86 @@ span.linenos.special { color: #000000; background-color: #ffffc0; padding-left:
|
|
| 3715 |
</div>
|
| 3716 |
|
| 3717 |
<div class="main-content">
|
| 3718 |
-
<div class="cell" id="cell-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 3719 |
<div class="cell-header">
|
| 3720 |
<span class="collapse-indicators">
|
| 3721 |
<span onclick="toggleCode('benchmark')" style="cursor: pointer;">▼ code</span>
|
| 3722 |
<span onclick="toggleOutput('benchmark')" style="cursor: pointer;">▼ output</span>
|
| 3723 |
<span id="uv-indicator-benchmark" onclick="toggleUvLogsFromHeader('benchmark')" style="cursor: pointer;">▶ uv-logs</span>
|
| 3724 |
</span> |
|
| 3725 |
-
Cell: benchmark | 77.
|
| 3726 |
| <button class="run-btn" onclick="runCell('benchmark')">▶ run</button>
|
| 3727 |
<button class="copy-btn" onclick="copyCell('benchmark')">Copy</button>
|
| 3728 |
<a href="cells/benchmark.py" target="_blank" class="raw-btn">Raw</a>
|
| 3729 |
</div>
|
| 3730 |
-
<div id="code-benchmark" class="cell-code" data-lines="
|
| 3731 |
<div class="highlight-with-lines">
|
| 3732 |
<div class="line-numbers" id="lines-benchmark">
|
| 3733 |
<a class="line-number" data-cell="benchmark" data-line="1" href="#cell-benchmark" onclick="event.preventDefault(); selectCellLine('benchmark', 1, true);">1</a>
|
|
@@ -4073,6 +4140,8 @@ Cell: benchmark | 77.66s
|
|
| 4073 |
<a class="line-number" data-cell="benchmark" data-line="341" href="#cell-benchmark" onclick="event.preventDefault(); selectCellLine('benchmark', 341, true);">341</a>
|
| 4074 |
<a class="line-number" data-cell="benchmark" data-line="342" href="#cell-benchmark" onclick="event.preventDefault(); selectCellLine('benchmark', 342, true);">342</a>
|
| 4075 |
<a class="line-number" data-cell="benchmark" data-line="343" href="#cell-benchmark" onclick="event.preventDefault(); selectCellLine('benchmark', 343, true);">343</a>
|
|
|
|
|
|
|
| 4076 |
</div>
|
| 4077 |
<div class="code-wrap">
|
| 4078 |
<div class="highlight"><pre><span></span><span class="c1"># /// script</span>
|
|
@@ -4417,7 +4486,9 @@ Cell: benchmark | 77.66s
|
|
| 4417 |
|
| 4418 |
<span class="k">with</span> <span class="n">torch</span><span class="o">.</span><span class="n">inference_mode</span><span class="p">():</span>
|
| 4419 |
<span class="n">correctness</span><span class="p">()</span>
|
| 4420 |
-
<span class="n">benchmark_fn</span><span class="o">.</span><span class="n">run</span><span class="p">(</span><span class="n">print_data</span><span class="o">=</span><span class="kc">True</span><span class="p">,</span> <span class="n">save_path</span><span class="o">=</span><span class="n">output_dir</span><span class="o">.</span><span class="n">as_posix</span><span class="p">())</span>
|
|
|
|
|
|
|
| 4421 |
</pre></div>
|
| 4422 |
|
| 4423 |
<div class="code-line-highlight" id="line-highlight-benchmark"></div>
|
|
@@ -4433,105 +4504,105 @@ xFormers not found.
|
|
| 4433 |
|
| 4434 |
|
| 4435 |
===== Testing shape: (1, 4224, 24, 128) =====
|
| 4436 |
-
torch_cudnn : absmax=0.
|
| 4437 |
-
torch_cudnn_compile_d : absmax=0.
|
| 4438 |
-
torch_cudnn_compile_ma : absmax=0.
|
| 4439 |
-
torch_flash : absmax=0.
|
| 4440 |
-
torch_flash_compile_d : absmax=0.
|
| 4441 |
-
torch_flash_compile_ma : absmax=0.
|
| 4442 |
-
hf_flash_attn : absmax=0.
|
| 4443 |
-
hf_flash_attn3 : absmax=0.
|
| 4444 |
|
| 4445 |
|
| 4446 |
===== Testing shape: (1, 4352, 24, 128) =====
|
| 4447 |
-
torch_cudnn : absmax=0.
|
| 4448 |
-
torch_cudnn_compile_d : absmax=0.
|
| 4449 |
-
torch_cudnn_compile_ma : absmax=0.
|
| 4450 |
-
torch_flash : absmax=0.
|
| 4451 |
-
torch_flash_compile_d : absmax=0.
|
| 4452 |
-
torch_flash_compile_ma : absmax=0.
|
| 4453 |
-
hf_flash_attn : absmax=0.
|
| 4454 |
-
hf_flash_attn3 : absmax=0.
|
| 4455 |
|
| 4456 |
|
| 4457 |
===== Testing shape: (1, 4416, 24, 128) =====
|
| 4458 |
-
torch_cudnn : absmax=0.
|
| 4459 |
-
torch_cudnn_compile_d : absmax=0.
|
| 4460 |
-
torch_cudnn_compile_ma : absmax=0.
|
| 4461 |
-
torch_flash : absmax=0.
|
| 4462 |
-
torch_flash_compile_d : absmax=0.
|
| 4463 |
-
torch_flash_compile_ma : absmax=0.
|
| 4464 |
-
hf_flash_attn : absmax=0.
|
| 4465 |
-
hf_flash_attn3 : absmax=0.
|
| 4466 |
|
| 4467 |
|
| 4468 |
===== Testing shape: (1, 4480, 24, 128) =====
|
| 4469 |
-
torch_cudnn : absmax=0.
|
| 4470 |
-
torch_cudnn_compile_d : absmax=0.
|
| 4471 |
-
torch_cudnn_compile_ma : absmax=0.
|
| 4472 |
-
torch_flash : absmax=0.
|
| 4473 |
-
torch_flash_compile_d : absmax=0.
|
| 4474 |
-
torch_flash_compile_ma : absmax=0.
|
| 4475 |
-
hf_flash_attn : absmax=0.
|
| 4476 |
-
hf_flash_attn3 : absmax=0.
|
| 4477 |
|
| 4478 |
|
| 4479 |
===== Testing shape: (1, 4544, 24, 128) =====
|
| 4480 |
torch_cudnn : absmax=0.000976, mae=0.000072, mse=0.000000
|
| 4481 |
torch_cudnn_compile_d : absmax=0.000976, mae=0.000072, mse=0.000000
|
| 4482 |
torch_cudnn_compile_ma : absmax=0.000976, mae=0.000072, mse=0.000000
|
| 4483 |
-
torch_flash : absmax=0.
|
| 4484 |
-
torch_flash_compile_d : absmax=0.
|
| 4485 |
-
torch_flash_compile_ma : absmax=0.
|
| 4486 |
-
hf_flash_attn : absmax=0.
|
| 4487 |
-
hf_flash_attn3 : absmax=0.
|
| 4488 |
|
| 4489 |
|
| 4490 |
===== Testing shape: (1, 4608, 24, 128) =====
|
| 4491 |
-
torch_cudnn : absmax=0.
|
| 4492 |
-
torch_cudnn_compile_d : absmax=0.
|
| 4493 |
-
torch_cudnn_compile_ma : absmax=0.
|
| 4494 |
-
torch_flash : absmax=0.
|
| 4495 |
-
torch_flash_compile_d : absmax=0.
|
| 4496 |
-
torch_flash_compile_ma : absmax=0.
|
| 4497 |
-
hf_flash_attn : absmax=0.
|
| 4498 |
-
hf_flash_attn3 : absmax=0.
|
| 4499 |
Attention Benchmark:
|
| 4500 |
seq_len torch_cudnn torch_cudnn_compile_d torch_cudnn_compile_ma torch_flash torch_flash_compile_d torch_flash_compile_ma hf_flash_attn hf_flash_attn3
|
| 4501 |
-
0 4224.0 3.
|
| 4502 |
-
1 4352.0 4.
|
| 4503 |
-
2 4416.0 4.
|
| 4504 |
-
3 4480.0 4.
|
| 4505 |
-
4 4544.0 4.
|
| 4506 |
-
5 4608.0 4.
|
| 4507 |
</div>
|
| 4508 |
<div class="uv-install-logs" id="uv-logs-benchmark">
|
| 4509 |
<div class="uv-logs-header" onclick="toggleUvLogs(this)">▶ UV Install Logs</div>
|
| 4510 |
<div class="uv-logs-content" style="display: none;">
|
| 4511 |
-
Downloading
|
| 4512 |
-
Downloading
|
| 4513 |
-
Downloading
|
| 4514 |
Downloading nvidia-nvjitlink-cu12 (37.4MiB)
|
|
|
|
|
|
|
|
|
|
|
|
|
| 4515 |
Downloading nvidia-cusparselt-cu12 (273.9MiB)
|
| 4516 |
-
Downloading nvidia-cudnn-cu12 (674.0MiB)
|
| 4517 |
-
Downloading fonttools (4.7MiB)
|
| 4518 |
-
Downloading nvidia-cuda-nvrtc-cu12 (84.0MiB)
|
| 4519 |
-
Downloading triton (148.3MiB)
|
| 4520 |
-
Downloading setuptools (1.1MiB)
|
| 4521 |
-
Downloading pillow (6.3MiB)
|
| 4522 |
Downloading nvidia-cufft-cu12 (184.2MiB)
|
| 4523 |
-
Downloading
|
| 4524 |
Downloading matplotlib (8.3MiB)
|
|
|
|
|
|
|
|
|
|
| 4525 |
Downloading nvidia-curand-cu12 (60.7MiB)
|
| 4526 |
-
Downloading nvidia-cusolver-cu12 (255.1MiB)
|
| 4527 |
-
Downloading nvidia-cusparse-cu12 (274.9MiB)
|
| 4528 |
-
Downloading torch (846.9MiB)
|
| 4529 |
Downloading nvidia-cublas-cu12 (566.8MiB)
|
|
|
|
| 4530 |
Downloading numpy (16.2MiB)
|
| 4531 |
-
Downloading
|
| 4532 |
-
Downloading nvidia-
|
| 4533 |
-
Downloading
|
| 4534 |
-
Downloading
|
|
|
|
| 4535 |
Downloading nvidia-cufile-cu12
|
| 4536 |
Downloading kiwisolver
|
| 4537 |
Downloading hf-xet
|
|
@@ -4544,30 +4615,35 @@ Downloading hf-xet (3.0MiB)
|
|
| 4544 |
Downloading sympy
|
| 4545 |
Downloading numpy
|
| 4546 |
Downloading nvidia-nvjitlink-cu12
|
| 4547 |
-
Downloading pandas
|
| 4548 |
Downloading nvidia-curand-cu12
|
|
|
|
| 4549 |
Downloading nvidia-cuda-nvrtc-cu12
|
| 4550 |
Downloading triton
|
| 4551 |
Downloading nvidia-cufft-cu12
|
| 4552 |
Downloading nvidia-cusolver-cu12
|
| 4553 |
-
Downloading nvidia-cusparse-cu12
|
| 4554 |
Downloading nvidia-cusparselt-cu12
|
|
|
|
| 4555 |
Downloading nvidia-nccl-cu12
|
| 4556 |
Downloading nvidia-cublas-cu12
|
| 4557 |
Downloading nvidia-cudnn-cu12
|
| 4558 |
Downloading torch
|
| 4559 |
-
Installed 49 packages in
|
| 4560 |
</div>
|
| 4561 |
</div>
|
| 4562 |
<div class="cell-stderr">Fetching 20 files: 0%| | 0/20 [00:00<?, ?it/s]
|
| 4563 |
-
Fetching 20 files: 5%|▌ | 1/20 [00:00<00:
|
| 4564 |
-
Fetching 20 files: 10%|█ | 2/20 [00:
|
| 4565 |
-
Fetching 20 files: 100%|██████████| 20/20 [00:
|
| 4566 |
|
| 4567 |
Fetching 4 files: 0%| | 0/4 [00:00<?, ?it/s]
|
| 4568 |
-
Fetching 4 files: 25%|██▌ | 1/4 [00:00<00:00,
|
| 4569 |
-
Fetching 4 files: 50%|█████ | 2/4 [00:01<00:01, 1.
|
| 4570 |
-
Fetching 4 files: 100%|██████████| 4/4 [00:01<00:00, 2.
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 4571 |
<div class="cell-artifacts">
|
| 4572 |
<h4>Artifacts:</h4>
|
| 4573 |
<a href="artifacts/benchmark/Attention Benchmark.png" class="artifact" target="_blank">Attention Benchmark.png</a>
|
|
|
|
| 3715 |
</div>
|
| 3716 |
|
| 3717 |
<div class="main-content">
|
| 3718 |
+
<div class="cell" id="cell-nv">
|
| 3719 |
+
<div class="cell-header">
|
| 3720 |
+
<span class="collapse-indicators">
|
| 3721 |
+
<span onclick="toggleCode('nv')" style="cursor: pointer;">▼ code</span>
|
| 3722 |
+
<span onclick="toggleOutput('nv')" style="cursor: pointer;">▼ output</span>
|
| 3723 |
+
<span id="uv-indicator-nv" style="cursor: default; opacity: 0.3;">▶ uv-logs</span>
|
| 3724 |
+
</span> |
|
| 3725 |
+
Cell: nv | 0.70s
|
| 3726 |
+
| <button class="run-btn" onclick="runCell('nv')">▶ run</button>
|
| 3727 |
+
<button class="copy-btn" onclick="copyCell('nv')">Copy</button>
|
| 3728 |
+
<a href="cells/nv.py" target="_blank" class="raw-btn">Raw</a>
|
| 3729 |
+
</div>
|
| 3730 |
+
<div id="code-nv" class="cell-code" data-lines="3">
|
| 3731 |
+
<div class="highlight-with-lines">
|
| 3732 |
+
<div class="line-numbers" id="lines-nv">
|
| 3733 |
+
<a class="line-number" data-cell="nv" data-line="1" href="#cell-nv" onclick="event.preventDefault(); selectCellLine('nv', 1, true);">1</a>
|
| 3734 |
+
<a class="line-number" data-cell="nv" data-line="2" href="#cell-nv" onclick="event.preventDefault(); selectCellLine('nv', 2, true);">2</a>
|
| 3735 |
+
<a class="line-number" data-cell="nv" data-line="3" href="#cell-nv" onclick="event.preventDefault(); selectCellLine('nv', 3, true);">3</a>
|
| 3736 |
+
</div>
|
| 3737 |
+
<div class="code-wrap">
|
| 3738 |
+
<div class="highlight"><pre><span></span><span class="kn">import</span><span class="w"> </span><span class="nn">subprocess</span>
|
| 3739 |
+
|
| 3740 |
+
<span class="nb">print</span><span class="p">(</span><span class="n">subprocess</span><span class="o">.</span><span class="n">run</span><span class="p">([</span><span class="s2">"nvidia-smi"</span><span class="p">],</span> <span class="n">capture_output</span><span class="o">=</span><span class="kc">True</span><span class="p">,</span> <span class="n">text</span><span class="o">=</span><span class="kc">True</span><span class="p">)</span><span class="o">.</span><span class="n">stdout</span><span class="p">)</span>
|
| 3741 |
+
</pre></div>
|
| 3742 |
+
|
| 3743 |
+
<div class="code-line-highlight" id="line-highlight-nv"></div>
|
| 3744 |
+
</div>
|
| 3745 |
+
</div>
|
| 3746 |
+
</div>
|
| 3747 |
+
<div id="output-nv" class="cell-output">
|
| 3748 |
+
<div class="cell-stdout">Fri Sep 26 02:23:10 2025
|
| 3749 |
+
+-----------------------------------------------------------------------------------------+
|
| 3750 |
+
| NVIDIA-SMI 570.172.08 Driver Version: 570.172.08 CUDA Version: 12.8 |
|
| 3751 |
+
|-----------------------------------------+------------------------+----------------------+
|
| 3752 |
+
| GPU Name Persistence-M | Bus-Id Disp.A | Volatile Uncorr. ECC |
|
| 3753 |
+
| Fan Temp Perf Pwr:Usage/Cap | Memory-Usage | GPU-Util Compute M. |
|
| 3754 |
+
| | | MIG M. |
|
| 3755 |
+
|=========================================+========================+======================|
|
| 3756 |
+
| 0 NVIDIA A10G On | 00000000:00:1B.0 Off | 0 |
|
| 3757 |
+
| 0% 31C P8 26W / 300W | 0MiB / 23028MiB | 0% Default |
|
| 3758 |
+
| | | N/A |
|
| 3759 |
+
+-----------------------------------------+------------------------+----------------------+
|
| 3760 |
+
| 1 NVIDIA A10G On | 00000000:00:1C.0 Off | 0 |
|
| 3761 |
+
| 0% 30C P8 26W / 300W | 0MiB / 23028MiB | 0% Default |
|
| 3762 |
+
| | | N/A |
|
| 3763 |
+
+-----------------------------------------+------------------------+----------------------+
|
| 3764 |
+
| 2 NVIDIA A10G On | 00000000:00:1D.0 Off | 0 |
|
| 3765 |
+
| 0% 31C P8 27W / 300W | 0MiB / 23028MiB | 0% Default |
|
| 3766 |
+
| | | N/A |
|
| 3767 |
+
+-----------------------------------------+------------------------+----------------------+
|
| 3768 |
+
| 3 NVIDIA A10G On | 00000000:00:1E.0 Off | 0 |
|
| 3769 |
+
| 0% 30C P8 27W / 300W | 0MiB / 23028MiB | 0% Default |
|
| 3770 |
+
| | | N/A |
|
| 3771 |
+
+-----------------------------------------+------------------------+----------------------+
|
| 3772 |
+
|
| 3773 |
+
+-----------------------------------------------------------------------------------------+
|
| 3774 |
+
| Processes: |
|
| 3775 |
+
| GPU GI CI PID Type Process name GPU Memory |
|
| 3776 |
+
| ID ID Usage |
|
| 3777 |
+
|=========================================================================================|
|
| 3778 |
+
| No running processes found |
|
| 3779 |
+
+-----------------------------------------------------------------------------------------+
|
| 3780 |
+
|
| 3781 |
+
</div>
|
| 3782 |
+
</div>
|
| 3783 |
+
</div>
|
| 3784 |
+
|
| 3785 |
+
<div class="cell cell-failed" id="cell-benchmark">
|
| 3786 |
<div class="cell-header">
|
| 3787 |
<span class="collapse-indicators">
|
| 3788 |
<span onclick="toggleCode('benchmark')" style="cursor: pointer;">▼ code</span>
|
| 3789 |
<span onclick="toggleOutput('benchmark')" style="cursor: pointer;">▼ output</span>
|
| 3790 |
<span id="uv-indicator-benchmark" onclick="toggleUvLogsFromHeader('benchmark')" style="cursor: pointer;">▶ uv-logs</span>
|
| 3791 |
</span> |
|
| 3792 |
+
Cell: benchmark | 77.48s | FAILED
|
| 3793 |
| <button class="run-btn" onclick="runCell('benchmark')">▶ run</button>
|
| 3794 |
<button class="copy-btn" onclick="copyCell('benchmark')">Copy</button>
|
| 3795 |
<a href="cells/benchmark.py" target="_blank" class="raw-btn">Raw</a>
|
| 3796 |
</div>
|
| 3797 |
+
<div id="code-benchmark" class="cell-code" data-lines="345">
|
| 3798 |
<div class="highlight-with-lines">
|
| 3799 |
<div class="line-numbers" id="lines-benchmark">
|
| 3800 |
<a class="line-number" data-cell="benchmark" data-line="1" href="#cell-benchmark" onclick="event.preventDefault(); selectCellLine('benchmark', 1, true);">1</a>
|
|
|
|
| 4140 |
<a class="line-number" data-cell="benchmark" data-line="341" href="#cell-benchmark" onclick="event.preventDefault(); selectCellLine('benchmark', 341, true);">341</a>
|
| 4141 |
<a class="line-number" data-cell="benchmark" data-line="342" href="#cell-benchmark" onclick="event.preventDefault(); selectCellLine('benchmark', 342, true);">342</a>
|
| 4142 |
<a class="line-number" data-cell="benchmark" data-line="343" href="#cell-benchmark" onclick="event.preventDefault(); selectCellLine('benchmark', 343, true);">343</a>
|
| 4143 |
+
<a class="line-number" data-cell="benchmark" data-line="344" href="#cell-benchmark" onclick="event.preventDefault(); selectCellLine('benchmark', 344, true);">344</a>
|
| 4144 |
+
<a class="line-number" data-cell="benchmark" data-line="345" href="#cell-benchmark" onclick="event.preventDefault(); selectCellLine('benchmark', 345, true);">345</a>
|
| 4145 |
</div>
|
| 4146 |
<div class="code-wrap">
|
| 4147 |
<div class="highlight"><pre><span></span><span class="c1"># /// script</span>
|
|
|
|
| 4486 |
|
| 4487 |
<span class="k">with</span> <span class="n">torch</span><span class="o">.</span><span class="n">inference_mode</span><span class="p">():</span>
|
| 4488 |
<span class="n">correctness</span><span class="p">()</span>
|
| 4489 |
+
<span class="n">fig</span> <span class="o">=</span> <span class="n">benchmark_fn</span><span class="o">.</span><span class="n">run</span><span class="p">(</span><span class="n">print_data</span><span class="o">=</span><span class="kc">True</span><span class="p">,</span> <span class="n">save_path</span><span class="o">=</span><span class="n">output_dir</span><span class="o">.</span><span class="n">as_posix</span><span class="p">())</span>
|
| 4490 |
+
<span class="n">fig</span><span class="o">.</span><span class="n">savefig</span><span class="p">(</span><span class="n">output_dir</span> <span class="o">/</span> <span class="s2">"attention_benchmark.png"</span><span class="p">,</span> <span class="n">dpi</span><span class="o">=</span><span class="mi">300</span><span class="p">,</span> <span class="n">bbox_inches</span><span class="o">=</span><span class="s2">"tight"</span><span class="p">)</span>
|
| 4491 |
+
<span class="nb">print</span><span class="p">(</span><span class="sa">f</span><span class="s2">"Benchmark plot saved to: </span><span class="si">{</span><span class="n">output_dir</span><span class="w"> </span><span class="o">/</span><span class="w"> </span><span class="s1">'attention_benchmark.png'</span><span class="si">}</span><span class="s2">"</span><span class="p">)</span>
|
| 4492 |
</pre></div>
|
| 4493 |
|
| 4494 |
<div class="code-line-highlight" id="line-highlight-benchmark"></div>
|
|
|
|
| 4504 |
|
| 4505 |
|
| 4506 |
===== Testing shape: (1, 4224, 24, 128) =====
|
| 4507 |
+
torch_cudnn : absmax=0.000994, mae=0.000075, mse=0.000000
|
| 4508 |
+
torch_cudnn_compile_d : absmax=0.000994, mae=0.000075, mse=0.000000
|
| 4509 |
+
torch_cudnn_compile_ma : absmax=0.000994, mae=0.000075, mse=0.000000
|
| 4510 |
+
torch_flash : absmax=0.000994, mae=0.000075, mse=0.000000
|
| 4511 |
+
torch_flash_compile_d : absmax=0.000994, mae=0.000075, mse=0.000000
|
| 4512 |
+
torch_flash_compile_ma : absmax=0.000994, mae=0.000075, mse=0.000000
|
| 4513 |
+
hf_flash_attn : absmax=0.000994, mae=0.000075, mse=0.000000
|
| 4514 |
+
hf_flash_attn3 : absmax=0.000994, mae=0.000075, mse=0.000000
|
| 4515 |
|
| 4516 |
|
| 4517 |
===== Testing shape: (1, 4352, 24, 128) =====
|
| 4518 |
+
torch_cudnn : absmax=0.001718, mae=0.000073, mse=0.000000
|
| 4519 |
+
torch_cudnn_compile_d : absmax=0.001718, mae=0.000073, mse=0.000000
|
| 4520 |
+
torch_cudnn_compile_ma : absmax=0.001718, mae=0.000073, mse=0.000000
|
| 4521 |
+
torch_flash : absmax=0.001718, mae=0.000073, mse=0.000000
|
| 4522 |
+
torch_flash_compile_d : absmax=0.001718, mae=0.000073, mse=0.000000
|
| 4523 |
+
torch_flash_compile_ma : absmax=0.001718, mae=0.000073, mse=0.000000
|
| 4524 |
+
hf_flash_attn : absmax=0.001718, mae=0.000073, mse=0.000000
|
| 4525 |
+
hf_flash_attn3 : absmax=0.001718, mae=0.000073, mse=0.000000
|
| 4526 |
|
| 4527 |
|
| 4528 |
===== Testing shape: (1, 4416, 24, 128) =====
|
| 4529 |
+
torch_cudnn : absmax=0.001273, mae=0.000073, mse=0.000000
|
| 4530 |
+
torch_cudnn_compile_d : absmax=0.001273, mae=0.000073, mse=0.000000
|
| 4531 |
+
torch_cudnn_compile_ma : absmax=0.001273, mae=0.000073, mse=0.000000
|
| 4532 |
+
torch_flash : absmax=0.001440, mae=0.000073, mse=0.000000
|
| 4533 |
+
torch_flash_compile_d : absmax=0.001440, mae=0.000073, mse=0.000000
|
| 4534 |
+
torch_flash_compile_ma : absmax=0.001440, mae=0.000073, mse=0.000000
|
| 4535 |
+
hf_flash_attn : absmax=0.001440, mae=0.000073, mse=0.000000
|
| 4536 |
+
hf_flash_attn3 : absmax=0.001440, mae=0.000073, mse=0.000000
|
| 4537 |
|
| 4538 |
|
| 4539 |
===== Testing shape: (1, 4480, 24, 128) =====
|
| 4540 |
+
torch_cudnn : absmax=0.001284, mae=0.000073, mse=0.000000
|
| 4541 |
+
torch_cudnn_compile_d : absmax=0.001284, mae=0.000073, mse=0.000000
|
| 4542 |
+
torch_cudnn_compile_ma : absmax=0.001284, mae=0.000073, mse=0.000000
|
| 4543 |
+
torch_flash : absmax=0.001284, mae=0.000073, mse=0.000000
|
| 4544 |
+
torch_flash_compile_d : absmax=0.001284, mae=0.000073, mse=0.000000
|
| 4545 |
+
torch_flash_compile_ma : absmax=0.001284, mae=0.000073, mse=0.000000
|
| 4546 |
+
hf_flash_attn : absmax=0.001284, mae=0.000073, mse=0.000000
|
| 4547 |
+
hf_flash_attn3 : absmax=0.001284, mae=0.000073, mse=0.000000
|
| 4548 |
|
| 4549 |
|
| 4550 |
===== Testing shape: (1, 4544, 24, 128) =====
|
| 4551 |
torch_cudnn : absmax=0.000976, mae=0.000072, mse=0.000000
|
| 4552 |
torch_cudnn_compile_d : absmax=0.000976, mae=0.000072, mse=0.000000
|
| 4553 |
torch_cudnn_compile_ma : absmax=0.000976, mae=0.000072, mse=0.000000
|
| 4554 |
+
torch_flash : absmax=0.000834, mae=0.000072, mse=0.000000
|
| 4555 |
+
torch_flash_compile_d : absmax=0.000834, mae=0.000072, mse=0.000000
|
| 4556 |
+
torch_flash_compile_ma : absmax=0.000834, mae=0.000072, mse=0.000000
|
| 4557 |
+
hf_flash_attn : absmax=0.000834, mae=0.000072, mse=0.000000
|
| 4558 |
+
hf_flash_attn3 : absmax=0.000815, mae=0.000072, mse=0.000000
|
| 4559 |
|
| 4560 |
|
| 4561 |
===== Testing shape: (1, 4608, 24, 128) =====
|
| 4562 |
+
torch_cudnn : absmax=0.000926, mae=0.000072, mse=0.000000
|
| 4563 |
+
torch_cudnn_compile_d : absmax=0.000926, mae=0.000072, mse=0.000000
|
| 4564 |
+
torch_cudnn_compile_ma : absmax=0.000926, mae=0.000072, mse=0.000000
|
| 4565 |
+
torch_flash : absmax=0.000926, mae=0.000072, mse=0.000000
|
| 4566 |
+
torch_flash_compile_d : absmax=0.000926, mae=0.000072, mse=0.000000
|
| 4567 |
+
torch_flash_compile_ma : absmax=0.000926, mae=0.000072, mse=0.000000
|
| 4568 |
+
hf_flash_attn : absmax=0.000926, mae=0.000072, mse=0.000000
|
| 4569 |
+
hf_flash_attn3 : absmax=0.000926, mae=0.000072, mse=0.000000
|
| 4570 |
Attention Benchmark:
|
| 4571 |
seq_len torch_cudnn torch_cudnn_compile_d torch_cudnn_compile_ma torch_flash torch_flash_compile_d torch_flash_compile_ma hf_flash_attn hf_flash_attn3
|
| 4572 |
+
0 4224.0 3.807456 3.789232 4.191984 3.974816 3.953792 4.322096 3.403408 3.328416
|
| 4573 |
+
1 4352.0 4.078480 4.072352 4.420736 4.400480 4.390000 4.738144 3.833424 3.755664
|
| 4574 |
+
2 4416.0 4.139680 4.134800 4.490464 4.451040 4.443680 4.795104 3.890112 3.860992
|
| 4575 |
+
3 4480.0 4.202048 4.195216 4.561248 4.524608 4.519520 4.877056 3.948816 3.866704
|
| 4576 |
+
4 4544.0 4.434992 4.427040 4.788000 4.582336 4.571872 4.945728 4.015280 3.982320
|
| 4577 |
+
5 4608.0 4.499456 4.490816 4.874464 4.669152 4.663648 5.035232 4.071872 3.983520
|
| 4578 |
</div>
|
| 4579 |
<div class="uv-install-logs" id="uv-logs-benchmark">
|
| 4580 |
<div class="uv-logs-header" onclick="toggleUvLogs(this)">▶ UV Install Logs</div>
|
| 4581 |
<div class="uv-logs-content" style="display: none;">
|
| 4582 |
+
Downloading hf-xet (3.0MiB)
|
| 4583 |
+
Downloading setuptools (1.1MiB)
|
| 4584 |
+
Downloading nvidia-cufile-cu12 (1.1MiB)
|
| 4585 |
Downloading nvidia-nvjitlink-cu12 (37.4MiB)
|
| 4586 |
+
Downloading pandas (11.8MiB)
|
| 4587 |
+
Downloading nvidia-cusparse-cu12 (274.9MiB)
|
| 4588 |
+
Downloading networkx (1.9MiB)
|
| 4589 |
+
Downloading nvidia-cusolver-cu12 (255.1MiB)
|
| 4590 |
Downloading nvidia-cusparselt-cu12 (273.9MiB)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 4591 |
Downloading nvidia-cufft-cu12 (184.2MiB)
|
| 4592 |
+
Downloading triton (148.3MiB)
|
| 4593 |
Downloading matplotlib (8.3MiB)
|
| 4594 |
+
Downloading kiwisolver (1.4MiB)
|
| 4595 |
+
Downloading nvidia-nccl-cu12 (307.4MiB)
|
| 4596 |
+
Downloading nvidia-cuda-cupti-cu12 (9.8MiB)
|
| 4597 |
Downloading nvidia-curand-cu12 (60.7MiB)
|
|
|
|
|
|
|
|
|
|
| 4598 |
Downloading nvidia-cublas-cu12 (566.8MiB)
|
| 4599 |
+
Downloading torch (846.9MiB)
|
| 4600 |
Downloading numpy (16.2MiB)
|
| 4601 |
+
Downloading pillow (6.3MiB)
|
| 4602 |
+
Downloading nvidia-cuda-nvrtc-cu12 (84.0MiB)
|
| 4603 |
+
Downloading nvidia-cudnn-cu12 (674.0MiB)
|
| 4604 |
+
Downloading fonttools (4.7MiB)
|
| 4605 |
+
Downloading sympy (6.0MiB)
|
| 4606 |
Downloading nvidia-cufile-cu12
|
| 4607 |
Downloading kiwisolver
|
| 4608 |
Downloading hf-xet
|
|
|
|
| 4615 |
Downloading sympy
|
| 4616 |
Downloading numpy
|
| 4617 |
Downloading nvidia-nvjitlink-cu12
|
|
|
|
| 4618 |
Downloading nvidia-curand-cu12
|
| 4619 |
+
Downloading pandas
|
| 4620 |
Downloading nvidia-cuda-nvrtc-cu12
|
| 4621 |
Downloading triton
|
| 4622 |
Downloading nvidia-cufft-cu12
|
| 4623 |
Downloading nvidia-cusolver-cu12
|
|
|
|
| 4624 |
Downloading nvidia-cusparselt-cu12
|
| 4625 |
+
Downloading nvidia-cusparse-cu12
|
| 4626 |
Downloading nvidia-nccl-cu12
|
| 4627 |
Downloading nvidia-cublas-cu12
|
| 4628 |
Downloading nvidia-cudnn-cu12
|
| 4629 |
Downloading torch
|
| 4630 |
+
Installed 49 packages in 520ms
|
| 4631 |
</div>
|
| 4632 |
</div>
|
| 4633 |
<div class="cell-stderr">Fetching 20 files: 0%| | 0/20 [00:00<?, ?it/s]
|
| 4634 |
+
Fetching 20 files: 5%|▌ | 1/20 [00:00<00:03, 4.89it/s]
|
| 4635 |
+
Fetching 20 files: 10%|█ | 2/20 [00:01<00:17, 1.02it/s]
|
| 4636 |
+
Fetching 20 files: 100%|██████████| 20/20 [00:01<00:00, 11.55it/s]
|
| 4637 |
|
| 4638 |
Fetching 4 files: 0%| | 0/4 [00:00<?, ?it/s]
|
| 4639 |
+
Fetching 4 files: 25%|██▌ | 1/4 [00:00<00:00, 6.41it/s]
|
| 4640 |
+
Fetching 4 files: 50%|█████ | 2/4 [00:01<00:01, 1.10it/s]
|
| 4641 |
+
Fetching 4 files: 100%|██████████| 4/4 [00:01<00:00, 2.52it/s]
|
| 4642 |
+
Traceback (most recent call last):
|
| 4643 |
+
File "/repo/flash_attn/.uvnote/cells/benchmark.py", line 344, in <module>
|
| 4644 |
+
fig.savefig(output_dir / "attention_benchmark.png", dpi=300, bbox_inches="tight")
|
| 4645 |
+
^^^^^^^^^^^
|
| 4646 |
+
AttributeError: 'NoneType' object has no attribute 'savefig'</div>
|
| 4647 |
<div class="cell-artifacts">
|
| 4648 |
<h4>Artifacts:</h4>
|
| 4649 |
<a href="artifacts/benchmark/Attention Benchmark.png" class="artifact" target="_blank">Attention Benchmark.png</a>
|
flash_attn/cells/benchmark.py
CHANGED
|
@@ -340,4 +340,6 @@ def benchmark_fn(seq_len: int, provider: str):
|
|
| 340 |
|
| 341 |
with torch.inference_mode():
|
| 342 |
correctness()
|
| 343 |
-
benchmark_fn.run(print_data=True, save_path=output_dir.as_posix())
|
|
|
|
|
|
|
|
|
| 340 |
|
| 341 |
with torch.inference_mode():
|
| 342 |
correctness()
|
| 343 |
+
fig = benchmark_fn.run(print_data=True, save_path=output_dir.as_posix())
|
| 344 |
+
fig.savefig(output_dir / "attention_benchmark.png", dpi=300, bbox_inches="tight")
|
| 345 |
+
print(f"Benchmark plot saved to: {output_dir / 'attention_benchmark.png'}")
|
flash_attn/cells/nv.py
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import subprocess
|
| 2 |
+
|
| 3 |
+
print(subprocess.run(["nvidia-smi"], capture_output=True, text=True).stdout)
|
moe_benchmarks/megablocks/megablocks_only.html
CHANGED
|
@@ -3715,74 +3715,7 @@ span.linenos.special { color: #000000; background-color: #ffffc0; padding-left:
|
|
| 3715 |
</div>
|
| 3716 |
|
| 3717 |
<div class="main-content">
|
| 3718 |
-
<
|
| 3719 |
-
<div class="cell-header">
|
| 3720 |
-
<span class="collapse-indicators">
|
| 3721 |
-
<span onclick="toggleCode('nv')" style="cursor: pointer;">▼ code</span>
|
| 3722 |
-
<span onclick="toggleOutput('nv')" style="cursor: pointer;">▼ output</span>
|
| 3723 |
-
<span id="uv-indicator-nv" style="cursor: default; opacity: 0.3;">▶ uv-logs</span>
|
| 3724 |
-
</span> |
|
| 3725 |
-
Cell: nv | 0.67s
|
| 3726 |
-
| <button class="run-btn" onclick="runCell('nv')">▶ run</button>
|
| 3727 |
-
<button class="copy-btn" onclick="copyCell('nv')">Copy</button>
|
| 3728 |
-
<a href="cells/nv.py" target="_blank" class="raw-btn">Raw</a>
|
| 3729 |
-
</div>
|
| 3730 |
-
<div id="code-nv" class="cell-code" data-lines="3">
|
| 3731 |
-
<div class="highlight-with-lines">
|
| 3732 |
-
<div class="line-numbers" id="lines-nv">
|
| 3733 |
-
<a class="line-number" data-cell="nv" data-line="1" href="#cell-nv" onclick="event.preventDefault(); selectCellLine('nv', 1, true);">1</a>
|
| 3734 |
-
<a class="line-number" data-cell="nv" data-line="2" href="#cell-nv" onclick="event.preventDefault(); selectCellLine('nv', 2, true);">2</a>
|
| 3735 |
-
<a class="line-number" data-cell="nv" data-line="3" href="#cell-nv" onclick="event.preventDefault(); selectCellLine('nv', 3, true);">3</a>
|
| 3736 |
-
</div>
|
| 3737 |
-
<div class="code-wrap">
|
| 3738 |
-
<div class="highlight"><pre><span></span><span class="kn">import</span><span class="w"> </span><span class="nn">subprocess</span>
|
| 3739 |
-
|
| 3740 |
-
<span class="nb">print</span><span class="p">(</span><span class="n">subprocess</span><span class="o">.</span><span class="n">run</span><span class="p">([</span><span class="s2">"nvidia-smi"</span><span class="p">],</span> <span class="n">capture_output</span><span class="o">=</span><span class="kc">True</span><span class="p">,</span> <span class="n">text</span><span class="o">=</span><span class="kc">True</span><span class="p">)</span><span class="o">.</span><span class="n">stdout</span><span class="p">)</span>
|
| 3741 |
-
</pre></div>
|
| 3742 |
-
|
| 3743 |
-
<div class="code-line-highlight" id="line-highlight-nv"></div>
|
| 3744 |
-
</div>
|
| 3745 |
-
</div>
|
| 3746 |
-
</div>
|
| 3747 |
-
<div id="output-nv" class="cell-output">
|
| 3748 |
-
<div class="cell-stdout">Thu Sep 25 20:02:38 2025
|
| 3749 |
-
+-----------------------------------------------------------------------------------------+
|
| 3750 |
-
| NVIDIA-SMI 570.172.08 Driver Version: 570.172.08 CUDA Version: 12.8 |
|
| 3751 |
-
|-----------------------------------------+------------------------+----------------------+
|
| 3752 |
-
| GPU Name Persistence-M | Bus-Id Disp.A | Volatile Uncorr. ECC |
|
| 3753 |
-
| Fan Temp Perf Pwr:Usage/Cap | Memory-Usage | GPU-Util Compute M. |
|
| 3754 |
-
| | | MIG M. |
|
| 3755 |
-
|=========================================+========================+======================|
|
| 3756 |
-
| 0 NVIDIA A10G On | 00000000:00:1B.0 Off | 0 |
|
| 3757 |
-
| 0% 40C P0 49W / 300W | 0MiB / 23028MiB | 0% Default |
|
| 3758 |
-
| | | N/A |
|
| 3759 |
-
+-----------------------------------------+------------------------+----------------------+
|
| 3760 |
-
| 1 NVIDIA A10G On | 00000000:00:1C.0 Off | 0 |
|
| 3761 |
-
| 0% 33C P8 23W / 300W | 0MiB / 23028MiB | 0% Default |
|
| 3762 |
-
| | | N/A |
|
| 3763 |
-
+-----------------------------------------+------------------------+----------------------+
|
| 3764 |
-
| 2 NVIDIA A10G On | 00000000:00:1D.0 Off | 0 |
|
| 3765 |
-
| 0% 33C P8 24W / 300W | 0MiB / 23028MiB | 0% Default |
|
| 3766 |
-
| | | N/A |
|
| 3767 |
-
+-----------------------------------------+------------------------+----------------------+
|
| 3768 |
-
| 3 NVIDIA A10G On | 00000000:00:1E.0 Off | 0 |
|
| 3769 |
-
| 0% 34C P8 26W / 300W | 0MiB / 23028MiB | 0% Default |
|
| 3770 |
-
| | | N/A |
|
| 3771 |
-
+-----------------------------------------+------------------------+----------------------+
|
| 3772 |
-
|
| 3773 |
-
+-----------------------------------------------------------------------------------------+
|
| 3774 |
-
| Processes: |
|
| 3775 |
-
| GPU GI CI PID Type Process name GPU Memory |
|
| 3776 |
-
| ID ID Usage |
|
| 3777 |
-
|=========================================================================================|
|
| 3778 |
-
| No running processes found |
|
| 3779 |
-
+-----------------------------------------------------------------------------------------+
|
| 3780 |
-
|
| 3781 |
-
</div>
|
| 3782 |
-
</div>
|
| 3783 |
-
</div>
|
| 3784 |
-
|
| 3785 |
-
<h1>No Kernels</h1>
|
| 3786 |
<p>First, we run the model without any custom kernels to get a reference point.</p>
|
| 3787 |
<h2>Forward</h2>
|
| 3788 |
<h2>Forward and Backward</h2>
|
|
@@ -3794,7 +3727,7 @@ Cell: nv | 0.67s
|
|
| 3794 |
<span onclick="toggleOutput('forward_and_backward_no_kernel')" style="cursor: pointer;">▼ output</span>
|
| 3795 |
<span id="uv-indicator-forward_and_backward_no_kernel" style="cursor: default; opacity: 0.3;">▶ uv-logs</span>
|
| 3796 |
</span> |
|
| 3797 |
-
Cell: forward_and_backward_no_kernel |
|
| 3798 |
| <button class="run-btn" onclick="runCell('forward_and_backward_no_kernel')">▶ run</button>
|
| 3799 |
<button class="copy-btn" onclick="copyCell('forward_and_backward_no_kernel')">Copy</button>
|
| 3800 |
<a href="cells/forward_and_backward_no_kernel.py" target="_blank" class="raw-btn">Raw</a>
|
|
|
|
| 3715 |
</div>
|
| 3716 |
|
| 3717 |
<div class="main-content">
|
| 3718 |
+
<h1>No Kernels</h1>
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 3719 |
<p>First, we run the model without any custom kernels to get a reference point.</p>
|
| 3720 |
<h2>Forward</h2>
|
| 3721 |
<h2>Forward and Backward</h2>
|
|
|
|
| 3727 |
<span onclick="toggleOutput('forward_and_backward_no_kernel')" style="cursor: pointer;">▼ output</span>
|
| 3728 |
<span id="uv-indicator-forward_and_backward_no_kernel" style="cursor: default; opacity: 0.3;">▶ uv-logs</span>
|
| 3729 |
</span> |
|
| 3730 |
+
Cell: forward_and_backward_no_kernel | 17.31s | FAILED
|
| 3731 |
| <button class="run-btn" onclick="runCell('forward_and_backward_no_kernel')">▶ run</button>
|
| 3732 |
<button class="copy-btn" onclick="copyCell('forward_and_backward_no_kernel')">Copy</button>
|
| 3733 |
<a href="cells/forward_and_backward_no_kernel.py" target="_blank" class="raw-btn">Raw</a>
|
moe_benchmarks/megablocks_yamoe/artifacts/binned_run/binned_results.json
CHANGED
|
@@ -9,16 +9,16 @@
|
|
| 9 |
"vary_inputs": true
|
| 10 |
},
|
| 11 |
"stats": {
|
| 12 |
-
"avg_ms":
|
| 13 |
-
"min_ms": 33.
|
| 14 |
-
"max_ms":
|
| 15 |
-
"std_ms": 1.
|
| 16 |
-
"p50_ms":
|
| 17 |
-
"p95_ms":
|
| 18 |
-
"p99_ms":
|
| 19 |
"num_iters": 50,
|
| 20 |
-
"tokens_per_s":
|
| 21 |
-
"throughput_variance":
|
| 22 |
},
|
| 23 |
"output_sum": 3.97190523147583
|
| 24 |
}
|
|
|
|
| 9 |
"vary_inputs": true
|
| 10 |
},
|
| 11 |
"stats": {
|
| 12 |
+
"avg_ms": 37.042515599997614,
|
| 13 |
+
"min_ms": 33.61098199997059,
|
| 14 |
+
"max_ms": 39.77627800003347,
|
| 15 |
+
"std_ms": 1.6558189449135647,
|
| 16 |
+
"p50_ms": 37.082583499994826,
|
| 17 |
+
"p95_ms": 39.325366850013665,
|
| 18 |
+
"p99_ms": 39.73607153999694,
|
| 19 |
"num_iters": 50,
|
| 20 |
+
"tokens_per_s": 2699.600671829276,
|
| 21 |
+
"throughput_variance": 122.63980223025922
|
| 22 |
},
|
| 23 |
"output_sum": 3.97190523147583
|
| 24 |
}
|
moe_benchmarks/megablocks_yamoe/artifacts/gptoss_run/gptoss_results.json
CHANGED
|
@@ -9,16 +9,16 @@
|
|
| 9 |
"vary_inputs": true
|
| 10 |
},
|
| 11 |
"stats": {
|
| 12 |
-
"avg_ms":
|
| 13 |
-
"min_ms": 40.
|
| 14 |
-
"max_ms":
|
| 15 |
-
"std_ms": 2.
|
| 16 |
-
"p50_ms":
|
| 17 |
-
"p95_ms":
|
| 18 |
-
"p99_ms":
|
| 19 |
"num_iters": 50,
|
| 20 |
-
"tokens_per_s":
|
| 21 |
-
"throughput_variance":
|
| 22 |
},
|
| 23 |
"output_sum": 11.53223705291748
|
| 24 |
}
|
|
|
|
| 9 |
"vary_inputs": true
|
| 10 |
},
|
| 11 |
"stats": {
|
| 12 |
+
"avg_ms": 47.38012357999878,
|
| 13 |
+
"min_ms": 40.92212500000869,
|
| 14 |
+
"max_ms": 51.281423999967046,
|
| 15 |
+
"std_ms": 2.9172375790717613,
|
| 16 |
+
"p50_ms": 48.13728099998116,
|
| 17 |
+
"p95_ms": 51.063823949996845,
|
| 18 |
+
"p99_ms": 51.260956209974324,
|
| 19 |
"num_iters": 50,
|
| 20 |
+
"tokens_per_s": 2110.589682847817,
|
| 21 |
+
"throughput_variance": 134.12269492084684
|
| 22 |
},
|
| 23 |
"output_sum": 11.53223705291748
|
| 24 |
}
|
moe_benchmarks/megablocks_yamoe/artifacts/gptoss_training_run/gptoss_training_results.json
CHANGED
|
@@ -9,16 +9,16 @@
|
|
| 9 |
"vary_inputs": true
|
| 10 |
},
|
| 11 |
"stats": {
|
| 12 |
-
"avg_ms":
|
| 13 |
-
"min_ms":
|
| 14 |
-
"max_ms": 51.
|
| 15 |
-
"std_ms":
|
| 16 |
-
"p50_ms":
|
| 17 |
-
"p95_ms":
|
| 18 |
-
"p99_ms": 51.
|
| 19 |
"num_iters": 50,
|
| 20 |
-
"tokens_per_s":
|
| 21 |
-
"throughput_variance":
|
| 22 |
},
|
| 23 |
"output_sum": 11.53223705291748
|
| 24 |
}
|
|
|
|
| 9 |
"vary_inputs": true
|
| 10 |
},
|
| 11 |
"stats": {
|
| 12 |
+
"avg_ms": 46.92225349999944,
|
| 13 |
+
"min_ms": 38.89427600000772,
|
| 14 |
+
"max_ms": 51.62209400003803,
|
| 15 |
+
"std_ms": 3.930283839179673,
|
| 16 |
+
"p50_ms": 48.18643950000023,
|
| 17 |
+
"p95_ms": 51.4210894500053,
|
| 18 |
+
"p99_ms": 51.56389033003563,
|
| 19 |
"num_iters": 50,
|
| 20 |
+
"tokens_per_s": 2131.1849397855794,
|
| 21 |
+
"throughput_variance": 188.79708542409617
|
| 22 |
},
|
| 23 |
"output_sum": 11.53223705291748
|
| 24 |
}
|
moe_benchmarks/megablocks_yamoe/artifacts/megablocks_run/megablocks_results.json
CHANGED
|
@@ -9,16 +9,16 @@
|
|
| 9 |
"vary_inputs": true
|
| 10 |
},
|
| 11 |
"stats": {
|
| 12 |
-
"avg_ms": 3.
|
| 13 |
-
"min_ms": 0.
|
| 14 |
-
"max_ms": 8.
|
| 15 |
-
"std_ms": 3.
|
| 16 |
-
"p50_ms": 0.
|
| 17 |
-
"p95_ms": 8.
|
| 18 |
-
"p99_ms": 8.
|
| 19 |
"num_iters": 50,
|
| 20 |
-
"tokens_per_s":
|
| 21 |
-
"throughput_variance":
|
| 22 |
},
|
| 23 |
"output_sum": 6.4738850593566895
|
| 24 |
}
|
|
|
|
| 9 |
"vary_inputs": true
|
| 10 |
},
|
| 11 |
"stats": {
|
| 12 |
+
"avg_ms": 3.8653191399976095,
|
| 13 |
+
"min_ms": 0.8422269999073251,
|
| 14 |
+
"max_ms": 8.544625000013184,
|
| 15 |
+
"std_ms": 3.690530253469649,
|
| 16 |
+
"p50_ms": 0.88288749992671,
|
| 17 |
+
"p95_ms": 8.536876499982782,
|
| 18 |
+
"p99_ms": 8.542453809967583,
|
| 19 |
"num_iters": 50,
|
| 20 |
+
"tokens_per_s": 25871.084994048342,
|
| 21 |
+
"throughput_variance": 50674.824252369
|
| 22 |
},
|
| 23 |
"output_sum": 6.4738850593566895
|
| 24 |
}
|
moe_benchmarks/megablocks_yamoe/artifacts/visualization/moe_performance_comparison.png
CHANGED
|
Git LFS Details
|
|
Git LFS Details
|
moe_benchmarks/megablocks_yamoe/artifacts/yamoe_run/yamoe_results.json
CHANGED
|
@@ -9,16 +9,16 @@
|
|
| 9 |
"vary_inputs": true
|
| 10 |
},
|
| 11 |
"stats": {
|
| 12 |
-
"avg_ms": 4.
|
| 13 |
-
"min_ms": 4.
|
| 14 |
-
"max_ms": 4.
|
| 15 |
-
"std_ms": 0.
|
| 16 |
-
"p50_ms": 4.
|
| 17 |
-
"p95_ms": 4.
|
| 18 |
-
"p99_ms": 4.
|
| 19 |
"num_iters": 50,
|
| 20 |
-
"tokens_per_s":
|
| 21 |
-
"throughput_variance":
|
| 22 |
},
|
| 23 |
-
"output_sum": 3.
|
| 24 |
}
|
|
|
|
| 9 |
"vary_inputs": true
|
| 10 |
},
|
| 11 |
"stats": {
|
| 12 |
+
"avg_ms": 4.248850420003691,
|
| 13 |
+
"min_ms": 4.146223000020655,
|
| 14 |
+
"max_ms": 4.269965999981196,
|
| 15 |
+
"std_ms": 0.01914249322297606,
|
| 16 |
+
"p50_ms": 4.252545499980442,
|
| 17 |
+
"p95_ms": 4.265578499993694,
|
| 18 |
+
"p99_ms": 4.269833699987657,
|
| 19 |
"num_iters": 50,
|
| 20 |
+
"tokens_per_s": 23535.77794341678,
|
| 21 |
+
"throughput_variance": 107.68667127056374
|
| 22 |
},
|
| 23 |
+
"output_sum": 3.9719059467315674
|
| 24 |
}
|
moe_benchmarks/megablocks_yamoe/cells/__pycache__/bench_utils.cpython-311.pyc
CHANGED
|
Binary files a/moe_benchmarks/megablocks_yamoe/cells/__pycache__/bench_utils.cpython-311.pyc and b/moe_benchmarks/megablocks_yamoe/cells/__pycache__/bench_utils.cpython-311.pyc differ
|
|
|
moe_benchmarks/megablocks_yamoe/cells/__pycache__/config.cpython-311.pyc
CHANGED
|
Binary files a/moe_benchmarks/megablocks_yamoe/cells/__pycache__/config.cpython-311.pyc and b/moe_benchmarks/megablocks_yamoe/cells/__pycache__/config.cpython-311.pyc differ
|
|
|
moe_benchmarks/megablocks_yamoe/megablocks_yamoe.html
CHANGED
|
@@ -3715,74 +3715,7 @@ span.linenos.special { color: #000000; background-color: #ffffc0; padding-left:
|
|
| 3715 |
</div>
|
| 3716 |
|
| 3717 |
<div class="main-content">
|
| 3718 |
-
<
|
| 3719 |
-
<div class="cell-header">
|
| 3720 |
-
<span class="collapse-indicators">
|
| 3721 |
-
<span onclick="toggleCode('nv')" style="cursor: pointer;">▼ code</span>
|
| 3722 |
-
<span onclick="toggleOutput('nv')" style="cursor: pointer;">▼ output</span>
|
| 3723 |
-
<span id="uv-indicator-nv" style="cursor: default; opacity: 0.3;">▶ uv-logs</span>
|
| 3724 |
-
</span> |
|
| 3725 |
-
Cell: nv | 0.71s
|
| 3726 |
-
| <button class="run-btn" onclick="runCell('nv')">▶ run</button>
|
| 3727 |
-
<button class="copy-btn" onclick="copyCell('nv')">Copy</button>
|
| 3728 |
-
<a href="cells/nv.py" target="_blank" class="raw-btn">Raw</a>
|
| 3729 |
-
</div>
|
| 3730 |
-
<div id="code-nv" class="cell-code" data-lines="3">
|
| 3731 |
-
<div class="highlight-with-lines">
|
| 3732 |
-
<div class="line-numbers" id="lines-nv">
|
| 3733 |
-
<a class="line-number" data-cell="nv" data-line="1" href="#cell-nv" onclick="event.preventDefault(); selectCellLine('nv', 1, true);">1</a>
|
| 3734 |
-
<a class="line-number" data-cell="nv" data-line="2" href="#cell-nv" onclick="event.preventDefault(); selectCellLine('nv', 2, true);">2</a>
|
| 3735 |
-
<a class="line-number" data-cell="nv" data-line="3" href="#cell-nv" onclick="event.preventDefault(); selectCellLine('nv', 3, true);">3</a>
|
| 3736 |
-
</div>
|
| 3737 |
-
<div class="code-wrap">
|
| 3738 |
-
<div class="highlight"><pre><span></span><span class="kn">import</span><span class="w"> </span><span class="nn">subprocess</span>
|
| 3739 |
-
|
| 3740 |
-
<span class="nb">print</span><span class="p">(</span><span class="n">subprocess</span><span class="o">.</span><span class="n">run</span><span class="p">([</span><span class="s2">"nvidia-smi"</span><span class="p">],</span> <span class="n">capture_output</span><span class="o">=</span><span class="kc">True</span><span class="p">,</span> <span class="n">text</span><span class="o">=</span><span class="kc">True</span><span class="p">)</span><span class="o">.</span><span class="n">stdout</span><span class="p">)</span>
|
| 3741 |
-
</pre></div>
|
| 3742 |
-
|
| 3743 |
-
<div class="code-line-highlight" id="line-highlight-nv"></div>
|
| 3744 |
-
</div>
|
| 3745 |
-
</div>
|
| 3746 |
-
</div>
|
| 3747 |
-
<div id="output-nv" class="cell-output">
|
| 3748 |
-
<div class="cell-stdout">Thu Sep 25 20:02:55 2025
|
| 3749 |
-
+-----------------------------------------------------------------------------------------+
|
| 3750 |
-
| NVIDIA-SMI 570.172.08 Driver Version: 570.172.08 CUDA Version: 12.8 |
|
| 3751 |
-
|-----------------------------------------+------------------------+----------------------+
|
| 3752 |
-
| GPU Name Persistence-M | Bus-Id Disp.A | Volatile Uncorr. ECC |
|
| 3753 |
-
| Fan Temp Perf Pwr:Usage/Cap | Memory-Usage | GPU-Util Compute M. |
|
| 3754 |
-
| | | MIG M. |
|
| 3755 |
-
|=========================================+========================+======================|
|
| 3756 |
-
| 0 NVIDIA A10G On | 00000000:00:1B.0 Off | 0 |
|
| 3757 |
-
| 0% 36C P8 25W / 300W | 0MiB / 23028MiB | 0% Default |
|
| 3758 |
-
| | | N/A |
|
| 3759 |
-
+-----------------------------------------+------------------------+----------------------+
|
| 3760 |
-
| 1 NVIDIA A10G On | 00000000:00:1C.0 Off | 0 |
|
| 3761 |
-
| 0% 33C P8 23W / 300W | 0MiB / 23028MiB | 0% Default |
|
| 3762 |
-
| | | N/A |
|
| 3763 |
-
+-----------------------------------------+------------------------+----------------------+
|
| 3764 |
-
| 2 NVIDIA A10G On | 00000000:00:1D.0 Off | 0 |
|
| 3765 |
-
| 0% 33C P8 24W / 300W | 0MiB / 23028MiB | 0% Default |
|
| 3766 |
-
| | | N/A |
|
| 3767 |
-
+-----------------------------------------+------------------------+----------------------+
|
| 3768 |
-
| 3 NVIDIA A10G On | 00000000:00:1E.0 Off | 0 |
|
| 3769 |
-
| 0% 33C P8 26W / 300W | 0MiB / 23028MiB | 0% Default |
|
| 3770 |
-
| | | N/A |
|
| 3771 |
-
+-----------------------------------------+------------------------+----------------------+
|
| 3772 |
-
|
| 3773 |
-
+-----------------------------------------------------------------------------------------+
|
| 3774 |
-
| Processes: |
|
| 3775 |
-
| GPU GI CI PID Type Process name GPU Memory |
|
| 3776 |
-
| ID ID Usage |
|
| 3777 |
-
|=========================================================================================|
|
| 3778 |
-
| No running processes found |
|
| 3779 |
-
+-----------------------------------------------------------------------------------------+
|
| 3780 |
-
|
| 3781 |
-
</div>
|
| 3782 |
-
</div>
|
| 3783 |
-
</div>
|
| 3784 |
-
|
| 3785 |
-
<h1>Comparison of Megablocks and Yamoe Kernels</h1>
|
| 3786 |
<p>This note compares the performance of the Megablocks and Yamoe kernels on the GPT-OSS-20B model.</p>
|
| 3787 |
<h2>Megablocks kernel</h2>
|
| 3788 |
<h2>Yamoe Kernel</h2>
|
|
@@ -3793,7 +3726,7 @@ Cell: nv | 0.71s
|
|
| 3793 |
<span onclick="toggleOutput('setup')" style="cursor: pointer;">▼ output</span>
|
| 3794 |
<span id="uv-indicator-setup" style="cursor: default; opacity: 0.3;">▶ uv-logs</span>
|
| 3795 |
</span> |
|
| 3796 |
-
Cell: setup |
|
| 3797 |
| <button class="run-btn" onclick="runCell('setup')">▶ run</button>
|
| 3798 |
<button class="copy-btn" onclick="copyCell('setup')">Copy</button>
|
| 3799 |
<a href="cells/setup.py" target="_blank" class="raw-btn">Raw</a>
|
|
|
|
| 3715 |
</div>
|
| 3716 |
|
| 3717 |
<div class="main-content">
|
| 3718 |
+
<h1>Comparison of Megablocks and Yamoe Kernels</h1>
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 3719 |
<p>This note compares the performance of the Megablocks and Yamoe kernels on the GPT-OSS-20B model.</p>
|
| 3720 |
<h2>Megablocks kernel</h2>
|
| 3721 |
<h2>Yamoe Kernel</h2>
|
|
|
|
| 3726 |
<span onclick="toggleOutput('setup')" style="cursor: pointer;">▼ output</span>
|
| 3727 |
<span id="uv-indicator-setup" style="cursor: default; opacity: 0.3;">▶ uv-logs</span>
|
| 3728 |
</span> |
|
| 3729 |
+
Cell: setup | 17.08s | FAILED
|
| 3730 |
| <button class="run-btn" onclick="runCell('setup')">▶ run</button>
|
| 3731 |
<button class="copy-btn" onclick="copyCell('setup')">Copy</button>
|
| 3732 |
<a href="cells/setup.py" target="_blank" class="raw-btn">Raw</a>
|
moe_benchmarks/megablocks_yamoe/torch_profile.html
CHANGED
|
@@ -3720,7 +3720,7 @@ span.linenos.special { color: #000000; background-color: #ffffc0; padding-left:
|
|
| 3720 |
<span onclick="toggleOutput('utils')" style="cursor: pointer;">▼ output</span>
|
| 3721 |
<span id="uv-indicator-utils" onclick="toggleUvLogsFromHeader('utils')" style="cursor: pointer;">▶ uv-logs</span>
|
| 3722 |
</span> |
|
| 3723 |
-
Cell: utils | deps: torch, numpy |
|
| 3724 |
| <button class="run-btn" onclick="runCell('utils')">▶ run</button>
|
| 3725 |
<button class="copy-btn" onclick="copyCell('utils')">Copy</button>
|
| 3726 |
<a href="cells/utils.py" target="_blank" class="raw-btn">Raw</a>
|
|
@@ -3794,24 +3794,24 @@ Cell: utils | deps: torch, numpy | 36.15s
|
|
| 3794 |
<div class="uv-install-logs" id="uv-logs-utils">
|
| 3795 |
<div class="uv-logs-header" onclick="toggleUvLogs(this)">▶ UV Install Logs</div>
|
| 3796 |
<div class="uv-logs-content" style="display: none;">
|
|
|
|
|
|
|
| 3797 |
Downloading networkx (1.9MiB)
|
| 3798 |
Downloading setuptools (1.1MiB)
|
| 3799 |
-
Downloading
|
| 3800 |
Downloading nvidia-cufile-cu12 (1.1MiB)
|
|
|
|
|
|
|
|
|
|
|
|
|
| 3801 |
Downloading nvidia-nvjitlink-cu12 (37.4MiB)
|
| 3802 |
-
Downloading
|
| 3803 |
Downloading triton (148.3MiB)
|
|
|
|
| 3804 |
Downloading nvidia-cufft-cu12 (184.2MiB)
|
| 3805 |
-
Downloading nvidia-curand-cu12 (60.7MiB)
|
| 3806 |
-
Downloading nvidia-cudnn-cu12 (674.0MiB)
|
| 3807 |
Downloading nvidia-cusparse-cu12 (274.9MiB)
|
| 3808 |
-
Downloading sympy (6.0MiB)
|
| 3809 |
-
Downloading nvidia-cublas-cu12 (566.8MiB)
|
| 3810 |
-
Downloading nvidia-cusolver-cu12 (255.1MiB)
|
| 3811 |
-
Downloading nvidia-cuda-nvrtc-cu12 (84.0MiB)
|
| 3812 |
-
Downloading nvidia-nccl-cu12 (307.4MiB)
|
| 3813 |
Downloading torch (846.9MiB)
|
| 3814 |
-
Downloading nvidia-
|
| 3815 |
Downloading nvidia-cufile-cu12
|
| 3816 |
Downloading setuptools
|
| 3817 |
Downloading networkx
|
|
@@ -3830,7 +3830,7 @@ Downloading nvidia-cusparselt-cu12 (273.9MiB)
|
|
| 3830 |
Downloading nvidia-cublas-cu12
|
| 3831 |
Downloading nvidia-cudnn-cu12
|
| 3832 |
Downloading torch
|
| 3833 |
-
Installed 26 packages in
|
| 3834 |
</div>
|
| 3835 |
</div>
|
| 3836 |
</div>
|
|
@@ -3843,7 +3843,7 @@ Installed 26 packages in 452ms
|
|
| 3843 |
<span onclick="toggleOutput('bench_utils')" style="cursor: pointer;">▼ output</span>
|
| 3844 |
<span id="uv-indicator-bench_utils" onclick="toggleUvLogsFromHeader('bench_utils')" style="cursor: pointer;">▶ uv-logs</span>
|
| 3845 |
</span> |
|
| 3846 |
-
Cell: bench_utils | deps: torch, numpy |
|
| 3847 |
| <button class="run-btn" onclick="runCell('bench_utils')">▶ run</button>
|
| 3848 |
<button class="copy-btn" onclick="copyCell('bench_utils')">Copy</button>
|
| 3849 |
<a href="cells/bench_utils.py" target="_blank" class="raw-btn">Raw</a>
|
|
@@ -4331,24 +4331,24 @@ Cell: bench_utils | deps: torch, numpy | 34.88s
|
|
| 4331 |
<div class="uv-install-logs" id="uv-logs-bench_utils">
|
| 4332 |
<div class="uv-logs-header" onclick="toggleUvLogs(this)">▶ UV Install Logs</div>
|
| 4333 |
<div class="uv-logs-content" style="display: none;">
|
| 4334 |
-
Downloading nvidia-cusparselt-cu12 (273.9MiB)
|
| 4335 |
-
Downloading nvidia-cudnn-cu12 (674.0MiB)
|
| 4336 |
-
Downloading setuptools (1.1MiB)
|
| 4337 |
-
Downloading nvidia-curand-cu12 (60.7MiB)
|
| 4338 |
-
Downloading nvidia-nvjitlink-cu12 (37.4MiB)
|
| 4339 |
Downloading sympy (6.0MiB)
|
|
|
|
|
|
|
|
|
|
| 4340 |
Downloading nvidia-cusolver-cu12 (255.1MiB)
|
| 4341 |
Downloading numpy (16.2MiB)
|
| 4342 |
-
Downloading nvidia-
|
| 4343 |
-
Downloading nvidia-
|
| 4344 |
Downloading torch (846.9MiB)
|
| 4345 |
-
Downloading
|
| 4346 |
Downloading nvidia-cusparse-cu12 (274.9MiB)
|
| 4347 |
-
Downloading nvidia-cufile-cu12 (1.1MiB)
|
| 4348 |
Downloading nvidia-cublas-cu12 (566.8MiB)
|
| 4349 |
-
Downloading
|
|
|
|
|
|
|
|
|
|
|
|
|
| 4350 |
Downloading nvidia-nccl-cu12 (307.4MiB)
|
| 4351 |
-
Downloading nvidia-cuda-nvrtc-cu12 (84.0MiB)
|
| 4352 |
Downloading nvidia-cufile-cu12
|
| 4353 |
Downloading setuptools
|
| 4354 |
Downloading networkx
|
|
@@ -4361,13 +4361,13 @@ Downloading nvidia-cuda-nvrtc-cu12 (84.0MiB)
|
|
| 4361 |
Downloading triton
|
| 4362 |
Downloading nvidia-cufft-cu12
|
| 4363 |
Downloading nvidia-cusolver-cu12
|
| 4364 |
-
Downloading nvidia-cusparse-cu12
|
| 4365 |
Downloading nvidia-cusparselt-cu12
|
|
|
|
| 4366 |
Downloading nvidia-nccl-cu12
|
| 4367 |
Downloading nvidia-cublas-cu12
|
| 4368 |
Downloading nvidia-cudnn-cu12
|
| 4369 |
Downloading torch
|
| 4370 |
-
Installed 26 packages in
|
| 4371 |
</div>
|
| 4372 |
</div>
|
| 4373 |
</div>
|
|
@@ -4381,7 +4381,7 @@ Installed 26 packages in 453ms
|
|
| 4381 |
<span onclick="toggleOutput('config')" style="cursor: pointer;">▼ output</span>
|
| 4382 |
<span id="uv-indicator-config" onclick="toggleUvLogsFromHeader('config')" style="cursor: pointer;">▶ uv-logs</span>
|
| 4383 |
</span> |
|
| 4384 |
-
Cell: config | deps: torch, numpy |
|
| 4385 |
| <button class="run-btn" onclick="runCell('config')">▶ run</button>
|
| 4386 |
<button class="copy-btn" onclick="copyCell('config')">Copy</button>
|
| 4387 |
<a href="cells/config.py" target="_blank" class="raw-btn">Raw</a>
|
|
@@ -4441,24 +4441,24 @@ Cell: config | deps: torch, numpy | 37.12s
|
|
| 4441 |
<div class="uv-install-logs" id="uv-logs-config">
|
| 4442 |
<div class="uv-logs-header" onclick="toggleUvLogs(this)">▶ UV Install Logs</div>
|
| 4443 |
<div class="uv-logs-content" style="display: none;">
|
| 4444 |
-
Downloading networkx (1.9MiB)
|
| 4445 |
-
Downloading sympy (6.0MiB)
|
| 4446 |
-
Downloading nvidia-cusparselt-cu12 (273.9MiB)
|
| 4447 |
-
Downloading numpy (16.2MiB)
|
| 4448 |
Downloading nvidia-cufile-cu12 (1.1MiB)
|
|
|
|
|
|
|
|
|
|
| 4449 |
Downloading triton (148.3MiB)
|
| 4450 |
-
Downloading nvidia-
|
|
|
|
| 4451 |
Downloading nvidia-cublas-cu12 (566.8MiB)
|
| 4452 |
-
Downloading nvidia-
|
| 4453 |
Downloading setuptools (1.1MiB)
|
| 4454 |
-
Downloading nvidia-cudnn-cu12 (674.0MiB)
|
| 4455 |
-
Downloading nvidia-curand-cu12 (60.7MiB)
|
| 4456 |
-
Downloading nvidia-cusolver-cu12 (255.1MiB)
|
| 4457 |
-
Downloading nvidia-cufft-cu12 (184.2MiB)
|
| 4458 |
-
Downloading nvidia-cuda-cupti-cu12 (9.8MiB)
|
| 4459 |
Downloading nvidia-nccl-cu12 (307.4MiB)
|
|
|
|
|
|
|
|
|
|
|
|
|
| 4460 |
Downloading nvidia-cuda-nvrtc-cu12 (84.0MiB)
|
| 4461 |
Downloading torch (846.9MiB)
|
|
|
|
| 4462 |
Downloading nvidia-cufile-cu12
|
| 4463 |
Downloading setuptools
|
| 4464 |
Downloading networkx
|
|
@@ -4471,13 +4471,13 @@ Downloading torch (846.9MiB)
|
|
| 4471 |
Downloading triton
|
| 4472 |
Downloading nvidia-cufft-cu12
|
| 4473 |
Downloading nvidia-cusolver-cu12
|
| 4474 |
-
Downloading nvidia-cusparselt-cu12
|
| 4475 |
Downloading nvidia-cusparse-cu12
|
|
|
|
| 4476 |
Downloading nvidia-nccl-cu12
|
| 4477 |
Downloading nvidia-cublas-cu12
|
| 4478 |
Downloading nvidia-cudnn-cu12
|
| 4479 |
Downloading torch
|
| 4480 |
-
Installed 26 packages in
|
| 4481 |
</div>
|
| 4482 |
</div>
|
| 4483 |
</div>
|
|
@@ -4490,7 +4490,7 @@ Installed 26 packages in 453ms
|
|
| 4490 |
<span onclick="toggleOutput('save_data')" style="cursor: pointer;">▼ output</span>
|
| 4491 |
<span id="uv-indicator-save_data" onclick="toggleUvLogsFromHeader('save_data')" style="cursor: pointer;">▶ uv-logs</span>
|
| 4492 |
</span> |
|
| 4493 |
-
Cell: save_data | deps: torch, numpy | 39.
|
| 4494 |
| <button class="run-btn" onclick="runCell('save_data')">▶ run</button>
|
| 4495 |
<button class="copy-btn" onclick="copyCell('save_data')">Copy</button>
|
| 4496 |
<a href="cells/save_data.py" target="_blank" class="raw-btn">Raw</a>
|
|
@@ -4585,24 +4585,24 @@ Down sum: 206.729263
|
|
| 4585 |
<div class="uv-install-logs" id="uv-logs-save_data">
|
| 4586 |
<div class="uv-logs-header" onclick="toggleUvLogs(this)">▶ UV Install Logs</div>
|
| 4587 |
<div class="uv-logs-content" style="display: none;">
|
| 4588 |
-
Downloading networkx (1.9MiB)
|
| 4589 |
-
Downloading nvidia-cuda-nvrtc-cu12 (84.0MiB)
|
| 4590 |
-
Downloading nvidia-cusparse-cu12 (274.9MiB)
|
| 4591 |
-
Downloading numpy (16.2MiB)
|
| 4592 |
-
Downloading sympy (6.0MiB)
|
| 4593 |
-
Downloading nvidia-nccl-cu12 (307.4MiB)
|
| 4594 |
Downloading setuptools (1.1MiB)
|
| 4595 |
-
Downloading nvidia-cuda-cupti-cu12 (9.8MiB)
|
| 4596 |
-
Downloading nvidia-cufft-cu12 (184.2MiB)
|
| 4597 |
Downloading nvidia-cudnn-cu12 (674.0MiB)
|
| 4598 |
-
Downloading nvidia-
|
|
|
|
|
|
|
|
|
|
| 4599 |
Downloading nvidia-cusolver-cu12 (255.1MiB)
|
| 4600 |
-
Downloading nvidia-cusparselt-cu12 (273.9MiB)
|
| 4601 |
-
Downloading nvidia-nvjitlink-cu12 (37.4MiB)
|
| 4602 |
Downloading nvidia-curand-cu12 (60.7MiB)
|
| 4603 |
-
Downloading
|
|
|
|
|
|
|
|
|
|
| 4604 |
Downloading triton (148.3MiB)
|
|
|
|
|
|
|
|
|
|
| 4605 |
Downloading nvidia-cublas-cu12 (566.8MiB)
|
|
|
|
| 4606 |
Downloading nvidia-cufile-cu12
|
| 4607 |
Downloading setuptools
|
| 4608 |
Downloading networkx
|
|
@@ -4621,17 +4621,17 @@ Downloading nvidia-cublas-cu12 (566.8MiB)
|
|
| 4621 |
Downloading nvidia-cublas-cu12
|
| 4622 |
Downloading nvidia-cudnn-cu12
|
| 4623 |
Downloading torch
|
| 4624 |
-
Installed 26 packages in
|
| 4625 |
</div>
|
| 4626 |
</div>
|
| 4627 |
<div class="cell-artifacts">
|
| 4628 |
<h4>Artifacts:</h4>
|
| 4629 |
-
<a href="artifacts/save_data/router_bias.pt" class="artifact" target="_blank">router_bias.pt</a>
|
| 4630 |
-
<a href="artifacts/save_data/down_proj_bias.pt" class="artifact" target="_blank">down_proj_bias.pt</a>
|
| 4631 |
-
<a href="artifacts/save_data/gate_up_proj_bias.pt" class="artifact" target="_blank">gate_up_proj_bias.pt</a>
|
| 4632 |
<a href="artifacts/save_data/router_weight.pt" class="artifact" target="_blank">router_weight.pt</a>
|
| 4633 |
-
<a href="artifacts/save_data/
|
| 4634 |
<a href="artifacts/save_data/gate_up_proj.pt" class="artifact" target="_blank">gate_up_proj.pt</a>
|
|
|
|
|
|
|
|
|
|
| 4635 |
</div>
|
| 4636 |
</div>
|
| 4637 |
</div>
|
|
@@ -4645,7 +4645,7 @@ Installed 26 packages in 464ms
|
|
| 4645 |
<span onclick="toggleOutput('yamoe_run')" style="cursor: pointer;">▼ output</span>
|
| 4646 |
<span id="uv-indicator-yamoe_run" onclick="toggleUvLogsFromHeader('yamoe_run')" style="cursor: pointer;">▶ uv-logs</span>
|
| 4647 |
</span> |
|
| 4648 |
-
Cell: yamoe_run | deps: torch, kernels, numpy |
|
| 4649 |
| <button class="run-btn" onclick="runCell('yamoe_run')">▶ run</button>
|
| 4650 |
<button class="copy-btn" onclick="copyCell('yamoe_run')">Copy</button>
|
| 4651 |
<a href="cells/yamoe_run.py" target="_blank" class="raw-btn">Raw</a>
|
|
@@ -4938,10 +4938,10 @@ Input Variation: +0.001 * iteration (deterministic)
|
|
| 4938 |
|
| 4939 |
Warming up (10 iterations)...
|
| 4940 |
Benchmarking (50 iterations)...
|
| 4941 |
-
Progress: 20% complete (avg: 4.
|
| 4942 |
-
Progress: 40% complete (avg: 4.
|
| 4943 |
-
Progress: 60% complete (avg: 4.
|
| 4944 |
-
Progress: 80% complete (avg: 4.
|
| 4945 |
|
| 4946 |
Output tensors:
|
| 4947 |
Primary: shape=(1, 100, 1152), dtype=torch.float32, device=cuda:0, range=[-0.049506, 0.054984], mean=0.000034, std=0.006508, norm=2.208791
|
|
@@ -4952,46 +4952,46 @@ Iterations: 50
|
|
| 4952 |
|
| 4953 |
Latency Statistics:
|
| 4954 |
Average: 4.249 ms
|
| 4955 |
-
Min: 4.
|
| 4956 |
-
Max: 4.
|
| 4957 |
-
Std Dev: 0.
|
| 4958 |
|
| 4959 |
Percentiles:
|
| 4960 |
-
P50 (median): 4.
|
| 4961 |
-
P95: 4.
|
| 4962 |
-
P99: 4.
|
| 4963 |
|
| 4964 |
Throughput:
|
| 4965 |
-
Tokens/sec:
|
| 4966 |
-
Std Dev:
|
| 4967 |
━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━
|
| 4968 |
|
| 4969 |
Saved benchmark results to yamoe_results.json
|
| 4970 |
|
| 4971 |
-
Output sum: 3.
|
| 4972 |
</div>
|
| 4973 |
<div class="uv-install-logs" id="uv-logs-yamoe_run">
|
| 4974 |
<div class="uv-logs-header" onclick="toggleUvLogs(this)">▶ UV Install Logs</div>
|
| 4975 |
<div class="uv-logs-content" style="display: none;">
|
| 4976 |
-
Downloading sympy (6.0MiB)
|
| 4977 |
-
Downloading nvidia-cusolver-cu12 (255.1MiB)
|
| 4978 |
-
Downloading hf-xet (3.0MiB)
|
| 4979 |
Downloading nvidia-cuda-cupti-cu12 (9.8MiB)
|
| 4980 |
-
Downloading
|
| 4981 |
-
Downloading
|
|
|
|
| 4982 |
Downloading nvidia-cusparselt-cu12 (273.9MiB)
|
| 4983 |
-
Downloading nvidia-cufile-cu12 (1.1MiB)
|
| 4984 |
-
Downloading nvidia-nvjitlink-cu12 (37.4MiB)
|
| 4985 |
-
Downloading triton (148.3MiB)
|
| 4986 |
Downloading nvidia-cublas-cu12 (566.8MiB)
|
| 4987 |
-
Downloading
|
| 4988 |
Downloading nvidia-cusparse-cu12 (274.9MiB)
|
|
|
|
| 4989 |
Downloading nvidia-nccl-cu12 (307.4MiB)
|
|
|
|
|
|
|
|
|
|
| 4990 |
Downloading nvidia-curand-cu12 (60.7MiB)
|
| 4991 |
-
Downloading
|
| 4992 |
-
Downloading
|
| 4993 |
Downloading setuptools (1.1MiB)
|
| 4994 |
-
Downloading
|
|
|
|
| 4995 |
Downloading nvidia-cufile-cu12
|
| 4996 |
Downloading hf-xet
|
| 4997 |
Downloading setuptools
|
|
@@ -5005,19 +5005,19 @@ Downloading networkx (1.9MiB)
|
|
| 5005 |
Downloading triton
|
| 5006 |
Downloading nvidia-cufft-cu12
|
| 5007 |
Downloading nvidia-cusolver-cu12
|
| 5008 |
-
Downloading nvidia-cusparse-cu12
|
| 5009 |
Downloading nvidia-cusparselt-cu12
|
|
|
|
| 5010 |
Downloading nvidia-nccl-cu12
|
| 5011 |
Downloading nvidia-cublas-cu12
|
| 5012 |
Downloading nvidia-cudnn-cu12
|
| 5013 |
Downloading torch
|
| 5014 |
-
Installed 37 packages in
|
| 5015 |
</div>
|
| 5016 |
</div>
|
| 5017 |
<div class="cell-stderr">Fetching 6 files: 0%| | 0/6 [00:00<?, ?it/s]
|
| 5018 |
-
Fetching 6 files: 17%|█▋ | 1/6 [00:00<00:01,
|
| 5019 |
-
Fetching 6 files: 50%|█████ | 3/6 [00:00<00:00,
|
| 5020 |
-
Fetching 6 files: 100%|██████████| 6/6 [00:00<00:00,
|
| 5021 |
<div class="cell-artifacts">
|
| 5022 |
<h4>Artifacts:</h4>
|
| 5023 |
<a href="artifacts/yamoe_run/yamoe_results.json" class="artifact" target="_blank">yamoe_results.json</a>
|
|
@@ -5034,7 +5034,7 @@ Fetching 6 files: 100%|██████████| 6/6 [00:00<00:00, 8.7
|
|
| 5034 |
<span onclick="toggleOutput('binned_run')" style="cursor: pointer;">▼ output</span>
|
| 5035 |
<span id="uv-indicator-binned_run" onclick="toggleUvLogsFromHeader('binned_run')" style="cursor: pointer;">▶ uv-logs</span>
|
| 5036 |
</span> |
|
| 5037 |
-
Cell: binned_run | deps: torch, numpy | 39.
|
| 5038 |
| <button class="run-btn" onclick="runCell('binned_run')">▶ run</button>
|
| 5039 |
<button class="copy-btn" onclick="copyCell('binned_run')">Copy</button>
|
| 5040 |
<a href="cells/binned_run.py" target="_blank" class="raw-btn">Raw</a>
|
|
@@ -5448,10 +5448,10 @@ Input Variation: +0.001 * iteration (deterministic)
|
|
| 5448 |
|
| 5449 |
Warming up (10 iterations)...
|
| 5450 |
Benchmarking (50 iterations)...
|
| 5451 |
-
Progress: 20% complete (avg:
|
| 5452 |
-
Progress: 40% complete (avg:
|
| 5453 |
-
Progress: 60% complete (avg:
|
| 5454 |
-
Progress: 80% complete (avg:
|
| 5455 |
|
| 5456 |
Output tensors:
|
| 5457 |
Primary: shape=(1, 100, 1152), dtype=torch.float32, device=cuda:0, range=[-0.049506, 0.054984], mean=0.000034, std=0.006508, norm=2.208791
|
|
@@ -5461,19 +5461,19 @@ Output tensors:
|
|
| 5461 |
Iterations: 50
|
| 5462 |
|
| 5463 |
Latency Statistics:
|
| 5464 |
-
Average:
|
| 5465 |
-
Min: 33.
|
| 5466 |
-
Max:
|
| 5467 |
-
Std Dev: 1.
|
| 5468 |
|
| 5469 |
Percentiles:
|
| 5470 |
-
P50 (median):
|
| 5471 |
-
P95:
|
| 5472 |
-
P99:
|
| 5473 |
|
| 5474 |
Throughput:
|
| 5475 |
-
Tokens/sec:
|
| 5476 |
-
Std Dev:
|
| 5477 |
━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━
|
| 5478 |
|
| 5479 |
Saved benchmark results to binned_results.json
|
|
@@ -5484,23 +5484,23 @@ Output sum: 3.971905
|
|
| 5484 |
<div class="uv-logs-header" onclick="toggleUvLogs(this)">▶ UV Install Logs</div>
|
| 5485 |
<div class="uv-logs-content" style="display: none;">
|
| 5486 |
Downloading setuptools (1.1MiB)
|
| 5487 |
-
Downloading
|
| 5488 |
-
Downloading networkx (1.9MiB)
|
| 5489 |
Downloading nvidia-cuda-cupti-cu12 (9.8MiB)
|
|
|
|
| 5490 |
Downloading nvidia-cublas-cu12 (566.8MiB)
|
|
|
|
|
|
|
|
|
|
| 5491 |
Downloading nvidia-cusolver-cu12 (255.1MiB)
|
| 5492 |
-
Downloading
|
| 5493 |
-
Downloading nvidia-nvjitlink-cu12 (37.4MiB)
|
| 5494 |
-
Downloading nvidia-cusparse-cu12 (274.9MiB)
|
| 5495 |
-
Downloading nvidia-cuda-nvrtc-cu12 (84.0MiB)
|
| 5496 |
-
Downloading sympy (6.0MiB)
|
| 5497 |
Downloading nvidia-curand-cu12 (60.7MiB)
|
|
|
|
| 5498 |
Downloading nvidia-cusparselt-cu12 (273.9MiB)
|
|
|
|
|
|
|
| 5499 |
Downloading nvidia-cufile-cu12 (1.1MiB)
|
| 5500 |
-
Downloading nvidia-
|
| 5501 |
-
Downloading nvidia-
|
| 5502 |
-
Downloading torch (846.9MiB)
|
| 5503 |
-
Downloading triton (148.3MiB)
|
| 5504 |
Downloading nvidia-cufile-cu12
|
| 5505 |
Downloading setuptools
|
| 5506 |
Downloading networkx
|
|
@@ -5513,13 +5513,13 @@ Downloading triton (148.3MiB)
|
|
| 5513 |
Downloading triton
|
| 5514 |
Downloading nvidia-cufft-cu12
|
| 5515 |
Downloading nvidia-cusolver-cu12
|
| 5516 |
-
Downloading nvidia-cusparselt-cu12
|
| 5517 |
Downloading nvidia-cusparse-cu12
|
|
|
|
| 5518 |
Downloading nvidia-nccl-cu12
|
| 5519 |
Downloading nvidia-cublas-cu12
|
| 5520 |
Downloading nvidia-cudnn-cu12
|
| 5521 |
Downloading torch
|
| 5522 |
-
Installed 26 packages in
|
| 5523 |
</div>
|
| 5524 |
</div>
|
| 5525 |
<div class="cell-artifacts">
|
|
@@ -5538,7 +5538,7 @@ Installed 26 packages in 444ms
|
|
| 5538 |
<span onclick="toggleOutput('gptoss_run')" style="cursor: pointer;">▼ output</span>
|
| 5539 |
<span id="uv-indicator-gptoss_run" onclick="toggleUvLogsFromHeader('gptoss_run')" style="cursor: pointer;">▶ uv-logs</span>
|
| 5540 |
</span> |
|
| 5541 |
-
Cell: gptoss_run | deps: torch, numpy |
|
| 5542 |
| <button class="run-btn" onclick="runCell('gptoss_run')">▶ run</button>
|
| 5543 |
<button class="copy-btn" onclick="copyCell('gptoss_run')">Copy</button>
|
| 5544 |
<a href="cells/gptoss_run.py" target="_blank" class="raw-btn">Raw</a>
|
|
@@ -5856,10 +5856,10 @@ Input Variation: +0.001 * iteration (deterministic)
|
|
| 5856 |
|
| 5857 |
Warming up (10 iterations)...
|
| 5858 |
Benchmarking (50 iterations)...
|
| 5859 |
-
Progress: 20% complete (avg: 50.
|
| 5860 |
-
Progress: 40% complete (avg:
|
| 5861 |
-
Progress: 60% complete (avg:
|
| 5862 |
-
Progress: 80% complete (avg:
|
| 5863 |
|
| 5864 |
Output tensors:
|
| 5865 |
Primary: shape=(1, 100, 1152), dtype=torch.float32, device=cuda:0, range=[-0.064982, 0.061193], mean=0.000100, std=0.013510, norm=4.585560
|
|
@@ -5869,19 +5869,19 @@ Output tensors:
|
|
| 5869 |
Iterations: 50
|
| 5870 |
|
| 5871 |
Latency Statistics:
|
| 5872 |
-
Average:
|
| 5873 |
-
Min: 40.
|
| 5874 |
-
Max:
|
| 5875 |
-
Std Dev: 2.
|
| 5876 |
|
| 5877 |
Percentiles:
|
| 5878 |
-
P50 (median):
|
| 5879 |
-
P95:
|
| 5880 |
-
P99:
|
| 5881 |
|
| 5882 |
Throughput:
|
| 5883 |
-
Tokens/sec:
|
| 5884 |
-
Std Dev:
|
| 5885 |
━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━
|
| 5886 |
|
| 5887 |
Saved benchmark results to gptoss_results.json
|
|
@@ -5891,24 +5891,24 @@ Output sum: 11.532237
|
|
| 5891 |
<div class="uv-install-logs" id="uv-logs-gptoss_run">
|
| 5892 |
<div class="uv-logs-header" onclick="toggleUvLogs(this)">▶ UV Install Logs</div>
|
| 5893 |
<div class="uv-logs-content" style="display: none;">
|
|
|
|
| 5894 |
Downloading networkx (1.9MiB)
|
| 5895 |
-
Downloading nvidia-curand-cu12 (60.7MiB)
|
| 5896 |
-
Downloading nvidia-nvjitlink-cu12 (37.4MiB)
|
| 5897 |
-
Downloading nvidia-nccl-cu12 (307.4MiB)
|
| 5898 |
Downloading triton (148.3MiB)
|
| 5899 |
-
Downloading nvidia-cuda-nvrtc-cu12 (84.0MiB)
|
| 5900 |
-
Downloading nvidia-cusparselt-cu12 (273.9MiB)
|
| 5901 |
-
Downloading nvidia-cudnn-cu12 (674.0MiB)
|
| 5902 |
-
Downloading nvidia-cufft-cu12 (184.2MiB)
|
| 5903 |
-
Downloading nvidia-cublas-cu12 (566.8MiB)
|
| 5904 |
-
Downloading sympy (6.0MiB)
|
| 5905 |
-
Downloading setuptools (1.1MiB)
|
| 5906 |
Downloading nvidia-cufile-cu12 (1.1MiB)
|
| 5907 |
-
Downloading nvidia-cusolver-cu12 (255.1MiB)
|
| 5908 |
Downloading nvidia-cuda-cupti-cu12 (9.8MiB)
|
|
|
|
|
|
|
|
|
|
|
|
|
| 5909 |
Downloading numpy (16.2MiB)
|
| 5910 |
-
Downloading
|
|
|
|
|
|
|
|
|
|
| 5911 |
Downloading nvidia-cusparse-cu12 (274.9MiB)
|
|
|
|
|
|
|
|
|
|
| 5912 |
Downloading nvidia-cufile-cu12
|
| 5913 |
Downloading setuptools
|
| 5914 |
Downloading networkx
|
|
@@ -5927,7 +5927,7 @@ Downloading nvidia-cusparse-cu12 (274.9MiB)
|
|
| 5927 |
Downloading nvidia-cublas-cu12
|
| 5928 |
Downloading nvidia-cudnn-cu12
|
| 5929 |
Downloading torch
|
| 5930 |
-
Installed 26 packages in
|
| 5931 |
</div>
|
| 5932 |
</div>
|
| 5933 |
<div class="cell-artifacts">
|
|
@@ -5946,7 +5946,7 @@ Installed 26 packages in 455ms
|
|
| 5946 |
<span onclick="toggleOutput('gptoss_training_run')" style="cursor: pointer;">▼ output</span>
|
| 5947 |
<span id="uv-indicator-gptoss_training_run" onclick="toggleUvLogsFromHeader('gptoss_training_run')" style="cursor: pointer;">▶ uv-logs</span>
|
| 5948 |
</span> |
|
| 5949 |
-
Cell: gptoss_training_run | deps: torch, numpy | 40.
|
| 5950 |
| <button class="run-btn" onclick="runCell('gptoss_training_run')">▶ run</button>
|
| 5951 |
<button class="copy-btn" onclick="copyCell('gptoss_training_run')">Copy</button>
|
| 5952 |
<a href="cells/gptoss_training_run.py" target="_blank" class="raw-btn">Raw</a>
|
|
@@ -6247,10 +6247,10 @@ Input Variation: +0.001 * iteration (deterministic)
|
|
| 6247 |
|
| 6248 |
Warming up (10 iterations)...
|
| 6249 |
Benchmarking (50 iterations)...
|
| 6250 |
-
Progress: 20% complete (avg:
|
| 6251 |
-
Progress: 40% complete (avg: 50.
|
| 6252 |
-
Progress: 60% complete (avg: 49.
|
| 6253 |
-
Progress: 80% complete (avg: 48.
|
| 6254 |
|
| 6255 |
Output tensors:
|
| 6256 |
Primary: shape=(1, 100, 1152), dtype=torch.float32, device=cuda:0, range=[-0.064982, 0.061193], mean=0.000100, std=0.013510, norm=4.585560
|
|
@@ -6260,19 +6260,19 @@ Output tensors:
|
|
| 6260 |
Iterations: 50
|
| 6261 |
|
| 6262 |
Latency Statistics:
|
| 6263 |
-
Average:
|
| 6264 |
-
Min:
|
| 6265 |
-
Max: 51.
|
| 6266 |
-
Std Dev:
|
| 6267 |
|
| 6268 |
Percentiles:
|
| 6269 |
-
P50 (median):
|
| 6270 |
-
P95:
|
| 6271 |
-
P99: 51.
|
| 6272 |
|
| 6273 |
Throughput:
|
| 6274 |
-
Tokens/sec:
|
| 6275 |
-
Std Dev:
|
| 6276 |
━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━
|
| 6277 |
|
| 6278 |
Saved benchmark results to gptoss_training_results.json
|
|
@@ -6282,24 +6282,24 @@ Output sum: 11.532237
|
|
| 6282 |
<div class="uv-install-logs" id="uv-logs-gptoss_training_run">
|
| 6283 |
<div class="uv-logs-header" onclick="toggleUvLogs(this)">▶ UV Install Logs</div>
|
| 6284 |
<div class="uv-logs-content" style="display: none;">
|
| 6285 |
-
Downloading setuptools (1.1MiB)
|
| 6286 |
-
Downloading nvidia-curand-cu12 (60.7MiB)
|
| 6287 |
Downloading nvidia-cudnn-cu12 (674.0MiB)
|
| 6288 |
-
Downloading nvidia-cusparselt-cu12 (273.9MiB)
|
| 6289 |
-
Downloading numpy (16.2MiB)
|
| 6290 |
-
Downloading nvidia-cufft-cu12 (184.2MiB)
|
| 6291 |
-
Downloading nvidia-cusparse-cu12 (274.9MiB)
|
| 6292 |
-
Downloading nvidia-nvjitlink-cu12 (37.4MiB)
|
| 6293 |
-
Downloading nvidia-cublas-cu12 (566.8MiB)
|
| 6294 |
Downloading nvidia-cuda-cupti-cu12 (9.8MiB)
|
| 6295 |
-
Downloading nvidia-cuda-nvrtc-cu12 (84.0MiB)
|
| 6296 |
-
Downloading networkx (1.9MiB)
|
| 6297 |
Downloading nvidia-cusolver-cu12 (255.1MiB)
|
| 6298 |
-
Downloading nvidia-cufile-cu12 (1.1MiB)
|
| 6299 |
Downloading nvidia-nccl-cu12 (307.4MiB)
|
| 6300 |
-
Downloading
|
|
|
|
|
|
|
|
|
|
| 6301 |
Downloading sympy (6.0MiB)
|
|
|
|
|
|
|
|
|
|
| 6302 |
Downloading torch (846.9MiB)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 6303 |
Downloading nvidia-cufile-cu12
|
| 6304 |
Downloading setuptools
|
| 6305 |
Downloading networkx
|
|
@@ -6318,7 +6318,7 @@ Downloading torch (846.9MiB)
|
|
| 6318 |
Downloading nvidia-cublas-cu12
|
| 6319 |
Downloading nvidia-cudnn-cu12
|
| 6320 |
Downloading torch
|
| 6321 |
-
Installed 26 packages in
|
| 6322 |
</div>
|
| 6323 |
</div>
|
| 6324 |
<div class="cell-artifacts">
|
|
@@ -6337,7 +6337,7 @@ Installed 26 packages in 444ms
|
|
| 6337 |
<span onclick="toggleOutput('megablocks_run')" style="cursor: pointer;">▼ output</span>
|
| 6338 |
<span id="uv-indicator-megablocks_run" onclick="toggleUvLogsFromHeader('megablocks_run')" style="cursor: pointer;">▶ uv-logs</span>
|
| 6339 |
</span> |
|
| 6340 |
-
Cell: megablocks_run | deps: torch, numpy, kernels |
|
| 6341 |
| <button class="run-btn" onclick="runCell('megablocks_run')">▶ run</button>
|
| 6342 |
<button class="copy-btn" onclick="copyCell('megablocks_run')">Copy</button>
|
| 6343 |
<a href="cells/megablocks_run.py" target="_blank" class="raw-btn">Raw</a>
|
|
@@ -6566,10 +6566,10 @@ Input Variation: +0.001 * iteration (deterministic)
|
|
| 6566 |
|
| 6567 |
Warming up (10 iterations)...
|
| 6568 |
Benchmarking (50 iterations)...
|
| 6569 |
-
Progress: 20% complete (avg: 0.
|
| 6570 |
-
Progress: 40% complete (avg: 0.
|
| 6571 |
-
Progress: 60% complete (avg: 0.
|
| 6572 |
-
Progress: 80% complete (avg: 2.
|
| 6573 |
|
| 6574 |
Output tensors:
|
| 6575 |
Primary: shape=(1, 100, 1152), dtype=torch.float32, device=cuda:0, range=[-0.061104, 0.055115], mean=0.000056, std=0.013535, norm=4.593927
|
|
@@ -6579,19 +6579,19 @@ Output tensors:
|
|
| 6579 |
Iterations: 50
|
| 6580 |
|
| 6581 |
Latency Statistics:
|
| 6582 |
-
Average: 3.
|
| 6583 |
-
Min: 0.
|
| 6584 |
-
Max: 8.
|
| 6585 |
-
Std Dev: 3.
|
| 6586 |
|
| 6587 |
Percentiles:
|
| 6588 |
-
P50 (median): 0.
|
| 6589 |
-
P95: 8.
|
| 6590 |
-
P99: 8.
|
| 6591 |
|
| 6592 |
Throughput:
|
| 6593 |
-
Tokens/sec:
|
| 6594 |
-
Std Dev:
|
| 6595 |
━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━
|
| 6596 |
|
| 6597 |
Saved benchmark results to megablocks_results.json
|
|
@@ -6601,25 +6601,25 @@ Output sum: 6.473885
|
|
| 6601 |
<div class="uv-install-logs" id="uv-logs-megablocks_run">
|
| 6602 |
<div class="uv-logs-header" onclick="toggleUvLogs(this)">▶ UV Install Logs</div>
|
| 6603 |
<div class="uv-logs-content" style="display: none;">
|
| 6604 |
-
Downloading
|
| 6605 |
-
Downloading nvidia-cusparse-cu12 (274.9MiB)
|
| 6606 |
-
Downloading setuptools (1.1MiB)
|
| 6607 |
-
Downloading hf-xet (3.0MiB)
|
| 6608 |
-
Downloading nvidia-cufile-cu12 (1.1MiB)
|
| 6609 |
Downloading numpy (16.2MiB)
|
| 6610 |
-
Downloading
|
| 6611 |
-
Downloading
|
| 6612 |
-
Downloading
|
|
|
|
|
|
|
| 6613 |
Downloading torch (846.9MiB)
|
|
|
|
|
|
|
| 6614 |
Downloading nvidia-cudnn-cu12 (674.0MiB)
|
| 6615 |
-
Downloading
|
| 6616 |
-
Downloading nvidia-
|
| 6617 |
-
Downloading nvidia-
|
| 6618 |
-
Downloading nvidia-cusolver-cu12 (255.1MiB)
|
| 6619 |
Downloading nvidia-cufft-cu12 (184.2MiB)
|
| 6620 |
-
Downloading nvidia-nccl-cu12 (307.4MiB)
|
| 6621 |
-
Downloading nvidia-curand-cu12 (60.7MiB)
|
| 6622 |
Downloading triton (148.3MiB)
|
|
|
|
|
|
|
|
|
|
| 6623 |
Downloading nvidia-cufile-cu12
|
| 6624 |
Downloading hf-xet
|
| 6625 |
Downloading setuptools
|
|
@@ -6639,19 +6639,19 @@ Downloading triton (148.3MiB)
|
|
| 6639 |
Downloading nvidia-cublas-cu12
|
| 6640 |
Downloading nvidia-cudnn-cu12
|
| 6641 |
Downloading torch
|
| 6642 |
-
Installed 37 packages in
|
| 6643 |
</div>
|
| 6644 |
</div>
|
| 6645 |
<div class="cell-stderr">Fetching 66 files: 0%| | 0/66 [00:00<?, ?it/s]
|
| 6646 |
-
Fetching 66 files: 2%|▏ | 1/66 [00:00<00:15, 4.
|
| 6647 |
-
Fetching 66 files:
|
| 6648 |
-
Fetching 66 files:
|
| 6649 |
-
Fetching 66 files:
|
| 6650 |
-
Fetching 66 files:
|
| 6651 |
-
Fetching 66 files:
|
| 6652 |
-
Fetching 66 files:
|
| 6653 |
-
Fetching 66 files:
|
| 6654 |
-
Fetching 66 files: 100%|██████████| 66/66 [00:01<00:00,
|
| 6655 |
<div class="cell-artifacts">
|
| 6656 |
<h4>Artifacts:</h4>
|
| 6657 |
<a href="artifacts/megablocks_run/megablocks_results.json" class="artifact" target="_blank">megablocks_results.json</a>
|
|
@@ -6668,7 +6668,7 @@ Fetching 66 files: 100%|██████████| 66/66 [00:01<00:00, 3
|
|
| 6668 |
<span onclick="toggleOutput('visualization')" style="cursor: pointer;">▼ output</span>
|
| 6669 |
<span id="uv-indicator-visualization" onclick="toggleUvLogsFromHeader('visualization')" style="cursor: pointer;">▶ uv-logs</span>
|
| 6670 |
</span> |
|
| 6671 |
-
Cell: visualization | deps: matplotlib | 3.
|
| 6672 |
| <button class="run-btn" onclick="runCell('visualization')">▶ run</button>
|
| 6673 |
<button class="copy-btn" onclick="copyCell('visualization')">Copy</button>
|
| 6674 |
<a href="cells/visualization.py" target="_blank" class="raw-btn">Raw</a>
|
|
@@ -6914,30 +6914,30 @@ Loaded /repo/moe_benchmarks/megablocks_yamoe/.uvnote/cache/0febdf3420999533bc2e1
|
|
| 6914 |
Performance Summary:
|
| 6915 |
Implementation Avg (ms) P95 (ms) Tokens/sec Relative Speed
|
| 6916 |
--------------------------------------------------------------------------------
|
| 6917 |
-
megablocks_results 3.
|
| 6918 |
-
yamoe_results 4.25 4.27
|
| 6919 |
-
binned_results
|
| 6920 |
-
|
| 6921 |
-
|
| 6922 |
-
|
| 6923 |
-
Fastest: megablocks_results (3.
|
| 6924 |
-
Slowest:
|
| 6925 |
-
Max Speedup: 12.
|
| 6926 |
</div>
|
| 6927 |
<div class="uv-install-logs" id="uv-logs-visualization">
|
| 6928 |
<div class="uv-logs-header" onclick="toggleUvLogs(this)">▶ UV Install Logs</div>
|
| 6929 |
<div class="uv-logs-content" style="display: none;">
|
| 6930 |
-
Downloading pillow (6.3MiB)
|
| 6931 |
-
Downloading matplotlib (8.3MiB)
|
| 6932 |
-
Downloading fonttools (4.7MiB)
|
| 6933 |
Downloading numpy (16.2MiB)
|
| 6934 |
Downloading kiwisolver (1.4MiB)
|
|
|
|
|
|
|
|
|
|
| 6935 |
Downloading kiwisolver
|
| 6936 |
Downloading pillow
|
| 6937 |
Downloading fonttools
|
| 6938 |
Downloading matplotlib
|
| 6939 |
Downloading numpy
|
| 6940 |
-
Installed 11 packages in
|
| 6941 |
</div>
|
| 6942 |
</div>
|
| 6943 |
<div class="cell-artifacts">
|
|
|
|
| 3720 |
<span onclick="toggleOutput('utils')" style="cursor: pointer;">▼ output</span>
|
| 3721 |
<span id="uv-indicator-utils" onclick="toggleUvLogsFromHeader('utils')" style="cursor: pointer;">▶ uv-logs</span>
|
| 3722 |
</span> |
|
| 3723 |
+
Cell: utils | deps: torch, numpy | 34.86s
|
| 3724 |
| <button class="run-btn" onclick="runCell('utils')">▶ run</button>
|
| 3725 |
<button class="copy-btn" onclick="copyCell('utils')">Copy</button>
|
| 3726 |
<a href="cells/utils.py" target="_blank" class="raw-btn">Raw</a>
|
|
|
|
| 3794 |
<div class="uv-install-logs" id="uv-logs-utils">
|
| 3795 |
<div class="uv-logs-header" onclick="toggleUvLogs(this)">▶ UV Install Logs</div>
|
| 3796 |
<div class="uv-logs-content" style="display: none;">
|
| 3797 |
+
Downloading nvidia-cuda-cupti-cu12 (9.8MiB)
|
| 3798 |
+
Downloading sympy (6.0MiB)
|
| 3799 |
Downloading networkx (1.9MiB)
|
| 3800 |
Downloading setuptools (1.1MiB)
|
| 3801 |
+
Downloading nvidia-cudnn-cu12 (674.0MiB)
|
| 3802 |
Downloading nvidia-cufile-cu12 (1.1MiB)
|
| 3803 |
+
Downloading nvidia-nccl-cu12 (307.4MiB)
|
| 3804 |
+
Downloading nvidia-cusolver-cu12 (255.1MiB)
|
| 3805 |
+
Downloading nvidia-cusparselt-cu12 (273.9MiB)
|
| 3806 |
+
Downloading nvidia-curand-cu12 (60.7MiB)
|
| 3807 |
Downloading nvidia-nvjitlink-cu12 (37.4MiB)
|
| 3808 |
+
Downloading numpy (16.2MiB)
|
| 3809 |
Downloading triton (148.3MiB)
|
| 3810 |
+
Downloading nvidia-cublas-cu12 (566.8MiB)
|
| 3811 |
Downloading nvidia-cufft-cu12 (184.2MiB)
|
|
|
|
|
|
|
| 3812 |
Downloading nvidia-cusparse-cu12 (274.9MiB)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 3813 |
Downloading torch (846.9MiB)
|
| 3814 |
+
Downloading nvidia-cuda-nvrtc-cu12 (84.0MiB)
|
| 3815 |
Downloading nvidia-cufile-cu12
|
| 3816 |
Downloading setuptools
|
| 3817 |
Downloading networkx
|
|
|
|
| 3830 |
Downloading nvidia-cublas-cu12
|
| 3831 |
Downloading nvidia-cudnn-cu12
|
| 3832 |
Downloading torch
|
| 3833 |
+
Installed 26 packages in 447ms
|
| 3834 |
</div>
|
| 3835 |
</div>
|
| 3836 |
</div>
|
|
|
|
| 3843 |
<span onclick="toggleOutput('bench_utils')" style="cursor: pointer;">▼ output</span>
|
| 3844 |
<span id="uv-indicator-bench_utils" onclick="toggleUvLogsFromHeader('bench_utils')" style="cursor: pointer;">▶ uv-logs</span>
|
| 3845 |
</span> |
|
| 3846 |
+
Cell: bench_utils | deps: torch, numpy | 35.19s
|
| 3847 |
| <button class="run-btn" onclick="runCell('bench_utils')">▶ run</button>
|
| 3848 |
<button class="copy-btn" onclick="copyCell('bench_utils')">Copy</button>
|
| 3849 |
<a href="cells/bench_utils.py" target="_blank" class="raw-btn">Raw</a>
|
|
|
|
| 4331 |
<div class="uv-install-logs" id="uv-logs-bench_utils">
|
| 4332 |
<div class="uv-logs-header" onclick="toggleUvLogs(this)">▶ UV Install Logs</div>
|
| 4333 |
<div class="uv-logs-content" style="display: none;">
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 4334 |
Downloading sympy (6.0MiB)
|
| 4335 |
+
Downloading networkx (1.9MiB)
|
| 4336 |
+
Downloading setuptools (1.1MiB)
|
| 4337 |
+
Downloading nvidia-cudnn-cu12 (674.0MiB)
|
| 4338 |
Downloading nvidia-cusolver-cu12 (255.1MiB)
|
| 4339 |
Downloading numpy (16.2MiB)
|
| 4340 |
+
Downloading nvidia-cuda-nvrtc-cu12 (84.0MiB)
|
| 4341 |
+
Downloading nvidia-nvjitlink-cu12 (37.4MiB)
|
| 4342 |
Downloading torch (846.9MiB)
|
| 4343 |
+
Downloading nvidia-cufft-cu12 (184.2MiB)
|
| 4344 |
Downloading nvidia-cusparse-cu12 (274.9MiB)
|
|
|
|
| 4345 |
Downloading nvidia-cublas-cu12 (566.8MiB)
|
| 4346 |
+
Downloading nvidia-curand-cu12 (60.7MiB)
|
| 4347 |
+
Downloading nvidia-cufile-cu12 (1.1MiB)
|
| 4348 |
+
Downloading nvidia-cuda-cupti-cu12 (9.8MiB)
|
| 4349 |
+
Downloading nvidia-cusparselt-cu12 (273.9MiB)
|
| 4350 |
+
Downloading triton (148.3MiB)
|
| 4351 |
Downloading nvidia-nccl-cu12 (307.4MiB)
|
|
|
|
| 4352 |
Downloading nvidia-cufile-cu12
|
| 4353 |
Downloading setuptools
|
| 4354 |
Downloading networkx
|
|
|
|
| 4361 |
Downloading triton
|
| 4362 |
Downloading nvidia-cufft-cu12
|
| 4363 |
Downloading nvidia-cusolver-cu12
|
|
|
|
| 4364 |
Downloading nvidia-cusparselt-cu12
|
| 4365 |
+
Downloading nvidia-cusparse-cu12
|
| 4366 |
Downloading nvidia-nccl-cu12
|
| 4367 |
Downloading nvidia-cublas-cu12
|
| 4368 |
Downloading nvidia-cudnn-cu12
|
| 4369 |
Downloading torch
|
| 4370 |
+
Installed 26 packages in 455ms
|
| 4371 |
</div>
|
| 4372 |
</div>
|
| 4373 |
</div>
|
|
|
|
| 4381 |
<span onclick="toggleOutput('config')" style="cursor: pointer;">▼ output</span>
|
| 4382 |
<span id="uv-indicator-config" onclick="toggleUvLogsFromHeader('config')" style="cursor: pointer;">▶ uv-logs</span>
|
| 4383 |
</span> |
|
| 4384 |
+
Cell: config | deps: torch, numpy | 34.96s
|
| 4385 |
| <button class="run-btn" onclick="runCell('config')">▶ run</button>
|
| 4386 |
<button class="copy-btn" onclick="copyCell('config')">Copy</button>
|
| 4387 |
<a href="cells/config.py" target="_blank" class="raw-btn">Raw</a>
|
|
|
|
| 4441 |
<div class="uv-install-logs" id="uv-logs-config">
|
| 4442 |
<div class="uv-logs-header" onclick="toggleUvLogs(this)">▶ UV Install Logs</div>
|
| 4443 |
<div class="uv-logs-content" style="display: none;">
|
|
|
|
|
|
|
|
|
|
|
|
|
| 4444 |
Downloading nvidia-cufile-cu12 (1.1MiB)
|
| 4445 |
+
Downloading nvidia-cusparse-cu12 (274.9MiB)
|
| 4446 |
+
Downloading nvidia-cufft-cu12 (184.2MiB)
|
| 4447 |
+
Downloading sympy (6.0MiB)
|
| 4448 |
Downloading triton (148.3MiB)
|
| 4449 |
+
Downloading nvidia-curand-cu12 (60.7MiB)
|
| 4450 |
+
Downloading nvidia-cusparselt-cu12 (273.9MiB)
|
| 4451 |
Downloading nvidia-cublas-cu12 (566.8MiB)
|
| 4452 |
+
Downloading nvidia-nvjitlink-cu12 (37.4MiB)
|
| 4453 |
Downloading setuptools (1.1MiB)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 4454 |
Downloading nvidia-nccl-cu12 (307.4MiB)
|
| 4455 |
+
Downloading networkx (1.9MiB)
|
| 4456 |
+
Downloading nvidia-cuda-cupti-cu12 (9.8MiB)
|
| 4457 |
+
Downloading nvidia-cusolver-cu12 (255.1MiB)
|
| 4458 |
+
Downloading nvidia-cudnn-cu12 (674.0MiB)
|
| 4459 |
Downloading nvidia-cuda-nvrtc-cu12 (84.0MiB)
|
| 4460 |
Downloading torch (846.9MiB)
|
| 4461 |
+
Downloading numpy (16.2MiB)
|
| 4462 |
Downloading nvidia-cufile-cu12
|
| 4463 |
Downloading setuptools
|
| 4464 |
Downloading networkx
|
|
|
|
| 4471 |
Downloading triton
|
| 4472 |
Downloading nvidia-cufft-cu12
|
| 4473 |
Downloading nvidia-cusolver-cu12
|
|
|
|
| 4474 |
Downloading nvidia-cusparse-cu12
|
| 4475 |
+
Downloading nvidia-cusparselt-cu12
|
| 4476 |
Downloading nvidia-nccl-cu12
|
| 4477 |
Downloading nvidia-cublas-cu12
|
| 4478 |
Downloading nvidia-cudnn-cu12
|
| 4479 |
Downloading torch
|
| 4480 |
+
Installed 26 packages in 449ms
|
| 4481 |
</div>
|
| 4482 |
</div>
|
| 4483 |
</div>
|
|
|
|
| 4490 |
<span onclick="toggleOutput('save_data')" style="cursor: pointer;">▼ output</span>
|
| 4491 |
<span id="uv-indicator-save_data" onclick="toggleUvLogsFromHeader('save_data')" style="cursor: pointer;">▶ uv-logs</span>
|
| 4492 |
</span> |
|
| 4493 |
+
Cell: save_data | deps: torch, numpy | 39.26s
|
| 4494 |
| <button class="run-btn" onclick="runCell('save_data')">▶ run</button>
|
| 4495 |
<button class="copy-btn" onclick="copyCell('save_data')">Copy</button>
|
| 4496 |
<a href="cells/save_data.py" target="_blank" class="raw-btn">Raw</a>
|
|
|
|
| 4585 |
<div class="uv-install-logs" id="uv-logs-save_data">
|
| 4586 |
<div class="uv-logs-header" onclick="toggleUvLogs(this)">▶ UV Install Logs</div>
|
| 4587 |
<div class="uv-logs-content" style="display: none;">
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 4588 |
Downloading setuptools (1.1MiB)
|
|
|
|
|
|
|
| 4589 |
Downloading nvidia-cudnn-cu12 (674.0MiB)
|
| 4590 |
+
Downloading nvidia-cuda-cupti-cu12 (9.8MiB)
|
| 4591 |
+
Downloading nvidia-nccl-cu12 (307.4MiB)
|
| 4592 |
+
Downloading nvidia-cusparse-cu12 (274.9MiB)
|
| 4593 |
+
Downloading sympy (6.0MiB)
|
| 4594 |
Downloading nvidia-cusolver-cu12 (255.1MiB)
|
|
|
|
|
|
|
| 4595 |
Downloading nvidia-curand-cu12 (60.7MiB)
|
| 4596 |
+
Downloading numpy (16.2MiB)
|
| 4597 |
+
Downloading nvidia-cufile-cu12 (1.1MiB)
|
| 4598 |
+
Downloading nvidia-cuda-nvrtc-cu12 (84.0MiB)
|
| 4599 |
+
Downloading nvidia-cusparselt-cu12 (273.9MiB)
|
| 4600 |
Downloading triton (148.3MiB)
|
| 4601 |
+
Downloading nvidia-cufft-cu12 (184.2MiB)
|
| 4602 |
+
Downloading networkx (1.9MiB)
|
| 4603 |
+
Downloading torch (846.9MiB)
|
| 4604 |
Downloading nvidia-cublas-cu12 (566.8MiB)
|
| 4605 |
+
Downloading nvidia-nvjitlink-cu12 (37.4MiB)
|
| 4606 |
Downloading nvidia-cufile-cu12
|
| 4607 |
Downloading setuptools
|
| 4608 |
Downloading networkx
|
|
|
|
| 4621 |
Downloading nvidia-cublas-cu12
|
| 4622 |
Downloading nvidia-cudnn-cu12
|
| 4623 |
Downloading torch
|
| 4624 |
+
Installed 26 packages in 453ms
|
| 4625 |
</div>
|
| 4626 |
</div>
|
| 4627 |
<div class="cell-artifacts">
|
| 4628 |
<h4>Artifacts:</h4>
|
|
|
|
|
|
|
|
|
|
| 4629 |
<a href="artifacts/save_data/router_weight.pt" class="artifact" target="_blank">router_weight.pt</a>
|
| 4630 |
+
<a href="artifacts/save_data/gate_up_proj_bias.pt" class="artifact" target="_blank">gate_up_proj_bias.pt</a>
|
| 4631 |
<a href="artifacts/save_data/gate_up_proj.pt" class="artifact" target="_blank">gate_up_proj.pt</a>
|
| 4632 |
+
<a href="artifacts/save_data/down_proj.pt" class="artifact" target="_blank">down_proj.pt</a>
|
| 4633 |
+
<a href="artifacts/save_data/router_bias.pt" class="artifact" target="_blank">router_bias.pt</a>
|
| 4634 |
+
<a href="artifacts/save_data/down_proj_bias.pt" class="artifact" target="_blank">down_proj_bias.pt</a>
|
| 4635 |
</div>
|
| 4636 |
</div>
|
| 4637 |
</div>
|
|
|
|
| 4645 |
<span onclick="toggleOutput('yamoe_run')" style="cursor: pointer;">▼ output</span>
|
| 4646 |
<span id="uv-indicator-yamoe_run" onclick="toggleUvLogsFromHeader('yamoe_run')" style="cursor: pointer;">▶ uv-logs</span>
|
| 4647 |
</span> |
|
| 4648 |
+
Cell: yamoe_run | deps: torch, kernels, numpy | 39.67s
|
| 4649 |
| <button class="run-btn" onclick="runCell('yamoe_run')">▶ run</button>
|
| 4650 |
<button class="copy-btn" onclick="copyCell('yamoe_run')">Copy</button>
|
| 4651 |
<a href="cells/yamoe_run.py" target="_blank" class="raw-btn">Raw</a>
|
|
|
|
| 4938 |
|
| 4939 |
Warming up (10 iterations)...
|
| 4940 |
Benchmarking (50 iterations)...
|
| 4941 |
+
Progress: 20% complete (avg: 4.250 ms)
|
| 4942 |
+
Progress: 40% complete (avg: 4.247 ms)
|
| 4943 |
+
Progress: 60% complete (avg: 4.248 ms)
|
| 4944 |
+
Progress: 80% complete (avg: 4.248 ms)
|
| 4945 |
|
| 4946 |
Output tensors:
|
| 4947 |
Primary: shape=(1, 100, 1152), dtype=torch.float32, device=cuda:0, range=[-0.049506, 0.054984], mean=0.000034, std=0.006508, norm=2.208791
|
|
|
|
| 4952 |
|
| 4953 |
Latency Statistics:
|
| 4954 |
Average: 4.249 ms
|
| 4955 |
+
Min: 4.146 ms
|
| 4956 |
+
Max: 4.270 ms
|
| 4957 |
+
Std Dev: 0.019 ms
|
| 4958 |
|
| 4959 |
Percentiles:
|
| 4960 |
+
P50 (median): 4.253 ms
|
| 4961 |
+
P95: 4.266 ms
|
| 4962 |
+
P99: 4.270 ms
|
| 4963 |
|
| 4964 |
Throughput:
|
| 4965 |
+
Tokens/sec: 23535.8
|
| 4966 |
+
Std Dev: 107.7
|
| 4967 |
━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━
|
| 4968 |
|
| 4969 |
Saved benchmark results to yamoe_results.json
|
| 4970 |
|
| 4971 |
+
Output sum: 3.971906
|
| 4972 |
</div>
|
| 4973 |
<div class="uv-install-logs" id="uv-logs-yamoe_run">
|
| 4974 |
<div class="uv-logs-header" onclick="toggleUvLogs(this)">▶ UV Install Logs</div>
|
| 4975 |
<div class="uv-logs-content" style="display: none;">
|
|
|
|
|
|
|
|
|
|
| 4976 |
Downloading nvidia-cuda-cupti-cu12 (9.8MiB)
|
| 4977 |
+
Downloading networkx (1.9MiB)
|
| 4978 |
+
Downloading nvidia-cufft-cu12 (184.2MiB)
|
| 4979 |
+
Downloading nvidia-cudnn-cu12 (674.0MiB)
|
| 4980 |
Downloading nvidia-cusparselt-cu12 (273.9MiB)
|
|
|
|
|
|
|
|
|
|
| 4981 |
Downloading nvidia-cublas-cu12 (566.8MiB)
|
| 4982 |
+
Downloading torch (846.9MiB)
|
| 4983 |
Downloading nvidia-cusparse-cu12 (274.9MiB)
|
| 4984 |
+
Downloading nvidia-cuda-nvrtc-cu12 (84.0MiB)
|
| 4985 |
Downloading nvidia-nccl-cu12 (307.4MiB)
|
| 4986 |
+
Downloading nvidia-cusolver-cu12 (255.1MiB)
|
| 4987 |
+
Downloading triton (148.3MiB)
|
| 4988 |
+
Downloading numpy (16.2MiB)
|
| 4989 |
Downloading nvidia-curand-cu12 (60.7MiB)
|
| 4990 |
+
Downloading nvidia-nvjitlink-cu12 (37.4MiB)
|
| 4991 |
+
Downloading sympy (6.0MiB)
|
| 4992 |
Downloading setuptools (1.1MiB)
|
| 4993 |
+
Downloading hf-xet (3.0MiB)
|
| 4994 |
+
Downloading nvidia-cufile-cu12 (1.1MiB)
|
| 4995 |
Downloading nvidia-cufile-cu12
|
| 4996 |
Downloading hf-xet
|
| 4997 |
Downloading setuptools
|
|
|
|
| 5005 |
Downloading triton
|
| 5006 |
Downloading nvidia-cufft-cu12
|
| 5007 |
Downloading nvidia-cusolver-cu12
|
|
|
|
| 5008 |
Downloading nvidia-cusparselt-cu12
|
| 5009 |
+
Downloading nvidia-cusparse-cu12
|
| 5010 |
Downloading nvidia-nccl-cu12
|
| 5011 |
Downloading nvidia-cublas-cu12
|
| 5012 |
Downloading nvidia-cudnn-cu12
|
| 5013 |
Downloading torch
|
| 5014 |
+
Installed 37 packages in 453ms
|
| 5015 |
</div>
|
| 5016 |
</div>
|
| 5017 |
<div class="cell-stderr">Fetching 6 files: 0%| | 0/6 [00:00<?, ?it/s]
|
| 5018 |
+
Fetching 6 files: 17%|█▋ | 1/6 [00:00<00:01, 3.69it/s]
|
| 5019 |
+
Fetching 6 files: 50%|█████ | 3/6 [00:00<00:00, 3.01it/s]
|
| 5020 |
+
Fetching 6 files: 100%|██████████| 6/6 [00:00<00:00, 6.13it/s]</div>
|
| 5021 |
<div class="cell-artifacts">
|
| 5022 |
<h4>Artifacts:</h4>
|
| 5023 |
<a href="artifacts/yamoe_run/yamoe_results.json" class="artifact" target="_blank">yamoe_results.json</a>
|
|
|
|
| 5034 |
<span onclick="toggleOutput('binned_run')" style="cursor: pointer;">▼ output</span>
|
| 5035 |
<span id="uv-indicator-binned_run" onclick="toggleUvLogsFromHeader('binned_run')" style="cursor: pointer;">▶ uv-logs</span>
|
| 5036 |
</span> |
|
| 5037 |
+
Cell: binned_run | deps: torch, numpy | 39.33s
|
| 5038 |
| <button class="run-btn" onclick="runCell('binned_run')">▶ run</button>
|
| 5039 |
<button class="copy-btn" onclick="copyCell('binned_run')">Copy</button>
|
| 5040 |
<a href="cells/binned_run.py" target="_blank" class="raw-btn">Raw</a>
|
|
|
|
| 5448 |
|
| 5449 |
Warming up (10 iterations)...
|
| 5450 |
Benchmarking (50 iterations)...
|
| 5451 |
+
Progress: 20% complete (avg: 38.670 ms)
|
| 5452 |
+
Progress: 40% complete (avg: 38.443 ms)
|
| 5453 |
+
Progress: 60% complete (avg: 38.101 ms)
|
| 5454 |
+
Progress: 80% complete (avg: 37.640 ms)
|
| 5455 |
|
| 5456 |
Output tensors:
|
| 5457 |
Primary: shape=(1, 100, 1152), dtype=torch.float32, device=cuda:0, range=[-0.049506, 0.054984], mean=0.000034, std=0.006508, norm=2.208791
|
|
|
|
| 5461 |
Iterations: 50
|
| 5462 |
|
| 5463 |
Latency Statistics:
|
| 5464 |
+
Average: 37.043 ms
|
| 5465 |
+
Min: 33.611 ms
|
| 5466 |
+
Max: 39.776 ms
|
| 5467 |
+
Std Dev: 1.656 ms
|
| 5468 |
|
| 5469 |
Percentiles:
|
| 5470 |
+
P50 (median): 37.083 ms
|
| 5471 |
+
P95: 39.325 ms
|
| 5472 |
+
P99: 39.736 ms
|
| 5473 |
|
| 5474 |
Throughput:
|
| 5475 |
+
Tokens/sec: 2699.6
|
| 5476 |
+
Std Dev: 122.6
|
| 5477 |
━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━
|
| 5478 |
|
| 5479 |
Saved benchmark results to binned_results.json
|
|
|
|
| 5484 |
<div class="uv-logs-header" onclick="toggleUvLogs(this)">▶ UV Install Logs</div>
|
| 5485 |
<div class="uv-logs-content" style="display: none;">
|
| 5486 |
Downloading setuptools (1.1MiB)
|
| 5487 |
+
Downloading nvidia-nvjitlink-cu12 (37.4MiB)
|
|
|
|
| 5488 |
Downloading nvidia-cuda-cupti-cu12 (9.8MiB)
|
| 5489 |
+
Downloading networkx (1.9MiB)
|
| 5490 |
Downloading nvidia-cublas-cu12 (566.8MiB)
|
| 5491 |
+
Downloading nvidia-cufft-cu12 (184.2MiB)
|
| 5492 |
+
Downloading triton (148.3MiB)
|
| 5493 |
+
Downloading nvidia-nccl-cu12 (307.4MiB)
|
| 5494 |
Downloading nvidia-cusolver-cu12 (255.1MiB)
|
| 5495 |
+
Downloading torch (846.9MiB)
|
|
|
|
|
|
|
|
|
|
|
|
|
| 5496 |
Downloading nvidia-curand-cu12 (60.7MiB)
|
| 5497 |
+
Downloading numpy (16.2MiB)
|
| 5498 |
Downloading nvidia-cusparselt-cu12 (273.9MiB)
|
| 5499 |
+
Downloading sympy (6.0MiB)
|
| 5500 |
+
Downloading nvidia-cuda-nvrtc-cu12 (84.0MiB)
|
| 5501 |
Downloading nvidia-cufile-cu12 (1.1MiB)
|
| 5502 |
+
Downloading nvidia-cudnn-cu12 (674.0MiB)
|
| 5503 |
+
Downloading nvidia-cusparse-cu12 (274.9MiB)
|
|
|
|
|
|
|
| 5504 |
Downloading nvidia-cufile-cu12
|
| 5505 |
Downloading setuptools
|
| 5506 |
Downloading networkx
|
|
|
|
| 5513 |
Downloading triton
|
| 5514 |
Downloading nvidia-cufft-cu12
|
| 5515 |
Downloading nvidia-cusolver-cu12
|
|
|
|
| 5516 |
Downloading nvidia-cusparse-cu12
|
| 5517 |
+
Downloading nvidia-cusparselt-cu12
|
| 5518 |
Downloading nvidia-nccl-cu12
|
| 5519 |
Downloading nvidia-cublas-cu12
|
| 5520 |
Downloading nvidia-cudnn-cu12
|
| 5521 |
Downloading torch
|
| 5522 |
+
Installed 26 packages in 526ms
|
| 5523 |
</div>
|
| 5524 |
</div>
|
| 5525 |
<div class="cell-artifacts">
|
|
|
|
| 5538 |
<span onclick="toggleOutput('gptoss_run')" style="cursor: pointer;">▼ output</span>
|
| 5539 |
<span id="uv-indicator-gptoss_run" onclick="toggleUvLogsFromHeader('gptoss_run')" style="cursor: pointer;">▶ uv-logs</span>
|
| 5540 |
</span> |
|
| 5541 |
+
Cell: gptoss_run | deps: torch, numpy | 40.94s
|
| 5542 |
| <button class="run-btn" onclick="runCell('gptoss_run')">▶ run</button>
|
| 5543 |
<button class="copy-btn" onclick="copyCell('gptoss_run')">Copy</button>
|
| 5544 |
<a href="cells/gptoss_run.py" target="_blank" class="raw-btn">Raw</a>
|
|
|
|
| 5856 |
|
| 5857 |
Warming up (10 iterations)...
|
| 5858 |
Benchmarking (50 iterations)...
|
| 5859 |
+
Progress: 20% complete (avg: 50.824 ms)
|
| 5860 |
+
Progress: 40% complete (avg: 50.316 ms)
|
| 5861 |
+
Progress: 60% complete (avg: 49.490 ms)
|
| 5862 |
+
Progress: 80% complete (avg: 48.422 ms)
|
| 5863 |
|
| 5864 |
Output tensors:
|
| 5865 |
Primary: shape=(1, 100, 1152), dtype=torch.float32, device=cuda:0, range=[-0.064982, 0.061193], mean=0.000100, std=0.013510, norm=4.585560
|
|
|
|
| 5869 |
Iterations: 50
|
| 5870 |
|
| 5871 |
Latency Statistics:
|
| 5872 |
+
Average: 47.380 ms
|
| 5873 |
+
Min: 40.922 ms
|
| 5874 |
+
Max: 51.281 ms
|
| 5875 |
+
Std Dev: 2.917 ms
|
| 5876 |
|
| 5877 |
Percentiles:
|
| 5878 |
+
P50 (median): 48.137 ms
|
| 5879 |
+
P95: 51.064 ms
|
| 5880 |
+
P99: 51.261 ms
|
| 5881 |
|
| 5882 |
Throughput:
|
| 5883 |
+
Tokens/sec: 2110.6
|
| 5884 |
+
Std Dev: 134.1
|
| 5885 |
━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━
|
| 5886 |
|
| 5887 |
Saved benchmark results to gptoss_results.json
|
|
|
|
| 5891 |
<div class="uv-install-logs" id="uv-logs-gptoss_run">
|
| 5892 |
<div class="uv-logs-header" onclick="toggleUvLogs(this)">▶ UV Install Logs</div>
|
| 5893 |
<div class="uv-logs-content" style="display: none;">
|
| 5894 |
+
Downloading setuptools (1.1MiB)
|
| 5895 |
Downloading networkx (1.9MiB)
|
|
|
|
|
|
|
|
|
|
| 5896 |
Downloading triton (148.3MiB)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 5897 |
Downloading nvidia-cufile-cu12 (1.1MiB)
|
|
|
|
| 5898 |
Downloading nvidia-cuda-cupti-cu12 (9.8MiB)
|
| 5899 |
+
Downloading nvidia-nvjitlink-cu12 (37.4MiB)
|
| 5900 |
+
Downloading nvidia-cufft-cu12 (184.2MiB)
|
| 5901 |
+
Downloading nvidia-cudnn-cu12 (674.0MiB)
|
| 5902 |
+
Downloading nvidia-curand-cu12 (60.7MiB)
|
| 5903 |
Downloading numpy (16.2MiB)
|
| 5904 |
+
Downloading nvidia-nccl-cu12 (307.4MiB)
|
| 5905 |
+
Downloading nvidia-cuda-nvrtc-cu12 (84.0MiB)
|
| 5906 |
+
Downloading nvidia-cusolver-cu12 (255.1MiB)
|
| 5907 |
+
Downloading sympy (6.0MiB)
|
| 5908 |
Downloading nvidia-cusparse-cu12 (274.9MiB)
|
| 5909 |
+
Downloading torch (846.9MiB)
|
| 5910 |
+
Downloading nvidia-cusparselt-cu12 (273.9MiB)
|
| 5911 |
+
Downloading nvidia-cublas-cu12 (566.8MiB)
|
| 5912 |
Downloading nvidia-cufile-cu12
|
| 5913 |
Downloading setuptools
|
| 5914 |
Downloading networkx
|
|
|
|
| 5927 |
Downloading nvidia-cublas-cu12
|
| 5928 |
Downloading nvidia-cudnn-cu12
|
| 5929 |
Downloading torch
|
| 5930 |
+
Installed 26 packages in 449ms
|
| 5931 |
</div>
|
| 5932 |
</div>
|
| 5933 |
<div class="cell-artifacts">
|
|
|
|
| 5946 |
<span onclick="toggleOutput('gptoss_training_run')" style="cursor: pointer;">▼ output</span>
|
| 5947 |
<span id="uv-indicator-gptoss_training_run" onclick="toggleUvLogsFromHeader('gptoss_training_run')" style="cursor: pointer;">▶ uv-logs</span>
|
| 5948 |
</span> |
|
| 5949 |
+
Cell: gptoss_training_run | deps: torch, numpy | 40.19s
|
| 5950 |
| <button class="run-btn" onclick="runCell('gptoss_training_run')">▶ run</button>
|
| 5951 |
<button class="copy-btn" onclick="copyCell('gptoss_training_run')">Copy</button>
|
| 5952 |
<a href="cells/gptoss_training_run.py" target="_blank" class="raw-btn">Raw</a>
|
|
|
|
| 6247 |
|
| 6248 |
Warming up (10 iterations)...
|
| 6249 |
Benchmarking (50 iterations)...
|
| 6250 |
+
Progress: 20% complete (avg: 51.217 ms)
|
| 6251 |
+
Progress: 40% complete (avg: 50.712 ms)
|
| 6252 |
+
Progress: 60% complete (avg: 49.798 ms)
|
| 6253 |
+
Progress: 80% complete (avg: 48.389 ms)
|
| 6254 |
|
| 6255 |
Output tensors:
|
| 6256 |
Primary: shape=(1, 100, 1152), dtype=torch.float32, device=cuda:0, range=[-0.064982, 0.061193], mean=0.000100, std=0.013510, norm=4.585560
|
|
|
|
| 6260 |
Iterations: 50
|
| 6261 |
|
| 6262 |
Latency Statistics:
|
| 6263 |
+
Average: 46.922 ms
|
| 6264 |
+
Min: 38.894 ms
|
| 6265 |
+
Max: 51.622 ms
|
| 6266 |
+
Std Dev: 3.930 ms
|
| 6267 |
|
| 6268 |
Percentiles:
|
| 6269 |
+
P50 (median): 48.186 ms
|
| 6270 |
+
P95: 51.421 ms
|
| 6271 |
+
P99: 51.564 ms
|
| 6272 |
|
| 6273 |
Throughput:
|
| 6274 |
+
Tokens/sec: 2131.2
|
| 6275 |
+
Std Dev: 188.8
|
| 6276 |
━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━
|
| 6277 |
|
| 6278 |
Saved benchmark results to gptoss_training_results.json
|
|
|
|
| 6282 |
<div class="uv-install-logs" id="uv-logs-gptoss_training_run">
|
| 6283 |
<div class="uv-logs-header" onclick="toggleUvLogs(this)">▶ UV Install Logs</div>
|
| 6284 |
<div class="uv-logs-content" style="display: none;">
|
|
|
|
|
|
|
| 6285 |
Downloading nvidia-cudnn-cu12 (674.0MiB)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 6286 |
Downloading nvidia-cuda-cupti-cu12 (9.8MiB)
|
|
|
|
|
|
|
| 6287 |
Downloading nvidia-cusolver-cu12 (255.1MiB)
|
|
|
|
| 6288 |
Downloading nvidia-nccl-cu12 (307.4MiB)
|
| 6289 |
+
Downloading nvidia-cufft-cu12 (184.2MiB)
|
| 6290 |
+
Downloading nvidia-cusparse-cu12 (274.9MiB)
|
| 6291 |
+
Downloading networkx (1.9MiB)
|
| 6292 |
+
Downloading nvidia-curand-cu12 (60.7MiB)
|
| 6293 |
Downloading sympy (6.0MiB)
|
| 6294 |
+
Downloading numpy (16.2MiB)
|
| 6295 |
+
Downloading nvidia-cufile-cu12 (1.1MiB)
|
| 6296 |
+
Downloading nvidia-cusparselt-cu12 (273.9MiB)
|
| 6297 |
Downloading torch (846.9MiB)
|
| 6298 |
+
Downloading setuptools (1.1MiB)
|
| 6299 |
+
Downloading nvidia-nvjitlink-cu12 (37.4MiB)
|
| 6300 |
+
Downloading nvidia-cuda-nvrtc-cu12 (84.0MiB)
|
| 6301 |
+
Downloading nvidia-cublas-cu12 (566.8MiB)
|
| 6302 |
+
Downloading triton (148.3MiB)
|
| 6303 |
Downloading nvidia-cufile-cu12
|
| 6304 |
Downloading setuptools
|
| 6305 |
Downloading networkx
|
|
|
|
| 6318 |
Downloading nvidia-cublas-cu12
|
| 6319 |
Downloading nvidia-cudnn-cu12
|
| 6320 |
Downloading torch
|
| 6321 |
+
Installed 26 packages in 548ms
|
| 6322 |
</div>
|
| 6323 |
</div>
|
| 6324 |
<div class="cell-artifacts">
|
|
|
|
| 6337 |
<span onclick="toggleOutput('megablocks_run')" style="cursor: pointer;">▼ output</span>
|
| 6338 |
<span id="uv-indicator-megablocks_run" onclick="toggleUvLogsFromHeader('megablocks_run')" style="cursor: pointer;">▶ uv-logs</span>
|
| 6339 |
</span> |
|
| 6340 |
+
Cell: megablocks_run | deps: torch, numpy, kernels | 48.02s
|
| 6341 |
| <button class="run-btn" onclick="runCell('megablocks_run')">▶ run</button>
|
| 6342 |
<button class="copy-btn" onclick="copyCell('megablocks_run')">Copy</button>
|
| 6343 |
<a href="cells/megablocks_run.py" target="_blank" class="raw-btn">Raw</a>
|
|
|
|
| 6566 |
|
| 6567 |
Warming up (10 iterations)...
|
| 6568 |
Benchmarking (50 iterations)...
|
| 6569 |
+
Progress: 20% complete (avg: 0.894 ms)
|
| 6570 |
+
Progress: 40% complete (avg: 0.876 ms)
|
| 6571 |
+
Progress: 60% complete (avg: 0.872 ms)
|
| 6572 |
+
Progress: 80% complete (avg: 2.709 ms)
|
| 6573 |
|
| 6574 |
Output tensors:
|
| 6575 |
Primary: shape=(1, 100, 1152), dtype=torch.float32, device=cuda:0, range=[-0.061104, 0.055115], mean=0.000056, std=0.013535, norm=4.593927
|
|
|
|
| 6579 |
Iterations: 50
|
| 6580 |
|
| 6581 |
Latency Statistics:
|
| 6582 |
+
Average: 3.865 ms
|
| 6583 |
+
Min: 0.842 ms
|
| 6584 |
+
Max: 8.545 ms
|
| 6585 |
+
Std Dev: 3.691 ms
|
| 6586 |
|
| 6587 |
Percentiles:
|
| 6588 |
+
P50 (median): 0.883 ms
|
| 6589 |
+
P95: 8.537 ms
|
| 6590 |
+
P99: 8.542 ms
|
| 6591 |
|
| 6592 |
Throughput:
|
| 6593 |
+
Tokens/sec: 25871.1
|
| 6594 |
+
Std Dev: 50674.8
|
| 6595 |
━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━
|
| 6596 |
|
| 6597 |
Saved benchmark results to megablocks_results.json
|
|
|
|
| 6601 |
<div class="uv-install-logs" id="uv-logs-megablocks_run">
|
| 6602 |
<div class="uv-logs-header" onclick="toggleUvLogs(this)">▶ UV Install Logs</div>
|
| 6603 |
<div class="uv-logs-content" style="display: none;">
|
| 6604 |
+
Downloading sympy (6.0MiB)
|
|
|
|
|
|
|
|
|
|
|
|
|
| 6605 |
Downloading numpy (16.2MiB)
|
| 6606 |
+
Downloading nvidia-nccl-cu12 (307.4MiB)
|
| 6607 |
+
Downloading hf-xet (3.0MiB)
|
| 6608 |
+
Downloading setuptools (1.1MiB)
|
| 6609 |
+
Downloading nvidia-cuda-cupti-cu12 (9.8MiB)
|
| 6610 |
+
Downloading nvidia-cublas-cu12 (566.8MiB)
|
| 6611 |
Downloading torch (846.9MiB)
|
| 6612 |
+
Downloading nvidia-nvjitlink-cu12 (37.4MiB)
|
| 6613 |
+
Downloading nvidia-cufile-cu12 (1.1MiB)
|
| 6614 |
Downloading nvidia-cudnn-cu12 (674.0MiB)
|
| 6615 |
+
Downloading nvidia-cuda-nvrtc-cu12 (84.0MiB)
|
| 6616 |
+
Downloading nvidia-cusparse-cu12 (274.9MiB)
|
| 6617 |
+
Downloading nvidia-cusparselt-cu12 (273.9MiB)
|
|
|
|
| 6618 |
Downloading nvidia-cufft-cu12 (184.2MiB)
|
|
|
|
|
|
|
| 6619 |
Downloading triton (148.3MiB)
|
| 6620 |
+
Downloading networkx (1.9MiB)
|
| 6621 |
+
Downloading nvidia-curand-cu12 (60.7MiB)
|
| 6622 |
+
Downloading nvidia-cusolver-cu12 (255.1MiB)
|
| 6623 |
Downloading nvidia-cufile-cu12
|
| 6624 |
Downloading hf-xet
|
| 6625 |
Downloading setuptools
|
|
|
|
| 6639 |
Downloading nvidia-cublas-cu12
|
| 6640 |
Downloading nvidia-cudnn-cu12
|
| 6641 |
Downloading torch
|
| 6642 |
+
Installed 37 packages in 525ms
|
| 6643 |
</div>
|
| 6644 |
</div>
|
| 6645 |
<div class="cell-stderr">Fetching 66 files: 0%| | 0/66 [00:00<?, ?it/s]
|
| 6646 |
+
Fetching 66 files: 2%|▏ | 1/66 [00:00<00:15, 4.11it/s]
|
| 6647 |
+
Fetching 66 files: 15%|█▌ | 10/66 [00:00<00:02, 21.70it/s]
|
| 6648 |
+
Fetching 66 files: 26%|██▌ | 17/66 [00:00<00:02, 19.93it/s]
|
| 6649 |
+
Fetching 66 files: 47%|████▋ | 31/66 [00:01<00:00, 39.55it/s]
|
| 6650 |
+
Fetching 66 files: 61%|██████ | 40/66 [00:01<00:00, 29.77it/s]
|
| 6651 |
+
Fetching 66 files: 77%|███████▋ | 51/66 [00:01<00:00, 38.15it/s]
|
| 6652 |
+
Fetching 66 files: 89%|████████▉ | 59/66 [00:01<00:00, 42.51it/s]
|
| 6653 |
+
Fetching 66 files: 98%|█████████▊| 65/66 [00:01<00:00, 44.76it/s]
|
| 6654 |
+
Fetching 66 files: 100%|██████████| 66/66 [00:01<00:00, 35.09it/s]</div>
|
| 6655 |
<div class="cell-artifacts">
|
| 6656 |
<h4>Artifacts:</h4>
|
| 6657 |
<a href="artifacts/megablocks_run/megablocks_results.json" class="artifact" target="_blank">megablocks_results.json</a>
|
|
|
|
| 6668 |
<span onclick="toggleOutput('visualization')" style="cursor: pointer;">▼ output</span>
|
| 6669 |
<span id="uv-indicator-visualization" onclick="toggleUvLogsFromHeader('visualization')" style="cursor: pointer;">▶ uv-logs</span>
|
| 6670 |
</span> |
|
| 6671 |
+
Cell: visualization | deps: matplotlib | 3.14s
|
| 6672 |
| <button class="run-btn" onclick="runCell('visualization')">▶ run</button>
|
| 6673 |
<button class="copy-btn" onclick="copyCell('visualization')">Copy</button>
|
| 6674 |
<a href="cells/visualization.py" target="_blank" class="raw-btn">Raw</a>
|
|
|
|
| 6914 |
Performance Summary:
|
| 6915 |
Implementation Avg (ms) P95 (ms) Tokens/sec Relative Speed
|
| 6916 |
--------------------------------------------------------------------------------
|
| 6917 |
+
megablocks_results 3.87 8.54 25871 1.00x
|
| 6918 |
+
yamoe_results 4.25 4.27 23536 0.91x
|
| 6919 |
+
binned_results 37.04 39.33 2700 0.10x
|
| 6920 |
+
gptoss_training_results 46.92 51.42 2131 0.08x
|
| 6921 |
+
gptoss_results 47.38 51.06 2111 0.08x
|
| 6922 |
+
|
| 6923 |
+
Fastest: megablocks_results (3.87ms avg)
|
| 6924 |
+
Slowest: gptoss_results (47.38ms avg)
|
| 6925 |
+
Max Speedup: 12.3x
|
| 6926 |
</div>
|
| 6927 |
<div class="uv-install-logs" id="uv-logs-visualization">
|
| 6928 |
<div class="uv-logs-header" onclick="toggleUvLogs(this)">▶ UV Install Logs</div>
|
| 6929 |
<div class="uv-logs-content" style="display: none;">
|
|
|
|
|
|
|
|
|
|
| 6930 |
Downloading numpy (16.2MiB)
|
| 6931 |
Downloading kiwisolver (1.4MiB)
|
| 6932 |
+
Downloading pillow (6.3MiB)
|
| 6933 |
+
Downloading fonttools (4.7MiB)
|
| 6934 |
+
Downloading matplotlib (8.3MiB)
|
| 6935 |
Downloading kiwisolver
|
| 6936 |
Downloading pillow
|
| 6937 |
Downloading fonttools
|
| 6938 |
Downloading matplotlib
|
| 6939 |
Downloading numpy
|
| 6940 |
+
Installed 11 packages in 48ms
|
| 6941 |
</div>
|
| 6942 |
</div>
|
| 6943 |
<div class="cell-artifacts">
|