Upload folder using huggingface_hub
Browse files- activation/impls/artifacts/benchmark/activation.jsonl +9 -9
- activation/impls/hf_kernels_swiglu.html +91 -92
- activation/impls/torch_swiglu.html +120 -120
- activation/results/artifacts/combine/latency.svg +2 -2
- activation/results/combined_results.html +122 -96
- causal_conv1d/impls/artifacts/benchmark/causal_conv1d.jsonl +24 -24
- causal_conv1d/impls/cells/benchmark.py +9 -18
- causal_conv1d/impls/hf_kernels_causal_conv1d.html +0 -0
- causal_conv1d/impls/torch_causal_conv1d.html +0 -0
- causal_conv1d/results/artifacts/combine/latency.svg +2 -2
- causal_conv1d/results/combined_results.html +138 -138
- flash_attn/impls/artifacts/benchmark/attention.jsonl +6 -6
- flash_attn/impls/cells/benchmark.py +9 -10
- flash_attn/impls/flash_attention.html +192 -140
- flash_attn/impls/hf_kernels_flash_attn.html +94 -99
- flash_attn/impls/hf_kernels_flash_attn3.html +80 -80
- flash_attn/impls/mem_efficient_attention.html +133 -185
- flash_attn/impls/sage_attention.html +17 -12
- flash_attn/impls/xformers.html +89 -89
- flash_attn/results/artifacts/combine/latency.svg +2 -2
- flash_attn/results/combined_results.html +138 -138
- index.html +0 -0
- layer_norm/impls/artifacts/benchmark/layer_norm.jsonl +4 -4
- layer_norm/impls/hf_kernels_layer_norm.html +56 -55
- layer_norm/impls/torch_layer_norm.html +55 -54
- layer_norm/results/artifacts/combine/latency.svg +2 -2
- layer_norm/results/combined_results.html +51 -51
- rotary/impls/artifacts/benchmark/rotary.jsonl +24 -24
- rotary/impls/hf_kernels_rotary.html +0 -0
- rotary/impls/torch_rotary.html +0 -0
- rotary/index.html +0 -0
- rotary/results/artifacts/combine/latency.svg +1 -1
- rotary/results/combined_results.html +84 -84
activation/impls/artifacts/benchmark/activation.jsonl
CHANGED
|
@@ -1,9 +1,9 @@
|
|
| 1 |
-
{"ts": "2025-10-
|
| 2 |
-
{"ts": "2025-10-
|
| 3 |
-
{"ts": "2025-10-
|
| 4 |
-
{"ts": "2025-10-
|
| 5 |
-
{"ts": "2025-10-
|
| 6 |
-
{"ts": "2025-10-
|
| 7 |
-
{"ts": "2025-10-
|
| 8 |
-
{"ts": "2025-10-
|
| 9 |
-
{"ts": "2025-10-
|
|
|
|
| 1 |
+
{"ts": "2025-10-29T14:27:26Z", "run": "cb02b80f11f04022890ac7026c401c57", "impl": "hf_kernels_swiglu", "tags": {"family": "hf-kernels", "backend": "cuda"}, "wl": {"name": "cuda_T128_D768", "num_tokens": 128, "hidden_dim": 768, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.02251099999739381, "p50": 0.02324100000805629, "p90": 0.023539999972399528, "mean": 0.023146399996676337, "iqr": 0.0007499999696847226, "raw_times": [0.023539999972399528, 0.022790000002714805, 0.02324100000805629, 0.02365000000281725, 0.02251099999739381], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.029810000000907166, "peak_bytes": 1966080, "ok": true, "absmax": 0.0, "corr": {"ok": true, "rtol": 0.02, "atol": 0.02, "absmax": 0.0, "mae": 0.0, "mse": 0.0, "ref": "swiglu_fp32"}, "err": null}
|
| 2 |
+
{"ts": "2025-10-29T14:27:26Z", "run": "cb02b80f11f04022890ac7026c401c57", "impl": "hf_kernels_swiglu", "tags": {"family": "hf-kernels", "backend": "cuda"}, "wl": {"name": "cuda_T128_D1024", "num_tokens": 128, "hidden_dim": 1024, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.027370999987397227, "p50": 0.028240999995432503, "p90": 0.028329999963716546, "mean": 0.02825879998908931, "iqr": 0.00023899997358967084, "raw_times": [0.028090999990126875, 0.028240999995432503, 0.028329999963716546, 0.029261000008773408, 0.027370999987397227], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.03212000001440174, "peak_bytes": 2621440, "ok": true, "absmax": 0.0, "corr": {"ok": true, "rtol": 0.02, "atol": 0.02, "absmax": 0.0, "mae": 0.0, "mse": 0.0, "ref": "swiglu_fp32"}, "err": null}
|
| 3 |
+
{"ts": "2025-10-29T14:27:26Z", "run": "cb02b80f11f04022890ac7026c401c57", "impl": "hf_kernels_swiglu", "tags": {"family": "hf-kernels", "backend": "cuda"}, "wl": {"name": "cuda_T128_D2048", "num_tokens": 128, "hidden_dim": 2048, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.02641099996480989, "p50": 0.027520999992702855, "p90": 0.028440999983558868, "mean": 0.027734599996165343, "iqr": 0.001440999938040477, "raw_times": [0.02641099996480989, 0.028440999983558868, 0.029299999994236714, 0.027520999992702855, 0.02700000004551839], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.032080999972095015, "peak_bytes": 5242880, "ok": true, "absmax": 0.0, "corr": {"ok": true, "rtol": 0.02, "atol": 0.02, "absmax": 0.0, "mae": 0.0, "mse": 0.0, "ref": "swiglu_fp32"}, "err": null}
|
| 4 |
+
{"ts": "2025-10-29T14:27:26Z", "run": "cb02b80f11f04022890ac7026c401c57", "impl": "hf_kernels_swiglu", "tags": {"family": "hf-kernels", "backend": "cuda"}, "wl": {"name": "cuda_T256_D768", "num_tokens": 256, "hidden_dim": 768, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.026321000007101247, "p50": 0.02724099999795726, "p90": 0.028659999998126295, "mean": 0.02923079999845868, "iqr": 0.0014990000067882647, "raw_times": [0.026321000007101247, 0.03677099999777056, 0.028659999998126295, 0.02716099999133803, 0.02724099999795726], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.031121000006351096, "peak_bytes": 3932160, "ok": true, "absmax": 0.0, "corr": {"ok": true, "rtol": 0.02, "atol": 0.02, "absmax": 0.0, "mae": 0.0, "mse": 0.0, "ref": "swiglu_fp32"}, "err": null}
|
| 5 |
+
{"ts": "2025-10-29T14:27:26Z", "run": "cb02b80f11f04022890ac7026c401c57", "impl": "hf_kernels_swiglu", "tags": {"family": "hf-kernels", "backend": "cuda"}, "wl": {"name": "cuda_T256_D1024", "num_tokens": 256, "hidden_dim": 1024, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.025979999975334067, "p50": 0.028520999990178098, "p90": 0.028720999978304462, "mean": 0.027810800008865044, "iqr": 0.00169999992749581, "raw_times": [0.025979999975334067, 0.028811000049699942, 0.028520999990178098, 0.027021000050808652, 0.028720999978304462], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.02976100000751103, "peak_bytes": 5242880, "ok": true, "absmax": 0.0, "corr": {"ok": true, "rtol": 0.02, "atol": 0.02, "absmax": 0.0, "mae": 0.0, "mse": 0.0, "ref": "swiglu_fp32"}, "err": null}
|
| 6 |
+
{"ts": "2025-10-29T14:27:26Z", "run": "cb02b80f11f04022890ac7026c401c57", "impl": "hf_kernels_swiglu", "tags": {"family": "hf-kernels", "backend": "cuda"}, "wl": {"name": "cuda_T256_D2048", "num_tokens": 256, "hidden_dim": 2048, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.026359999992564553, "p50": 0.027051000017763727, "p90": 0.027101000000584463, "mean": 0.027004599996871548, "iqr": 0.00035100003970001126, "raw_times": [0.027101000000584463, 0.027051000017763727, 0.027761000012560544, 0.026749999960884452, 0.026359999992564553], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.029620999953294813, "peak_bytes": 10485760, "ok": true, "absmax": 0.0, "corr": {"ok": true, "rtol": 0.02, "atol": 0.02, "absmax": 0.0, "mae": 0.0, "mse": 0.0, "ref": "swiglu_fp32"}, "err": null}
|
| 7 |
+
{"ts": "2025-10-29T14:27:26Z", "run": "cb02b80f11f04022890ac7026c401c57", "impl": "hf_kernels_swiglu", "tags": {"family": "hf-kernels", "backend": "cuda"}, "wl": {"name": "cuda_T512_D768", "num_tokens": 512, "hidden_dim": 768, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.02576100001761006, "p50": 0.027530000011211087, "p90": 0.02828099997032041, "mean": 0.0273743999969156, "iqr": 0.001340999972399004, "raw_times": [0.02576100001761006, 0.02828099997032041, 0.026939999997921404, 0.02835999998751504, 0.027530000011211087], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.030121000008875853, "peak_bytes": 7864320, "ok": true, "absmax": 0.0, "corr": {"ok": true, "rtol": 0.02, "atol": 0.02, "absmax": 0.0, "mae": 0.0, "mse": 0.0, "ref": "swiglu_fp32"}, "err": null}
|
| 8 |
+
{"ts": "2025-10-29T14:27:26Z", "run": "cb02b80f11f04022890ac7026c401c57", "impl": "hf_kernels_swiglu", "tags": {"family": "hf-kernels", "backend": "cuda"}, "wl": {"name": "cuda_T512_D1024", "num_tokens": 512, "hidden_dim": 1024, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.025459999960730784, "p50": 0.028590999988864496, "p90": 0.02870100001928222, "mean": 0.027812799999082927, "iqr": 0.00113999999484804, "raw_times": [0.025459999960730784, 0.02870100001928222, 0.028751000002102955, 0.02756100002443418, 0.028590999988864496], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.029950999987704563, "peak_bytes": 10485760, "ok": true, "absmax": 0.0, "corr": {"ok": true, "rtol": 0.02, "atol": 0.02, "absmax": 0.0, "mae": 0.0, "mse": 0.0, "ref": "swiglu_fp32"}, "err": null}
|
| 9 |
+
{"ts": "2025-10-29T14:27:26Z", "run": "cb02b80f11f04022890ac7026c401c57", "impl": "hf_kernels_swiglu", "tags": {"family": "hf-kernels", "backend": "cuda"}, "wl": {"name": "cuda_T512_D2048", "num_tokens": 512, "hidden_dim": 2048, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.02632999996876606, "p50": 0.027500999976837193, "p90": 0.028640000039104052, "mean": 0.028318399995441723, "iqr": 0.0021100000253682083, "raw_times": [0.02632999996876606, 0.03259099997876547, 0.027500999976837193, 0.026530000013735844, 0.028640000039104052], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.029991000019435887, "peak_bytes": 20971520, "ok": true, "absmax": 0.0, "corr": {"ok": true, "rtol": 0.02, "atol": 0.02, "absmax": 0.0, "mae": 0.0, "mse": 0.0, "ref": "swiglu_fp32"}, "err": null}
|
activation/impls/hf_kernels_swiglu.html
CHANGED
|
@@ -3871,7 +3871,7 @@ span.linenos.special { color: #000000; background-color: #ffffc0; padding-left:
|
|
| 3871 |
<span onclick="toggleOutput('nv')" style="cursor: pointer;">▼ output</span>
|
| 3872 |
<span id="uv-indicator-nv" style="cursor: default; opacity: 0.3;">▶ uv-logs</span>
|
| 3873 |
</span> |
|
| 3874 |
-
Cell: nv | 0.
|
| 3875 |
| <button class="run-btn" onclick="runCell('nv')">▶ run</button>
|
| 3876 |
<button class="copy-btn" onclick="copyCell('nv')">Copy</button>
|
| 3877 |
<a href="cells/nv.py" target="_blank" class="raw-btn">Raw</a>
|
|
@@ -3887,7 +3887,7 @@ Cell: nv | 0.21s
|
|
| 3887 |
</div>
|
| 3888 |
</div>
|
| 3889 |
<div id="output-nv" class="cell-output">
|
| 3890 |
-
<div class="cell-stdout"><pre class="stdout-text">
|
| 3891 |
+-----------------------------------------------------------------------------------------+
|
| 3892 |
| NVIDIA-SMI 570.195.03 Driver Version: 570.195.03 CUDA Version: 12.8 |
|
| 3893 |
|-----------------------------------------+------------------------+----------------------+
|
|
@@ -3896,7 +3896,7 @@ Cell: nv | 0.21s
|
|
| 3896 |
| | | MIG M. |
|
| 3897 |
|=========================================+========================+======================|
|
| 3898 |
| 0 NVIDIA L40S On | 00000000:4D:00.0 Off | 0 |
|
| 3899 |
-
| N/A
|
| 3900 |
| | | N/A |
|
| 3901 |
+-----------------------------------------+------------------------+----------------------+
|
| 3902 |
|
|
@@ -3920,7 +3920,7 @@ Cell: nv | 0.21s
|
|
| 3920 |
<span onclick="toggleOutput('benchmark')" style="cursor: pointer;">▼ output</span>
|
| 3921 |
<span id="uv-indicator-benchmark" onclick="toggleUvLogsFromHeader('benchmark')" style="cursor: pointer;">▶ uv-logs</span>
|
| 3922 |
</span> |
|
| 3923 |
-
Cell: benchmark | 4.
|
| 3924 |
| <button class="run-btn" onclick="runCell('benchmark')">▶ run</button>
|
| 3925 |
<button class="copy-btn" onclick="copyCell('benchmark')">Copy</button>
|
| 3926 |
<a href="cells/benchmark.py" target="_blank" class="raw-btn">Raw</a>
|
|
@@ -3976,17 +3976,17 @@ PROFILE TRACE: hf_kernels_swiglu | cuda_T128_D768
|
|
| 3976 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 3977 |
Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
|
| 3978 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 3979 |
-
hf_kernels_swiglu 0.00% 0.000us 0.00% 0.000us 0.000us
|
| 3980 |
-
hf_kernels_swiglu
|
| 3981 |
-
_activation_beeaae6::silu_and_mul 1.
|
| 3982 |
-
void vllm::act_and_mul_kernel<c10::BFloat16, &(c10::... 0.00% 0.000us 0.00% 0.000us 0.000us 4.
|
| 3983 |
-
Activity Buffer Request
|
| 3984 |
-
aten::empty 2.
|
| 3985 |
-
cudaLaunchKernel 2.
|
| 3986 |
-
cudaDeviceSynchronize 0.
|
| 3987 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 3988 |
-
Self CPU time total: 1.
|
| 3989 |
-
Self CUDA time total: 4.
|
| 3990 |
|
| 3991 |
|
| 3992 |
|
|
@@ -3996,17 +3996,17 @@ PROFILE TRACE: hf_kernels_swiglu | cuda_T128_D1024
|
|
| 3996 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 3997 |
Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
|
| 3998 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 3999 |
-
hf_kernels_swiglu 0.00% 0.000us 0.00% 0.000us 0.000us
|
| 4000 |
-
hf_kernels_swiglu 6.
|
| 4001 |
-
_activation_beeaae6::silu_and_mul 1.
|
| 4002 |
-
void vllm::act_and_mul_kernel<c10::BFloat16, &(c10::... 0.00% 0.000us 0.00% 0.000us 0.000us 3.
|
| 4003 |
-
Activity Buffer Request 88.
|
| 4004 |
-
aten::empty 1.
|
| 4005 |
-
cudaLaunchKernel 1.
|
| 4006 |
-
cudaDeviceSynchronize 0.
|
| 4007 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 4008 |
-
Self CPU time total: 1.
|
| 4009 |
-
Self CUDA time total: 3.
|
| 4010 |
|
| 4011 |
|
| 4012 |
|
|
@@ -4016,16 +4016,16 @@ PROFILE TRACE: hf_kernels_swiglu | cuda_T128_D2048
|
|
| 4016 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 4017 |
Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
|
| 4018 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 4019 |
-
hf_kernels_swiglu 0.00% 0.000us 0.00% 0.000us 0.000us
|
| 4020 |
-
hf_kernels_swiglu 6.
|
| 4021 |
-
_activation_beeaae6::silu_and_mul 1.
|
| 4022 |
void vllm::act_and_mul_kernel<c10::BFloat16, &(c10::... 0.00% 0.000us 0.00% 0.000us 0.000us 4.897us 100.00% 4.897us 1.632us 3
|
| 4023 |
-
Activity Buffer Request
|
| 4024 |
-
aten::empty 1.
|
| 4025 |
-
cudaLaunchKernel 1.
|
| 4026 |
-
cudaDeviceSynchronize 0.
|
| 4027 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 4028 |
-
Self CPU time total: 1.
|
| 4029 |
Self CUDA time total: 4.897us
|
| 4030 |
|
| 4031 |
|
|
@@ -4036,16 +4036,16 @@ PROFILE TRACE: hf_kernels_swiglu | cuda_T256_D768
|
|
| 4036 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 4037 |
Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
|
| 4038 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 4039 |
-
hf_kernels_swiglu 0.00% 0.000us 0.00% 0.000us 0.000us
|
| 4040 |
-
hf_kernels_swiglu
|
| 4041 |
-
_activation_beeaae6::silu_and_mul 1.
|
| 4042 |
void vllm::act_and_mul_kernel<c10::BFloat16, &(c10::... 0.00% 0.000us 0.00% 0.000us 0.000us 4.256us 100.00% 4.256us 1.419us 3
|
| 4043 |
-
Activity Buffer Request
|
| 4044 |
-
aten::empty 1.
|
| 4045 |
-
cudaLaunchKernel
|
| 4046 |
-
cudaDeviceSynchronize 0.
|
| 4047 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 4048 |
-
Self CPU time total: 1.
|
| 4049 |
Self CUDA time total: 4.256us
|
| 4050 |
|
| 4051 |
|
|
@@ -4056,17 +4056,17 @@ PROFILE TRACE: hf_kernels_swiglu | cuda_T256_D1024
|
|
| 4056 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 4057 |
Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
|
| 4058 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 4059 |
-
hf_kernels_swiglu 0.00% 0.000us 0.00% 0.000us 0.000us
|
| 4060 |
-
hf_kernels_swiglu
|
| 4061 |
-
_activation_beeaae6::silu_and_mul
|
| 4062 |
-
void vllm::act_and_mul_kernel<c10::BFloat16, &(c10::... 0.00% 0.000us 0.00% 0.000us 0.000us 5.
|
| 4063 |
-
Activity Buffer Request
|
| 4064 |
-
aten::empty
|
| 4065 |
-
cudaLaunchKernel
|
| 4066 |
-
cudaDeviceSynchronize
|
| 4067 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 4068 |
-
Self CPU time total:
|
| 4069 |
-
Self CUDA time total: 5.
|
| 4070 |
|
| 4071 |
|
| 4072 |
|
|
@@ -4076,16 +4076,16 @@ PROFILE TRACE: hf_kernels_swiglu | cuda_T256_D2048
|
|
| 4076 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 4077 |
Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
|
| 4078 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 4079 |
-
hf_kernels_swiglu 0.00% 0.000us 0.00% 0.000us 0.000us
|
| 4080 |
-
hf_kernels_swiglu
|
| 4081 |
-
_activation_beeaae6::silu_and_mul 1.
|
| 4082 |
void vllm::act_and_mul_kernel<c10::BFloat16, &(c10::... 0.00% 0.000us 0.00% 0.000us 0.000us 7.680us 100.00% 7.680us 2.560us 3
|
| 4083 |
-
Activity Buffer Request
|
| 4084 |
-
aten::empty 1.
|
| 4085 |
-
cudaLaunchKernel
|
| 4086 |
-
cudaDeviceSynchronize 0.
|
| 4087 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 4088 |
-
Self CPU time total: 1.
|
| 4089 |
Self CUDA time total: 7.680us
|
| 4090 |
|
| 4091 |
|
|
@@ -4096,17 +4096,17 @@ PROFILE TRACE: hf_kernels_swiglu | cuda_T512_D768
|
|
| 4096 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 4097 |
Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
|
| 4098 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 4099 |
-
hf_kernels_swiglu 0.00% 0.000us 0.00% 0.000us 0.000us
|
| 4100 |
-
hf_kernels_swiglu
|
| 4101 |
-
_activation_beeaae6::silu_and_mul 1.
|
| 4102 |
-
void vllm::act_and_mul_kernel<c10::BFloat16, &(c10::... 0.00% 0.000us 0.00% 0.000us 0.000us 6.
|
| 4103 |
-
Activity Buffer Request
|
| 4104 |
-
aten::empty 1.
|
| 4105 |
-
cudaLaunchKernel
|
| 4106 |
-
cudaDeviceSynchronize 0.
|
| 4107 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 4108 |
-
Self CPU time total: 1.
|
| 4109 |
-
Self CUDA time total: 6.
|
| 4110 |
|
| 4111 |
|
| 4112 |
|
|
@@ -4116,17 +4116,17 @@ PROFILE TRACE: hf_kernels_swiglu | cuda_T512_D1024
|
|
| 4116 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 4117 |
Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
|
| 4118 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 4119 |
-
hf_kernels_swiglu 0.00% 0.000us 0.00% 0.000us 0.000us
|
| 4120 |
-
hf_kernels_swiglu
|
| 4121 |
-
_activation_beeaae6::silu_and_mul
|
| 4122 |
-
void vllm::act_and_mul_kernel<c10::BFloat16, &(c10::... 0.00% 0.000us 0.00% 0.000us 0.000us 9.
|
| 4123 |
-
Activity Buffer Request
|
| 4124 |
-
aten::empty
|
| 4125 |
-
cudaLaunchKernel
|
| 4126 |
-
cudaDeviceSynchronize
|
| 4127 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 4128 |
-
Self CPU time total:
|
| 4129 |
-
Self CUDA time total: 9.
|
| 4130 |
|
| 4131 |
|
| 4132 |
|
|
@@ -4136,17 +4136,17 @@ PROFILE TRACE: hf_kernels_swiglu | cuda_T512_D2048
|
|
| 4136 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 4137 |
Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
|
| 4138 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 4139 |
-
hf_kernels_swiglu 0.00% 0.000us 0.00% 0.000us 0.000us
|
| 4140 |
-
hf_kernels_swiglu
|
| 4141 |
-
_activation_beeaae6::silu_and_mul
|
| 4142 |
-
void vllm::act_and_mul_kernel<c10::BFloat16, &(c10::... 0.00% 0.000us 0.00% 0.000us 0.000us
|
| 4143 |
-
Activity Buffer Request
|
| 4144 |
-
aten::empty
|
| 4145 |
-
cudaLaunchKernel
|
| 4146 |
-
cudaDeviceSynchronize
|
| 4147 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 4148 |
-
Self CPU time total:
|
| 4149 |
-
Self CUDA time total:
|
| 4150 |
|
| 4151 |
|
| 4152 |
impl wl p50(ms) ok
|
|
@@ -4163,13 +4163,12 @@ hf_kernels_swiglu cuda_T512_D768 0.03 True
|
|
| 4163 |
<div class="uv-install-logs" id="uv-logs-benchmark">
|
| 4164 |
<div class="uv-logs-header" onclick="toggleUvLogs(this)">▶ UV Install Logs</div>
|
| 4165 |
<div class="uv-logs-content" style="display: none;">
|
| 4166 |
-
Installed 15 packages in
|
| 4167 |
</div>
|
| 4168 |
</div>
|
| 4169 |
<div class="cell-stderr">Fetching 7 files: 0%| | 0/7 [00:00<?, ?it/s]
|
| 4170 |
-
Fetching 7 files:
|
| 4171 |
-
Fetching 7 files:
|
| 4172 |
-
Fetching 7 files: 100%|██████████| 7/7 [00:00<00:00, 15.62it/s]</div>
|
| 4173 |
<div class="cell-artifacts">
|
| 4174 |
<h4>Artifacts:</h4>
|
| 4175 |
<a href="artifacts/benchmark/activation.jsonl" class="artifact" target="_blank">activation.jsonl</a>
|
|
|
|
| 3871 |
<span onclick="toggleOutput('nv')" style="cursor: pointer;">▼ output</span>
|
| 3872 |
<span id="uv-indicator-nv" style="cursor: default; opacity: 0.3;">▶ uv-logs</span>
|
| 3873 |
</span> |
|
| 3874 |
+
Cell: nv | 0.26s
|
| 3875 |
| <button class="run-btn" onclick="runCell('nv')">▶ run</button>
|
| 3876 |
<button class="copy-btn" onclick="copyCell('nv')">Copy</button>
|
| 3877 |
<a href="cells/nv.py" target="_blank" class="raw-btn">Raw</a>
|
|
|
|
| 3887 |
</div>
|
| 3888 |
</div>
|
| 3889 |
<div id="output-nv" class="cell-output">
|
| 3890 |
+
<div class="cell-stdout"><pre class="stdout-text">Wed Oct 29 14:26:44 2025
|
| 3891 |
+-----------------------------------------------------------------------------------------+
|
| 3892 |
| NVIDIA-SMI 570.195.03 Driver Version: 570.195.03 CUDA Version: 12.8 |
|
| 3893 |
|-----------------------------------------+------------------------+----------------------+
|
|
|
|
| 3896 |
| | | MIG M. |
|
| 3897 |
|=========================================+========================+======================|
|
| 3898 |
| 0 NVIDIA L40S On | 00000000:4D:00.0 Off | 0 |
|
| 3899 |
+
| N/A 32C P0 133W / 350W | 0MiB / 46068MiB | 100% Default |
|
| 3900 |
| | | N/A |
|
| 3901 |
+-----------------------------------------+------------------------+----------------------+
|
| 3902 |
|
|
|
|
| 3920 |
<span onclick="toggleOutput('benchmark')" style="cursor: pointer;">▼ output</span>
|
| 3921 |
<span id="uv-indicator-benchmark" onclick="toggleUvLogsFromHeader('benchmark')" style="cursor: pointer;">▶ uv-logs</span>
|
| 3922 |
</span> |
|
| 3923 |
+
Cell: benchmark | 4.19s
|
| 3924 |
| <button class="run-btn" onclick="runCell('benchmark')">▶ run</button>
|
| 3925 |
<button class="copy-btn" onclick="copyCell('benchmark')">Copy</button>
|
| 3926 |
<a href="cells/benchmark.py" target="_blank" class="raw-btn">Raw</a>
|
|
|
|
| 3976 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 3977 |
Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
|
| 3978 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 3979 |
+
hf_kernels_swiglu 0.00% 0.000us 0.00% 0.000us 0.000us 72.288us 1807.20% 72.288us 72.288us 1
|
| 3980 |
+
hf_kernels_swiglu 12.07% 211.387us 99.59% 1.744ms 1.744ms 0.000us 0.00% 5.376us 5.376us 1
|
| 3981 |
+
_activation_beeaae6::silu_and_mul 1.10% 19.319us 84.87% 1.486ms 495.368us 4.000us 100.00% 5.376us 1.792us 3
|
| 3982 |
+
void vllm::act_and_mul_kernel<c10::BFloat16, &(c10::... 0.00% 0.000us 0.00% 0.000us 0.000us 4.000us 100.00% 4.000us 1.333us 3
|
| 3983 |
+
Activity Buffer Request 81.49% 1.427ms 81.49% 1.427ms 1.427ms 1.376us 34.40% 1.376us 1.376us 1
|
| 3984 |
+
aten::empty 2.64% 46.231us 2.64% 46.231us 15.410us 0.000us 0.00% 0.000us 0.000us 3
|
| 3985 |
+
cudaLaunchKernel 2.28% 39.911us 2.28% 39.911us 13.304us 0.000us 0.00% 0.000us 0.000us 3
|
| 3986 |
+
cudaDeviceSynchronize 0.41% 7.220us 0.41% 7.220us 7.220us 0.000us 0.00% 0.000us 0.000us 1
|
| 3987 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 3988 |
+
Self CPU time total: 1.751ms
|
| 3989 |
+
Self CUDA time total: 4.000us
|
| 3990 |
|
| 3991 |
|
| 3992 |
|
|
|
|
| 3996 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 3997 |
Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
|
| 3998 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 3999 |
+
hf_kernels_swiglu 0.00% 0.000us 0.00% 0.000us 0.000us 62.686us 1579.79% 62.686us 62.686us 1
|
| 4000 |
+
hf_kernels_swiglu 6.72% 108.943us 99.67% 1.616ms 1.616ms 0.000us 0.00% 5.312us 5.312us 1
|
| 4001 |
+
_activation_beeaae6::silu_and_mul 1.34% 21.721us 91.77% 1.488ms 495.875us 3.968us 100.00% 5.312us 1.771us 3
|
| 4002 |
+
void vllm::act_and_mul_kernel<c10::BFloat16, &(c10::... 0.00% 0.000us 0.00% 0.000us 0.000us 3.968us 100.00% 3.968us 1.323us 3
|
| 4003 |
+
Activity Buffer Request 88.82% 1.440ms 88.82% 1.440ms 1.440ms 1.344us 33.87% 1.344us 1.344us 1
|
| 4004 |
+
aten::empty 1.18% 19.150us 1.18% 19.150us 6.383us 0.000us 0.00% 0.000us 0.000us 3
|
| 4005 |
+
cudaLaunchKernel 1.61% 26.150us 1.61% 26.150us 8.717us 0.000us 0.00% 0.000us 0.000us 3
|
| 4006 |
+
cudaDeviceSynchronize 0.33% 5.310us 0.33% 5.310us 5.310us 0.000us 0.00% 0.000us 0.000us 1
|
| 4007 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 4008 |
+
Self CPU time total: 1.621ms
|
| 4009 |
+
Self CUDA time total: 3.968us
|
| 4010 |
|
| 4011 |
|
| 4012 |
|
|
|
|
| 4016 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 4017 |
Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
|
| 4018 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 4019 |
+
hf_kernels_swiglu 0.00% 0.000us 0.00% 0.000us 0.000us 66.687us 1361.79% 66.687us 66.687us 1
|
| 4020 |
+
hf_kernels_swiglu 6.74% 109.943us 99.70% 1.626ms 1.626ms 0.000us 0.00% 6.529us 6.529us 1
|
| 4021 |
+
_activation_beeaae6::silu_and_mul 1.25% 20.459us 91.78% 1.496ms 498.816us 4.897us 100.00% 6.529us 2.176us 3
|
| 4022 |
void vllm::act_and_mul_kernel<c10::BFloat16, &(c10::... 0.00% 0.000us 0.00% 0.000us 0.000us 4.897us 100.00% 4.897us 1.632us 3
|
| 4023 |
+
Activity Buffer Request 88.91% 1.450ms 88.91% 1.450ms 1.450ms 1.632us 33.33% 1.632us 1.632us 1
|
| 4024 |
+
aten::empty 1.18% 19.260us 1.18% 19.260us 6.420us 0.000us 0.00% 0.000us 0.000us 3
|
| 4025 |
+
cudaLaunchKernel 1.61% 26.232us 1.61% 26.232us 8.744us 0.000us 0.00% 0.000us 0.000us 3
|
| 4026 |
+
cudaDeviceSynchronize 0.30% 4.870us 0.30% 4.870us 4.870us 0.000us 0.00% 0.000us 0.000us 1
|
| 4027 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 4028 |
+
Self CPU time total: 1.631ms
|
| 4029 |
Self CUDA time total: 4.897us
|
| 4030 |
|
| 4031 |
|
|
|
|
| 4036 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 4037 |
Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
|
| 4038 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 4039 |
+
hf_kernels_swiglu 0.00% 0.000us 0.00% 0.000us 0.000us 66.081us 1552.66% 66.081us 66.081us 1
|
| 4040 |
+
hf_kernels_swiglu 6.15% 108.423us 99.71% 1.758ms 1.758ms 0.000us 0.00% 5.696us 5.696us 1
|
| 4041 |
+
_activation_beeaae6::silu_and_mul 1.25% 22.001us 92.49% 1.631ms 543.697us 4.256us 100.00% 5.696us 1.899us 3
|
| 4042 |
void vllm::act_and_mul_kernel<c10::BFloat16, &(c10::... 0.00% 0.000us 0.00% 0.000us 0.000us 4.256us 100.00% 4.256us 1.419us 3
|
| 4043 |
+
Activity Buffer Request 80.93% 1.427ms 80.93% 1.427ms 1.427ms 1.440us 33.83% 1.440us 1.440us 1
|
| 4044 |
+
aten::empty 1.07% 18.910us 1.07% 18.910us 6.303us 0.000us 0.00% 0.000us 0.000us 3
|
| 4045 |
+
cudaLaunchKernel 10.31% 181.874us 10.31% 181.874us 60.625us 0.000us 0.00% 0.000us 0.000us 3
|
| 4046 |
+
cudaDeviceSynchronize 0.29% 5.110us 0.29% 5.110us 5.110us 0.000us 0.00% 0.000us 0.000us 1
|
| 4047 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 4048 |
+
Self CPU time total: 1.764ms
|
| 4049 |
Self CUDA time total: 4.256us
|
| 4050 |
|
| 4051 |
|
|
|
|
| 4056 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 4057 |
Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
|
| 4058 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 4059 |
+
hf_kernels_swiglu 0.00% 0.000us 0.00% 0.000us 0.000us 63.167us 1072.63% 63.167us 63.167us 1
|
| 4060 |
+
hf_kernels_swiglu 15.22% 87.332us 99.19% 569.294us 569.294us 0.000us 0.00% 7.873us 7.873us 1
|
| 4061 |
+
_activation_beeaae6::silu_and_mul 3.58% 20.570us 80.67% 463.002us 154.334us 5.889us 100.00% 7.873us 2.624us 3
|
| 4062 |
+
void vllm::act_and_mul_kernel<c10::BFloat16, &(c10::... 0.00% 0.000us 0.00% 0.000us 0.000us 5.889us 100.00% 5.889us 1.963us 3
|
| 4063 |
+
Activity Buffer Request 48.76% 279.877us 48.76% 279.877us 279.877us 1.984us 33.69% 1.984us 1.984us 1
|
| 4064 |
+
aten::empty 3.30% 18.960us 3.30% 18.960us 6.320us 0.000us 0.00% 0.000us 0.000us 3
|
| 4065 |
+
cudaLaunchKernel 28.32% 162.555us 28.32% 162.555us 54.185us 0.000us 0.00% 0.000us 0.000us 3
|
| 4066 |
+
cudaDeviceSynchronize 0.81% 4.660us 0.81% 4.660us 4.660us 0.000us 0.00% 0.000us 0.000us 1
|
| 4067 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 4068 |
+
Self CPU time total: 573.954us
|
| 4069 |
+
Self CUDA time total: 5.889us
|
| 4070 |
|
| 4071 |
|
| 4072 |
|
|
|
|
| 4076 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 4077 |
Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
|
| 4078 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 4079 |
+
hf_kernels_swiglu 0.00% 0.000us 0.00% 0.000us 0.000us 69.632us 906.67% 69.632us 69.632us 1
|
| 4080 |
+
hf_kernels_swiglu 6.07% 107.484us 99.73% 1.766ms 1.766ms 0.000us 0.00% 10.240us 10.240us 1
|
| 4081 |
+
_activation_beeaae6::silu_and_mul 1.19% 21.010us 92.55% 1.639ms 546.413us 7.680us 100.00% 10.240us 3.413us 3
|
| 4082 |
void vllm::act_and_mul_kernel<c10::BFloat16, &(c10::... 0.00% 0.000us 0.00% 0.000us 0.000us 7.680us 100.00% 7.680us 2.560us 3
|
| 4083 |
+
Activity Buffer Request 81.69% 1.447ms 81.69% 1.447ms 1.447ms 2.560us 33.33% 2.560us 2.560us 1
|
| 4084 |
+
aten::empty 1.11% 19.720us 1.11% 19.720us 6.573us 0.000us 0.00% 0.000us 0.000us 3
|
| 4085 |
+
cudaLaunchKernel 9.67% 171.234us 9.67% 171.234us 57.078us 0.000us 0.00% 0.000us 0.000us 3
|
| 4086 |
+
cudaDeviceSynchronize 0.27% 4.800us 0.27% 4.800us 4.800us 0.000us 0.00% 0.000us 0.000us 1
|
| 4087 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 4088 |
+
Self CPU time total: 1.771ms
|
| 4089 |
Self CUDA time total: 7.680us
|
| 4090 |
|
| 4091 |
|
|
|
|
| 4096 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 4097 |
Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
|
| 4098 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 4099 |
+
hf_kernels_swiglu 0.00% 0.000us 0.00% 0.000us 0.000us 72.064us 1098.54% 72.064us 72.064us 1
|
| 4100 |
+
hf_kernels_swiglu 6.19% 109.521us 99.72% 1.763ms 1.763ms 0.000us 0.00% 8.768us 8.768us 1
|
| 4101 |
+
_activation_beeaae6::silu_and_mul 1.22% 21.580us 92.43% 1.635ms 544.850us 6.560us 100.00% 8.768us 2.923us 3
|
| 4102 |
+
void vllm::act_and_mul_kernel<c10::BFloat16, &(c10::... 0.00% 0.000us 0.00% 0.000us 0.000us 6.560us 100.00% 6.560us 2.187us 3
|
| 4103 |
+
Activity Buffer Request 81.92% 1.449ms 81.92% 1.449ms 1.449ms 2.208us 33.66% 2.208us 2.208us 1
|
| 4104 |
+
aten::empty 1.09% 19.351us 1.09% 19.351us 6.450us 0.000us 0.00% 0.000us 0.000us 3
|
| 4105 |
+
cudaLaunchKernel 9.29% 164.205us 9.29% 164.205us 54.735us 0.000us 0.00% 0.000us 0.000us 3
|
| 4106 |
+
cudaDeviceSynchronize 0.28% 4.990us 0.28% 4.990us 4.990us 0.000us 0.00% 0.000us 0.000us 1
|
| 4107 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 4108 |
+
Self CPU time total: 1.768ms
|
| 4109 |
+
Self CUDA time total: 6.560us
|
| 4110 |
|
| 4111 |
|
| 4112 |
|
|
|
|
| 4116 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 4117 |
Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
|
| 4118 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 4119 |
+
hf_kernels_swiglu 0.00% 0.000us 0.00% 0.000us 0.000us 65.118us 692.16% 65.118us 65.118us 1
|
| 4120 |
+
hf_kernels_swiglu 16.62% 89.683us 99.03% 534.374us 534.374us 0.000us 0.00% 12.576us 12.576us 1
|
| 4121 |
+
_activation_beeaae6::silu_and_mul 3.96% 21.372us 78.99% 426.201us 142.067us 9.408us 100.00% 12.576us 4.192us 3
|
| 4122 |
+
void vllm::act_and_mul_kernel<c10::BFloat16, &(c10::... 0.00% 0.000us 0.00% 0.000us 0.000us 9.408us 100.00% 9.408us 3.136us 3
|
| 4123 |
+
Activity Buffer Request 44.61% 240.735us 44.61% 240.735us 240.735us 3.168us 33.67% 3.168us 3.168us 1
|
| 4124 |
+
aten::empty 3.43% 18.490us 3.43% 18.490us 6.163us 0.000us 0.00% 0.000us 0.000us 3
|
| 4125 |
+
cudaLaunchKernel 30.41% 164.094us 30.41% 164.094us 54.698us 0.000us 0.00% 0.000us 0.000us 3
|
| 4126 |
+
cudaDeviceSynchronize 0.97% 5.210us 0.97% 5.210us 5.210us 0.000us 0.00% 0.000us 0.000us 1
|
| 4127 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 4128 |
+
Self CPU time total: 539.584us
|
| 4129 |
+
Self CUDA time total: 9.408us
|
| 4130 |
|
| 4131 |
|
| 4132 |
|
|
|
|
| 4136 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 4137 |
Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
|
| 4138 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 4139 |
+
hf_kernels_swiglu 0.00% 0.000us 0.00% 0.000us 0.000us 69.182us 527.34% 69.182us 69.182us 1
|
| 4140 |
+
hf_kernels_swiglu 12.86% 103.214us 99.41% 797.800us 797.800us 0.000us 0.00% 17.534us 17.534us 1
|
| 4141 |
+
_activation_beeaae6::silu_and_mul 2.63% 21.139us 84.20% 675.726us 225.242us 13.119us 100.00% 17.534us 5.845us 3
|
| 4142 |
+
void vllm::act_and_mul_kernel<c10::BFloat16, &(c10::... 0.00% 0.000us 0.00% 0.000us 0.000us 13.119us 100.00% 13.119us 4.373us 3
|
| 4143 |
+
Activity Buffer Request 61.21% 491.232us 61.21% 491.232us 491.232us 4.415us 33.65% 4.415us 4.415us 1
|
| 4144 |
+
aten::empty 2.35% 18.860us 2.35% 18.860us 6.287us 0.000us 0.00% 0.000us 0.000us 3
|
| 4145 |
+
cudaLaunchKernel 20.35% 163.355us 20.35% 163.355us 54.452us 0.000us 0.00% 0.000us 0.000us 3
|
| 4146 |
+
cudaDeviceSynchronize 0.59% 4.750us 0.59% 4.750us 4.750us 0.000us 0.00% 0.000us 0.000us 1
|
| 4147 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 4148 |
+
Self CPU time total: 802.550us
|
| 4149 |
+
Self CUDA time total: 13.119us
|
| 4150 |
|
| 4151 |
|
| 4152 |
impl wl p50(ms) ok
|
|
|
|
| 4163 |
<div class="uv-install-logs" id="uv-logs-benchmark">
|
| 4164 |
<div class="uv-logs-header" onclick="toggleUvLogs(this)">▶ UV Install Logs</div>
|
| 4165 |
<div class="uv-logs-content" style="display: none;">
|
| 4166 |
+
Installed 15 packages in 13ms
|
| 4167 |
</div>
|
| 4168 |
</div>
|
| 4169 |
<div class="cell-stderr">Fetching 7 files: 0%| | 0/7 [00:00<?, ?it/s]
|
| 4170 |
+
Fetching 7 files: 71%|███████▏ | 5/7 [00:00<00:00, 14.29it/s]
|
| 4171 |
+
Fetching 7 files: 100%|██████████| 7/7 [00:00<00:00, 19.98it/s]</div>
|
|
|
|
| 4172 |
<div class="cell-artifacts">
|
| 4173 |
<h4>Artifacts:</h4>
|
| 4174 |
<a href="artifacts/benchmark/activation.jsonl" class="artifact" target="_blank">activation.jsonl</a>
|
activation/impls/torch_swiglu.html
CHANGED
|
@@ -3871,7 +3871,7 @@ span.linenos.special { color: #000000; background-color: #ffffc0; padding-left:
|
|
| 3871 |
<span onclick="toggleOutput('nv')" style="cursor: pointer;">▼ output</span>
|
| 3872 |
<span id="uv-indicator-nv" style="cursor: default; opacity: 0.3;">▶ uv-logs</span>
|
| 3873 |
</span> |
|
| 3874 |
-
Cell: nv | 0.
|
| 3875 |
| <button class="run-btn" onclick="runCell('nv')">▶ run</button>
|
| 3876 |
<button class="copy-btn" onclick="copyCell('nv')">Copy</button>
|
| 3877 |
<a href="cells/nv.py" target="_blank" class="raw-btn">Raw</a>
|
|
@@ -3887,7 +3887,7 @@ Cell: nv | 0.21s
|
|
| 3887 |
</div>
|
| 3888 |
</div>
|
| 3889 |
<div id="output-nv" class="cell-output">
|
| 3890 |
-
<div class="cell-stdout"><pre class="stdout-text">
|
| 3891 |
+-----------------------------------------------------------------------------------------+
|
| 3892 |
| NVIDIA-SMI 570.195.03 Driver Version: 570.195.03 CUDA Version: 12.8 |
|
| 3893 |
|-----------------------------------------+------------------------+----------------------+
|
|
@@ -3896,7 +3896,7 @@ Cell: nv | 0.21s
|
|
| 3896 |
| | | MIG M. |
|
| 3897 |
|=========================================+========================+======================|
|
| 3898 |
| 0 NVIDIA L40S On | 00000000:4D:00.0 Off | 0 |
|
| 3899 |
-
| N/A
|
| 3900 |
| | | N/A |
|
| 3901 |
+-----------------------------------------+------------------------+----------------------+
|
| 3902 |
|
|
@@ -3920,7 +3920,7 @@ Cell: nv | 0.21s
|
|
| 3920 |
<span onclick="toggleOutput('benchmark')" style="cursor: pointer;">▼ output</span>
|
| 3921 |
<span id="uv-indicator-benchmark" onclick="toggleUvLogsFromHeader('benchmark')" style="cursor: pointer;">▶ uv-logs</span>
|
| 3922 |
</span> |
|
| 3923 |
-
Cell: benchmark | 6.
|
| 3924 |
| <button class="run-btn" onclick="runCell('benchmark')">▶ run</button>
|
| 3925 |
<button class="copy-btn" onclick="copyCell('benchmark')">Copy</button>
|
| 3926 |
<a href="cells/benchmark.py" target="_blank" class="raw-btn">Raw</a>
|
|
@@ -3970,20 +3970,20 @@ PROFILE TRACE: torch_eager | cuda_T128_D768
|
|
| 3970 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 3971 |
Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
|
| 3972 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 3973 |
-
torch_eager 0.00% 0.000us 0.00% 0.000us 0.000us
|
| 3974 |
-
torch_eager 11.
|
| 3975 |
-
aten::silu 3.
|
| 3976 |
-
void at::native::elementwise_kernel<128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 6.
|
| 3977 |
-
aten::mul
|
| 3978 |
-
void at::native::elementwise_kernel<128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 6.209us 48.
|
| 3979 |
-
Activity Buffer Request 76.
|
| 3980 |
-
aten::slice 2.
|
| 3981 |
-
aten::as_strided 0.
|
| 3982 |
-
cudaLaunchKernel 3.
|
| 3983 |
-
cudaDeviceSynchronize 0.
|
| 3984 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 3985 |
-
Self CPU time total: 1.
|
| 3986 |
-
Self CUDA time total: 12.
|
| 3987 |
|
| 3988 |
|
| 3989 |
|
|
@@ -3993,20 +3993,20 @@ PROFILE TRACE: torch_eager | cuda_T128_D1024
|
|
| 3993 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 3994 |
Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
|
| 3995 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 3996 |
-
torch_eager 0.00% 0.000us 0.00% 0.000us 0.000us
|
| 3997 |
-
torch_eager
|
| 3998 |
-
aten::silu 2.
|
| 3999 |
void at::native::elementwise_kernel<128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 6.399us 51.68% 6.399us 2.133us 3
|
| 4000 |
-
aten::mul 1.
|
| 4001 |
-
void at::native::elementwise_kernel<128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 5.
|
| 4002 |
-
Activity Buffer Request 84.
|
| 4003 |
-
aten::slice 1.
|
| 4004 |
-
aten::as_strided 0.35% 5.
|
| 4005 |
-
cudaLaunchKernel 2.
|
| 4006 |
-
cudaDeviceSynchronize 0.
|
| 4007 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 4008 |
-
Self CPU time total: 1.
|
| 4009 |
-
Self CUDA time total: 12.
|
| 4010 |
|
| 4011 |
|
| 4012 |
|
|
@@ -4016,20 +4016,20 @@ PROFILE TRACE: torch_eager | cuda_T128_D2048
|
|
| 4016 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 4017 |
Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
|
| 4018 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 4019 |
-
torch_eager 0.00% 0.000us 0.00% 0.000us 0.000us
|
| 4020 |
-
torch_eager 6.
|
| 4021 |
-
aten::silu 2.
|
| 4022 |
-
void at::native::elementwise_kernel<128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 6.
|
| 4023 |
-
aten::mul 1.
|
| 4024 |
-
void at::native::elementwise_kernel<128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 6.432us 48.
|
| 4025 |
-
Activity Buffer Request 84.
|
| 4026 |
-
aten::slice 1.
|
| 4027 |
-
aten::as_strided 0.
|
| 4028 |
-
cudaLaunchKernel 2.
|
| 4029 |
-
cudaDeviceSynchronize 0.
|
| 4030 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 4031 |
Self CPU time total: 1.692ms
|
| 4032 |
-
Self CUDA time total: 13.
|
| 4033 |
|
| 4034 |
|
| 4035 |
|
|
@@ -4039,20 +4039,20 @@ PROFILE TRACE: torch_eager | cuda_T256_D768
|
|
| 4039 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 4040 |
Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
|
| 4041 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 4042 |
-
torch_eager 0.00% 0.000us 0.00% 0.000us 0.000us
|
| 4043 |
-
torch_eager 6.
|
| 4044 |
-
aten::silu 2.
|
| 4045 |
-
void at::native::elementwise_kernel<128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 6.
|
| 4046 |
-
aten::mul 1.
|
| 4047 |
-
void at::native::elementwise_kernel<128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 6.
|
| 4048 |
-
Activity Buffer Request
|
| 4049 |
-
aten::slice 1.
|
| 4050 |
-
aten::as_strided 0.
|
| 4051 |
-
cudaLaunchKernel
|
| 4052 |
-
cudaDeviceSynchronize 0.
|
| 4053 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 4054 |
-
Self CPU time total: 1.
|
| 4055 |
-
Self CUDA time total: 12.
|
| 4056 |
|
| 4057 |
|
| 4058 |
|
|
@@ -4062,20 +4062,20 @@ PROFILE TRACE: torch_eager | cuda_T256_D1024
|
|
| 4062 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 4063 |
Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
|
| 4064 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 4065 |
-
torch_eager 0.00% 0.000us 0.00% 0.000us 0.000us
|
| 4066 |
-
torch_eager
|
| 4067 |
-
aten::silu 2.
|
| 4068 |
-
void at::native::elementwise_kernel<128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 6.
|
| 4069 |
-
aten::mul 1.
|
| 4070 |
-
void at::native::elementwise_kernel<128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 6.
|
| 4071 |
-
Activity Buffer Request
|
| 4072 |
-
aten::slice 1.
|
| 4073 |
-
aten::as_strided 0.
|
| 4074 |
-
cudaLaunchKernel 10.
|
| 4075 |
-
cudaDeviceSynchronize 0.
|
| 4076 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 4077 |
-
Self CPU time total: 1.
|
| 4078 |
-
Self CUDA time total: 13.
|
| 4079 |
|
| 4080 |
|
| 4081 |
|
|
@@ -4085,20 +4085,20 @@ PROFILE TRACE: torch_eager | cuda_T256_D2048
|
|
| 4085 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 4086 |
Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
|
| 4087 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 4088 |
-
torch_eager 0.00% 0.000us 0.00% 0.000us 0.000us
|
| 4089 |
-
torch_eager
|
| 4090 |
-
aten::silu 8.
|
| 4091 |
-
void at::native::elementwise_kernel<128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 7.
|
| 4092 |
-
aten::mul 5.
|
| 4093 |
-
void at::native::elementwise_kernel<128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 7.552us 48.
|
| 4094 |
-
Activity Buffer Request 22.
|
| 4095 |
-
aten::slice
|
| 4096 |
-
aten::as_strided 1.
|
| 4097 |
-
cudaLaunchKernel
|
| 4098 |
-
cudaDeviceSynchronize 0.
|
| 4099 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 4100 |
-
Self CPU time total:
|
| 4101 |
-
Self CUDA time total: 15.
|
| 4102 |
|
| 4103 |
|
| 4104 |
|
|
@@ -4108,20 +4108,20 @@ PROFILE TRACE: torch_eager | cuda_T512_D768
|
|
| 4108 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 4109 |
Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
|
| 4110 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 4111 |
-
torch_eager 0.00% 0.000us 0.00% 0.000us 0.000us
|
| 4112 |
-
torch_eager
|
| 4113 |
-
aten::silu 2.
|
| 4114 |
-
void at::native::elementwise_kernel<128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 7.
|
| 4115 |
-
aten::mul 1.
|
| 4116 |
-
void at::native::elementwise_kernel<128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us
|
| 4117 |
-
Activity Buffer Request 78.
|
| 4118 |
-
aten::slice 1.
|
| 4119 |
-
aten::as_strided 0.
|
| 4120 |
-
cudaLaunchKernel 9.
|
| 4121 |
-
cudaDeviceSynchronize 0.
|
| 4122 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 4123 |
-
Self CPU time total: 1.
|
| 4124 |
-
Self CUDA time total: 14.
|
| 4125 |
|
| 4126 |
|
| 4127 |
|
|
@@ -4131,20 +4131,20 @@ PROFILE TRACE: torch_eager | cuda_T512_D1024
|
|
| 4131 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 4132 |
Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
|
| 4133 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 4134 |
-
torch_eager 0.00% 0.000us 0.00% 0.000us 0.000us
|
| 4135 |
-
torch_eager
|
| 4136 |
-
aten::silu
|
| 4137 |
-
void at::native::elementwise_kernel<128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 7.936us 51.
|
| 4138 |
-
aten::mul 5.
|
| 4139 |
-
void at::native::elementwise_kernel<128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 7.
|
| 4140 |
-
Activity Buffer Request
|
| 4141 |
-
aten::slice 5.
|
| 4142 |
-
aten::as_strided 1.
|
| 4143 |
-
cudaLaunchKernel
|
| 4144 |
-
cudaDeviceSynchronize 1.
|
| 4145 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 4146 |
-
Self CPU time total:
|
| 4147 |
-
Self CUDA time total: 15.
|
| 4148 |
|
| 4149 |
|
| 4150 |
|
|
@@ -4154,20 +4154,20 @@ PROFILE TRACE: torch_eager | cuda_T512_D2048
|
|
| 4154 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 4155 |
Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
|
| 4156 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 4157 |
-
torch_eager 0.00% 0.000us 0.00% 0.000us 0.000us
|
| 4158 |
-
torch_eager 5.
|
| 4159 |
-
aten::silu 2.
|
| 4160 |
-
void at::native::elementwise_kernel<128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 11.
|
| 4161 |
-
aten::mul 1.
|
| 4162 |
-
void at::native::elementwise_kernel<128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 10.
|
| 4163 |
-
Activity Buffer Request
|
| 4164 |
-
aten::slice 1.
|
| 4165 |
-
aten::as_strided 0.
|
| 4166 |
-
cudaLaunchKernel
|
| 4167 |
-
cudaDeviceSynchronize 0.
|
| 4168 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 4169 |
-
Self CPU time total: 1.
|
| 4170 |
-
Self CUDA time total: 22.
|
| 4171 |
|
| 4172 |
|
| 4173 |
impl wl p50(ms) ok
|
|
@@ -4184,7 +4184,7 @@ torch_eager cuda_T512_D768 0.05 True
|
|
| 4184 |
<div class="uv-install-logs" id="uv-logs-benchmark">
|
| 4185 |
<div class="uv-logs-header" onclick="toggleUvLogs(this)">▶ UV Install Logs</div>
|
| 4186 |
<div class="uv-logs-content" style="display: none;">
|
| 4187 |
-
Installed 37 packages in
|
| 4188 |
</div>
|
| 4189 |
</div>
|
| 4190 |
<div class="cell-artifacts">
|
|
|
|
| 3871 |
<span onclick="toggleOutput('nv')" style="cursor: pointer;">▼ output</span>
|
| 3872 |
<span id="uv-indicator-nv" style="cursor: default; opacity: 0.3;">▶ uv-logs</span>
|
| 3873 |
</span> |
|
| 3874 |
+
Cell: nv | 0.26s
|
| 3875 |
| <button class="run-btn" onclick="runCell('nv')">▶ run</button>
|
| 3876 |
<button class="copy-btn" onclick="copyCell('nv')">Copy</button>
|
| 3877 |
<a href="cells/nv.py" target="_blank" class="raw-btn">Raw</a>
|
|
|
|
| 3887 |
</div>
|
| 3888 |
</div>
|
| 3889 |
<div id="output-nv" class="cell-output">
|
| 3890 |
+
<div class="cell-stdout"><pre class="stdout-text">Wed Oct 29 14:26:44 2025
|
| 3891 |
+-----------------------------------------------------------------------------------------+
|
| 3892 |
| NVIDIA-SMI 570.195.03 Driver Version: 570.195.03 CUDA Version: 12.8 |
|
| 3893 |
|-----------------------------------------+------------------------+----------------------+
|
|
|
|
| 3896 |
| | | MIG M. |
|
| 3897 |
|=========================================+========================+======================|
|
| 3898 |
| 0 NVIDIA L40S On | 00000000:4D:00.0 Off | 0 |
|
| 3899 |
+
| N/A 32C P0 133W / 350W | 0MiB / 46068MiB | 100% Default |
|
| 3900 |
| | | N/A |
|
| 3901 |
+-----------------------------------------+------------------------+----------------------+
|
| 3902 |
|
|
|
|
| 3920 |
<span onclick="toggleOutput('benchmark')" style="cursor: pointer;">▼ output</span>
|
| 3921 |
<span id="uv-indicator-benchmark" onclick="toggleUvLogsFromHeader('benchmark')" style="cursor: pointer;">▶ uv-logs</span>
|
| 3922 |
</span> |
|
| 3923 |
+
Cell: benchmark | 6.86s
|
| 3924 |
| <button class="run-btn" onclick="runCell('benchmark')">▶ run</button>
|
| 3925 |
<button class="copy-btn" onclick="copyCell('benchmark')">Copy</button>
|
| 3926 |
<a href="cells/benchmark.py" target="_blank" class="raw-btn">Raw</a>
|
|
|
|
| 3970 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 3971 |
Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
|
| 3972 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 3973 |
+
torch_eager 0.00% 0.000us 0.00% 0.000us 0.000us 189.470us 1483.94% 189.470us 189.470us 1
|
| 3974 |
+
torch_eager 11.64% 220.727us 99.60% 1.889ms 1.889ms 0.000us 0.00% 15.103us 15.103us 1
|
| 3975 |
+
aten::silu 3.36% 63.732us 81.84% 1.552ms 517.326us 6.559us 51.37% 8.894us 2.965us 3
|
| 3976 |
+
void at::native::elementwise_kernel<128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 6.559us 51.37% 6.559us 2.186us 3
|
| 3977 |
+
aten::mul 1.83% 34.608us 3.05% 57.780us 19.260us 6.209us 48.63% 6.209us 2.070us 3
|
| 3978 |
+
void at::native::elementwise_kernel<128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 6.209us 48.63% 6.209us 2.070us 3
|
| 3979 |
+
Activity Buffer Request 76.17% 1.444ms 76.17% 1.444ms 1.444ms 2.335us 18.29% 2.335us 2.335us 1
|
| 3980 |
+
aten::slice 2.47% 46.790us 3.07% 58.281us 9.714us 0.000us 0.00% 0.000us 0.000us 6
|
| 3981 |
+
aten::as_strided 0.61% 11.491us 0.61% 11.491us 1.915us 0.000us 0.00% 0.000us 0.000us 6
|
| 3982 |
+
cudaLaunchKernel 3.54% 67.043us 3.54% 67.043us 11.174us 0.000us 0.00% 0.000us 0.000us 6
|
| 3983 |
+
cudaDeviceSynchronize 0.40% 7.531us 0.40% 7.531us 7.531us 0.000us 0.00% 0.000us 0.000us 1
|
| 3984 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 3985 |
+
Self CPU time total: 1.896ms
|
| 3986 |
+
Self CUDA time total: 12.768us
|
| 3987 |
|
| 3988 |
|
| 3989 |
|
|
|
|
| 3993 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 3994 |
Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
|
| 3995 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 3996 |
+
torch_eager 0.00% 0.000us 0.00% 0.000us 0.000us 160.895us 1299.43% 160.895us 160.895us 1
|
| 3997 |
+
torch_eager 6.82% 117.243us 99.71% 1.713ms 1.713ms 0.000us 0.00% 14.558us 14.558us 1
|
| 3998 |
+
aten::silu 2.46% 42.340us 88.23% 1.516ms 505.362us 6.399us 51.68% 8.575us 2.858us 3
|
| 3999 |
void at::native::elementwise_kernel<128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 6.399us 51.68% 6.399us 2.133us 3
|
| 4000 |
+
aten::mul 1.64% 28.101us 2.83% 48.681us 16.227us 5.983us 48.32% 5.983us 1.994us 3
|
| 4001 |
+
void at::native::elementwise_kernel<128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 5.983us 48.32% 5.983us 1.994us 3
|
| 4002 |
+
Activity Buffer Request 84.10% 1.445ms 84.10% 1.445ms 1.445ms 2.176us 17.57% 2.176us 2.176us 1
|
| 4003 |
+
aten::slice 1.47% 25.252us 1.82% 31.222us 5.204us 0.000us 0.00% 0.000us 0.000us 6
|
| 4004 |
+
aten::as_strided 0.35% 5.970us 0.35% 5.970us 0.995us 0.000us 0.00% 0.000us 0.000us 6
|
| 4005 |
+
cudaLaunchKernel 2.87% 49.290us 2.87% 49.290us 8.215us 0.000us 0.00% 0.000us 0.000us 6
|
| 4006 |
+
cudaDeviceSynchronize 0.29% 5.020us 0.29% 5.020us 5.020us 0.000us 0.00% 0.000us 0.000us 1
|
| 4007 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 4008 |
+
Self CPU time total: 1.718ms
|
| 4009 |
+
Self CUDA time total: 12.382us
|
| 4010 |
|
| 4011 |
|
| 4012 |
|
|
|
|
| 4016 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 4017 |
Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
|
| 4018 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 4019 |
+
torch_eager 0.00% 0.000us 0.00% 0.000us 0.000us 157.982us 1195.38% 157.982us 157.982us 1
|
| 4020 |
+
torch_eager 6.51% 110.244us 99.65% 1.686ms 1.686ms 0.000us 0.00% 15.488us 15.488us 1
|
| 4021 |
+
aten::silu 2.52% 42.653us 88.50% 1.498ms 499.192us 6.784us 51.33% 9.056us 3.019us 3
|
| 4022 |
+
void at::native::elementwise_kernel<128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 6.784us 51.33% 6.784us 2.261us 3
|
| 4023 |
+
aten::mul 1.66% 28.021us 2.76% 46.791us 15.597us 6.432us 48.67% 6.432us 2.144us 3
|
| 4024 |
+
void at::native::elementwise_kernel<128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 6.432us 48.67% 6.432us 2.144us 3
|
| 4025 |
+
Activity Buffer Request 84.30% 1.427ms 84.30% 1.427ms 1.427ms 2.272us 17.19% 2.272us 2.272us 1
|
| 4026 |
+
aten::slice 1.51% 25.627us 1.87% 31.700us 5.283us 0.000us 0.00% 0.000us 0.000us 6
|
| 4027 |
+
aten::as_strided 0.36% 6.073us 0.36% 6.073us 1.012us 0.000us 0.00% 0.000us 0.000us 6
|
| 4028 |
+
cudaLaunchKernel 2.78% 47.050us 2.78% 47.050us 7.842us 0.000us 0.00% 0.000us 0.000us 6
|
| 4029 |
+
cudaDeviceSynchronize 0.35% 5.950us 0.35% 5.950us 5.950us 0.000us 0.00% 0.000us 0.000us 1
|
| 4030 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 4031 |
Self CPU time total: 1.692ms
|
| 4032 |
+
Self CUDA time total: 13.216us
|
| 4033 |
|
| 4034 |
|
| 4035 |
|
|
|
|
| 4039 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 4040 |
Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
|
| 4041 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 4042 |
+
torch_eager 0.00% 0.000us 0.00% 0.000us 0.000us 159.902us 1258.67% 159.902us 159.902us 1
|
| 4043 |
+
torch_eager 6.73% 114.317us 99.66% 1.694ms 1.694ms 0.000us 0.00% 14.912us 14.912us 1
|
| 4044 |
+
aten::silu 2.46% 41.881us 88.34% 1.501ms 500.465us 6.560us 51.64% 8.768us 2.923us 3
|
| 4045 |
+
void at::native::elementwise_kernel<128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 6.560us 51.64% 6.560us 2.187us 3
|
| 4046 |
+
aten::mul 1.68% 28.581us 2.79% 47.441us 15.814us 6.144us 48.36% 6.144us 2.048us 3
|
| 4047 |
+
void at::native::elementwise_kernel<128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 6.144us 48.36% 6.144us 2.048us 3
|
| 4048 |
+
Activity Buffer Request 74.33% 1.263ms 74.33% 1.263ms 1.263ms 2.208us 17.38% 2.208us 2.208us 1
|
| 4049 |
+
aten::slice 1.44% 24.468us 1.80% 30.638us 5.106us 0.000us 0.00% 0.000us 0.000us 6
|
| 4050 |
+
aten::as_strided 0.36% 6.170us 0.36% 6.170us 1.028us 0.000us 0.00% 0.000us 0.000us 6
|
| 4051 |
+
cudaLaunchKernel 12.65% 214.994us 12.65% 214.994us 35.832us 0.000us 0.00% 0.000us 0.000us 6
|
| 4052 |
+
cudaDeviceSynchronize 0.34% 5.830us 0.34% 5.830us 5.830us 0.000us 0.00% 0.000us 0.000us 1
|
| 4053 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 4054 |
+
Self CPU time total: 1.700ms
|
| 4055 |
+
Self CUDA time total: 12.704us
|
| 4056 |
|
| 4057 |
|
| 4058 |
|
|
|
|
| 4062 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 4063 |
Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
|
| 4064 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 4065 |
+
torch_eager 0.00% 0.000us 0.00% 0.000us 0.000us 157.053us 1185.48% 157.053us 157.053us 1
|
| 4066 |
+
torch_eager 6.08% 111.294us 99.69% 1.824ms 1.824ms 0.000us 0.00% 15.552us 15.552us 1
|
| 4067 |
+
aten::silu 2.39% 43.729us 89.42% 1.636ms 545.306us 6.784us 51.21% 9.088us 3.029us 3
|
| 4068 |
+
void at::native::elementwise_kernel<128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 6.784us 51.21% 6.784us 2.261us 3
|
| 4069 |
+
aten::mul 1.44% 26.361us 2.52% 46.181us 15.394us 6.464us 48.79% 6.464us 2.155us 3
|
| 4070 |
+
void at::native::elementwise_kernel<128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 6.464us 48.79% 6.464us 2.155us 3
|
| 4071 |
+
Activity Buffer Request 77.97% 1.426ms 77.97% 1.426ms 1.426ms 2.304us 17.39% 2.304us 2.304us 1
|
| 4072 |
+
aten::slice 1.34% 24.571us 1.66% 30.441us 5.074us 0.000us 0.00% 0.000us 0.000us 6
|
| 4073 |
+
aten::as_strided 0.32% 5.870us 0.32% 5.870us 0.978us 0.000us 0.00% 0.000us 0.000us 6
|
| 4074 |
+
cudaLaunchKernel 10.14% 185.544us 10.14% 185.544us 30.924us 0.000us 0.00% 0.000us 0.000us 6
|
| 4075 |
+
cudaDeviceSynchronize 0.31% 5.601us 0.31% 5.601us 5.601us 0.000us 0.00% 0.000us 0.000us 1
|
| 4076 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 4077 |
+
Self CPU time total: 1.829ms
|
| 4078 |
+
Self CUDA time total: 13.248us
|
| 4079 |
|
| 4080 |
|
| 4081 |
|
|
|
|
| 4085 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 4086 |
Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
|
| 4087 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 4088 |
+
torch_eager 0.00% 0.000us 0.00% 0.000us 0.000us 151.390us 977.47% 151.390us 151.390us 1
|
| 4089 |
+
torch_eager 22.03% 109.975us 99.02% 494.363us 494.363us 0.000us 0.00% 18.176us 18.176us 1
|
| 4090 |
+
aten::silu 8.41% 41.971us 61.88% 308.937us 102.979us 7.936us 51.24% 10.624us 3.541us 3
|
| 4091 |
+
void at::native::elementwise_kernel<128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 7.936us 51.24% 7.936us 2.645us 3
|
| 4092 |
+
aten::mul 5.23% 26.101us 8.92% 44.531us 14.844us 7.552us 48.76% 7.552us 2.517us 3
|
| 4093 |
+
void at::native::elementwise_kernel<128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 7.552us 48.76% 7.552us 2.517us 3
|
| 4094 |
+
Activity Buffer Request 22.19% 110.773us 22.19% 110.773us 110.773us 2.688us 17.36% 2.688us 2.688us 1
|
| 4095 |
+
aten::slice 5.05% 25.220us 6.19% 30.920us 5.153us 0.000us 0.00% 0.000us 0.000us 6
|
| 4096 |
+
aten::as_strided 1.14% 5.700us 1.14% 5.700us 0.950us 0.000us 0.00% 0.000us 0.000us 6
|
| 4097 |
+
cudaLaunchKernel 34.98% 174.623us 34.98% 174.623us 29.104us 0.000us 0.00% 0.000us 0.000us 6
|
| 4098 |
+
cudaDeviceSynchronize 0.98% 4.900us 0.98% 4.900us 4.900us 0.000us 0.00% 0.000us 0.000us 1
|
| 4099 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 4100 |
+
Self CPU time total: 499.263us
|
| 4101 |
+
Self CUDA time total: 15.488us
|
| 4102 |
|
| 4103 |
|
| 4104 |
|
|
|
|
| 4108 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 4109 |
Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
|
| 4110 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 4111 |
+
torch_eager 0.00% 0.000us 0.00% 0.000us 0.000us 163.583us 1143.70% 163.583us 163.583us 1
|
| 4112 |
+
torch_eager 6.28% 116.052us 99.70% 1.841ms 1.841ms 0.000us 0.00% 16.767us 16.767us 1
|
| 4113 |
+
aten::silu 2.27% 41.942us 89.09% 1.645ms 548.450us 7.327us 51.23% 9.791us 3.264us 3
|
| 4114 |
+
void at::native::elementwise_kernel<128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 7.327us 51.23% 7.327us 2.442us 3
|
| 4115 |
+
aten::mul 1.55% 28.681us 2.62% 48.392us 16.131us 6.976us 48.77% 6.976us 2.325us 3
|
| 4116 |
+
void at::native::elementwise_kernel<128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 6.976us 48.77% 6.976us 2.325us 3
|
| 4117 |
+
Activity Buffer Request 78.22% 1.445ms 78.22% 1.445ms 1.445ms 2.464us 17.23% 2.464us 2.464us 1
|
| 4118 |
+
aten::slice 1.38% 25.430us 1.70% 31.392us 5.232us 0.000us 0.00% 0.000us 0.000us 6
|
| 4119 |
+
aten::as_strided 0.32% 5.962us 0.32% 5.962us 0.994us 0.000us 0.00% 0.000us 0.000us 6
|
| 4120 |
+
cudaLaunchKernel 9.67% 178.614us 9.67% 178.614us 29.769us 0.000us 0.00% 0.000us 0.000us 6
|
| 4121 |
+
cudaDeviceSynchronize 0.30% 5.570us 0.30% 5.570us 5.570us 0.000us 0.00% 0.000us 0.000us 1
|
| 4122 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 4123 |
+
Self CPU time total: 1.847ms
|
| 4124 |
+
Self CUDA time total: 14.303us
|
| 4125 |
|
| 4126 |
|
| 4127 |
|
|
|
|
| 4131 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 4132 |
Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
|
| 4133 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 4134 |
+
torch_eager 0.00% 0.000us 0.00% 0.000us 0.000us 150.172us 969.60% 150.172us 150.172us 1
|
| 4135 |
+
torch_eager 23.07% 110.204us 98.98% 472.752us 472.752us 0.000us 0.00% 18.176us 18.176us 1
|
| 4136 |
+
aten::silu 9.08% 43.371us 60.20% 287.547us 95.849us 7.936us 51.24% 10.624us 3.541us 3
|
| 4137 |
+
void at::native::elementwise_kernel<128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 7.936us 51.24% 7.936us 2.645us 3
|
| 4138 |
+
aten::mul 5.48% 26.181us 9.38% 44.801us 14.934us 7.552us 48.76% 7.552us 2.517us 3
|
| 4139 |
+
void at::native::elementwise_kernel<128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 7.552us 48.76% 7.552us 2.517us 3
|
| 4140 |
+
Activity Buffer Request 19.26% 92.002us 19.26% 92.002us 92.002us 2.688us 17.36% 2.688us 2.688us 1
|
| 4141 |
+
aten::slice 5.00% 23.870us 6.32% 30.200us 5.033us 0.000us 0.00% 0.000us 0.000us 6
|
| 4142 |
+
aten::as_strided 1.33% 6.330us 1.33% 6.330us 1.055us 0.000us 0.00% 0.000us 0.000us 6
|
| 4143 |
+
cudaLaunchKernel 35.76% 170.794us 35.76% 170.794us 28.466us 0.000us 0.00% 0.000us 0.000us 6
|
| 4144 |
+
cudaDeviceSynchronize 1.02% 4.871us 1.02% 4.871us 4.871us 0.000us 0.00% 0.000us 0.000us 1
|
| 4145 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 4146 |
+
Self CPU time total: 477.623us
|
| 4147 |
+
Self CUDA time total: 15.488us
|
| 4148 |
|
| 4149 |
|
| 4150 |
|
|
|
|
| 4154 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 4155 |
Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
|
| 4156 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 4157 |
+
torch_eager 0.00% 0.000us 0.00% 0.000us 0.000us 160.000us 713.30% 160.000us 160.000us 1
|
| 4158 |
+
torch_eager 5.99% 109.975us 99.73% 1.831ms 1.831ms 0.000us 0.00% 26.335us 26.335us 1
|
| 4159 |
+
aten::silu 2.30% 42.230us 89.52% 1.643ms 547.763us 11.583us 51.64% 15.487us 5.162us 3
|
| 4160 |
+
void at::native::elementwise_kernel<128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 11.583us 51.64% 11.583us 3.861us 3
|
| 4161 |
+
aten::mul 1.54% 28.250us 2.52% 46.180us 15.393us 10.848us 48.36% 10.848us 3.616us 3
|
| 4162 |
+
void at::native::elementwise_kernel<128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 10.848us 48.36% 10.848us 3.616us 3
|
| 4163 |
+
Activity Buffer Request 78.83% 1.447ms 78.83% 1.447ms 1.447ms 3.904us 17.40% 3.904us 3.904us 1
|
| 4164 |
+
aten::slice 1.37% 25.211us 1.70% 31.261us 5.210us 0.000us 0.00% 0.000us 0.000us 6
|
| 4165 |
+
aten::as_strided 0.33% 6.050us 0.33% 6.050us 1.008us 0.000us 0.00% 0.000us 0.000us 6
|
| 4166 |
+
cudaLaunchKernel 9.37% 171.964us 9.37% 171.964us 28.661us 0.000us 0.00% 0.000us 0.000us 6
|
| 4167 |
+
cudaDeviceSynchronize 0.27% 4.930us 0.27% 4.930us 4.930us 0.000us 0.00% 0.000us 0.000us 1
|
| 4168 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 4169 |
+
Self CPU time total: 1.836ms
|
| 4170 |
+
Self CUDA time total: 22.431us
|
| 4171 |
|
| 4172 |
|
| 4173 |
impl wl p50(ms) ok
|
|
|
|
| 4184 |
<div class="uv-install-logs" id="uv-logs-benchmark">
|
| 4185 |
<div class="uv-logs-header" onclick="toggleUvLogs(this)">▶ UV Install Logs</div>
|
| 4186 |
<div class="uv-logs-content" style="display: none;">
|
| 4187 |
+
Installed 37 packages in 230ms
|
| 4188 |
</div>
|
| 4189 |
</div>
|
| 4190 |
<div class="cell-artifacts">
|
activation/results/artifacts/combine/latency.svg
CHANGED
|
|
Git LFS Details
|
|
|
Git LFS Details
|
activation/results/combined_results.html
CHANGED
|
@@ -3872,7 +3872,7 @@ span.linenos.special { color: #000000; background-color: #ffffc0; padding-left:
|
|
| 3872 |
<rdf:RDF>
|
| 3873 |
<ns2:Work>
|
| 3874 |
<dc:type rdf:resource="http://purl.org/dc/dcmitype/StillImage" />
|
| 3875 |
-
<dc:date>2025-10-
|
| 3876 |
<dc:format>image/svg+xml</dc:format>
|
| 3877 |
<dc:creator>
|
| 3878 |
<ns2:Agent>
|
|
@@ -4021,83 +4021,96 @@ span.linenos.special { color: #000000; background-color: #ffffc0; padding-left:
|
|
| 4021 |
<g id="matplotlib.axis_2">
|
| 4022 |
<g id="ytick_1">
|
| 4023 |
<g id="grid-y--2" class="grid grid-y">
|
| 4024 |
-
<path d="M 60.23
|
| 4025 |
</g>
|
| 4026 |
<g id="line2d_10">
|
| 4027 |
<defs>
|
| 4028 |
<path id="m0fca2865ba" d="M 0 0 L -3.5 0 " style="stroke: #000000; stroke-width: 0.8" />
|
| 4029 |
</defs>
|
| 4030 |
<g>
|
| 4031 |
-
<use ns4:href="#m0fca2865ba" x="60.23" y="
|
| 4032 |
</g>
|
| 4033 |
</g>
|
| 4034 |
<g id="text_10">
|
| 4035 |
-
<text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: end" x="53.23" y="
|
| 4036 |
</g>
|
| 4037 |
</g>
|
| 4038 |
<g id="ytick_2">
|
| 4039 |
<g id="grid-y--3" class="grid grid-y">
|
| 4040 |
-
<path d="M 60.23
|
| 4041 |
</g>
|
| 4042 |
<g id="line2d_11">
|
| 4043 |
<g>
|
| 4044 |
-
<use ns4:href="#m0fca2865ba" x="60.23" y="
|
| 4045 |
</g>
|
| 4046 |
</g>
|
| 4047 |
<g id="text_11">
|
| 4048 |
-
<text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: end" x="53.23" y="
|
| 4049 |
</g>
|
| 4050 |
</g>
|
| 4051 |
<g id="ytick_3">
|
| 4052 |
<g id="grid-y--4" class="grid grid-y">
|
| 4053 |
-
<path d="M 60.23
|
| 4054 |
</g>
|
| 4055 |
<g id="line2d_12">
|
| 4056 |
<g>
|
| 4057 |
-
<use ns4:href="#m0fca2865ba" x="60.23" y="
|
| 4058 |
</g>
|
| 4059 |
</g>
|
| 4060 |
<g id="text_12">
|
| 4061 |
-
<text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: end" x="53.23" y="
|
| 4062 |
</g>
|
| 4063 |
</g>
|
| 4064 |
<g id="ytick_4">
|
| 4065 |
<g id="grid-y--5" class="grid grid-y">
|
| 4066 |
-
<path d="M 60.23
|
| 4067 |
</g>
|
| 4068 |
<g id="line2d_13">
|
| 4069 |
<g>
|
| 4070 |
-
<use ns4:href="#m0fca2865ba" x="60.23" y="
|
| 4071 |
</g>
|
| 4072 |
</g>
|
| 4073 |
<g id="text_13">
|
| 4074 |
-
<text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: end" x="53.23" y="
|
| 4075 |
</g>
|
| 4076 |
</g>
|
| 4077 |
<g id="ytick_5">
|
| 4078 |
<g id="grid-y--6" class="grid grid-y">
|
| 4079 |
-
<path d="M 60.23
|
| 4080 |
</g>
|
| 4081 |
<g id="line2d_14">
|
| 4082 |
<g>
|
| 4083 |
-
<use ns4:href="#m0fca2865ba" x="60.23" y="
|
| 4084 |
</g>
|
| 4085 |
</g>
|
| 4086 |
<g id="text_14">
|
| 4087 |
-
<text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: end" x="53.23" y="
|
| 4088 |
</g>
|
| 4089 |
</g>
|
| 4090 |
<g id="ytick_6">
|
| 4091 |
<g id="grid-y--7" class="grid grid-y">
|
| 4092 |
-
<path d="M 60.23
|
| 4093 |
</g>
|
| 4094 |
<g id="line2d_15">
|
| 4095 |
<g>
|
| 4096 |
-
<use ns4:href="#m0fca2865ba" x="60.23" y="
|
| 4097 |
</g>
|
| 4098 |
</g>
|
| 4099 |
<g id="text_15">
|
| 4100 |
-
<text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: end" x="53.23" y="
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 4101 |
</g>
|
| 4102 |
</g>
|
| 4103 |
<g id="label--y" class="ylabel">
|
|
@@ -4105,37 +4118,37 @@ span.linenos.special { color: #000000; background-color: #ffffc0; padding-left:
|
|
| 4105 |
</g>
|
| 4106 |
</g>
|
| 4107 |
<g id="series--hf-kernels-swiglu" class="series">
|
| 4108 |
-
<path d="M 96.005644 451.16779 L 185.444754
|
| 4109 |
<defs>
|
| 4110 |
<path id="md7efaf3aec" d="M 0 3 C 0.795609 3 1.55874 2.683901 2.12132 2.12132 C 2.683901 1.55874 3 0.795609 3 0 C 3 -0.795609 2.683901 -1.55874 2.12132 -2.12132 C 1.55874 -2.683901 0.795609 -3 0 -3 C -0.795609 -3 -1.55874 -2.683901 -2.12132 -2.12132 C -2.683901 -1.55874 -3 -0.795609 -3 0 C -3 0.795609 -2.683901 1.55874 -2.12132 2.12132 C -1.55874 2.683901 -0.795609 3 0 3 z " style="stroke: #1f77b4" />
|
| 4111 |
</defs>
|
| 4112 |
<g clip-path="url(#p620c7d392f)">
|
| 4113 |
<use ns4:href="#md7efaf3aec" x="96.005644" y="451.16779" style="fill: #1f77b4; stroke: #1f77b4" />
|
| 4114 |
-
<use ns4:href="#md7efaf3aec" x="185.444754" y="
|
| 4115 |
-
<use ns4:href="#md7efaf3aec" x="274.883864" y="
|
| 4116 |
-
<use ns4:href="#md7efaf3aec" x="364.322974" y="
|
| 4117 |
-
<use ns4:href="#md7efaf3aec" x="453.762084" y="
|
| 4118 |
-
<use ns4:href="#md7efaf3aec" x="543.201194" y="
|
| 4119 |
-
<use ns4:href="#md7efaf3aec" x="632.640304" y="
|
| 4120 |
-
<use ns4:href="#md7efaf3aec" x="722.079415" y="
|
| 4121 |
-
<use ns4:href="#md7efaf3aec" x="811.518525" y="
|
| 4122 |
</g>
|
| 4123 |
</g>
|
| 4124 |
<g id="series--torch-eager" class="series">
|
| 4125 |
-
<path d="M 96.005644
|
| 4126 |
<defs>
|
| 4127 |
<path id="m9b8c54d372" d="M 0 3 C 0.795609 3 1.55874 2.683901 2.12132 2.12132 C 2.683901 1.55874 3 0.795609 3 0 C 3 -0.795609 2.683901 -1.55874 2.12132 -2.12132 C 1.55874 -2.683901 0.795609 -3 0 -3 C -0.795609 -3 -1.55874 -2.683901 -2.12132 -2.12132 C -2.683901 -1.55874 -3 -0.795609 -3 0 C -3 0.795609 -2.683901 1.55874 -2.12132 2.12132 C -1.55874 2.683901 -0.795609 3 0 3 z " style="stroke: #ff7f0e" />
|
| 4128 |
</defs>
|
| 4129 |
<g clip-path="url(#p620c7d392f)">
|
| 4130 |
-
<use ns4:href="#m9b8c54d372" x="96.005644" y="
|
| 4131 |
<use ns4:href="#m9b8c54d372" x="185.444754" y="47.08418" style="fill: #ff7f0e; stroke: #ff7f0e" />
|
| 4132 |
-
<use ns4:href="#m9b8c54d372" x="274.883864" y="
|
| 4133 |
-
<use ns4:href="#m9b8c54d372" x="364.322974" y="
|
| 4134 |
-
<use ns4:href="#m9b8c54d372" x="453.762084" y="
|
| 4135 |
-
<use ns4:href="#m9b8c54d372" x="543.201194" y="
|
| 4136 |
-
<use ns4:href="#m9b8c54d372" x="632.640304" y="
|
| 4137 |
-
<use ns4:href="#m9b8c54d372" x="722.079415" y="
|
| 4138 |
-
<use ns4:href="#m9b8c54d372" x="811.518525" y="
|
| 4139 |
</g>
|
| 4140 |
</g>
|
| 4141 |
<g id="patch_3">
|
|
@@ -4150,30 +4163,30 @@ span.linenos.special { color: #000000; background-color: #ffffc0; padding-left:
|
|
| 4150 |
<g id="patch_6">
|
| 4151 |
<path d="M 60.23 26.88 L 847.294169 26.88 " style="fill: none; stroke: #000000; stroke-width: 0.8; stroke-linejoin: miter; stroke-linecap: square" />
|
| 4152 |
</g>
|
| 4153 |
-
<g id="
|
| 4154 |
<text style="font-size: 12px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: middle" x="453.762084" y="20.88" transform="rotate(-0 453.762084 20.88)">Attention Implementation Latency</text>
|
| 4155 |
</g>
|
| 4156 |
<g id="legend" class="legend">
|
| 4157 |
<g id="patch_7">
|
| 4158 |
-
<path d="M 720.811356
|
| 4159 |
</g>
|
| 4160 |
-
<g id="
|
| 4161 |
-
<path d="M 722.811356
|
| 4162 |
<g>
|
| 4163 |
-
<use ns4:href="#md7efaf3aec" x="732.811356" y="
|
| 4164 |
</g>
|
| 4165 |
</g>
|
| 4166 |
<g id="legend-label--hf-kernels-swiglu" class="legend">
|
| 4167 |
-
<text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: start" x="750.811356" y="
|
| 4168 |
</g>
|
| 4169 |
-
<g id="
|
| 4170 |
-
<path d="M 722.811356
|
| 4171 |
<g>
|
| 4172 |
-
<use ns4:href="#m9b8c54d372" x="732.811356" y="
|
| 4173 |
</g>
|
| 4174 |
</g>
|
| 4175 |
<g id="legend-label--torch-eager" class="legend">
|
| 4176 |
-
<text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: start" x="750.811356" y="
|
| 4177 |
</g>
|
| 4178 |
</g>
|
| 4179 |
</g>
|
|
@@ -4193,7 +4206,7 @@ span.linenos.special { color: #000000; background-color: #ffffc0; padding-left:
|
|
| 4193 |
<span onclick="toggleOutput('combine')" style="cursor: pointer;">▼ output</span>
|
| 4194 |
<span id="uv-indicator-combine" onclick="toggleUvLogsFromHeader('combine')" style="cursor: pointer;">▶ uv-logs</span>
|
| 4195 |
</span> |
|
| 4196 |
-
Cell: combine | 4.
|
| 4197 |
| <button class="run-btn" onclick="runCell('combine')">▶ run</button>
|
| 4198 |
<button class="copy-btn" onclick="copyCell('combine')">Copy</button>
|
| 4199 |
<a href="cells/combine.py" target="_blank" class="raw-btn">Raw</a>
|
|
@@ -4319,7 +4332,7 @@ Implementations included:
|
|
| 4319 |
<div class="uv-install-logs" id="uv-logs-combine">
|
| 4320 |
<div class="uv-logs-header" onclick="toggleUvLogs(this)">▶ UV Install Logs</div>
|
| 4321 |
<div class="uv-logs-content" style="display: none;">
|
| 4322 |
-
Installed 37 packages in
|
| 4323 |
</div>
|
| 4324 |
</div>
|
| 4325 |
<div class="cell-artifacts">
|
|
@@ -4332,7 +4345,7 @@ Installed 37 packages in 195ms
|
|
| 4332 |
<rdf:RDF>
|
| 4333 |
<ns2:Work>
|
| 4334 |
<dc:type rdf:resource="http://purl.org/dc/dcmitype/StillImage" />
|
| 4335 |
-
<dc:date>2025-10-
|
| 4336 |
<dc:format>image/svg+xml</dc:format>
|
| 4337 |
<dc:creator>
|
| 4338 |
<ns2:Agent>
|
|
@@ -4481,83 +4494,96 @@ Installed 37 packages in 195ms
|
|
| 4481 |
<g id="matplotlib.axis_2">
|
| 4482 |
<g id="ytick_1">
|
| 4483 |
<g id="grid-y--2" class="grid grid-y">
|
| 4484 |
-
<path d="M 60.23
|
| 4485 |
</g>
|
| 4486 |
<g id="line2d_10">
|
| 4487 |
<defs>
|
| 4488 |
<path id="m0fca2865ba" d="M 0 0 L -3.5 0 " style="stroke: #000000; stroke-width: 0.8" />
|
| 4489 |
</defs>
|
| 4490 |
<g>
|
| 4491 |
-
<use ns4:href="#m0fca2865ba" x="60.23" y="
|
| 4492 |
</g>
|
| 4493 |
</g>
|
| 4494 |
<g id="text_10">
|
| 4495 |
-
<text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: end" x="53.23" y="
|
| 4496 |
</g>
|
| 4497 |
</g>
|
| 4498 |
<g id="ytick_2">
|
| 4499 |
<g id="grid-y--3" class="grid grid-y">
|
| 4500 |
-
<path d="M 60.23
|
| 4501 |
</g>
|
| 4502 |
<g id="line2d_11">
|
| 4503 |
<g>
|
| 4504 |
-
<use ns4:href="#m0fca2865ba" x="60.23" y="
|
| 4505 |
</g>
|
| 4506 |
</g>
|
| 4507 |
<g id="text_11">
|
| 4508 |
-
<text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: end" x="53.23" y="
|
| 4509 |
</g>
|
| 4510 |
</g>
|
| 4511 |
<g id="ytick_3">
|
| 4512 |
<g id="grid-y--4" class="grid grid-y">
|
| 4513 |
-
<path d="M 60.23
|
| 4514 |
</g>
|
| 4515 |
<g id="line2d_12">
|
| 4516 |
<g>
|
| 4517 |
-
<use ns4:href="#m0fca2865ba" x="60.23" y="
|
| 4518 |
</g>
|
| 4519 |
</g>
|
| 4520 |
<g id="text_12">
|
| 4521 |
-
<text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: end" x="53.23" y="
|
| 4522 |
</g>
|
| 4523 |
</g>
|
| 4524 |
<g id="ytick_4">
|
| 4525 |
<g id="grid-y--5" class="grid grid-y">
|
| 4526 |
-
<path d="M 60.23
|
| 4527 |
</g>
|
| 4528 |
<g id="line2d_13">
|
| 4529 |
<g>
|
| 4530 |
-
<use ns4:href="#m0fca2865ba" x="60.23" y="
|
| 4531 |
</g>
|
| 4532 |
</g>
|
| 4533 |
<g id="text_13">
|
| 4534 |
-
<text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: end" x="53.23" y="
|
| 4535 |
</g>
|
| 4536 |
</g>
|
| 4537 |
<g id="ytick_5">
|
| 4538 |
<g id="grid-y--6" class="grid grid-y">
|
| 4539 |
-
<path d="M 60.23
|
| 4540 |
</g>
|
| 4541 |
<g id="line2d_14">
|
| 4542 |
<g>
|
| 4543 |
-
<use ns4:href="#m0fca2865ba" x="60.23" y="
|
| 4544 |
</g>
|
| 4545 |
</g>
|
| 4546 |
<g id="text_14">
|
| 4547 |
-
<text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: end" x="53.23" y="
|
| 4548 |
</g>
|
| 4549 |
</g>
|
| 4550 |
<g id="ytick_6">
|
| 4551 |
<g id="grid-y--7" class="grid grid-y">
|
| 4552 |
-
<path d="M 60.23
|
| 4553 |
</g>
|
| 4554 |
<g id="line2d_15">
|
| 4555 |
<g>
|
| 4556 |
-
<use ns4:href="#m0fca2865ba" x="60.23" y="
|
| 4557 |
</g>
|
| 4558 |
</g>
|
| 4559 |
<g id="text_15">
|
| 4560 |
-
<text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: end" x="53.23" y="
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 4561 |
</g>
|
| 4562 |
</g>
|
| 4563 |
<g id="label--y" class="ylabel">
|
|
@@ -4565,37 +4591,37 @@ Installed 37 packages in 195ms
|
|
| 4565 |
</g>
|
| 4566 |
</g>
|
| 4567 |
<g id="series--hf-kernels-swiglu" class="series">
|
| 4568 |
-
<path d="M 96.005644 451.16779 L 185.444754
|
| 4569 |
<defs>
|
| 4570 |
<path id="md7efaf3aec" d="M 0 3 C 0.795609 3 1.55874 2.683901 2.12132 2.12132 C 2.683901 1.55874 3 0.795609 3 0 C 3 -0.795609 2.683901 -1.55874 2.12132 -2.12132 C 1.55874 -2.683901 0.795609 -3 0 -3 C -0.795609 -3 -1.55874 -2.683901 -2.12132 -2.12132 C -2.683901 -1.55874 -3 -0.795609 -3 0 C -3 0.795609 -2.683901 1.55874 -2.12132 2.12132 C -1.55874 2.683901 -0.795609 3 0 3 z " style="stroke: #1f77b4" />
|
| 4571 |
</defs>
|
| 4572 |
<g clip-path="url(#p620c7d392f)">
|
| 4573 |
<use ns4:href="#md7efaf3aec" x="96.005644" y="451.16779" style="fill: #1f77b4; stroke: #1f77b4" />
|
| 4574 |
-
<use ns4:href="#md7efaf3aec" x="185.444754" y="
|
| 4575 |
-
<use ns4:href="#md7efaf3aec" x="274.883864" y="
|
| 4576 |
-
<use ns4:href="#md7efaf3aec" x="364.322974" y="
|
| 4577 |
-
<use ns4:href="#md7efaf3aec" x="453.762084" y="
|
| 4578 |
-
<use ns4:href="#md7efaf3aec" x="543.201194" y="
|
| 4579 |
-
<use ns4:href="#md7efaf3aec" x="632.640304" y="
|
| 4580 |
-
<use ns4:href="#md7efaf3aec" x="722.079415" y="
|
| 4581 |
-
<use ns4:href="#md7efaf3aec" x="811.518525" y="
|
| 4582 |
</g>
|
| 4583 |
</g>
|
| 4584 |
<g id="series--torch-eager" class="series">
|
| 4585 |
-
<path d="M 96.005644
|
| 4586 |
<defs>
|
| 4587 |
<path id="m9b8c54d372" d="M 0 3 C 0.795609 3 1.55874 2.683901 2.12132 2.12132 C 2.683901 1.55874 3 0.795609 3 0 C 3 -0.795609 2.683901 -1.55874 2.12132 -2.12132 C 1.55874 -2.683901 0.795609 -3 0 -3 C -0.795609 -3 -1.55874 -2.683901 -2.12132 -2.12132 C -2.683901 -1.55874 -3 -0.795609 -3 0 C -3 0.795609 -2.683901 1.55874 -2.12132 2.12132 C -1.55874 2.683901 -0.795609 3 0 3 z " style="stroke: #ff7f0e" />
|
| 4588 |
</defs>
|
| 4589 |
<g clip-path="url(#p620c7d392f)">
|
| 4590 |
-
<use ns4:href="#m9b8c54d372" x="96.005644" y="
|
| 4591 |
<use ns4:href="#m9b8c54d372" x="185.444754" y="47.08418" style="fill: #ff7f0e; stroke: #ff7f0e" />
|
| 4592 |
-
<use ns4:href="#m9b8c54d372" x="274.883864" y="
|
| 4593 |
-
<use ns4:href="#m9b8c54d372" x="364.322974" y="
|
| 4594 |
-
<use ns4:href="#m9b8c54d372" x="453.762084" y="
|
| 4595 |
-
<use ns4:href="#m9b8c54d372" x="543.201194" y="
|
| 4596 |
-
<use ns4:href="#m9b8c54d372" x="632.640304" y="
|
| 4597 |
-
<use ns4:href="#m9b8c54d372" x="722.079415" y="
|
| 4598 |
-
<use ns4:href="#m9b8c54d372" x="811.518525" y="
|
| 4599 |
</g>
|
| 4600 |
</g>
|
| 4601 |
<g id="patch_3">
|
|
@@ -4610,30 +4636,30 @@ Installed 37 packages in 195ms
|
|
| 4610 |
<g id="patch_6">
|
| 4611 |
<path d="M 60.23 26.88 L 847.294169 26.88 " style="fill: none; stroke: #000000; stroke-width: 0.8; stroke-linejoin: miter; stroke-linecap: square" />
|
| 4612 |
</g>
|
| 4613 |
-
<g id="
|
| 4614 |
<text style="font-size: 12px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: middle" x="453.762084" y="20.88" transform="rotate(-0 453.762084 20.88)">Attention Implementation Latency</text>
|
| 4615 |
</g>
|
| 4616 |
<g id="legend" class="legend">
|
| 4617 |
<g id="patch_7">
|
| 4618 |
-
<path d="M 720.811356
|
| 4619 |
</g>
|
| 4620 |
-
<g id="
|
| 4621 |
-
<path d="M 722.811356
|
| 4622 |
<g>
|
| 4623 |
-
<use ns4:href="#md7efaf3aec" x="732.811356" y="
|
| 4624 |
</g>
|
| 4625 |
</g>
|
| 4626 |
<g id="legend-label--hf-kernels-swiglu" class="legend">
|
| 4627 |
-
<text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: start" x="750.811356" y="
|
| 4628 |
</g>
|
| 4629 |
-
<g id="
|
| 4630 |
-
<path d="M 722.811356
|
| 4631 |
<g>
|
| 4632 |
-
<use ns4:href="#m9b8c54d372" x="732.811356" y="
|
| 4633 |
</g>
|
| 4634 |
</g>
|
| 4635 |
<g id="legend-label--torch-eager" class="legend">
|
| 4636 |
-
<text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: start" x="750.811356" y="
|
| 4637 |
</g>
|
| 4638 |
</g>
|
| 4639 |
</g>
|
|
|
|
| 3872 |
<rdf:RDF>
|
| 3873 |
<ns2:Work>
|
| 3874 |
<dc:type rdf:resource="http://purl.org/dc/dcmitype/StillImage" />
|
| 3875 |
+
<dc:date>2025-10-29T14:27:49.999657</dc:date>
|
| 3876 |
<dc:format>image/svg+xml</dc:format>
|
| 3877 |
<dc:creator>
|
| 3878 |
<ns2:Agent>
|
|
|
|
| 4021 |
<g id="matplotlib.axis_2">
|
| 4022 |
<g id="ytick_1">
|
| 4023 |
<g id="grid-y--2" class="grid grid-y">
|
| 4024 |
+
<path d="M 60.23 428.188156 L 847.294169 428.188156 " clip-path="url(#p620c7d392f)" style="fill: none; stroke: #b0b0b0; stroke-opacity: 0.3; stroke-width: 0.8; stroke-linecap: square" />
|
| 4025 |
</g>
|
| 4026 |
<g id="line2d_10">
|
| 4027 |
<defs>
|
| 4028 |
<path id="m0fca2865ba" d="M 0 0 L -3.5 0 " style="stroke: #000000; stroke-width: 0.8" />
|
| 4029 |
</defs>
|
| 4030 |
<g>
|
| 4031 |
+
<use ns4:href="#m0fca2865ba" x="60.23" y="428.188156" style="stroke: #000000; stroke-width: 0.8" />
|
| 4032 |
</g>
|
| 4033 |
</g>
|
| 4034 |
<g id="text_10">
|
| 4035 |
+
<text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: end" x="53.23" y="431.987375" transform="rotate(-0 53.23 431.987375)">0.025</text>
|
| 4036 |
</g>
|
| 4037 |
</g>
|
| 4038 |
<g id="ytick_2">
|
| 4039 |
<g id="grid-y--3" class="grid grid-y">
|
| 4040 |
+
<path d="M 60.23 362.86799 L 847.294169 362.86799 " clip-path="url(#p620c7d392f)" style="fill: none; stroke: #b0b0b0; stroke-opacity: 0.3; stroke-width: 0.8; stroke-linecap: square" />
|
| 4041 |
</g>
|
| 4042 |
<g id="line2d_11">
|
| 4043 |
<g>
|
| 4044 |
+
<use ns4:href="#m0fca2865ba" x="60.23" y="362.86799" style="stroke: #000000; stroke-width: 0.8" />
|
| 4045 |
</g>
|
| 4046 |
</g>
|
| 4047 |
<g id="text_11">
|
| 4048 |
+
<text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: end" x="53.23" y="366.667209" transform="rotate(-0 53.23 366.667209)">0.030</text>
|
| 4049 |
</g>
|
| 4050 |
</g>
|
| 4051 |
<g id="ytick_3">
|
| 4052 |
<g id="grid-y--4" class="grid grid-y">
|
| 4053 |
+
<path d="M 60.23 297.547824 L 847.294169 297.547824 " clip-path="url(#p620c7d392f)" style="fill: none; stroke: #b0b0b0; stroke-opacity: 0.3; stroke-width: 0.8; stroke-linecap: square" />
|
| 4054 |
</g>
|
| 4055 |
<g id="line2d_12">
|
| 4056 |
<g>
|
| 4057 |
+
<use ns4:href="#m0fca2865ba" x="60.23" y="297.547824" style="stroke: #000000; stroke-width: 0.8" />
|
| 4058 |
</g>
|
| 4059 |
</g>
|
| 4060 |
<g id="text_12">
|
| 4061 |
+
<text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: end" x="53.23" y="301.347043" transform="rotate(-0 53.23 301.347043)">0.035</text>
|
| 4062 |
</g>
|
| 4063 |
</g>
|
| 4064 |
<g id="ytick_4">
|
| 4065 |
<g id="grid-y--5" class="grid grid-y">
|
| 4066 |
+
<path d="M 60.23 232.227658 L 847.294169 232.227658 " clip-path="url(#p620c7d392f)" style="fill: none; stroke: #b0b0b0; stroke-opacity: 0.3; stroke-width: 0.8; stroke-linecap: square" />
|
| 4067 |
</g>
|
| 4068 |
<g id="line2d_13">
|
| 4069 |
<g>
|
| 4070 |
+
<use ns4:href="#m0fca2865ba" x="60.23" y="232.227658" style="stroke: #000000; stroke-width: 0.8" />
|
| 4071 |
</g>
|
| 4072 |
</g>
|
| 4073 |
<g id="text_13">
|
| 4074 |
+
<text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: end" x="53.23" y="236.026877" transform="rotate(-0 53.23 236.026877)">0.040</text>
|
| 4075 |
</g>
|
| 4076 |
</g>
|
| 4077 |
<g id="ytick_5">
|
| 4078 |
<g id="grid-y--6" class="grid grid-y">
|
| 4079 |
+
<path d="M 60.23 166.907492 L 847.294169 166.907492 " clip-path="url(#p620c7d392f)" style="fill: none; stroke: #b0b0b0; stroke-opacity: 0.3; stroke-width: 0.8; stroke-linecap: square" />
|
| 4080 |
</g>
|
| 4081 |
<g id="line2d_14">
|
| 4082 |
<g>
|
| 4083 |
+
<use ns4:href="#m0fca2865ba" x="60.23" y="166.907492" style="stroke: #000000; stroke-width: 0.8" />
|
| 4084 |
</g>
|
| 4085 |
</g>
|
| 4086 |
<g id="text_14">
|
| 4087 |
+
<text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: end" x="53.23" y="170.706711" transform="rotate(-0 53.23 170.706711)">0.045</text>
|
| 4088 |
</g>
|
| 4089 |
</g>
|
| 4090 |
<g id="ytick_6">
|
| 4091 |
<g id="grid-y--7" class="grid grid-y">
|
| 4092 |
+
<path d="M 60.23 101.587327 L 847.294169 101.587327 " clip-path="url(#p620c7d392f)" style="fill: none; stroke: #b0b0b0; stroke-opacity: 0.3; stroke-width: 0.8; stroke-linecap: square" />
|
| 4093 |
</g>
|
| 4094 |
<g id="line2d_15">
|
| 4095 |
<g>
|
| 4096 |
+
<use ns4:href="#m0fca2865ba" x="60.23" y="101.587327" style="stroke: #000000; stroke-width: 0.8" />
|
| 4097 |
</g>
|
| 4098 |
</g>
|
| 4099 |
<g id="text_15">
|
| 4100 |
+
<text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: end" x="53.23" y="105.386545" transform="rotate(-0 53.23 105.386545)">0.050</text>
|
| 4101 |
+
</g>
|
| 4102 |
+
</g>
|
| 4103 |
+
<g id="ytick_7">
|
| 4104 |
+
<g id="grid-y--8" class="grid grid-y">
|
| 4105 |
+
<path d="M 60.23 36.267161 L 847.294169 36.267161 " clip-path="url(#p620c7d392f)" style="fill: none; stroke: #b0b0b0; stroke-opacity: 0.3; stroke-width: 0.8; stroke-linecap: square" />
|
| 4106 |
+
</g>
|
| 4107 |
+
<g id="line2d_16">
|
| 4108 |
+
<g>
|
| 4109 |
+
<use ns4:href="#m0fca2865ba" x="60.23" y="36.267161" style="stroke: #000000; stroke-width: 0.8" />
|
| 4110 |
+
</g>
|
| 4111 |
+
</g>
|
| 4112 |
+
<g id="text_16">
|
| 4113 |
+
<text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: end" x="53.23" y="40.066379" transform="rotate(-0 53.23 40.066379)">0.055</text>
|
| 4114 |
</g>
|
| 4115 |
</g>
|
| 4116 |
<g id="label--y" class="ylabel">
|
|
|
|
| 4118 |
</g>
|
| 4119 |
</g>
|
| 4120 |
<g id="series--hf-kernels-swiglu" class="series">
|
| 4121 |
+
<path d="M 96.005644 451.16779 L 185.444754 385.847624 L 274.883864 395.253728 L 364.322974 398.911657 L 453.762084 382.189695 L 543.201194 401.393823 L 632.640304 395.136152 L 722.079415 381.275213 L 811.518525 395.515009 " clip-path="url(#p620c7d392f)" style="fill: none; stroke: #1f77b4; stroke-width: 1.5; stroke-linecap: square" />
|
| 4122 |
<defs>
|
| 4123 |
<path id="md7efaf3aec" d="M 0 3 C 0.795609 3 1.55874 2.683901 2.12132 2.12132 C 2.683901 1.55874 3 0.795609 3 0 C 3 -0.795609 2.683901 -1.55874 2.12132 -2.12132 C 1.55874 -2.683901 0.795609 -3 0 -3 C -0.795609 -3 -1.55874 -2.683901 -2.12132 -2.12132 C -2.683901 -1.55874 -3 -0.795609 -3 0 C -3 0.795609 -2.683901 1.55874 -2.12132 2.12132 C -1.55874 2.683901 -0.795609 3 0 3 z " style="stroke: #1f77b4" />
|
| 4124 |
</defs>
|
| 4125 |
<g clip-path="url(#p620c7d392f)">
|
| 4126 |
<use ns4:href="#md7efaf3aec" x="96.005644" y="451.16779" style="fill: #1f77b4; stroke: #1f77b4" />
|
| 4127 |
+
<use ns4:href="#md7efaf3aec" x="185.444754" y="385.847624" style="fill: #1f77b4; stroke: #1f77b4" />
|
| 4128 |
+
<use ns4:href="#md7efaf3aec" x="274.883864" y="395.253728" style="fill: #1f77b4; stroke: #1f77b4" />
|
| 4129 |
+
<use ns4:href="#md7efaf3aec" x="364.322974" y="398.911657" style="fill: #1f77b4; stroke: #1f77b4" />
|
| 4130 |
+
<use ns4:href="#md7efaf3aec" x="453.762084" y="382.189695" style="fill: #1f77b4; stroke: #1f77b4" />
|
| 4131 |
+
<use ns4:href="#md7efaf3aec" x="543.201194" y="401.393823" style="fill: #1f77b4; stroke: #1f77b4" />
|
| 4132 |
+
<use ns4:href="#md7efaf3aec" x="632.640304" y="395.136152" style="fill: #1f77b4; stroke: #1f77b4" />
|
| 4133 |
+
<use ns4:href="#md7efaf3aec" x="722.079415" y="381.275213" style="fill: #1f77b4; stroke: #1f77b4" />
|
| 4134 |
+
<use ns4:href="#md7efaf3aec" x="811.518525" y="395.515009" style="fill: #1f77b4; stroke: #1f77b4" />
|
| 4135 |
</g>
|
| 4136 |
</g>
|
| 4137 |
<g id="series--torch-eager" class="series">
|
| 4138 |
+
<path d="M 96.005644 194.328898 L 185.444754 47.08418 L 274.883864 59.495011 L 364.322974 61.46768 L 453.762084 66.170732 L 543.201194 84.055394 L 632.640304 56.503348 L 722.079415 80.67181 L 811.518525 81.586292 " clip-path="url(#p620c7d392f)" style="fill: none; stroke: #ff7f0e; stroke-width: 1.5; stroke-linecap: square" />
|
| 4139 |
<defs>
|
| 4140 |
<path id="m9b8c54d372" d="M 0 3 C 0.795609 3 1.55874 2.683901 2.12132 2.12132 C 2.683901 1.55874 3 0.795609 3 0 C 3 -0.795609 2.683901 -1.55874 2.12132 -2.12132 C 1.55874 -2.683901 0.795609 -3 0 -3 C -0.795609 -3 -1.55874 -2.683901 -2.12132 -2.12132 C -2.683901 -1.55874 -3 -0.795609 -3 0 C -3 0.795609 -2.683901 1.55874 -2.12132 2.12132 C -1.55874 2.683901 -0.795609 3 0 3 z " style="stroke: #ff7f0e" />
|
| 4141 |
</defs>
|
| 4142 |
<g clip-path="url(#p620c7d392f)">
|
| 4143 |
+
<use ns4:href="#m9b8c54d372" x="96.005644" y="194.328898" style="fill: #ff7f0e; stroke: #ff7f0e" />
|
| 4144 |
<use ns4:href="#m9b8c54d372" x="185.444754" y="47.08418" style="fill: #ff7f0e; stroke: #ff7f0e" />
|
| 4145 |
+
<use ns4:href="#m9b8c54d372" x="274.883864" y="59.495011" style="fill: #ff7f0e; stroke: #ff7f0e" />
|
| 4146 |
+
<use ns4:href="#m9b8c54d372" x="364.322974" y="61.46768" style="fill: #ff7f0e; stroke: #ff7f0e" />
|
| 4147 |
+
<use ns4:href="#m9b8c54d372" x="453.762084" y="66.170732" style="fill: #ff7f0e; stroke: #ff7f0e" />
|
| 4148 |
+
<use ns4:href="#m9b8c54d372" x="543.201194" y="84.055394" style="fill: #ff7f0e; stroke: #ff7f0e" />
|
| 4149 |
+
<use ns4:href="#m9b8c54d372" x="632.640304" y="56.503348" style="fill: #ff7f0e; stroke: #ff7f0e" />
|
| 4150 |
+
<use ns4:href="#m9b8c54d372" x="722.079415" y="80.67181" style="fill: #ff7f0e; stroke: #ff7f0e" />
|
| 4151 |
+
<use ns4:href="#m9b8c54d372" x="811.518525" y="81.586292" style="fill: #ff7f0e; stroke: #ff7f0e" />
|
| 4152 |
</g>
|
| 4153 |
</g>
|
| 4154 |
<g id="patch_3">
|
|
|
|
| 4163 |
<g id="patch_6">
|
| 4164 |
<path d="M 60.23 26.88 L 847.294169 26.88 " style="fill: none; stroke: #000000; stroke-width: 0.8; stroke-linejoin: miter; stroke-linecap: square" />
|
| 4165 |
</g>
|
| 4166 |
+
<g id="text_17">
|
| 4167 |
<text style="font-size: 12px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: middle" x="453.762084" y="20.88" transform="rotate(-0 453.762084 20.88)">Attention Implementation Latency</text>
|
| 4168 |
</g>
|
| 4169 |
<g id="legend" class="legend">
|
| 4170 |
<g id="patch_7">
|
| 4171 |
+
<path d="M 720.811356 64.7925 L 840.294169 64.7925 Q 842.294169 64.7925 842.294169 62.7925 L 842.294169 33.88 Q 842.294169 31.88 840.294169 31.88 L 720.811356 31.88 Q 718.811356 31.88 718.811356 33.88 L 718.811356 62.7925 Q 718.811356 64.7925 720.811356 64.7925 L 720.811356 64.7925 z " style="fill: none; opacity: 0.8; stroke: #cccccc; stroke-linejoin: miter" />
|
| 4172 |
</g>
|
| 4173 |
+
<g id="line2d_17">
|
| 4174 |
+
<path d="M 722.811356 39.978438 L 732.811356 39.978438 L 742.811356 39.978438 " style="fill: none; stroke: #1f77b4; stroke-width: 1.5; stroke-linecap: square" />
|
| 4175 |
<g>
|
| 4176 |
+
<use ns4:href="#md7efaf3aec" x="732.811356" y="39.978438" style="fill: #1f77b4; stroke: #1f77b4" />
|
| 4177 |
</g>
|
| 4178 |
</g>
|
| 4179 |
<g id="legend-label--hf-kernels-swiglu" class="legend">
|
| 4180 |
+
<text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: start" x="750.811356" y="43.478438" transform="rotate(-0 750.811356 43.478438)">hf_kernels_swiglu</text>
|
| 4181 |
</g>
|
| 4182 |
+
<g id="line2d_18">
|
| 4183 |
+
<path d="M 722.811356 54.934687 L 732.811356 54.934687 L 742.811356 54.934687 " style="fill: none; stroke: #ff7f0e; stroke-width: 1.5; stroke-linecap: square" />
|
| 4184 |
<g>
|
| 4185 |
+
<use ns4:href="#m9b8c54d372" x="732.811356" y="54.934687" style="fill: #ff7f0e; stroke: #ff7f0e" />
|
| 4186 |
</g>
|
| 4187 |
</g>
|
| 4188 |
<g id="legend-label--torch-eager" class="legend">
|
| 4189 |
+
<text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: start" x="750.811356" y="58.434687" transform="rotate(-0 750.811356 58.434687)">torch_eager</text>
|
| 4190 |
</g>
|
| 4191 |
</g>
|
| 4192 |
</g>
|
|
|
|
| 4206 |
<span onclick="toggleOutput('combine')" style="cursor: pointer;">▼ output</span>
|
| 4207 |
<span id="uv-indicator-combine" onclick="toggleUvLogsFromHeader('combine')" style="cursor: pointer;">▶ uv-logs</span>
|
| 4208 |
</span> |
|
| 4209 |
+
Cell: combine | 4.24s
|
| 4210 |
| <button class="run-btn" onclick="runCell('combine')">▶ run</button>
|
| 4211 |
<button class="copy-btn" onclick="copyCell('combine')">Copy</button>
|
| 4212 |
<a href="cells/combine.py" target="_blank" class="raw-btn">Raw</a>
|
|
|
|
| 4332 |
<div class="uv-install-logs" id="uv-logs-combine">
|
| 4333 |
<div class="uv-logs-header" onclick="toggleUvLogs(this)">▶ UV Install Logs</div>
|
| 4334 |
<div class="uv-logs-content" style="display: none;">
|
| 4335 |
+
Installed 37 packages in 218ms
|
| 4336 |
</div>
|
| 4337 |
</div>
|
| 4338 |
<div class="cell-artifacts">
|
|
|
|
| 4345 |
<rdf:RDF>
|
| 4346 |
<ns2:Work>
|
| 4347 |
<dc:type rdf:resource="http://purl.org/dc/dcmitype/StillImage" />
|
| 4348 |
+
<dc:date>2025-10-29T14:27:49.999657</dc:date>
|
| 4349 |
<dc:format>image/svg+xml</dc:format>
|
| 4350 |
<dc:creator>
|
| 4351 |
<ns2:Agent>
|
|
|
|
| 4494 |
<g id="matplotlib.axis_2">
|
| 4495 |
<g id="ytick_1">
|
| 4496 |
<g id="grid-y--2" class="grid grid-y">
|
| 4497 |
+
<path d="M 60.23 428.188156 L 847.294169 428.188156 " clip-path="url(#p620c7d392f)" style="fill: none; stroke: #b0b0b0; stroke-opacity: 0.3; stroke-width: 0.8; stroke-linecap: square" />
|
| 4498 |
</g>
|
| 4499 |
<g id="line2d_10">
|
| 4500 |
<defs>
|
| 4501 |
<path id="m0fca2865ba" d="M 0 0 L -3.5 0 " style="stroke: #000000; stroke-width: 0.8" />
|
| 4502 |
</defs>
|
| 4503 |
<g>
|
| 4504 |
+
<use ns4:href="#m0fca2865ba" x="60.23" y="428.188156" style="stroke: #000000; stroke-width: 0.8" />
|
| 4505 |
</g>
|
| 4506 |
</g>
|
| 4507 |
<g id="text_10">
|
| 4508 |
+
<text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: end" x="53.23" y="431.987375" transform="rotate(-0 53.23 431.987375)">0.025</text>
|
| 4509 |
</g>
|
| 4510 |
</g>
|
| 4511 |
<g id="ytick_2">
|
| 4512 |
<g id="grid-y--3" class="grid grid-y">
|
| 4513 |
+
<path d="M 60.23 362.86799 L 847.294169 362.86799 " clip-path="url(#p620c7d392f)" style="fill: none; stroke: #b0b0b0; stroke-opacity: 0.3; stroke-width: 0.8; stroke-linecap: square" />
|
| 4514 |
</g>
|
| 4515 |
<g id="line2d_11">
|
| 4516 |
<g>
|
| 4517 |
+
<use ns4:href="#m0fca2865ba" x="60.23" y="362.86799" style="stroke: #000000; stroke-width: 0.8" />
|
| 4518 |
</g>
|
| 4519 |
</g>
|
| 4520 |
<g id="text_11">
|
| 4521 |
+
<text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: end" x="53.23" y="366.667209" transform="rotate(-0 53.23 366.667209)">0.030</text>
|
| 4522 |
</g>
|
| 4523 |
</g>
|
| 4524 |
<g id="ytick_3">
|
| 4525 |
<g id="grid-y--4" class="grid grid-y">
|
| 4526 |
+
<path d="M 60.23 297.547824 L 847.294169 297.547824 " clip-path="url(#p620c7d392f)" style="fill: none; stroke: #b0b0b0; stroke-opacity: 0.3; stroke-width: 0.8; stroke-linecap: square" />
|
| 4527 |
</g>
|
| 4528 |
<g id="line2d_12">
|
| 4529 |
<g>
|
| 4530 |
+
<use ns4:href="#m0fca2865ba" x="60.23" y="297.547824" style="stroke: #000000; stroke-width: 0.8" />
|
| 4531 |
</g>
|
| 4532 |
</g>
|
| 4533 |
<g id="text_12">
|
| 4534 |
+
<text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: end" x="53.23" y="301.347043" transform="rotate(-0 53.23 301.347043)">0.035</text>
|
| 4535 |
</g>
|
| 4536 |
</g>
|
| 4537 |
<g id="ytick_4">
|
| 4538 |
<g id="grid-y--5" class="grid grid-y">
|
| 4539 |
+
<path d="M 60.23 232.227658 L 847.294169 232.227658 " clip-path="url(#p620c7d392f)" style="fill: none; stroke: #b0b0b0; stroke-opacity: 0.3; stroke-width: 0.8; stroke-linecap: square" />
|
| 4540 |
</g>
|
| 4541 |
<g id="line2d_13">
|
| 4542 |
<g>
|
| 4543 |
+
<use ns4:href="#m0fca2865ba" x="60.23" y="232.227658" style="stroke: #000000; stroke-width: 0.8" />
|
| 4544 |
</g>
|
| 4545 |
</g>
|
| 4546 |
<g id="text_13">
|
| 4547 |
+
<text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: end" x="53.23" y="236.026877" transform="rotate(-0 53.23 236.026877)">0.040</text>
|
| 4548 |
</g>
|
| 4549 |
</g>
|
| 4550 |
<g id="ytick_5">
|
| 4551 |
<g id="grid-y--6" class="grid grid-y">
|
| 4552 |
+
<path d="M 60.23 166.907492 L 847.294169 166.907492 " clip-path="url(#p620c7d392f)" style="fill: none; stroke: #b0b0b0; stroke-opacity: 0.3; stroke-width: 0.8; stroke-linecap: square" />
|
| 4553 |
</g>
|
| 4554 |
<g id="line2d_14">
|
| 4555 |
<g>
|
| 4556 |
+
<use ns4:href="#m0fca2865ba" x="60.23" y="166.907492" style="stroke: #000000; stroke-width: 0.8" />
|
| 4557 |
</g>
|
| 4558 |
</g>
|
| 4559 |
<g id="text_14">
|
| 4560 |
+
<text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: end" x="53.23" y="170.706711" transform="rotate(-0 53.23 170.706711)">0.045</text>
|
| 4561 |
</g>
|
| 4562 |
</g>
|
| 4563 |
<g id="ytick_6">
|
| 4564 |
<g id="grid-y--7" class="grid grid-y">
|
| 4565 |
+
<path d="M 60.23 101.587327 L 847.294169 101.587327 " clip-path="url(#p620c7d392f)" style="fill: none; stroke: #b0b0b0; stroke-opacity: 0.3; stroke-width: 0.8; stroke-linecap: square" />
|
| 4566 |
</g>
|
| 4567 |
<g id="line2d_15">
|
| 4568 |
<g>
|
| 4569 |
+
<use ns4:href="#m0fca2865ba" x="60.23" y="101.587327" style="stroke: #000000; stroke-width: 0.8" />
|
| 4570 |
</g>
|
| 4571 |
</g>
|
| 4572 |
<g id="text_15">
|
| 4573 |
+
<text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: end" x="53.23" y="105.386545" transform="rotate(-0 53.23 105.386545)">0.050</text>
|
| 4574 |
+
</g>
|
| 4575 |
+
</g>
|
| 4576 |
+
<g id="ytick_7">
|
| 4577 |
+
<g id="grid-y--8" class="grid grid-y">
|
| 4578 |
+
<path d="M 60.23 36.267161 L 847.294169 36.267161 " clip-path="url(#p620c7d392f)" style="fill: none; stroke: #b0b0b0; stroke-opacity: 0.3; stroke-width: 0.8; stroke-linecap: square" />
|
| 4579 |
+
</g>
|
| 4580 |
+
<g id="line2d_16">
|
| 4581 |
+
<g>
|
| 4582 |
+
<use ns4:href="#m0fca2865ba" x="60.23" y="36.267161" style="stroke: #000000; stroke-width: 0.8" />
|
| 4583 |
+
</g>
|
| 4584 |
+
</g>
|
| 4585 |
+
<g id="text_16">
|
| 4586 |
+
<text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: end" x="53.23" y="40.066379" transform="rotate(-0 53.23 40.066379)">0.055</text>
|
| 4587 |
</g>
|
| 4588 |
</g>
|
| 4589 |
<g id="label--y" class="ylabel">
|
|
|
|
| 4591 |
</g>
|
| 4592 |
</g>
|
| 4593 |
<g id="series--hf-kernels-swiglu" class="series">
|
| 4594 |
+
<path d="M 96.005644 451.16779 L 185.444754 385.847624 L 274.883864 395.253728 L 364.322974 398.911657 L 453.762084 382.189695 L 543.201194 401.393823 L 632.640304 395.136152 L 722.079415 381.275213 L 811.518525 395.515009 " clip-path="url(#p620c7d392f)" style="fill: none; stroke: #1f77b4; stroke-width: 1.5; stroke-linecap: square" />
|
| 4595 |
<defs>
|
| 4596 |
<path id="md7efaf3aec" d="M 0 3 C 0.795609 3 1.55874 2.683901 2.12132 2.12132 C 2.683901 1.55874 3 0.795609 3 0 C 3 -0.795609 2.683901 -1.55874 2.12132 -2.12132 C 1.55874 -2.683901 0.795609 -3 0 -3 C -0.795609 -3 -1.55874 -2.683901 -2.12132 -2.12132 C -2.683901 -1.55874 -3 -0.795609 -3 0 C -3 0.795609 -2.683901 1.55874 -2.12132 2.12132 C -1.55874 2.683901 -0.795609 3 0 3 z " style="stroke: #1f77b4" />
|
| 4597 |
</defs>
|
| 4598 |
<g clip-path="url(#p620c7d392f)">
|
| 4599 |
<use ns4:href="#md7efaf3aec" x="96.005644" y="451.16779" style="fill: #1f77b4; stroke: #1f77b4" />
|
| 4600 |
+
<use ns4:href="#md7efaf3aec" x="185.444754" y="385.847624" style="fill: #1f77b4; stroke: #1f77b4" />
|
| 4601 |
+
<use ns4:href="#md7efaf3aec" x="274.883864" y="395.253728" style="fill: #1f77b4; stroke: #1f77b4" />
|
| 4602 |
+
<use ns4:href="#md7efaf3aec" x="364.322974" y="398.911657" style="fill: #1f77b4; stroke: #1f77b4" />
|
| 4603 |
+
<use ns4:href="#md7efaf3aec" x="453.762084" y="382.189695" style="fill: #1f77b4; stroke: #1f77b4" />
|
| 4604 |
+
<use ns4:href="#md7efaf3aec" x="543.201194" y="401.393823" style="fill: #1f77b4; stroke: #1f77b4" />
|
| 4605 |
+
<use ns4:href="#md7efaf3aec" x="632.640304" y="395.136152" style="fill: #1f77b4; stroke: #1f77b4" />
|
| 4606 |
+
<use ns4:href="#md7efaf3aec" x="722.079415" y="381.275213" style="fill: #1f77b4; stroke: #1f77b4" />
|
| 4607 |
+
<use ns4:href="#md7efaf3aec" x="811.518525" y="395.515009" style="fill: #1f77b4; stroke: #1f77b4" />
|
| 4608 |
</g>
|
| 4609 |
</g>
|
| 4610 |
<g id="series--torch-eager" class="series">
|
| 4611 |
+
<path d="M 96.005644 194.328898 L 185.444754 47.08418 L 274.883864 59.495011 L 364.322974 61.46768 L 453.762084 66.170732 L 543.201194 84.055394 L 632.640304 56.503348 L 722.079415 80.67181 L 811.518525 81.586292 " clip-path="url(#p620c7d392f)" style="fill: none; stroke: #ff7f0e; stroke-width: 1.5; stroke-linecap: square" />
|
| 4612 |
<defs>
|
| 4613 |
<path id="m9b8c54d372" d="M 0 3 C 0.795609 3 1.55874 2.683901 2.12132 2.12132 C 2.683901 1.55874 3 0.795609 3 0 C 3 -0.795609 2.683901 -1.55874 2.12132 -2.12132 C 1.55874 -2.683901 0.795609 -3 0 -3 C -0.795609 -3 -1.55874 -2.683901 -2.12132 -2.12132 C -2.683901 -1.55874 -3 -0.795609 -3 0 C -3 0.795609 -2.683901 1.55874 -2.12132 2.12132 C -1.55874 2.683901 -0.795609 3 0 3 z " style="stroke: #ff7f0e" />
|
| 4614 |
</defs>
|
| 4615 |
<g clip-path="url(#p620c7d392f)">
|
| 4616 |
+
<use ns4:href="#m9b8c54d372" x="96.005644" y="194.328898" style="fill: #ff7f0e; stroke: #ff7f0e" />
|
| 4617 |
<use ns4:href="#m9b8c54d372" x="185.444754" y="47.08418" style="fill: #ff7f0e; stroke: #ff7f0e" />
|
| 4618 |
+
<use ns4:href="#m9b8c54d372" x="274.883864" y="59.495011" style="fill: #ff7f0e; stroke: #ff7f0e" />
|
| 4619 |
+
<use ns4:href="#m9b8c54d372" x="364.322974" y="61.46768" style="fill: #ff7f0e; stroke: #ff7f0e" />
|
| 4620 |
+
<use ns4:href="#m9b8c54d372" x="453.762084" y="66.170732" style="fill: #ff7f0e; stroke: #ff7f0e" />
|
| 4621 |
+
<use ns4:href="#m9b8c54d372" x="543.201194" y="84.055394" style="fill: #ff7f0e; stroke: #ff7f0e" />
|
| 4622 |
+
<use ns4:href="#m9b8c54d372" x="632.640304" y="56.503348" style="fill: #ff7f0e; stroke: #ff7f0e" />
|
| 4623 |
+
<use ns4:href="#m9b8c54d372" x="722.079415" y="80.67181" style="fill: #ff7f0e; stroke: #ff7f0e" />
|
| 4624 |
+
<use ns4:href="#m9b8c54d372" x="811.518525" y="81.586292" style="fill: #ff7f0e; stroke: #ff7f0e" />
|
| 4625 |
</g>
|
| 4626 |
</g>
|
| 4627 |
<g id="patch_3">
|
|
|
|
| 4636 |
<g id="patch_6">
|
| 4637 |
<path d="M 60.23 26.88 L 847.294169 26.88 " style="fill: none; stroke: #000000; stroke-width: 0.8; stroke-linejoin: miter; stroke-linecap: square" />
|
| 4638 |
</g>
|
| 4639 |
+
<g id="text_17">
|
| 4640 |
<text style="font-size: 12px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: middle" x="453.762084" y="20.88" transform="rotate(-0 453.762084 20.88)">Attention Implementation Latency</text>
|
| 4641 |
</g>
|
| 4642 |
<g id="legend" class="legend">
|
| 4643 |
<g id="patch_7">
|
| 4644 |
+
<path d="M 720.811356 64.7925 L 840.294169 64.7925 Q 842.294169 64.7925 842.294169 62.7925 L 842.294169 33.88 Q 842.294169 31.88 840.294169 31.88 L 720.811356 31.88 Q 718.811356 31.88 718.811356 33.88 L 718.811356 62.7925 Q 718.811356 64.7925 720.811356 64.7925 L 720.811356 64.7925 z " style="fill: none; opacity: 0.8; stroke: #cccccc; stroke-linejoin: miter" />
|
| 4645 |
</g>
|
| 4646 |
+
<g id="line2d_17">
|
| 4647 |
+
<path d="M 722.811356 39.978438 L 732.811356 39.978438 L 742.811356 39.978438 " style="fill: none; stroke: #1f77b4; stroke-width: 1.5; stroke-linecap: square" />
|
| 4648 |
<g>
|
| 4649 |
+
<use ns4:href="#md7efaf3aec" x="732.811356" y="39.978438" style="fill: #1f77b4; stroke: #1f77b4" />
|
| 4650 |
</g>
|
| 4651 |
</g>
|
| 4652 |
<g id="legend-label--hf-kernels-swiglu" class="legend">
|
| 4653 |
+
<text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: start" x="750.811356" y="43.478438" transform="rotate(-0 750.811356 43.478438)">hf_kernels_swiglu</text>
|
| 4654 |
</g>
|
| 4655 |
+
<g id="line2d_18">
|
| 4656 |
+
<path d="M 722.811356 54.934687 L 732.811356 54.934687 L 742.811356 54.934687 " style="fill: none; stroke: #ff7f0e; stroke-width: 1.5; stroke-linecap: square" />
|
| 4657 |
<g>
|
| 4658 |
+
<use ns4:href="#m9b8c54d372" x="732.811356" y="54.934687" style="fill: #ff7f0e; stroke: #ff7f0e" />
|
| 4659 |
</g>
|
| 4660 |
</g>
|
| 4661 |
<g id="legend-label--torch-eager" class="legend">
|
| 4662 |
+
<text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: start" x="750.811356" y="58.434687" transform="rotate(-0 750.811356 58.434687)">torch_eager</text>
|
| 4663 |
</g>
|
| 4664 |
</g>
|
| 4665 |
</g>
|
causal_conv1d/impls/artifacts/benchmark/causal_conv1d.jsonl
CHANGED
|
@@ -1,24 +1,24 @@
|
|
| 1 |
-
{"ts": "2025-10-
|
| 2 |
-
{"ts": "2025-10-
|
| 3 |
-
{"ts": "2025-10-
|
| 4 |
-
{"ts": "2025-10-
|
| 5 |
-
{"ts": "2025-10-
|
| 6 |
-
{"ts": "2025-10-
|
| 7 |
-
{"ts": "2025-10-
|
| 8 |
-
{"ts": "2025-10-
|
| 9 |
-
{"ts": "2025-10-
|
| 10 |
-
{"ts": "2025-10-
|
| 11 |
-
{"ts": "2025-10-
|
| 12 |
-
{"ts": "2025-10-
|
| 13 |
-
{"ts": "2025-10-
|
| 14 |
-
{"ts": "2025-10-
|
| 15 |
-
{"ts": "2025-10-
|
| 16 |
-
{"ts": "2025-10-
|
| 17 |
-
{"ts": "2025-10-
|
| 18 |
-
{"ts": "2025-10-
|
| 19 |
-
{"ts": "2025-10-
|
| 20 |
-
{"ts": "2025-10-
|
| 21 |
-
{"ts": "2025-10-
|
| 22 |
-
{"ts": "2025-10-
|
| 23 |
-
{"ts": "2025-10-
|
| 24 |
-
{"ts": "2025-10-
|
|
|
|
| 1 |
+
{"ts": "2025-10-29T14:27:35Z", "run": "b5ef0d3c828f4daf8ce959b0e1a6bf4c", "impl": "hf_kernels_causal_conv1d", "tags": {"family": "hf-kernels", "backend": "cuda"}, "wl": {"name": "cuda_B2_D64_S128_W2", "batch": 2, "dim": 64, "seqlen": 128, "width": 2, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.047051000024112, "p50": 0.048391000007086404, "p90": 0.048571999968771706, "mean": 0.04890720000503279, "iqr": 0.0005509999709829572, "raw_times": [0.0525010000274051, 0.048571999968771706, 0.04802099999778875, 0.048391000007086404, 0.047051000024112], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.058991999992485944, "peak_bytes": 295936, "ok": true, "absmax": 0.0, "corr": {"ok": true, "rtol": 0.003, "atol": 0.005, "absmax": 0.0, "mae": 0.0, "mse": 0.0, "ref": "causal_conv1d_fp32"}, "err": null}
|
| 2 |
+
{"ts": "2025-10-29T14:27:36Z", "run": "b5ef0d3c828f4daf8ce959b0e1a6bf4c", "impl": "hf_kernels_causal_conv1d", "tags": {"family": "hf-kernels", "backend": "cuda"}, "wl": {"name": "cuda_B2_D64_S128_W4", "batch": 2, "dim": 64, "seqlen": 128, "width": 4, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.05480199996554802, "p50": 0.05610099998420992, "p90": 0.05628099995647062, "mean": 0.056069199968078465, "iqr": 0.0006299999881775875, "raw_times": [0.057510999965870724, 0.05628099995647062, 0.05610099998420992, 0.055650999968293036, 0.05480199996554802], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.06025200002568454, "peak_bytes": 296448, "ok": true, "absmax": 0.0, "corr": {"ok": true, "rtol": 0.003, "atol": 0.005, "absmax": 0.0, "mae": 0.0, "mse": 0.0, "ref": "causal_conv1d_fp32"}, "err": null}
|
| 3 |
+
{"ts": "2025-10-29T14:27:36Z", "run": "b5ef0d3c828f4daf8ce959b0e1a6bf4c", "impl": "hf_kernels_causal_conv1d", "tags": {"family": "hf-kernels", "backend": "cuda"}, "wl": {"name": "cuda_B2_D64_S512_W2", "batch": 2, "dim": 64, "seqlen": 512, "width": 2, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.05469199999197372, "p50": 0.056971999981669796, "p90": 0.057361000017408514, "mean": 0.056363600003805914, "iqr": 0.0025490000439276628, "raw_times": [0.05469199999197372, 0.057361000017408514, 0.056971999981669796, 0.05798100005449669, 0.05481199997348085], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.0596809999819925, "peak_bytes": 1180672, "ok": true, "absmax": 0.0, "corr": {"ok": true, "rtol": 0.003, "atol": 0.005, "absmax": 0.0, "mae": 0.0, "mse": 0.0, "ref": "causal_conv1d_fp32"}, "err": null}
|
| 4 |
+
{"ts": "2025-10-29T14:27:36Z", "run": "b5ef0d3c828f4daf8ce959b0e1a6bf4c", "impl": "hf_kernels_causal_conv1d", "tags": {"family": "hf-kernels", "backend": "cuda"}, "wl": {"name": "cuda_B2_D64_S512_W4", "batch": 2, "dim": 64, "seqlen": 512, "width": 4, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.05416100003685642, "p50": 0.05502099998011545, "p90": 0.05511200004093553, "mean": 0.05489540000098714, "iqr": 0.00016000007008187822, "raw_times": [0.05416100003685642, 0.05495199997085365, 0.055230999976174644, 0.05511200004093553, 0.05502099998011545], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.05659100003185813, "peak_bytes": 1181184, "ok": true, "absmax": 0.0, "corr": {"ok": true, "rtol": 0.003, "atol": 0.005, "absmax": 0.0, "mae": 0.0, "mse": 0.0, "ref": "causal_conv1d_fp32"}, "err": null}
|
| 5 |
+
{"ts": "2025-10-29T14:27:36Z", "run": "b5ef0d3c828f4daf8ce959b0e1a6bf4c", "impl": "hf_kernels_causal_conv1d", "tags": {"family": "hf-kernels", "backend": "cuda"}, "wl": {"name": "cuda_B2_D64_S2048_W2", "batch": 2, "dim": 64, "seqlen": 2048, "width": 2, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.052401000004920206, "p50": 0.055000999964249786, "p90": 0.056301000029179704, "mean": 0.054810999995424936, "iqr": 0.0023400000372930663, "raw_times": [0.052401000004920206, 0.056301000029179704, 0.056390999986888346, 0.05396099999188664, 0.055000999964249786], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.05838100003074942, "peak_bytes": 4719616, "ok": true, "absmax": 0.0, "corr": {"ok": true, "rtol": 0.003, "atol": 0.005, "absmax": 0.0, "mae": 0.0, "mse": 0.0, "ref": "causal_conv1d_fp32"}, "err": null}
|
| 6 |
+
{"ts": "2025-10-29T14:27:36Z", "run": "b5ef0d3c828f4daf8ce959b0e1a6bf4c", "impl": "hf_kernels_causal_conv1d", "tags": {"family": "hf-kernels", "backend": "cuda"}, "wl": {"name": "cuda_B2_D64_S2048_W4", "batch": 2, "dim": 64, "seqlen": 2048, "width": 4, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.05270100001553146, "p50": 0.05342100001826111, "p90": 0.054111999986616865, "mean": 0.053611199996339565, "iqr": 0.0008709999974598759, "raw_times": [0.05324099998915699, 0.054580999972131394, 0.054111999986616865, 0.05342100001826111, 0.05270100001553146], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.0580610000042725, "peak_bytes": 4720128, "ok": true, "absmax": 0.0, "corr": {"ok": true, "rtol": 0.003, "atol": 0.005, "absmax": 0.0, "mae": 0.0, "mse": 0.0, "ref": "causal_conv1d_fp32"}, "err": null}
|
| 7 |
+
{"ts": "2025-10-29T14:27:36Z", "run": "b5ef0d3c828f4daf8ce959b0e1a6bf4c", "impl": "hf_kernels_causal_conv1d", "tags": {"family": "hf-kernels", "backend": "cuda"}, "wl": {"name": "cuda_B2_D2048_S128_W2", "batch": 2, "dim": 2048, "seqlen": 128, "width": 2, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.051810999991630524, "p50": 0.05310099999178419, "p90": 0.053301000036753976, "mean": 0.05577720000928821, "iqr": 0.0007700000423938036, "raw_times": [0.051810999991630524, 0.05253099999436017, 0.06814200003191218, 0.053301000036753976, 0.05310099999178419], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.056801999960498506, "peak_bytes": 9461760, "ok": true, "absmax": 0.0, "corr": {"ok": true, "rtol": 0.003, "atol": 0.005, "absmax": 0.0, "mae": 0.0, "mse": 0.0, "ref": "causal_conv1d_fp32"}, "err": null}
|
| 8 |
+
{"ts": "2025-10-29T14:27:36Z", "run": "b5ef0d3c828f4daf8ce959b0e1a6bf4c", "impl": "hf_kernels_causal_conv1d", "tags": {"family": "hf-kernels", "backend": "cuda"}, "wl": {"name": "cuda_B2_D2048_S128_W4", "batch": 2, "dim": 2048, "seqlen": 128, "width": 4, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.053270999956112064, "p50": 0.05397199998924407, "p90": 0.05399100001568513, "mean": 0.054061200000887766, "iqr": 0.00023000001192485797, "raw_times": [0.05399100001568513, 0.05531100003963729, 0.053270999956112064, 0.05397199998924407, 0.05376100000376027], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.056641000014678866, "peak_bytes": 9478144, "ok": true, "absmax": 0.0, "corr": {"ok": true, "rtol": 0.003, "atol": 0.005, "absmax": 0.0, "mae": 0.0, "mse": 0.0, "ref": "causal_conv1d_fp32"}, "err": null}
|
| 9 |
+
{"ts": "2025-10-29T14:27:36Z", "run": "b5ef0d3c828f4daf8ce959b0e1a6bf4c", "impl": "hf_kernels_causal_conv1d", "tags": {"family": "hf-kernels", "backend": "cuda"}, "wl": {"name": "cuda_B2_D2048_S512_W2", "batch": 2, "dim": 2048, "seqlen": 512, "width": 2, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.052231000040592335, "p50": 0.052561000018158666, "p90": 0.0526110000009794, "mean": 0.0529970000116009, "iqr": 0.0002200000039920269, "raw_times": [0.052231000040592335, 0.052390999996987375, 0.05519100000128674, 0.052561000018158666, 0.0526110000009794], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.05621100001462764, "peak_bytes": 37773312, "ok": true, "absmax": 0.0, "corr": {"ok": true, "rtol": 0.003, "atol": 0.005, "absmax": 0.0, "mae": 0.0, "mse": 0.0, "ref": "causal_conv1d_fp32"}, "err": null}
|
| 10 |
+
{"ts": "2025-10-29T14:27:36Z", "run": "b5ef0d3c828f4daf8ce959b0e1a6bf4c", "impl": "hf_kernels_causal_conv1d", "tags": {"family": "hf-kernels", "backend": "cuda"}, "wl": {"name": "cuda_B2_D2048_S512_W4", "batch": 2, "dim": 2048, "seqlen": 512, "width": 4, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.052561000018158666, "p50": 0.053772000001117704, "p90": 0.05471100001841478, "mean": 0.057879400003457704, "iqr": 0.0021000000174353772, "raw_times": [0.0526110000009794, 0.05471100001841478, 0.053772000001117704, 0.052561000018158666, 0.07574199997861797], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.06976200000963217, "peak_bytes": 37789696, "ok": true, "absmax": 0.0, "corr": {"ok": true, "rtol": 0.003, "atol": 0.005, "absmax": 0.0, "mae": 0.0, "mse": 0.0, "ref": "causal_conv1d_fp32"}, "err": null}
|
| 11 |
+
{"ts": "2025-10-29T14:27:36Z", "run": "b5ef0d3c828f4daf8ce959b0e1a6bf4c", "impl": "hf_kernels_causal_conv1d", "tags": {"family": "hf-kernels", "backend": "cuda"}, "wl": {"name": "cuda_B2_D2048_S2048_W2", "batch": 2, "dim": 2048, "seqlen": 2048, "width": 2, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.051341000016691396, "p50": 0.05308099997591853, "p90": 0.05349200000637211, "mean": 0.05272739998645193, "iqr": 0.0014310000437944836, "raw_times": [0.052060999962577625, 0.05366199997069998, 0.05308099997591853, 0.051341000016691396, 0.05349200000637211], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.05661099999088037, "peak_bytes": 151019520, "ok": true, "absmax": 0.0, "corr": {"ok": true, "rtol": 0.003, "atol": 0.005, "absmax": 0.0, "mae": 0.0, "mse": 0.0, "ref": "causal_conv1d_fp32"}, "err": null}
|
| 12 |
+
{"ts": "2025-10-29T14:27:36Z", "run": "b5ef0d3c828f4daf8ce959b0e1a6bf4c", "impl": "hf_kernels_causal_conv1d", "tags": {"family": "hf-kernels", "backend": "cuda"}, "wl": {"name": "cuda_B2_D2048_S2048_W4", "batch": 2, "dim": 2048, "seqlen": 2048, "width": 4, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.05081099999415528, "p50": 0.053202000003693684, "p90": 0.05362099994954406, "mean": 0.05282339998302632, "iqr": 0.0011499999459374521, "raw_times": [0.05081099999415528, 0.052471000003606605, 0.05362099994954406, 0.054011999964131974, 0.053202000003693684], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.05642200000011144, "peak_bytes": 151035904, "ok": true, "absmax": 0.0, "corr": {"ok": true, "rtol": 0.003, "atol": 0.005, "absmax": 0.0, "mae": 0.0, "mse": 0.0, "ref": "causal_conv1d_fp32"}, "err": null}
|
| 13 |
+
{"ts": "2025-10-29T14:27:36Z", "run": "b5ef0d3c828f4daf8ce959b0e1a6bf4c", "impl": "hf_kernels_causal_conv1d", "tags": {"family": "hf-kernels", "backend": "cuda"}, "wl": {"name": "cuda_B4_D64_S128_W2", "batch": 4, "dim": 64, "seqlen": 128, "width": 2, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.052221000032659504, "p50": 0.05397099999981947, "p90": 0.05448100000648992, "mean": 0.05380100001275423, "iqr": 0.001479999980347202, "raw_times": [0.052221000032659504, 0.05300100002614272, 0.055330999998659536, 0.05448100000648992, 0.05397099999981947], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.058421999995061924, "peak_bytes": 33727488, "ok": true, "absmax": 0.0, "corr": {"ok": true, "rtol": 0.003, "atol": 0.005, "absmax": 0.0, "mae": 0.0, "mse": 0.0, "ref": "causal_conv1d_fp32"}, "err": null}
|
| 14 |
+
{"ts": "2025-10-29T14:27:36Z", "run": "b5ef0d3c828f4daf8ce959b0e1a6bf4c", "impl": "hf_kernels_causal_conv1d", "tags": {"family": "hf-kernels", "backend": "cuda"}, "wl": {"name": "cuda_B4_D64_S128_W4", "batch": 4, "dim": 64, "seqlen": 128, "width": 4, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.05164199995988383, "p50": 0.053460999993149017, "p90": 0.05421100001967716, "mean": 0.0532791999944493, "iqr": 0.00139000002263856, "raw_times": [0.0528209999970386, 0.054261000002497894, 0.05421100001967716, 0.05164199995988383, 0.053460999993149017], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.05741199998965385, "peak_bytes": 591360, "ok": true, "absmax": 0.0, "corr": {"ok": true, "rtol": 0.003, "atol": 0.005, "absmax": 0.0, "mae": 0.0, "mse": 0.0, "ref": "causal_conv1d_fp32"}, "err": null}
|
| 15 |
+
{"ts": "2025-10-29T14:27:36Z", "run": "b5ef0d3c828f4daf8ce959b0e1a6bf4c", "impl": "hf_kernels_causal_conv1d", "tags": {"family": "hf-kernels", "backend": "cuda"}, "wl": {"name": "cuda_B4_D64_S512_W2", "batch": 4, "dim": 64, "seqlen": 512, "width": 2, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.052460999995673774, "p50": 0.0557109999590466, "p90": 0.05585100001326282, "mean": 0.05600519999688913, "iqr": 0.002330000029360235, "raw_times": [0.052460999995673774, 0.0557109999590466, 0.05585100001326282, 0.053520999983902584, 0.06248200003255988], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.058421000005637325, "peak_bytes": 2360320, "ok": true, "absmax": 0.0, "corr": {"ok": true, "rtol": 0.003, "atol": 0.005, "absmax": 0.0, "mae": 0.0, "mse": 0.0, "ref": "causal_conv1d_fp32"}, "err": null}
|
| 16 |
+
{"ts": "2025-10-29T14:27:36Z", "run": "b5ef0d3c828f4daf8ce959b0e1a6bf4c", "impl": "hf_kernels_causal_conv1d", "tags": {"family": "hf-kernels", "backend": "cuda"}, "wl": {"name": "cuda_B4_D64_S512_W4", "batch": 4, "dim": 64, "seqlen": 512, "width": 4, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.0540510000064387, "p50": 0.0541219999945497, "p90": 0.05425200004083308, "mean": 0.054337400013082515, "iqr": 0.0001910000264615519, "raw_times": [0.0540510000064387, 0.05425200004083308, 0.05520100000921957, 0.0541219999945497, 0.05406100001437153], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.05772200000819794, "peak_bytes": 2360832, "ok": true, "absmax": 0.0, "corr": {"ok": true, "rtol": 0.003, "atol": 0.005, "absmax": 0.0, "mae": 0.0, "mse": 0.0, "ref": "causal_conv1d_fp32"}, "err": null}
|
| 17 |
+
{"ts": "2025-10-29T14:27:36Z", "run": "b5ef0d3c828f4daf8ce959b0e1a6bf4c", "impl": "hf_kernels_causal_conv1d", "tags": {"family": "hf-kernels", "backend": "cuda"}, "wl": {"name": "cuda_B4_D64_S2048_W2", "batch": 4, "dim": 64, "seqlen": 2048, "width": 2, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.05302099998516496, "p50": 0.05408099997339377, "p90": 0.0544409999747586, "mean": 0.05571119997966889, "iqr": 0.0008299999763039523, "raw_times": [0.05302099998516496, 0.0544409999747586, 0.06340199996657248, 0.053610999998454645, 0.05408099997339377], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.05787100002407897, "peak_bytes": 9438208, "ok": true, "absmax": 0.0, "corr": {"ok": true, "rtol": 0.003, "atol": 0.005, "absmax": 0.0, "mae": 0.0, "mse": 0.0, "ref": "causal_conv1d_fp32"}, "err": null}
|
| 18 |
+
{"ts": "2025-10-29T14:27:36Z", "run": "b5ef0d3c828f4daf8ce959b0e1a6bf4c", "impl": "hf_kernels_causal_conv1d", "tags": {"family": "hf-kernels", "backend": "cuda"}, "wl": {"name": "cuda_B4_D64_S2048_W4", "batch": 4, "dim": 64, "seqlen": 2048, "width": 4, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.052391999986411975, "p50": 0.05292200000894809, "p90": 0.053131000015582686, "mean": 0.05318180001268047, "iqr": 0.00026899999738816405, "raw_times": [0.053131000015582686, 0.054602000034265075, 0.052391999986411975, 0.05286200001819452, 0.05292200000894809], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.05755099999760205, "peak_bytes": 9438720, "ok": true, "absmax": 0.0, "corr": {"ok": true, "rtol": 0.003, "atol": 0.005, "absmax": 0.0, "mae": 0.0, "mse": 0.0, "ref": "causal_conv1d_fp32"}, "err": null}
|
| 19 |
+
{"ts": "2025-10-29T14:27:36Z", "run": "b5ef0d3c828f4daf8ce959b0e1a6bf4c", "impl": "hf_kernels_causal_conv1d", "tags": {"family": "hf-kernels", "backend": "cuda"}, "wl": {"name": "cuda_B4_D2048_S128_W2", "batch": 4, "dim": 2048, "seqlen": 128, "width": 2, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.05195099998900332, "p50": 0.05479100002503401, "p90": 0.05620100000669481, "mean": 0.05852919999824735, "iqr": 0.0034000000255218765, "raw_times": [0.07690199998933167, 0.05620100000669481, 0.05479100002503401, 0.05195099998900332, 0.052800999981172936], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.057030999982998765, "peak_bytes": 18931712, "ok": true, "absmax": 0.0, "corr": {"ok": true, "rtol": 0.003, "atol": 0.005, "absmax": 0.0, "mae": 0.0, "mse": 0.0, "ref": "causal_conv1d_fp32"}, "err": null}
|
| 20 |
+
{"ts": "2025-10-29T14:27:36Z", "run": "b5ef0d3c828f4daf8ce959b0e1a6bf4c", "impl": "hf_kernels_causal_conv1d", "tags": {"family": "hf-kernels", "backend": "cuda"}, "wl": {"name": "cuda_B4_D2048_S128_W4", "batch": 4, "dim": 2048, "seqlen": 128, "width": 4, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.05290100000365783, "p50": 0.05457100002104198, "p90": 0.055251000048883725, "mean": 0.055353400011881604, "iqr": 0.001779000058377278, "raw_times": [0.05290100000365783, 0.06057199999531804, 0.055251000048883725, 0.05347199999050645, 0.05457100002104198], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.056300999972336285, "peak_bytes": 18948096, "ok": true, "absmax": 0.0, "corr": {"ok": true, "rtol": 0.003, "atol": 0.005, "absmax": 0.0, "mae": 0.0, "mse": 0.0, "ref": "causal_conv1d_fp32"}, "err": null}
|
| 21 |
+
{"ts": "2025-10-29T14:27:36Z", "run": "b5ef0d3c828f4daf8ce959b0e1a6bf4c", "impl": "hf_kernels_causal_conv1d", "tags": {"family": "hf-kernels", "backend": "cuda"}, "wl": {"name": "cuda_B4_D2048_S512_W2", "batch": 4, "dim": 2048, "seqlen": 512, "width": 2, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.052152000023397704, "p50": 0.05461199998535449, "p90": 0.05471200000783938, "mean": 0.05381760001910152, "iqr": 0.001900999961890193, "raw_times": [0.052152000023397704, 0.05461199998535449, 0.05480100003296684, 0.05471200000783938, 0.052811000045949186], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.05758200001082514, "peak_bytes": 75522048, "ok": true, "absmax": 0.0, "corr": {"ok": true, "rtol": 0.003, "atol": 0.005, "absmax": 0.0, "mae": 0.0, "mse": 0.0, "ref": "causal_conv1d_fp32"}, "err": null}
|
| 22 |
+
{"ts": "2025-10-29T14:27:36Z", "run": "b5ef0d3c828f4daf8ce959b0e1a6bf4c", "impl": "hf_kernels_causal_conv1d", "tags": {"family": "hf-kernels", "backend": "cuda"}, "wl": {"name": "cuda_B4_D2048_S512_W4", "batch": 4, "dim": 2048, "seqlen": 512, "width": 4, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.05339099999446262, "p50": 0.05463100001179555, "p90": 0.055171999974845676, "mean": 0.05451339999353877, "iqr": 0.0011309999763398082, "raw_times": [0.05404099999850587, 0.055331999988084135, 0.05339099999446262, 0.05463100001179555, 0.055171999974845676], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.058501000012256554, "peak_bytes": 75538432, "ok": true, "absmax": 0.0, "corr": {"ok": true, "rtol": 0.003, "atol": 0.005, "absmax": 0.0, "mae": 0.0, "mse": 0.0, "ref": "causal_conv1d_fp32"}, "err": null}
|
| 23 |
+
{"ts": "2025-10-29T14:27:36Z", "run": "b5ef0d3c828f4daf8ce959b0e1a6bf4c", "impl": "hf_kernels_causal_conv1d", "tags": {"family": "hf-kernels", "backend": "cuda"}, "wl": {"name": "cuda_B4_D2048_S2048_W2", "batch": 4, "dim": 2048, "seqlen": 2048, "width": 2, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.05277100001421786, "p50": 0.053860999969401746, "p90": 0.054361000024982786, "mean": 0.053951200004576094, "iqr": 0.0007190000133050489, "raw_times": [0.05277100001421786, 0.05512100000260034, 0.05364200001167774, 0.053860999969401746, 0.054361000024982786], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.057851999997637904, "peak_bytes": 302014464, "ok": true, "absmax": 0.0, "corr": {"ok": true, "rtol": 0.003, "atol": 0.005, "absmax": 0.0, "mae": 0.0, "mse": 0.0, "ref": "causal_conv1d_fp32"}, "err": null}
|
| 24 |
+
{"ts": "2025-10-29T14:27:36Z", "run": "b5ef0d3c828f4daf8ce959b0e1a6bf4c", "impl": "hf_kernels_causal_conv1d", "tags": {"family": "hf-kernels", "backend": "cuda"}, "wl": {"name": "cuda_B4_D2048_S2048_W4", "batch": 4, "dim": 2048, "seqlen": 2048, "width": 4, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.05124100005104992, "p50": 0.053691000005073874, "p90": 0.054261999991922494, "mean": 0.05327740001348502, "iqr": 0.0014609999539061391, "raw_times": [0.05124100005104992, 0.05439199998136246, 0.054261999991922494, 0.053691000005073874, 0.052801000038016355], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.05714199994599767, "peak_bytes": 302030848, "ok": true, "absmax": 0.0, "corr": {"ok": true, "rtol": 0.003, "atol": 0.005, "absmax": 0.0, "mae": 0.0, "mse": 0.0, "ref": "causal_conv1d_fp32"}, "err": null}
|
causal_conv1d/impls/cells/benchmark.py
CHANGED
|
@@ -4,37 +4,28 @@
|
|
| 4 |
# "numpy",
|
| 5 |
# "torch==2.8.0",
|
| 6 |
# "kernels-benchmark-tools",
|
|
|
|
| 7 |
# ]
|
| 8 |
#
|
| 9 |
# [tool.uv.sources]
|
| 10 |
# kernels-benchmark-tools = { path = "../../../../../tools", editable = true }
|
| 11 |
# ///
|
| 12 |
import torch
|
| 13 |
-
import torch.nn.functional as F
|
| 14 |
import sys
|
| 15 |
from kernels_benchmark_tools import KernelTypeEnum, run_benchmark
|
|
|
|
| 16 |
|
|
|
|
|
|
|
| 17 |
|
| 18 |
-
def torch_causal_conv1d(input_tensor, weight, bias):
|
| 19 |
-
# Convert to weight dtype for computation
|
| 20 |
-
x = input_tensor.to(weight.dtype)
|
| 21 |
-
dim = weight.shape[0]
|
| 22 |
-
width = weight.shape[1]
|
| 23 |
-
seqlen = input_tensor.shape[-1]
|
| 24 |
|
| 25 |
-
|
| 26 |
-
|
| 27 |
-
|
| 28 |
-
# Truncate to original sequence length
|
| 29 |
-
out = out[..., :seqlen]
|
| 30 |
-
|
| 31 |
-
# Convert back to original dtype
|
| 32 |
-
return out.to(input_tensor.dtype)
|
| 33 |
|
| 34 |
|
| 35 |
run_benchmark(
|
| 36 |
kernel_type=KernelTypeEnum.CAUSAL_CONV1D,
|
| 37 |
-
impl_name="
|
| 38 |
-
impl_tags={"family": "
|
| 39 |
-
impl_func=
|
| 40 |
)
|
|
|
|
| 4 |
# "numpy",
|
| 5 |
# "torch==2.8.0",
|
| 6 |
# "kernels-benchmark-tools",
|
| 7 |
+
# "kernels",
|
| 8 |
# ]
|
| 9 |
#
|
| 10 |
# [tool.uv.sources]
|
| 11 |
# kernels-benchmark-tools = { path = "../../../../../tools", editable = true }
|
| 12 |
# ///
|
| 13 |
import torch
|
|
|
|
| 14 |
import sys
|
| 15 |
from kernels_benchmark_tools import KernelTypeEnum, run_benchmark
|
| 16 |
+
from kernels import get_kernel
|
| 17 |
|
| 18 |
+
# Load the causal conv1d kernel
|
| 19 |
+
causal_conv1d = get_kernel("kernels-community/causal-conv1d")
|
| 20 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 21 |
|
| 22 |
+
def hf_kernels_causal_conv1d(input_tensor, weight, bias):
|
| 23 |
+
return causal_conv1d.causal_conv1d_fn(input_tensor, weight, bias)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 24 |
|
| 25 |
|
| 26 |
run_benchmark(
|
| 27 |
kernel_type=KernelTypeEnum.CAUSAL_CONV1D,
|
| 28 |
+
impl_name="hf_kernels_causal_conv1d",
|
| 29 |
+
impl_tags={"family": "hf-kernels", "backend": "cuda"},
|
| 30 |
+
impl_func=hf_kernels_causal_conv1d,
|
| 31 |
)
|
causal_conv1d/impls/hf_kernels_causal_conv1d.html
CHANGED
|
The diff for this file is too large to render.
See raw diff
|
|
|
causal_conv1d/impls/torch_causal_conv1d.html
CHANGED
|
The diff for this file is too large to render.
See raw diff
|
|
|
causal_conv1d/results/artifacts/combine/latency.svg
CHANGED
|
|
Git LFS Details
|
|
|
Git LFS Details
|
causal_conv1d/results/combined_results.html
CHANGED
|
@@ -3872,7 +3872,7 @@ span.linenos.special { color: #000000; background-color: #ffffc0; padding-left:
|
|
| 3872 |
<rdf:RDF>
|
| 3873 |
<ns2:Work>
|
| 3874 |
<dc:type rdf:resource="http://purl.org/dc/dcmitype/StillImage" />
|
| 3875 |
-
<dc:date>2025-10-
|
| 3876 |
<dc:format>image/svg+xml</dc:format>
|
| 3877 |
<dc:creator>
|
| 3878 |
<ns2:Agent>
|
|
@@ -4216,70 +4216,70 @@ span.linenos.special { color: #000000; background-color: #ffffc0; padding-left:
|
|
| 4216 |
<g id="matplotlib.axis_2">
|
| 4217 |
<g id="ytick_1">
|
| 4218 |
<g id="grid-y--2" class="grid grid-y">
|
| 4219 |
-
<path d="M 47.72
|
| 4220 |
</g>
|
| 4221 |
<g id="line2d_25">
|
| 4222 |
<defs>
|
| 4223 |
<path id="m0fca2865ba" d="M 0 0 L -3.5 0 " style="stroke: #000000; stroke-width: 0.8" />
|
| 4224 |
</defs>
|
| 4225 |
<g>
|
| 4226 |
-
<use ns4:href="#m0fca2865ba" x="47.72" y="
|
| 4227 |
</g>
|
| 4228 |
</g>
|
| 4229 |
<g id="text_25">
|
| 4230 |
-
<text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: end" x="40.72" y="
|
| 4231 |
</g>
|
| 4232 |
</g>
|
| 4233 |
<g id="ytick_2">
|
| 4234 |
<g id="grid-y--3" class="grid grid-y">
|
| 4235 |
-
<path d="M 47.72
|
| 4236 |
</g>
|
| 4237 |
<g id="line2d_26">
|
| 4238 |
<g>
|
| 4239 |
-
<use ns4:href="#m0fca2865ba" x="47.72" y="
|
| 4240 |
</g>
|
| 4241 |
</g>
|
| 4242 |
<g id="text_26">
|
| 4243 |
-
<text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: end" x="40.72" y="
|
| 4244 |
</g>
|
| 4245 |
</g>
|
| 4246 |
<g id="ytick_3">
|
| 4247 |
<g id="grid-y--4" class="grid grid-y">
|
| 4248 |
-
<path d="M 47.72
|
| 4249 |
</g>
|
| 4250 |
<g id="line2d_27">
|
| 4251 |
<g>
|
| 4252 |
-
<use ns4:href="#m0fca2865ba" x="47.72" y="
|
| 4253 |
</g>
|
| 4254 |
</g>
|
| 4255 |
<g id="text_27">
|
| 4256 |
-
<text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: end" x="40.72" y="213.
|
| 4257 |
</g>
|
| 4258 |
</g>
|
| 4259 |
<g id="ytick_4">
|
| 4260 |
<g id="grid-y--5" class="grid grid-y">
|
| 4261 |
-
<path d="M 47.72 126.
|
| 4262 |
</g>
|
| 4263 |
<g id="line2d_28">
|
| 4264 |
<g>
|
| 4265 |
-
<use ns4:href="#m0fca2865ba" x="47.72" y="126.
|
| 4266 |
</g>
|
| 4267 |
</g>
|
| 4268 |
<g id="text_28">
|
| 4269 |
-
<text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: end" x="40.72" y="130.
|
| 4270 |
</g>
|
| 4271 |
</g>
|
| 4272 |
<g id="ytick_5">
|
| 4273 |
<g id="grid-y--6" class="grid grid-y">
|
| 4274 |
-
<path d="M 47.72
|
| 4275 |
</g>
|
| 4276 |
<g id="line2d_29">
|
| 4277 |
<g>
|
| 4278 |
-
<use ns4:href="#m0fca2865ba" x="47.72" y="
|
| 4279 |
</g>
|
| 4280 |
</g>
|
| 4281 |
<g id="text_29">
|
| 4282 |
-
<text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: end" x="40.72" y="
|
| 4283 |
</g>
|
| 4284 |
</g>
|
| 4285 |
<g id="label--y" class="ylabel">
|
|
@@ -4287,66 +4287,66 @@ span.linenos.special { color: #000000; background-color: #ffffc0; padding-left:
|
|
| 4287 |
</g>
|
| 4288 |
</g>
|
| 4289 |
<g id="series--hf-kernels-causal-conv1d" class="series">
|
| 4290 |
-
<path d="M 83.325193 420.186871 L 114.286231
|
| 4291 |
<defs>
|
| 4292 |
<path id="md7efaf3aec" d="M 0 3 C 0.795609 3 1.55874 2.683901 2.12132 2.12132 C 2.683901 1.55874 3 0.795609 3 0 C 3 -0.795609 2.683901 -1.55874 2.12132 -2.12132 C 1.55874 -2.683901 0.795609 -3 0 -3 C -0.795609 -3 -1.55874 -2.683901 -2.12132 -2.12132 C -2.683901 -1.55874 -3 -0.795609 -3 0 C -3 0.795609 -2.683901 1.55874 -2.12132 2.12132 C -1.55874 2.683901 -0.795609 3 0 3 z " style="stroke: #1f77b4" />
|
| 4293 |
</defs>
|
| 4294 |
<g clip-path="url(#pb49fc4c8d2)">
|
| 4295 |
<use ns4:href="#md7efaf3aec" x="83.325193" y="420.186871" style="fill: #1f77b4; stroke: #1f77b4" />
|
| 4296 |
-
<use ns4:href="#md7efaf3aec" x="114.286231" y="
|
| 4297 |
-
<use ns4:href="#md7efaf3aec" x="145.247268" y="
|
| 4298 |
-
<use ns4:href="#md7efaf3aec" x="176.208306" y="
|
| 4299 |
-
<use ns4:href="#md7efaf3aec" x="207.169343" y="414.
|
| 4300 |
-
<use ns4:href="#md7efaf3aec" x="238.130381" y="
|
| 4301 |
-
<use ns4:href="#md7efaf3aec" x="269.091418" y="
|
| 4302 |
-
<use ns4:href="#md7efaf3aec" x="300.052455" y="
|
| 4303 |
-
<use ns4:href="#md7efaf3aec" x="331.013493" y="
|
| 4304 |
-
<use ns4:href="#md7efaf3aec" x="361.97453" y="415.
|
| 4305 |
-
<use ns4:href="#md7efaf3aec" x="392.935568" y="
|
| 4306 |
-
<use ns4:href="#md7efaf3aec" x="423.896605" y="
|
| 4307 |
-
<use ns4:href="#md7efaf3aec" x="454.857643" y="415.
|
| 4308 |
-
<use ns4:href="#md7efaf3aec" x="485.81868" y="415.
|
| 4309 |
-
<use ns4:href="#md7efaf3aec" x="516.779718" y="414.
|
| 4310 |
-
<use ns4:href="#md7efaf3aec" x="547.740755" y="
|
| 4311 |
-
<use ns4:href="#md7efaf3aec" x="578.701793" y="
|
| 4312 |
-
<use ns4:href="#md7efaf3aec" x="609.66283" y="
|
| 4313 |
-
<use ns4:href="#md7efaf3aec" x="640.623868" y="
|
| 4314 |
-
<use ns4:href="#md7efaf3aec" x="671.584905" y="
|
| 4315 |
-
<use ns4:href="#md7efaf3aec" x="702.545943" y="414.
|
| 4316 |
-
<use ns4:href="#md7efaf3aec" x="733.50698" y="
|
| 4317 |
-
<use ns4:href="#md7efaf3aec" x="764.468018" y="
|
| 4318 |
-
<use ns4:href="#md7efaf3aec" x="795.429055" y="415.
|
| 4319 |
</g>
|
| 4320 |
</g>
|
| 4321 |
<g id="series--torch-eager" class="series">
|
| 4322 |
-
<path d="M 83.325193
|
| 4323 |
<defs>
|
| 4324 |
<path id="m9b8c54d372" d="M 0 3 C 0.795609 3 1.55874 2.683901 2.12132 2.12132 C 2.683901 1.55874 3 0.795609 3 0 C 3 -0.795609 2.683901 -1.55874 2.12132 -2.12132 C 1.55874 -2.683901 0.795609 -3 0 -3 C -0.795609 -3 -1.55874 -2.683901 -2.12132 -2.12132 C -2.683901 -1.55874 -3 -0.795609 -3 0 C -3 0.795609 -2.683901 1.55874 -2.12132 2.12132 C -1.55874 2.683901 -0.795609 3 0 3 z " style="stroke: #ff7f0e" />
|
| 4325 |
</defs>
|
| 4326 |
<g clip-path="url(#pb49fc4c8d2)">
|
| 4327 |
-
<use ns4:href="#m9b8c54d372" x="83.325193" y="
|
| 4328 |
-
<use ns4:href="#m9b8c54d372" x="114.286231" y="
|
| 4329 |
-
<use ns4:href="#m9b8c54d372" x="145.247268" y="
|
| 4330 |
-
<use ns4:href="#m9b8c54d372" x="176.208306" y="
|
| 4331 |
-
<use ns4:href="#m9b8c54d372" x="207.169343" y="
|
| 4332 |
-
<use ns4:href="#m9b8c54d372" x="238.130381" y="
|
| 4333 |
-
<use ns4:href="#m9b8c54d372" x="269.091418" y="
|
| 4334 |
-
<use ns4:href="#m9b8c54d372" x="300.052455" y="
|
| 4335 |
-
<use ns4:href="#m9b8c54d372" x="331.013493" y="
|
| 4336 |
-
<use ns4:href="#m9b8c54d372" x="361.97453" y="
|
| 4337 |
-
<use ns4:href="#m9b8c54d372" x="392.935568" y="
|
| 4338 |
-
<use ns4:href="#m9b8c54d372" x="423.896605" y="
|
| 4339 |
-
<use ns4:href="#m9b8c54d372" x="454.857643" y="391.
|
| 4340 |
-
<use ns4:href="#m9b8c54d372" x="485.81868" y="
|
| 4341 |
-
<use ns4:href="#m9b8c54d372" x="516.779718" y="
|
| 4342 |
-
<use ns4:href="#m9b8c54d372" x="547.740755" y="390.
|
| 4343 |
-
<use ns4:href="#m9b8c54d372" x="578.701793" y="390.
|
| 4344 |
-
<use ns4:href="#m9b8c54d372" x="609.66283" y="
|
| 4345 |
-
<use ns4:href="#m9b8c54d372" x="640.623868" y="
|
| 4346 |
-
<use ns4:href="#m9b8c54d372" x="671.584905" y="
|
| 4347 |
-
<use ns4:href="#m9b8c54d372" x="702.545943" y="
|
| 4348 |
-
<use ns4:href="#m9b8c54d372" x="733.50698" y="375.
|
| 4349 |
-
<use ns4:href="#m9b8c54d372" x="764.468018" y="
|
| 4350 |
<use ns4:href="#m9b8c54d372" x="795.429055" y="45.608899" style="fill: #ff7f0e; stroke: #ff7f0e" />
|
| 4351 |
</g>
|
| 4352 |
</g>
|
|
@@ -4405,7 +4405,7 @@ span.linenos.special { color: #000000; background-color: #ffffc0; padding-left:
|
|
| 4405 |
<span onclick="toggleOutput('combine')" style="cursor: pointer;">▼ output</span>
|
| 4406 |
<span id="uv-indicator-combine" onclick="toggleUvLogsFromHeader('combine')" style="cursor: pointer;">▶ uv-logs</span>
|
| 4407 |
</span> |
|
| 4408 |
-
Cell: combine | 4.
|
| 4409 |
| <button class="run-btn" onclick="runCell('combine')">▶ run</button>
|
| 4410 |
<button class="copy-btn" onclick="copyCell('combine')">Copy</button>
|
| 4411 |
<a href="cells/combine.py" target="_blank" class="raw-btn">Raw</a>
|
|
@@ -4499,11 +4499,11 @@ hf_kernels_causal_conv1d cuda_B2_D2048_S2048_W4 0.05 True
|
|
| 4499 |
hf_kernels_causal_conv1d cuda_B2_D2048_S512_W2 0.05 True
|
| 4500 |
hf_kernels_causal_conv1d cuda_B2_D2048_S512_W4 0.05 True
|
| 4501 |
hf_kernels_causal_conv1d cuda_B2_D64_S128_W2 0.05 True
|
| 4502 |
-
hf_kernels_causal_conv1d cuda_B2_D64_S128_W4 0.
|
| 4503 |
-
hf_kernels_causal_conv1d cuda_B2_D64_S2048_W2 0.
|
| 4504 |
hf_kernels_causal_conv1d cuda_B2_D64_S2048_W4 0.05 True
|
| 4505 |
-
hf_kernels_causal_conv1d cuda_B2_D64_S512_W2 0.
|
| 4506 |
-
hf_kernels_causal_conv1d cuda_B2_D64_S512_W4 0.
|
| 4507 |
hf_kernels_causal_conv1d cuda_B4_D2048_S128_W2 0.05 True
|
| 4508 |
hf_kernels_causal_conv1d cuda_B4_D2048_S128_W4 0.05 True
|
| 4509 |
hf_kernels_causal_conv1d cuda_B4_D2048_S2048_W2 0.05 True
|
|
@@ -4514,9 +4514,9 @@ hf_kernels_causal_conv1d cuda_B4_D64_S128_W2 0.05 True
|
|
| 4514 |
hf_kernels_causal_conv1d cuda_B4_D64_S128_W4 0.05 True
|
| 4515 |
hf_kernels_causal_conv1d cuda_B4_D64_S2048_W2 0.05 True
|
| 4516 |
hf_kernels_causal_conv1d cuda_B4_D64_S2048_W4 0.05 True
|
| 4517 |
-
hf_kernels_causal_conv1d cuda_B4_D64_S512_W2 0.
|
| 4518 |
hf_kernels_causal_conv1d cuda_B4_D64_S512_W4 0.05 True
|
| 4519 |
-
torch_eager cuda_B2_D2048_S128_W2 0.
|
| 4520 |
torch_eager cuda_B2_D2048_S128_W4 0.08 True
|
| 4521 |
torch_eager cuda_B2_D2048_S2048_W2 0.15 True
|
| 4522 |
torch_eager cuda_B2_D2048_S2048_W4 0.16 True
|
|
@@ -4524,7 +4524,7 @@ torch_eager cuda_B2_D2048_S512_W2 0.08 True
|
|
| 4524 |
torch_eager cuda_B2_D2048_S512_W4 0.08 True
|
| 4525 |
torch_eager cuda_B2_D64_S128_W2 0.07 True
|
| 4526 |
torch_eager cuda_B2_D64_S128_W4 0.09 True
|
| 4527 |
-
torch_eager cuda_B2_D64_S2048_W2 0.
|
| 4528 |
torch_eager cuda_B2_D64_S2048_W4 0.08 True
|
| 4529 |
torch_eager cuda_B2_D64_S512_W2 0.09 True
|
| 4530 |
torch_eager cuda_B2_D64_S512_W4 0.08 True
|
|
@@ -4537,7 +4537,7 @@ torch_eager cuda_B4_D2048_S512_W4 0.10 True
|
|
| 4537 |
torch_eager cuda_B4_D64_S128_W2 0.08 True
|
| 4538 |
torch_eager cuda_B4_D64_S128_W4 0.08 True
|
| 4539 |
torch_eager cuda_B4_D64_S2048_W2 0.08 True
|
| 4540 |
-
torch_eager cuda_B4_D64_S2048_W4 0.
|
| 4541 |
torch_eager cuda_B4_D64_S512_W2 0.08 True
|
| 4542 |
torch_eager cuda_B4_D64_S512_W4 0.08 True
|
| 4543 |
|
|
@@ -4559,7 +4559,7 @@ Implementations included:
|
|
| 4559 |
<div class="uv-install-logs" id="uv-logs-combine">
|
| 4560 |
<div class="uv-logs-header" onclick="toggleUvLogs(this)">▶ UV Install Logs</div>
|
| 4561 |
<div class="uv-logs-content" style="display: none;">
|
| 4562 |
-
Installed 37 packages in
|
| 4563 |
</div>
|
| 4564 |
</div>
|
| 4565 |
<div class="cell-artifacts">
|
|
@@ -4572,7 +4572,7 @@ Installed 37 packages in 221ms
|
|
| 4572 |
<rdf:RDF>
|
| 4573 |
<ns2:Work>
|
| 4574 |
<dc:type rdf:resource="http://purl.org/dc/dcmitype/StillImage" />
|
| 4575 |
-
<dc:date>2025-10-
|
| 4576 |
<dc:format>image/svg+xml</dc:format>
|
| 4577 |
<dc:creator>
|
| 4578 |
<ns2:Agent>
|
|
@@ -4916,70 +4916,70 @@ Installed 37 packages in 221ms
|
|
| 4916 |
<g id="matplotlib.axis_2">
|
| 4917 |
<g id="ytick_1">
|
| 4918 |
<g id="grid-y--2" class="grid grid-y">
|
| 4919 |
-
<path d="M 47.72
|
| 4920 |
</g>
|
| 4921 |
<g id="line2d_25">
|
| 4922 |
<defs>
|
| 4923 |
<path id="m0fca2865ba" d="M 0 0 L -3.5 0 " style="stroke: #000000; stroke-width: 0.8" />
|
| 4924 |
</defs>
|
| 4925 |
<g>
|
| 4926 |
-
<use ns4:href="#m0fca2865ba" x="47.72" y="
|
| 4927 |
</g>
|
| 4928 |
</g>
|
| 4929 |
<g id="text_25">
|
| 4930 |
-
<text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: end" x="40.72" y="
|
| 4931 |
</g>
|
| 4932 |
</g>
|
| 4933 |
<g id="ytick_2">
|
| 4934 |
<g id="grid-y--3" class="grid grid-y">
|
| 4935 |
-
<path d="M 47.72
|
| 4936 |
</g>
|
| 4937 |
<g id="line2d_26">
|
| 4938 |
<g>
|
| 4939 |
-
<use ns4:href="#m0fca2865ba" x="47.72" y="
|
| 4940 |
</g>
|
| 4941 |
</g>
|
| 4942 |
<g id="text_26">
|
| 4943 |
-
<text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: end" x="40.72" y="
|
| 4944 |
</g>
|
| 4945 |
</g>
|
| 4946 |
<g id="ytick_3">
|
| 4947 |
<g id="grid-y--4" class="grid grid-y">
|
| 4948 |
-
<path d="M 47.72
|
| 4949 |
</g>
|
| 4950 |
<g id="line2d_27">
|
| 4951 |
<g>
|
| 4952 |
-
<use ns4:href="#m0fca2865ba" x="47.72" y="
|
| 4953 |
</g>
|
| 4954 |
</g>
|
| 4955 |
<g id="text_27">
|
| 4956 |
-
<text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: end" x="40.72" y="213.
|
| 4957 |
</g>
|
| 4958 |
</g>
|
| 4959 |
<g id="ytick_4">
|
| 4960 |
<g id="grid-y--5" class="grid grid-y">
|
| 4961 |
-
<path d="M 47.72 126.
|
| 4962 |
</g>
|
| 4963 |
<g id="line2d_28">
|
| 4964 |
<g>
|
| 4965 |
-
<use ns4:href="#m0fca2865ba" x="47.72" y="126.
|
| 4966 |
</g>
|
| 4967 |
</g>
|
| 4968 |
<g id="text_28">
|
| 4969 |
-
<text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: end" x="40.72" y="130.
|
| 4970 |
</g>
|
| 4971 |
</g>
|
| 4972 |
<g id="ytick_5">
|
| 4973 |
<g id="grid-y--6" class="grid grid-y">
|
| 4974 |
-
<path d="M 47.72
|
| 4975 |
</g>
|
| 4976 |
<g id="line2d_29">
|
| 4977 |
<g>
|
| 4978 |
-
<use ns4:href="#m0fca2865ba" x="47.72" y="
|
| 4979 |
</g>
|
| 4980 |
</g>
|
| 4981 |
<g id="text_29">
|
| 4982 |
-
<text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: end" x="40.72" y="
|
| 4983 |
</g>
|
| 4984 |
</g>
|
| 4985 |
<g id="label--y" class="ylabel">
|
|
@@ -4987,66 +4987,66 @@ Installed 37 packages in 221ms
|
|
| 4987 |
</g>
|
| 4988 |
</g>
|
| 4989 |
<g id="series--hf-kernels-causal-conv1d" class="series">
|
| 4990 |
-
<path d="M 83.325193 420.186871 L 114.286231
|
| 4991 |
<defs>
|
| 4992 |
<path id="md7efaf3aec" d="M 0 3 C 0.795609 3 1.55874 2.683901 2.12132 2.12132 C 2.683901 1.55874 3 0.795609 3 0 C 3 -0.795609 2.683901 -1.55874 2.12132 -2.12132 C 1.55874 -2.683901 0.795609 -3 0 -3 C -0.795609 -3 -1.55874 -2.683901 -2.12132 -2.12132 C -2.683901 -1.55874 -3 -0.795609 -3 0 C -3 0.795609 -2.683901 1.55874 -2.12132 2.12132 C -1.55874 2.683901 -0.795609 3 0 3 z " style="stroke: #1f77b4" />
|
| 4993 |
</defs>
|
| 4994 |
<g clip-path="url(#pb49fc4c8d2)">
|
| 4995 |
<use ns4:href="#md7efaf3aec" x="83.325193" y="420.186871" style="fill: #1f77b4; stroke: #1f77b4" />
|
| 4996 |
-
<use ns4:href="#md7efaf3aec" x="114.286231" y="
|
| 4997 |
-
<use ns4:href="#md7efaf3aec" x="145.247268" y="
|
| 4998 |
-
<use ns4:href="#md7efaf3aec" x="176.208306" y="
|
| 4999 |
-
<use ns4:href="#md7efaf3aec" x="207.169343" y="414.
|
| 5000 |
-
<use ns4:href="#md7efaf3aec" x="238.130381" y="
|
| 5001 |
-
<use ns4:href="#md7efaf3aec" x="269.091418" y="
|
| 5002 |
-
<use ns4:href="#md7efaf3aec" x="300.052455" y="
|
| 5003 |
-
<use ns4:href="#md7efaf3aec" x="331.013493" y="
|
| 5004 |
-
<use ns4:href="#md7efaf3aec" x="361.97453" y="415.
|
| 5005 |
-
<use ns4:href="#md7efaf3aec" x="392.935568" y="
|
| 5006 |
-
<use ns4:href="#md7efaf3aec" x="423.896605" y="
|
| 5007 |
-
<use ns4:href="#md7efaf3aec" x="454.857643" y="415.
|
| 5008 |
-
<use ns4:href="#md7efaf3aec" x="485.81868" y="415.
|
| 5009 |
-
<use ns4:href="#md7efaf3aec" x="516.779718" y="414.
|
| 5010 |
-
<use ns4:href="#md7efaf3aec" x="547.740755" y="
|
| 5011 |
-
<use ns4:href="#md7efaf3aec" x="578.701793" y="
|
| 5012 |
-
<use ns4:href="#md7efaf3aec" x="609.66283" y="
|
| 5013 |
-
<use ns4:href="#md7efaf3aec" x="640.623868" y="
|
| 5014 |
-
<use ns4:href="#md7efaf3aec" x="671.584905" y="
|
| 5015 |
-
<use ns4:href="#md7efaf3aec" x="702.545943" y="414.
|
| 5016 |
-
<use ns4:href="#md7efaf3aec" x="733.50698" y="
|
| 5017 |
-
<use ns4:href="#md7efaf3aec" x="764.468018" y="
|
| 5018 |
-
<use ns4:href="#md7efaf3aec" x="795.429055" y="415.
|
| 5019 |
</g>
|
| 5020 |
</g>
|
| 5021 |
<g id="series--torch-eager" class="series">
|
| 5022 |
-
<path d="M 83.325193
|
| 5023 |
<defs>
|
| 5024 |
<path id="m9b8c54d372" d="M 0 3 C 0.795609 3 1.55874 2.683901 2.12132 2.12132 C 2.683901 1.55874 3 0.795609 3 0 C 3 -0.795609 2.683901 -1.55874 2.12132 -2.12132 C 1.55874 -2.683901 0.795609 -3 0 -3 C -0.795609 -3 -1.55874 -2.683901 -2.12132 -2.12132 C -2.683901 -1.55874 -3 -0.795609 -3 0 C -3 0.795609 -2.683901 1.55874 -2.12132 2.12132 C -1.55874 2.683901 -0.795609 3 0 3 z " style="stroke: #ff7f0e" />
|
| 5025 |
</defs>
|
| 5026 |
<g clip-path="url(#pb49fc4c8d2)">
|
| 5027 |
-
<use ns4:href="#m9b8c54d372" x="83.325193" y="
|
| 5028 |
-
<use ns4:href="#m9b8c54d372" x="114.286231" y="
|
| 5029 |
-
<use ns4:href="#m9b8c54d372" x="145.247268" y="
|
| 5030 |
-
<use ns4:href="#m9b8c54d372" x="176.208306" y="
|
| 5031 |
-
<use ns4:href="#m9b8c54d372" x="207.169343" y="
|
| 5032 |
-
<use ns4:href="#m9b8c54d372" x="238.130381" y="
|
| 5033 |
-
<use ns4:href="#m9b8c54d372" x="269.091418" y="
|
| 5034 |
-
<use ns4:href="#m9b8c54d372" x="300.052455" y="
|
| 5035 |
-
<use ns4:href="#m9b8c54d372" x="331.013493" y="
|
| 5036 |
-
<use ns4:href="#m9b8c54d372" x="361.97453" y="
|
| 5037 |
-
<use ns4:href="#m9b8c54d372" x="392.935568" y="
|
| 5038 |
-
<use ns4:href="#m9b8c54d372" x="423.896605" y="
|
| 5039 |
-
<use ns4:href="#m9b8c54d372" x="454.857643" y="391.
|
| 5040 |
-
<use ns4:href="#m9b8c54d372" x="485.81868" y="
|
| 5041 |
-
<use ns4:href="#m9b8c54d372" x="516.779718" y="
|
| 5042 |
-
<use ns4:href="#m9b8c54d372" x="547.740755" y="390.
|
| 5043 |
-
<use ns4:href="#m9b8c54d372" x="578.701793" y="390.
|
| 5044 |
-
<use ns4:href="#m9b8c54d372" x="609.66283" y="
|
| 5045 |
-
<use ns4:href="#m9b8c54d372" x="640.623868" y="
|
| 5046 |
-
<use ns4:href="#m9b8c54d372" x="671.584905" y="
|
| 5047 |
-
<use ns4:href="#m9b8c54d372" x="702.545943" y="
|
| 5048 |
-
<use ns4:href="#m9b8c54d372" x="733.50698" y="375.
|
| 5049 |
-
<use ns4:href="#m9b8c54d372" x="764.468018" y="
|
| 5050 |
<use ns4:href="#m9b8c54d372" x="795.429055" y="45.608899" style="fill: #ff7f0e; stroke: #ff7f0e" />
|
| 5051 |
</g>
|
| 5052 |
</g>
|
|
|
|
| 3872 |
<rdf:RDF>
|
| 3873 |
<ns2:Work>
|
| 3874 |
<dc:type rdf:resource="http://purl.org/dc/dcmitype/StillImage" />
|
| 3875 |
+
<dc:date>2025-10-29T14:27:58.771179</dc:date>
|
| 3876 |
<dc:format>image/svg+xml</dc:format>
|
| 3877 |
<dc:creator>
|
| 3878 |
<ns2:Agent>
|
|
|
|
| 4216 |
<g id="matplotlib.axis_2">
|
| 4217 |
<g id="ytick_1">
|
| 4218 |
<g id="grid-y--2" class="grid grid-y">
|
| 4219 |
+
<path d="M 47.72 377.079386 L 831.034248 377.079386 " clip-path="url(#pb49fc4c8d2)" style="fill: none; stroke: #b0b0b0; stroke-opacity: 0.3; stroke-width: 0.8; stroke-linecap: square" />
|
| 4220 |
</g>
|
| 4221 |
<g id="line2d_25">
|
| 4222 |
<defs>
|
| 4223 |
<path id="m0fca2865ba" d="M 0 0 L -3.5 0 " style="stroke: #000000; stroke-width: 0.8" />
|
| 4224 |
</defs>
|
| 4225 |
<g>
|
| 4226 |
+
<use ns4:href="#m0fca2865ba" x="47.72" y="377.079386" style="stroke: #000000; stroke-width: 0.8" />
|
| 4227 |
</g>
|
| 4228 |
</g>
|
| 4229 |
<g id="text_25">
|
| 4230 |
+
<text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: end" x="40.72" y="380.878605" transform="rotate(-0 40.72 380.878605)">0.1</text>
|
| 4231 |
</g>
|
| 4232 |
</g>
|
| 4233 |
<g id="ytick_2">
|
| 4234 |
<g id="grid-y--3" class="grid grid-y">
|
| 4235 |
+
<path d="M 47.72 293.552318 L 831.034248 293.552318 " clip-path="url(#pb49fc4c8d2)" style="fill: none; stroke: #b0b0b0; stroke-opacity: 0.3; stroke-width: 0.8; stroke-linecap: square" />
|
| 4236 |
</g>
|
| 4237 |
<g id="line2d_26">
|
| 4238 |
<g>
|
| 4239 |
+
<use ns4:href="#m0fca2865ba" x="47.72" y="293.552318" style="stroke: #000000; stroke-width: 0.8" />
|
| 4240 |
</g>
|
| 4241 |
</g>
|
| 4242 |
<g id="text_26">
|
| 4243 |
+
<text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: end" x="40.72" y="297.351537" transform="rotate(-0 40.72 297.351537)">0.2</text>
|
| 4244 |
</g>
|
| 4245 |
</g>
|
| 4246 |
<g id="ytick_3">
|
| 4247 |
<g id="grid-y--4" class="grid grid-y">
|
| 4248 |
+
<path d="M 47.72 210.02525 L 831.034248 210.02525 " clip-path="url(#pb49fc4c8d2)" style="fill: none; stroke: #b0b0b0; stroke-opacity: 0.3; stroke-width: 0.8; stroke-linecap: square" />
|
| 4249 |
</g>
|
| 4250 |
<g id="line2d_27">
|
| 4251 |
<g>
|
| 4252 |
+
<use ns4:href="#m0fca2865ba" x="47.72" y="210.02525" style="stroke: #000000; stroke-width: 0.8" />
|
| 4253 |
</g>
|
| 4254 |
</g>
|
| 4255 |
<g id="text_27">
|
| 4256 |
+
<text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: end" x="40.72" y="213.824469" transform="rotate(-0 40.72 213.824469)">0.3</text>
|
| 4257 |
</g>
|
| 4258 |
</g>
|
| 4259 |
<g id="ytick_4">
|
| 4260 |
<g id="grid-y--5" class="grid grid-y">
|
| 4261 |
+
<path d="M 47.72 126.498182 L 831.034248 126.498182 " clip-path="url(#pb49fc4c8d2)" style="fill: none; stroke: #b0b0b0; stroke-opacity: 0.3; stroke-width: 0.8; stroke-linecap: square" />
|
| 4262 |
</g>
|
| 4263 |
<g id="line2d_28">
|
| 4264 |
<g>
|
| 4265 |
+
<use ns4:href="#m0fca2865ba" x="47.72" y="126.498182" style="stroke: #000000; stroke-width: 0.8" />
|
| 4266 |
</g>
|
| 4267 |
</g>
|
| 4268 |
<g id="text_28">
|
| 4269 |
+
<text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: end" x="40.72" y="130.297401" transform="rotate(-0 40.72 130.297401)">0.4</text>
|
| 4270 |
</g>
|
| 4271 |
</g>
|
| 4272 |
<g id="ytick_5">
|
| 4273 |
<g id="grid-y--6" class="grid grid-y">
|
| 4274 |
+
<path d="M 47.72 42.971114 L 831.034248 42.971114 " clip-path="url(#pb49fc4c8d2)" style="fill: none; stroke: #b0b0b0; stroke-opacity: 0.3; stroke-width: 0.8; stroke-linecap: square" />
|
| 4275 |
</g>
|
| 4276 |
<g id="line2d_29">
|
| 4277 |
<g>
|
| 4278 |
+
<use ns4:href="#m0fca2865ba" x="47.72" y="42.971114" style="stroke: #000000; stroke-width: 0.8" />
|
| 4279 |
</g>
|
| 4280 |
</g>
|
| 4281 |
<g id="text_29">
|
| 4282 |
+
<text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: end" x="40.72" y="46.770333" transform="rotate(-0 40.72 46.770333)">0.5</text>
|
| 4283 |
</g>
|
| 4284 |
</g>
|
| 4285 |
<g id="label--y" class="ylabel">
|
|
|
|
| 4287 |
</g>
|
| 4288 |
</g>
|
| 4289 |
<g id="series--hf-kernels-causal-conv1d" class="series">
|
| 4290 |
+
<path d="M 83.325193 420.186871 L 114.286231 413.746934 L 145.247268 413.019413 L 176.208306 414.649026 L 207.169343 414.665731 L 238.130381 415.985459 L 269.091418 416.252746 L 300.052455 415.525225 L 331.013493 416.703792 L 361.97453 415.692279 L 392.935568 416.269451 L 423.896605 416.168383 L 454.857643 415.52606 L 485.81868 415.952048 L 516.779718 414.072689 L 547.740755 415.399934 L 578.701793 415.43418 L 609.66283 416.402259 L 640.623868 414.841138 L 671.584905 415.024898 L 702.545943 414.990652 L 733.50698 414.974782 L 764.468018 415.61794 L 795.429055 415.759936 " clip-path="url(#pb49fc4c8d2)" style="fill: none; stroke: #1f77b4; stroke-width: 1.5; stroke-linecap: square" />
|
| 4291 |
<defs>
|
| 4292 |
<path id="md7efaf3aec" d="M 0 3 C 0.795609 3 1.55874 2.683901 2.12132 2.12132 C 2.683901 1.55874 3 0.795609 3 0 C 3 -0.795609 2.683901 -1.55874 2.12132 -2.12132 C 1.55874 -2.683901 0.795609 -3 0 -3 C -0.795609 -3 -1.55874 -2.683901 -2.12132 -2.12132 C -2.683901 -1.55874 -3 -0.795609 -3 0 C -3 0.795609 -2.683901 1.55874 -2.12132 2.12132 C -1.55874 2.683901 -0.795609 3 0 3 z " style="stroke: #1f77b4" />
|
| 4293 |
</defs>
|
| 4294 |
<g clip-path="url(#pb49fc4c8d2)">
|
| 4295 |
<use ns4:href="#md7efaf3aec" x="83.325193" y="420.186871" style="fill: #1f77b4; stroke: #1f77b4" />
|
| 4296 |
+
<use ns4:href="#md7efaf3aec" x="114.286231" y="413.746934" style="fill: #1f77b4; stroke: #1f77b4" />
|
| 4297 |
+
<use ns4:href="#md7efaf3aec" x="145.247268" y="413.019413" style="fill: #1f77b4; stroke: #1f77b4" />
|
| 4298 |
+
<use ns4:href="#md7efaf3aec" x="176.208306" y="414.649026" style="fill: #1f77b4; stroke: #1f77b4" />
|
| 4299 |
+
<use ns4:href="#md7efaf3aec" x="207.169343" y="414.665731" style="fill: #1f77b4; stroke: #1f77b4" />
|
| 4300 |
+
<use ns4:href="#md7efaf3aec" x="238.130381" y="415.985459" style="fill: #1f77b4; stroke: #1f77b4" />
|
| 4301 |
+
<use ns4:href="#md7efaf3aec" x="269.091418" y="416.252746" style="fill: #1f77b4; stroke: #1f77b4" />
|
| 4302 |
+
<use ns4:href="#md7efaf3aec" x="300.052455" y="415.525225" style="fill: #1f77b4; stroke: #1f77b4" />
|
| 4303 |
+
<use ns4:href="#md7efaf3aec" x="331.013493" y="416.703792" style="fill: #1f77b4; stroke: #1f77b4" />
|
| 4304 |
+
<use ns4:href="#md7efaf3aec" x="361.97453" y="415.692279" style="fill: #1f77b4; stroke: #1f77b4" />
|
| 4305 |
+
<use ns4:href="#md7efaf3aec" x="392.935568" y="416.269451" style="fill: #1f77b4; stroke: #1f77b4" />
|
| 4306 |
+
<use ns4:href="#md7efaf3aec" x="423.896605" y="416.168383" style="fill: #1f77b4; stroke: #1f77b4" />
|
| 4307 |
+
<use ns4:href="#md7efaf3aec" x="454.857643" y="415.52606" style="fill: #1f77b4; stroke: #1f77b4" />
|
| 4308 |
+
<use ns4:href="#md7efaf3aec" x="485.81868" y="415.952048" style="fill: #1f77b4; stroke: #1f77b4" />
|
| 4309 |
+
<use ns4:href="#md7efaf3aec" x="516.779718" y="414.072689" style="fill: #1f77b4; stroke: #1f77b4" />
|
| 4310 |
+
<use ns4:href="#md7efaf3aec" x="547.740755" y="415.399934" style="fill: #1f77b4; stroke: #1f77b4" />
|
| 4311 |
+
<use ns4:href="#md7efaf3aec" x="578.701793" y="415.43418" style="fill: #1f77b4; stroke: #1f77b4" />
|
| 4312 |
+
<use ns4:href="#md7efaf3aec" x="609.66283" y="416.402259" style="fill: #1f77b4; stroke: #1f77b4" />
|
| 4313 |
+
<use ns4:href="#md7efaf3aec" x="640.623868" y="414.841138" style="fill: #1f77b4; stroke: #1f77b4" />
|
| 4314 |
+
<use ns4:href="#md7efaf3aec" x="671.584905" y="415.024898" style="fill: #1f77b4; stroke: #1f77b4" />
|
| 4315 |
+
<use ns4:href="#md7efaf3aec" x="702.545943" y="414.990652" style="fill: #1f77b4; stroke: #1f77b4" />
|
| 4316 |
+
<use ns4:href="#md7efaf3aec" x="733.50698" y="414.974782" style="fill: #1f77b4; stroke: #1f77b4" />
|
| 4317 |
+
<use ns4:href="#md7efaf3aec" x="764.468018" y="415.61794" style="fill: #1f77b4; stroke: #1f77b4" />
|
| 4318 |
+
<use ns4:href="#md7efaf3aec" x="795.429055" y="415.759936" style="fill: #1f77b4; stroke: #1f77b4" />
|
| 4319 |
</g>
|
| 4320 |
</g>
|
| 4321 |
<g id="series--torch-eager" class="series">
|
| 4322 |
+
<path d="M 83.325193 401.710683 L 114.286231 389.180788 L 145.247268 389.523249 L 176.208306 390.141349 L 207.169343 391.126968 L 238.130381 390.809566 L 269.091418 390.934856 L 300.052455 390.667569 L 331.013493 390.500515 L 361.97453 389.707008 L 392.935568 339.037818 L 423.896605 325.239147 L 454.857643 391.043441 L 485.81868 391.009195 L 516.779718 391.143674 L 547.740755 390.442046 L 578.701793 390.951562 L 609.66283 389.129836 L 640.623868 391.795185 L 671.584905 391.319081 L 702.545943 381.654999 L 733.50698 375.966806 L 764.468018 53.96077 L 795.429055 45.608899 " clip-path="url(#pb49fc4c8d2)" style="fill: none; stroke: #ff7f0e; stroke-width: 1.5; stroke-linecap: square" />
|
| 4323 |
<defs>
|
| 4324 |
<path id="m9b8c54d372" d="M 0 3 C 0.795609 3 1.55874 2.683901 2.12132 2.12132 C 2.683901 1.55874 3 0.795609 3 0 C 3 -0.795609 2.683901 -1.55874 2.12132 -2.12132 C 1.55874 -2.683901 0.795609 -3 0 -3 C -0.795609 -3 -1.55874 -2.683901 -2.12132 -2.12132 C -2.683901 -1.55874 -3 -0.795609 -3 0 C -3 0.795609 -2.683901 1.55874 -2.12132 2.12132 C -1.55874 2.683901 -0.795609 3 0 3 z " style="stroke: #ff7f0e" />
|
| 4325 |
</defs>
|
| 4326 |
<g clip-path="url(#pb49fc4c8d2)">
|
| 4327 |
+
<use ns4:href="#m9b8c54d372" x="83.325193" y="401.710683" style="fill: #ff7f0e; stroke: #ff7f0e" />
|
| 4328 |
+
<use ns4:href="#m9b8c54d372" x="114.286231" y="389.180788" style="fill: #ff7f0e; stroke: #ff7f0e" />
|
| 4329 |
+
<use ns4:href="#m9b8c54d372" x="145.247268" y="389.523249" style="fill: #ff7f0e; stroke: #ff7f0e" />
|
| 4330 |
+
<use ns4:href="#m9b8c54d372" x="176.208306" y="390.141349" style="fill: #ff7f0e; stroke: #ff7f0e" />
|
| 4331 |
+
<use ns4:href="#m9b8c54d372" x="207.169343" y="391.126968" style="fill: #ff7f0e; stroke: #ff7f0e" />
|
| 4332 |
+
<use ns4:href="#m9b8c54d372" x="238.130381" y="390.809566" style="fill: #ff7f0e; stroke: #ff7f0e" />
|
| 4333 |
+
<use ns4:href="#m9b8c54d372" x="269.091418" y="390.934856" style="fill: #ff7f0e; stroke: #ff7f0e" />
|
| 4334 |
+
<use ns4:href="#m9b8c54d372" x="300.052455" y="390.667569" style="fill: #ff7f0e; stroke: #ff7f0e" />
|
| 4335 |
+
<use ns4:href="#m9b8c54d372" x="331.013493" y="390.500515" style="fill: #ff7f0e; stroke: #ff7f0e" />
|
| 4336 |
+
<use ns4:href="#m9b8c54d372" x="361.97453" y="389.707008" style="fill: #ff7f0e; stroke: #ff7f0e" />
|
| 4337 |
+
<use ns4:href="#m9b8c54d372" x="392.935568" y="339.037818" style="fill: #ff7f0e; stroke: #ff7f0e" />
|
| 4338 |
+
<use ns4:href="#m9b8c54d372" x="423.896605" y="325.239147" style="fill: #ff7f0e; stroke: #ff7f0e" />
|
| 4339 |
+
<use ns4:href="#m9b8c54d372" x="454.857643" y="391.043441" style="fill: #ff7f0e; stroke: #ff7f0e" />
|
| 4340 |
+
<use ns4:href="#m9b8c54d372" x="485.81868" y="391.009195" style="fill: #ff7f0e; stroke: #ff7f0e" />
|
| 4341 |
+
<use ns4:href="#m9b8c54d372" x="516.779718" y="391.143674" style="fill: #ff7f0e; stroke: #ff7f0e" />
|
| 4342 |
+
<use ns4:href="#m9b8c54d372" x="547.740755" y="390.442046" style="fill: #ff7f0e; stroke: #ff7f0e" />
|
| 4343 |
+
<use ns4:href="#m9b8c54d372" x="578.701793" y="390.951562" style="fill: #ff7f0e; stroke: #ff7f0e" />
|
| 4344 |
+
<use ns4:href="#m9b8c54d372" x="609.66283" y="389.129836" style="fill: #ff7f0e; stroke: #ff7f0e" />
|
| 4345 |
+
<use ns4:href="#m9b8c54d372" x="640.623868" y="391.795185" style="fill: #ff7f0e; stroke: #ff7f0e" />
|
| 4346 |
+
<use ns4:href="#m9b8c54d372" x="671.584905" y="391.319081" style="fill: #ff7f0e; stroke: #ff7f0e" />
|
| 4347 |
+
<use ns4:href="#m9b8c54d372" x="702.545943" y="381.654999" style="fill: #ff7f0e; stroke: #ff7f0e" />
|
| 4348 |
+
<use ns4:href="#m9b8c54d372" x="733.50698" y="375.966806" style="fill: #ff7f0e; stroke: #ff7f0e" />
|
| 4349 |
+
<use ns4:href="#m9b8c54d372" x="764.468018" y="53.96077" style="fill: #ff7f0e; stroke: #ff7f0e" />
|
| 4350 |
<use ns4:href="#m9b8c54d372" x="795.429055" y="45.608899" style="fill: #ff7f0e; stroke: #ff7f0e" />
|
| 4351 |
</g>
|
| 4352 |
</g>
|
|
|
|
| 4405 |
<span onclick="toggleOutput('combine')" style="cursor: pointer;">▼ output</span>
|
| 4406 |
<span id="uv-indicator-combine" onclick="toggleUvLogsFromHeader('combine')" style="cursor: pointer;">▶ uv-logs</span>
|
| 4407 |
</span> |
|
| 4408 |
+
Cell: combine | 4.32s
|
| 4409 |
| <button class="run-btn" onclick="runCell('combine')">▶ run</button>
|
| 4410 |
<button class="copy-btn" onclick="copyCell('combine')">Copy</button>
|
| 4411 |
<a href="cells/combine.py" target="_blank" class="raw-btn">Raw</a>
|
|
|
|
| 4499 |
hf_kernels_causal_conv1d cuda_B2_D2048_S512_W2 0.05 True
|
| 4500 |
hf_kernels_causal_conv1d cuda_B2_D2048_S512_W4 0.05 True
|
| 4501 |
hf_kernels_causal_conv1d cuda_B2_D64_S128_W2 0.05 True
|
| 4502 |
+
hf_kernels_causal_conv1d cuda_B2_D64_S128_W4 0.06 True
|
| 4503 |
+
hf_kernels_causal_conv1d cuda_B2_D64_S2048_W2 0.06 True
|
| 4504 |
hf_kernels_causal_conv1d cuda_B2_D64_S2048_W4 0.05 True
|
| 4505 |
+
hf_kernels_causal_conv1d cuda_B2_D64_S512_W2 0.06 True
|
| 4506 |
+
hf_kernels_causal_conv1d cuda_B2_D64_S512_W4 0.06 True
|
| 4507 |
hf_kernels_causal_conv1d cuda_B4_D2048_S128_W2 0.05 True
|
| 4508 |
hf_kernels_causal_conv1d cuda_B4_D2048_S128_W4 0.05 True
|
| 4509 |
hf_kernels_causal_conv1d cuda_B4_D2048_S2048_W2 0.05 True
|
|
|
|
| 4514 |
hf_kernels_causal_conv1d cuda_B4_D64_S128_W4 0.05 True
|
| 4515 |
hf_kernels_causal_conv1d cuda_B4_D64_S2048_W2 0.05 True
|
| 4516 |
hf_kernels_causal_conv1d cuda_B4_D64_S2048_W4 0.05 True
|
| 4517 |
+
hf_kernels_causal_conv1d cuda_B4_D64_S512_W2 0.06 True
|
| 4518 |
hf_kernels_causal_conv1d cuda_B4_D64_S512_W4 0.05 True
|
| 4519 |
+
torch_eager cuda_B2_D2048_S128_W2 0.08 True
|
| 4520 |
torch_eager cuda_B2_D2048_S128_W4 0.08 True
|
| 4521 |
torch_eager cuda_B2_D2048_S2048_W2 0.15 True
|
| 4522 |
torch_eager cuda_B2_D2048_S2048_W4 0.16 True
|
|
|
|
| 4524 |
torch_eager cuda_B2_D2048_S512_W4 0.08 True
|
| 4525 |
torch_eager cuda_B2_D64_S128_W2 0.07 True
|
| 4526 |
torch_eager cuda_B2_D64_S128_W4 0.09 True
|
| 4527 |
+
torch_eager cuda_B2_D64_S2048_W2 0.08 True
|
| 4528 |
torch_eager cuda_B2_D64_S2048_W4 0.08 True
|
| 4529 |
torch_eager cuda_B2_D64_S512_W2 0.09 True
|
| 4530 |
torch_eager cuda_B2_D64_S512_W4 0.08 True
|
|
|
|
| 4537 |
torch_eager cuda_B4_D64_S128_W2 0.08 True
|
| 4538 |
torch_eager cuda_B4_D64_S128_W4 0.08 True
|
| 4539 |
torch_eager cuda_B4_D64_S2048_W2 0.08 True
|
| 4540 |
+
torch_eager cuda_B4_D64_S2048_W4 0.09 True
|
| 4541 |
torch_eager cuda_B4_D64_S512_W2 0.08 True
|
| 4542 |
torch_eager cuda_B4_D64_S512_W4 0.08 True
|
| 4543 |
|
|
|
|
| 4559 |
<div class="uv-install-logs" id="uv-logs-combine">
|
| 4560 |
<div class="uv-logs-header" onclick="toggleUvLogs(this)">▶ UV Install Logs</div>
|
| 4561 |
<div class="uv-logs-content" style="display: none;">
|
| 4562 |
+
Installed 37 packages in 214ms
|
| 4563 |
</div>
|
| 4564 |
</div>
|
| 4565 |
<div class="cell-artifacts">
|
|
|
|
| 4572 |
<rdf:RDF>
|
| 4573 |
<ns2:Work>
|
| 4574 |
<dc:type rdf:resource="http://purl.org/dc/dcmitype/StillImage" />
|
| 4575 |
+
<dc:date>2025-10-29T14:27:58.771179</dc:date>
|
| 4576 |
<dc:format>image/svg+xml</dc:format>
|
| 4577 |
<dc:creator>
|
| 4578 |
<ns2:Agent>
|
|
|
|
| 4916 |
<g id="matplotlib.axis_2">
|
| 4917 |
<g id="ytick_1">
|
| 4918 |
<g id="grid-y--2" class="grid grid-y">
|
| 4919 |
+
<path d="M 47.72 377.079386 L 831.034248 377.079386 " clip-path="url(#pb49fc4c8d2)" style="fill: none; stroke: #b0b0b0; stroke-opacity: 0.3; stroke-width: 0.8; stroke-linecap: square" />
|
| 4920 |
</g>
|
| 4921 |
<g id="line2d_25">
|
| 4922 |
<defs>
|
| 4923 |
<path id="m0fca2865ba" d="M 0 0 L -3.5 0 " style="stroke: #000000; stroke-width: 0.8" />
|
| 4924 |
</defs>
|
| 4925 |
<g>
|
| 4926 |
+
<use ns4:href="#m0fca2865ba" x="47.72" y="377.079386" style="stroke: #000000; stroke-width: 0.8" />
|
| 4927 |
</g>
|
| 4928 |
</g>
|
| 4929 |
<g id="text_25">
|
| 4930 |
+
<text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: end" x="40.72" y="380.878605" transform="rotate(-0 40.72 380.878605)">0.1</text>
|
| 4931 |
</g>
|
| 4932 |
</g>
|
| 4933 |
<g id="ytick_2">
|
| 4934 |
<g id="grid-y--3" class="grid grid-y">
|
| 4935 |
+
<path d="M 47.72 293.552318 L 831.034248 293.552318 " clip-path="url(#pb49fc4c8d2)" style="fill: none; stroke: #b0b0b0; stroke-opacity: 0.3; stroke-width: 0.8; stroke-linecap: square" />
|
| 4936 |
</g>
|
| 4937 |
<g id="line2d_26">
|
| 4938 |
<g>
|
| 4939 |
+
<use ns4:href="#m0fca2865ba" x="47.72" y="293.552318" style="stroke: #000000; stroke-width: 0.8" />
|
| 4940 |
</g>
|
| 4941 |
</g>
|
| 4942 |
<g id="text_26">
|
| 4943 |
+
<text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: end" x="40.72" y="297.351537" transform="rotate(-0 40.72 297.351537)">0.2</text>
|
| 4944 |
</g>
|
| 4945 |
</g>
|
| 4946 |
<g id="ytick_3">
|
| 4947 |
<g id="grid-y--4" class="grid grid-y">
|
| 4948 |
+
<path d="M 47.72 210.02525 L 831.034248 210.02525 " clip-path="url(#pb49fc4c8d2)" style="fill: none; stroke: #b0b0b0; stroke-opacity: 0.3; stroke-width: 0.8; stroke-linecap: square" />
|
| 4949 |
</g>
|
| 4950 |
<g id="line2d_27">
|
| 4951 |
<g>
|
| 4952 |
+
<use ns4:href="#m0fca2865ba" x="47.72" y="210.02525" style="stroke: #000000; stroke-width: 0.8" />
|
| 4953 |
</g>
|
| 4954 |
</g>
|
| 4955 |
<g id="text_27">
|
| 4956 |
+
<text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: end" x="40.72" y="213.824469" transform="rotate(-0 40.72 213.824469)">0.3</text>
|
| 4957 |
</g>
|
| 4958 |
</g>
|
| 4959 |
<g id="ytick_4">
|
| 4960 |
<g id="grid-y--5" class="grid grid-y">
|
| 4961 |
+
<path d="M 47.72 126.498182 L 831.034248 126.498182 " clip-path="url(#pb49fc4c8d2)" style="fill: none; stroke: #b0b0b0; stroke-opacity: 0.3; stroke-width: 0.8; stroke-linecap: square" />
|
| 4962 |
</g>
|
| 4963 |
<g id="line2d_28">
|
| 4964 |
<g>
|
| 4965 |
+
<use ns4:href="#m0fca2865ba" x="47.72" y="126.498182" style="stroke: #000000; stroke-width: 0.8" />
|
| 4966 |
</g>
|
| 4967 |
</g>
|
| 4968 |
<g id="text_28">
|
| 4969 |
+
<text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: end" x="40.72" y="130.297401" transform="rotate(-0 40.72 130.297401)">0.4</text>
|
| 4970 |
</g>
|
| 4971 |
</g>
|
| 4972 |
<g id="ytick_5">
|
| 4973 |
<g id="grid-y--6" class="grid grid-y">
|
| 4974 |
+
<path d="M 47.72 42.971114 L 831.034248 42.971114 " clip-path="url(#pb49fc4c8d2)" style="fill: none; stroke: #b0b0b0; stroke-opacity: 0.3; stroke-width: 0.8; stroke-linecap: square" />
|
| 4975 |
</g>
|
| 4976 |
<g id="line2d_29">
|
| 4977 |
<g>
|
| 4978 |
+
<use ns4:href="#m0fca2865ba" x="47.72" y="42.971114" style="stroke: #000000; stroke-width: 0.8" />
|
| 4979 |
</g>
|
| 4980 |
</g>
|
| 4981 |
<g id="text_29">
|
| 4982 |
+
<text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: end" x="40.72" y="46.770333" transform="rotate(-0 40.72 46.770333)">0.5</text>
|
| 4983 |
</g>
|
| 4984 |
</g>
|
| 4985 |
<g id="label--y" class="ylabel">
|
|
|
|
| 4987 |
</g>
|
| 4988 |
</g>
|
| 4989 |
<g id="series--hf-kernels-causal-conv1d" class="series">
|
| 4990 |
+
<path d="M 83.325193 420.186871 L 114.286231 413.746934 L 145.247268 413.019413 L 176.208306 414.649026 L 207.169343 414.665731 L 238.130381 415.985459 L 269.091418 416.252746 L 300.052455 415.525225 L 331.013493 416.703792 L 361.97453 415.692279 L 392.935568 416.269451 L 423.896605 416.168383 L 454.857643 415.52606 L 485.81868 415.952048 L 516.779718 414.072689 L 547.740755 415.399934 L 578.701793 415.43418 L 609.66283 416.402259 L 640.623868 414.841138 L 671.584905 415.024898 L 702.545943 414.990652 L 733.50698 414.974782 L 764.468018 415.61794 L 795.429055 415.759936 " clip-path="url(#pb49fc4c8d2)" style="fill: none; stroke: #1f77b4; stroke-width: 1.5; stroke-linecap: square" />
|
| 4991 |
<defs>
|
| 4992 |
<path id="md7efaf3aec" d="M 0 3 C 0.795609 3 1.55874 2.683901 2.12132 2.12132 C 2.683901 1.55874 3 0.795609 3 0 C 3 -0.795609 2.683901 -1.55874 2.12132 -2.12132 C 1.55874 -2.683901 0.795609 -3 0 -3 C -0.795609 -3 -1.55874 -2.683901 -2.12132 -2.12132 C -2.683901 -1.55874 -3 -0.795609 -3 0 C -3 0.795609 -2.683901 1.55874 -2.12132 2.12132 C -1.55874 2.683901 -0.795609 3 0 3 z " style="stroke: #1f77b4" />
|
| 4993 |
</defs>
|
| 4994 |
<g clip-path="url(#pb49fc4c8d2)">
|
| 4995 |
<use ns4:href="#md7efaf3aec" x="83.325193" y="420.186871" style="fill: #1f77b4; stroke: #1f77b4" />
|
| 4996 |
+
<use ns4:href="#md7efaf3aec" x="114.286231" y="413.746934" style="fill: #1f77b4; stroke: #1f77b4" />
|
| 4997 |
+
<use ns4:href="#md7efaf3aec" x="145.247268" y="413.019413" style="fill: #1f77b4; stroke: #1f77b4" />
|
| 4998 |
+
<use ns4:href="#md7efaf3aec" x="176.208306" y="414.649026" style="fill: #1f77b4; stroke: #1f77b4" />
|
| 4999 |
+
<use ns4:href="#md7efaf3aec" x="207.169343" y="414.665731" style="fill: #1f77b4; stroke: #1f77b4" />
|
| 5000 |
+
<use ns4:href="#md7efaf3aec" x="238.130381" y="415.985459" style="fill: #1f77b4; stroke: #1f77b4" />
|
| 5001 |
+
<use ns4:href="#md7efaf3aec" x="269.091418" y="416.252746" style="fill: #1f77b4; stroke: #1f77b4" />
|
| 5002 |
+
<use ns4:href="#md7efaf3aec" x="300.052455" y="415.525225" style="fill: #1f77b4; stroke: #1f77b4" />
|
| 5003 |
+
<use ns4:href="#md7efaf3aec" x="331.013493" y="416.703792" style="fill: #1f77b4; stroke: #1f77b4" />
|
| 5004 |
+
<use ns4:href="#md7efaf3aec" x="361.97453" y="415.692279" style="fill: #1f77b4; stroke: #1f77b4" />
|
| 5005 |
+
<use ns4:href="#md7efaf3aec" x="392.935568" y="416.269451" style="fill: #1f77b4; stroke: #1f77b4" />
|
| 5006 |
+
<use ns4:href="#md7efaf3aec" x="423.896605" y="416.168383" style="fill: #1f77b4; stroke: #1f77b4" />
|
| 5007 |
+
<use ns4:href="#md7efaf3aec" x="454.857643" y="415.52606" style="fill: #1f77b4; stroke: #1f77b4" />
|
| 5008 |
+
<use ns4:href="#md7efaf3aec" x="485.81868" y="415.952048" style="fill: #1f77b4; stroke: #1f77b4" />
|
| 5009 |
+
<use ns4:href="#md7efaf3aec" x="516.779718" y="414.072689" style="fill: #1f77b4; stroke: #1f77b4" />
|
| 5010 |
+
<use ns4:href="#md7efaf3aec" x="547.740755" y="415.399934" style="fill: #1f77b4; stroke: #1f77b4" />
|
| 5011 |
+
<use ns4:href="#md7efaf3aec" x="578.701793" y="415.43418" style="fill: #1f77b4; stroke: #1f77b4" />
|
| 5012 |
+
<use ns4:href="#md7efaf3aec" x="609.66283" y="416.402259" style="fill: #1f77b4; stroke: #1f77b4" />
|
| 5013 |
+
<use ns4:href="#md7efaf3aec" x="640.623868" y="414.841138" style="fill: #1f77b4; stroke: #1f77b4" />
|
| 5014 |
+
<use ns4:href="#md7efaf3aec" x="671.584905" y="415.024898" style="fill: #1f77b4; stroke: #1f77b4" />
|
| 5015 |
+
<use ns4:href="#md7efaf3aec" x="702.545943" y="414.990652" style="fill: #1f77b4; stroke: #1f77b4" />
|
| 5016 |
+
<use ns4:href="#md7efaf3aec" x="733.50698" y="414.974782" style="fill: #1f77b4; stroke: #1f77b4" />
|
| 5017 |
+
<use ns4:href="#md7efaf3aec" x="764.468018" y="415.61794" style="fill: #1f77b4; stroke: #1f77b4" />
|
| 5018 |
+
<use ns4:href="#md7efaf3aec" x="795.429055" y="415.759936" style="fill: #1f77b4; stroke: #1f77b4" />
|
| 5019 |
</g>
|
| 5020 |
</g>
|
| 5021 |
<g id="series--torch-eager" class="series">
|
| 5022 |
+
<path d="M 83.325193 401.710683 L 114.286231 389.180788 L 145.247268 389.523249 L 176.208306 390.141349 L 207.169343 391.126968 L 238.130381 390.809566 L 269.091418 390.934856 L 300.052455 390.667569 L 331.013493 390.500515 L 361.97453 389.707008 L 392.935568 339.037818 L 423.896605 325.239147 L 454.857643 391.043441 L 485.81868 391.009195 L 516.779718 391.143674 L 547.740755 390.442046 L 578.701793 390.951562 L 609.66283 389.129836 L 640.623868 391.795185 L 671.584905 391.319081 L 702.545943 381.654999 L 733.50698 375.966806 L 764.468018 53.96077 L 795.429055 45.608899 " clip-path="url(#pb49fc4c8d2)" style="fill: none; stroke: #ff7f0e; stroke-width: 1.5; stroke-linecap: square" />
|
| 5023 |
<defs>
|
| 5024 |
<path id="m9b8c54d372" d="M 0 3 C 0.795609 3 1.55874 2.683901 2.12132 2.12132 C 2.683901 1.55874 3 0.795609 3 0 C 3 -0.795609 2.683901 -1.55874 2.12132 -2.12132 C 1.55874 -2.683901 0.795609 -3 0 -3 C -0.795609 -3 -1.55874 -2.683901 -2.12132 -2.12132 C -2.683901 -1.55874 -3 -0.795609 -3 0 C -3 0.795609 -2.683901 1.55874 -2.12132 2.12132 C -1.55874 2.683901 -0.795609 3 0 3 z " style="stroke: #ff7f0e" />
|
| 5025 |
</defs>
|
| 5026 |
<g clip-path="url(#pb49fc4c8d2)">
|
| 5027 |
+
<use ns4:href="#m9b8c54d372" x="83.325193" y="401.710683" style="fill: #ff7f0e; stroke: #ff7f0e" />
|
| 5028 |
+
<use ns4:href="#m9b8c54d372" x="114.286231" y="389.180788" style="fill: #ff7f0e; stroke: #ff7f0e" />
|
| 5029 |
+
<use ns4:href="#m9b8c54d372" x="145.247268" y="389.523249" style="fill: #ff7f0e; stroke: #ff7f0e" />
|
| 5030 |
+
<use ns4:href="#m9b8c54d372" x="176.208306" y="390.141349" style="fill: #ff7f0e; stroke: #ff7f0e" />
|
| 5031 |
+
<use ns4:href="#m9b8c54d372" x="207.169343" y="391.126968" style="fill: #ff7f0e; stroke: #ff7f0e" />
|
| 5032 |
+
<use ns4:href="#m9b8c54d372" x="238.130381" y="390.809566" style="fill: #ff7f0e; stroke: #ff7f0e" />
|
| 5033 |
+
<use ns4:href="#m9b8c54d372" x="269.091418" y="390.934856" style="fill: #ff7f0e; stroke: #ff7f0e" />
|
| 5034 |
+
<use ns4:href="#m9b8c54d372" x="300.052455" y="390.667569" style="fill: #ff7f0e; stroke: #ff7f0e" />
|
| 5035 |
+
<use ns4:href="#m9b8c54d372" x="331.013493" y="390.500515" style="fill: #ff7f0e; stroke: #ff7f0e" />
|
| 5036 |
+
<use ns4:href="#m9b8c54d372" x="361.97453" y="389.707008" style="fill: #ff7f0e; stroke: #ff7f0e" />
|
| 5037 |
+
<use ns4:href="#m9b8c54d372" x="392.935568" y="339.037818" style="fill: #ff7f0e; stroke: #ff7f0e" />
|
| 5038 |
+
<use ns4:href="#m9b8c54d372" x="423.896605" y="325.239147" style="fill: #ff7f0e; stroke: #ff7f0e" />
|
| 5039 |
+
<use ns4:href="#m9b8c54d372" x="454.857643" y="391.043441" style="fill: #ff7f0e; stroke: #ff7f0e" />
|
| 5040 |
+
<use ns4:href="#m9b8c54d372" x="485.81868" y="391.009195" style="fill: #ff7f0e; stroke: #ff7f0e" />
|
| 5041 |
+
<use ns4:href="#m9b8c54d372" x="516.779718" y="391.143674" style="fill: #ff7f0e; stroke: #ff7f0e" />
|
| 5042 |
+
<use ns4:href="#m9b8c54d372" x="547.740755" y="390.442046" style="fill: #ff7f0e; stroke: #ff7f0e" />
|
| 5043 |
+
<use ns4:href="#m9b8c54d372" x="578.701793" y="390.951562" style="fill: #ff7f0e; stroke: #ff7f0e" />
|
| 5044 |
+
<use ns4:href="#m9b8c54d372" x="609.66283" y="389.129836" style="fill: #ff7f0e; stroke: #ff7f0e" />
|
| 5045 |
+
<use ns4:href="#m9b8c54d372" x="640.623868" y="391.795185" style="fill: #ff7f0e; stroke: #ff7f0e" />
|
| 5046 |
+
<use ns4:href="#m9b8c54d372" x="671.584905" y="391.319081" style="fill: #ff7f0e; stroke: #ff7f0e" />
|
| 5047 |
+
<use ns4:href="#m9b8c54d372" x="702.545943" y="381.654999" style="fill: #ff7f0e; stroke: #ff7f0e" />
|
| 5048 |
+
<use ns4:href="#m9b8c54d372" x="733.50698" y="375.966806" style="fill: #ff7f0e; stroke: #ff7f0e" />
|
| 5049 |
+
<use ns4:href="#m9b8c54d372" x="764.468018" y="53.96077" style="fill: #ff7f0e; stroke: #ff7f0e" />
|
| 5050 |
<use ns4:href="#m9b8c54d372" x="795.429055" y="45.608899" style="fill: #ff7f0e; stroke: #ff7f0e" />
|
| 5051 |
</g>
|
| 5052 |
</g>
|
flash_attn/impls/artifacts/benchmark/attention.jsonl
CHANGED
|
@@ -1,6 +1,6 @@
|
|
| 1 |
-
{"ts": "2025-10-
|
| 2 |
-
{"ts": "2025-10-
|
| 3 |
-
{"ts": "2025-10-
|
| 4 |
-
{"ts": "2025-10-
|
| 5 |
-
{"ts": "2025-10-
|
| 6 |
-
{"ts": "2025-10-
|
|
|
|
| 1 |
+
{"ts": "2025-10-29T14:27:40Z", "run": "f6f1662cc0064e52be2f08072a7254af", "impl": "xformers_meff", "tags": {"family": "xformers", "backend": "memory_efficient", "compile": "none"}, "wl": {"name": "cuda_attn_L128_bfloat16", "batch": 1, "seq_len": 4224, "heads": 24, "head_dim": 128, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.98791400000664, "p50": 0.995113999977093, "p90": 1.0003840000081254, "mean": 0.9967803999984426, "iqr": 0.00634899998885885, "raw_times": [0.98791400000664, 0.9940350000192666, 1.006454999981088, 1.0003840000081254, 0.995113999977093], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 1.0150049999992916, "peak_bytes": 295567360, "ok": true, "absmax": 0.0625, "corr": {"ok": true, "rtol": 0.02, "atol": 0.02, "absmax": 0.0625, "mae": 0.0003604888916015625, "mse": 2.8014183044433594e-06, "ref": "sdpa_math_fp32"}, "err": null}
|
| 2 |
+
{"ts": "2025-10-29T14:27:41Z", "run": "f6f1662cc0064e52be2f08072a7254af", "impl": "xformers_meff", "tags": {"family": "xformers", "backend": "memory_efficient", "compile": "none"}, "wl": {"name": "cuda_attn_L256_bfloat16", "batch": 1, "seq_len": 4352, "heads": 24, "head_dim": 128, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 1.0233649999804584, "p50": 1.0321449999537435, "p90": 1.0348449999924014, "mean": 1.032277399974646, "iqr": 0.009739000006447895, "raw_times": [1.045925999960673, 1.0251059999859535, 1.0321449999537435, 1.0233649999804584, 1.0348449999924014], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 1.035865999995167, "peak_bytes": 304742400, "ok": true, "absmax": 0.0625, "corr": {"ok": true, "rtol": 0.02, "atol": 0.02, "absmax": 0.0625, "mae": 0.00035858154296875, "mse": 2.7865171432495117e-06, "ref": "sdpa_math_fp32"}, "err": null}
|
| 3 |
+
{"ts": "2025-10-29T14:27:41Z", "run": "f6f1662cc0064e52be2f08072a7254af", "impl": "xformers_meff", "tags": {"family": "xformers", "backend": "memory_efficient", "compile": "none"}, "wl": {"name": "cuda_attn_L320_bfloat16", "batch": 1, "seq_len": 4416, "heads": 24, "head_dim": 128, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 1.0737370000128976, "p50": 1.084086999981082, "p90": 1.088675999994848, "mean": 1.0826705999988917, "iqr": 0.0148400000057336, "raw_times": [1.0738359999891145, 1.0930170000165162, 1.084086999981082, 1.0737370000128976, 1.088675999994848], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 1.0889670000437945, "peak_bytes": 307494912, "ok": true, "absmax": 0.0625, "corr": {"ok": true, "rtol": 0.02, "atol": 0.02, "absmax": 0.0625, "mae": 0.00035858154296875, "mse": 2.7865171432495117e-06, "ref": "sdpa_math_fp32"}, "err": null}
|
| 4 |
+
{"ts": "2025-10-29T14:27:41Z", "run": "f6f1662cc0064e52be2f08072a7254af", "impl": "xformers_meff", "tags": {"family": "xformers", "backend": "memory_efficient", "compile": "none"}, "wl": {"name": "cuda_attn_L384_bfloat16", "batch": 1, "seq_len": 4480, "heads": 24, "head_dim": 128, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 1.0887770000067576, "p50": 1.0916359999555425, "p90": 1.096396999969329, "mean": 1.0932085999797891, "iqr": 0.005600999998023326, "raw_times": [1.0887770000067576, 1.0907959999713057, 1.0916359999555425, 1.0984369999960109, 1.096396999969329], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 1.1031370000296192, "peak_bytes": 311296000, "ok": true, "absmax": 0.0625, "corr": {"ok": true, "rtol": 0.02, "atol": 0.02, "absmax": 0.0625, "mae": 0.00035858154296875, "mse": 2.7865171432495117e-06, "ref": "sdpa_math_fp32"}, "err": null}
|
| 5 |
+
{"ts": "2025-10-29T14:27:41Z", "run": "f6f1662cc0064e52be2f08072a7254af", "impl": "xformers_meff", "tags": {"family": "xformers", "backend": "memory_efficient", "compile": "none"}, "wl": {"name": "cuda_attn_L448_bfloat16", "batch": 1, "seq_len": 4544, "heads": 24, "head_dim": 128, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 1.2465009999687027, "p50": 1.2523310000460697, "p90": 1.2523909999799798, "mean": 1.2538410000047406, "iqr": 0.005290999979479238, "raw_times": [1.2523310000460697, 1.2523909999799798, 1.2471000000005006, 1.2465009999687027, 1.2708820000284504], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 1.2551809999763464, "peak_bytes": 315621376, "ok": true, "absmax": 0.0625, "corr": {"ok": true, "rtol": 0.02, "atol": 0.02, "absmax": 0.0625, "mae": 0.00035858154296875, "mse": 2.7865171432495117e-06, "ref": "sdpa_math_fp32"}, "err": null}
|
| 6 |
+
{"ts": "2025-10-29T14:27:41Z", "run": "f6f1662cc0064e52be2f08072a7254af", "impl": "xformers_meff", "tags": {"family": "xformers", "backend": "memory_efficient", "compile": "none"}, "wl": {"name": "cuda_attn_L512_bfloat16", "batch": 1, "seq_len": 4608, "heads": 24, "head_dim": 128, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 1.2355700000057368, "p50": 1.241141000036805, "p90": 1.2576200000466997, "mean": 1.2477664000130062, "iqr": 0.02047000003813082, "raw_times": [1.2355700000057368, 1.241141000036805, 1.237150000008569, 1.2576200000466997, 1.2673509999672206], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 1.2579809999806457, "peak_bytes": 319946752, "ok": true, "absmax": 0.125, "corr": {"ok": true, "rtol": 0.02, "atol": 0.02, "absmax": 0.125, "mae": 0.000362396240234375, "mse": 2.8014183044433594e-06, "ref": "sdpa_math_fp32"}, "err": null}
|
flash_attn/impls/cells/benchmark.py
CHANGED
|
@@ -4,7 +4,7 @@
|
|
| 4 |
# "numpy",
|
| 5 |
# "torch==2.8.0",
|
| 6 |
# "kernels-benchmark-tools",
|
| 7 |
-
# "
|
| 8 |
# ]
|
| 9 |
#
|
| 10 |
# [tool.uv.sources]
|
|
@@ -13,19 +13,18 @@
|
|
| 13 |
import torch
|
| 14 |
import sys
|
| 15 |
from kernels_benchmark_tools import KernelTypeEnum, run_benchmark
|
| 16 |
-
|
| 17 |
|
| 18 |
-
# Load the flash attention 3 kernel
|
| 19 |
-
hf_kernels_flash_attn3 = get_kernel("kernels-community/flash-attn3")
|
| 20 |
|
| 21 |
-
|
| 22 |
-
|
| 23 |
-
|
|
|
|
| 24 |
|
| 25 |
|
| 26 |
run_benchmark(
|
| 27 |
kernel_type=KernelTypeEnum.ATTENTION,
|
| 28 |
-
impl_name="
|
| 29 |
-
impl_tags={"family": "
|
| 30 |
-
impl_func=
|
| 31 |
)
|
|
|
|
| 4 |
# "numpy",
|
| 5 |
# "torch==2.8.0",
|
| 6 |
# "kernels-benchmark-tools",
|
| 7 |
+
# "xformers",
|
| 8 |
# ]
|
| 9 |
#
|
| 10 |
# [tool.uv.sources]
|
|
|
|
| 13 |
import torch
|
| 14 |
import sys
|
| 15 |
from kernels_benchmark_tools import KernelTypeEnum, run_benchmark
|
| 16 |
+
import xformers.ops as xops
|
| 17 |
|
|
|
|
|
|
|
| 18 |
|
| 19 |
+
def xformers_attention(q, k, v):
|
| 20 |
+
"""xFormers memory efficient attention"""
|
| 21 |
+
# xFormers expects [batch, seq_len, heads, head_dim]
|
| 22 |
+
return xops.memory_efficient_attention(q, k, v)
|
| 23 |
|
| 24 |
|
| 25 |
run_benchmark(
|
| 26 |
kernel_type=KernelTypeEnum.ATTENTION,
|
| 27 |
+
impl_name="xformers_meff",
|
| 28 |
+
impl_tags={"family": "xformers", "backend": "memory_efficient", "compile": "none"},
|
| 29 |
+
impl_func=xformers_attention,
|
| 30 |
)
|
flash_attn/impls/flash_attention.html
CHANGED
|
@@ -3871,7 +3871,7 @@ span.linenos.special { color: #000000; background-color: #ffffc0; padding-left:
|
|
| 3871 |
<span onclick="toggleOutput('nv')" style="cursor: pointer;">▼ output</span>
|
| 3872 |
<span id="uv-indicator-nv" style="cursor: default; opacity: 0.3;">▶ uv-logs</span>
|
| 3873 |
</span> |
|
| 3874 |
-
Cell: nv | 0.
|
| 3875 |
| <button class="run-btn" onclick="runCell('nv')">▶ run</button>
|
| 3876 |
<button class="copy-btn" onclick="copyCell('nv')">Copy</button>
|
| 3877 |
<a href="cells/nv.py" target="_blank" class="raw-btn">Raw</a>
|
|
@@ -3888,7 +3888,7 @@ Cell: nv | 0.26s
|
|
| 3888 |
</div>
|
| 3889 |
</div>
|
| 3890 |
<div id="output-nv" class="cell-output">
|
| 3891 |
-
<div class="cell-stdout"><pre class="stdout-text">
|
| 3892 |
+-----------------------------------------------------------------------------------------+
|
| 3893 |
| NVIDIA-SMI 570.195.03 Driver Version: 570.195.03 CUDA Version: 12.8 |
|
| 3894 |
|-----------------------------------------+------------------------+----------------------+
|
|
@@ -3897,7 +3897,7 @@ Cell: nv | 0.26s
|
|
| 3897 |
| | | MIG M. |
|
| 3898 |
|=========================================+========================+======================|
|
| 3899 |
| 0 NVIDIA L40S On | 00000000:4D:00.0 Off | 0 |
|
| 3900 |
-
| N/A
|
| 3901 |
| | | N/A |
|
| 3902 |
+-----------------------------------------+------------------------+----------------------+
|
| 3903 |
|
|
@@ -3919,9 +3919,9 @@ Cell: nv | 0.26s
|
|
| 3919 |
<span class="collapse-indicators">
|
| 3920 |
<span onclick="toggleCode('benchmark')" style="cursor: pointer;">▼ code</span>
|
| 3921 |
<span onclick="toggleOutput('benchmark')" style="cursor: pointer;">▼ output</span>
|
| 3922 |
-
<span id="uv-indicator-benchmark" style="cursor:
|
| 3923 |
</span> |
|
| 3924 |
-
Cell: benchmark |
|
| 3925 |
| <button class="run-btn" onclick="runCell('benchmark')">▶ run</button>
|
| 3926 |
<button class="copy-btn" onclick="copyCell('benchmark')">Copy</button>
|
| 3927 |
<a href="cells/benchmark.py" target="_blank" class="raw-btn">Raw</a>
|
|
@@ -3972,29 +3972,29 @@ PROFILE TRACE: torch_flash_ma | cuda_attn_L128_bfloat16
|
|
| 3972 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 3973 |
Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
|
| 3974 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 3975 |
-
torch_flash_ma 0.00% 0.000us 0.00% 0.000us 0.000us 3.
|
| 3976 |
-
torch_flash_ma 6.
|
| 3977 |
-
aten::scaled_dot_product_attention 0.82%
|
| 3978 |
-
aten::_scaled_dot_product_flash_attention 0.
|
| 3979 |
-
aten::_flash_attention_forward 0.
|
| 3980 |
-
void pytorch_flash::flash_fwd_kernel<Flash_fwd_kerne... 0.00% 0.000us 0.00% 0.000us 0.000us 2.
|
| 3981 |
-
aten::contiguous 0.27% 14.
|
| 3982 |
-
aten::clone 0.
|
| 3983 |
-
aten::copy_ 1.
|
| 3984 |
-
void at::native::elementwise_kernel<128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us
|
| 3985 |
-
Activity Buffer Request 27.
|
| 3986 |
-
aten::transpose 1.
|
| 3987 |
-
aten::as_strided 0.
|
| 3988 |
-
aten::empty_like 0.
|
| 3989 |
-
aten::empty 1.
|
| 3990 |
-
cudaLaunchKernel 2.
|
| 3991 |
-
aten::empty_strided 0.
|
| 3992 |
-
cudaDeviceGetAttribute 0.
|
| 3993 |
-
cudaFuncSetAttribute 0.17% 8.
|
| 3994 |
-
cudaDeviceSynchronize
|
| 3995 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 3996 |
-
Self CPU time total: 5.
|
| 3997 |
-
Self CUDA time total: 3.
|
| 3998 |
|
| 3999 |
|
| 4000 |
|
|
@@ -4004,29 +4004,29 @@ PROFILE TRACE: torch_flash_ma | cuda_attn_L256_bfloat16
|
|
| 4004 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 4005 |
Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
|
| 4006 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 4007 |
-
torch_flash_ma 4.
|
| 4008 |
-
torch_flash_ma 0.00% 0.000us 0.00% 0.000us 0.000us 3.
|
| 4009 |
-
aten::scaled_dot_product_attention 0.
|
| 4010 |
-
aten::_scaled_dot_product_flash_attention 0.
|
| 4011 |
-
aten::_flash_attention_forward 0.
|
| 4012 |
-
void pytorch_flash::flash_fwd_kernel<Flash_fwd_kerne... 0.00% 0.000us 0.00% 0.000us 0.000us 2.
|
| 4013 |
-
aten::contiguous 0.
|
| 4014 |
-
aten::clone 0.
|
| 4015 |
-
aten::copy_
|
| 4016 |
-
void at::native::elementwise_kernel<128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us
|
| 4017 |
-
Activity Buffer Request
|
| 4018 |
-
aten::transpose
|
| 4019 |
-
aten::as_strided 0.
|
| 4020 |
-
aten::empty_like 0.
|
| 4021 |
-
aten::empty 1.
|
| 4022 |
-
cudaLaunchKernel
|
| 4023 |
-
aten::empty_strided 0.
|
| 4024 |
-
cudaDeviceGetAttribute 0.03% 1.
|
| 4025 |
-
cudaFuncSetAttribute 0.07% 3.
|
| 4026 |
-
cudaDeviceSynchronize 58.
|
| 4027 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 4028 |
-
Self CPU time total: 5.
|
| 4029 |
-
Self CUDA time total: 3.
|
| 4030 |
|
| 4031 |
|
| 4032 |
|
|
@@ -4036,29 +4036,29 @@ PROFILE TRACE: torch_flash_ma | cuda_attn_L320_bfloat16
|
|
| 4036 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 4037 |
Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
|
| 4038 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 4039 |
-
torch_flash_ma 4.
|
| 4040 |
-
torch_flash_ma 0.00% 0.000us 0.00% 0.000us 0.000us 3.
|
| 4041 |
-
aten::scaled_dot_product_attention 0.46% 24.
|
| 4042 |
-
aten::_scaled_dot_product_flash_attention 0.
|
| 4043 |
-
aten::_flash_attention_forward 0.
|
| 4044 |
-
void pytorch_flash::flash_fwd_kernel<Flash_fwd_kerne... 0.00% 0.000us 0.00% 0.000us 0.000us 2.
|
| 4045 |
-
aten::contiguous 0.
|
| 4046 |
-
aten::clone 0.53%
|
| 4047 |
-
aten::copy_ 1.
|
| 4048 |
-
void at::native::elementwise_kernel<128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us
|
| 4049 |
-
Activity Buffer Request 27.
|
| 4050 |
-
aten::transpose 0.
|
| 4051 |
-
aten::as_strided 0.
|
| 4052 |
-
aten::empty_like 0.37% 19.
|
| 4053 |
-
aten::empty 1.
|
| 4054 |
-
cudaLaunchKernel
|
| 4055 |
-
aten::empty_strided 0.
|
| 4056 |
-
cudaDeviceGetAttribute 0.
|
| 4057 |
-
cudaFuncSetAttribute 0.
|
| 4058 |
-
cudaDeviceSynchronize 58.
|
| 4059 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 4060 |
-
Self CPU time total: 5.
|
| 4061 |
-
Self CUDA time total: 3.
|
| 4062 |
|
| 4063 |
|
| 4064 |
|
|
@@ -4068,29 +4068,29 @@ PROFILE TRACE: torch_flash_ma | cuda_attn_L384_bfloat16
|
|
| 4068 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 4069 |
Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
|
| 4070 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 4071 |
-
torch_flash_ma
|
| 4072 |
-
torch_flash_ma 0.00% 0.000us 0.00% 0.000us 0.000us 3.
|
| 4073 |
-
aten::scaled_dot_product_attention 0.48% 26.
|
| 4074 |
-
aten::_scaled_dot_product_flash_attention 0.
|
| 4075 |
-
aten::_flash_attention_forward 0.
|
| 4076 |
-
void pytorch_flash::flash_fwd_kernel<Flash_fwd_kerne... 0.00% 0.000us 0.00% 0.000us 0.000us 3.
|
| 4077 |
-
aten::contiguous 0.17% 9.
|
| 4078 |
-
aten::clone 0.
|
| 4079 |
-
aten::copy_ 1.
|
| 4080 |
-
void at::native::elementwise_kernel<128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us
|
| 4081 |
-
Activity Buffer Request
|
| 4082 |
-
aten::transpose 0.
|
| 4083 |
-
aten::as_strided 0.32% 17.
|
| 4084 |
-
aten::empty_like 0.
|
| 4085 |
-
aten::empty 1.
|
| 4086 |
-
cudaLaunchKernel 5.
|
| 4087 |
-
aten::empty_strided 0.
|
| 4088 |
-
cudaDeviceGetAttribute 0.03% 1.
|
| 4089 |
-
cudaFuncSetAttribute 0.07%
|
| 4090 |
-
cudaDeviceSynchronize
|
| 4091 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 4092 |
-
Self CPU time total: 5.
|
| 4093 |
-
Self CUDA time total: 3.
|
| 4094 |
|
| 4095 |
|
| 4096 |
|
|
@@ -4100,29 +4100,29 @@ PROFILE TRACE: torch_flash_ma | cuda_attn_L448_bfloat16
|
|
| 4100 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 4101 |
Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
|
| 4102 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 4103 |
-
torch_flash_ma
|
| 4104 |
-
torch_flash_ma 0.00% 0.000us 0.00% 0.000us 0.000us 4.
|
| 4105 |
-
aten::scaled_dot_product_attention 0.
|
| 4106 |
-
aten::_scaled_dot_product_flash_attention 0.
|
| 4107 |
-
aten::_flash_attention_forward 0.
|
| 4108 |
-
void pytorch_flash::flash_fwd_kernel<Flash_fwd_kerne... 0.00% 0.000us 0.00% 0.000us 0.000us 3.
|
| 4109 |
-
aten::contiguous 0.
|
| 4110 |
-
aten::clone 0.
|
| 4111 |
-
aten::copy_ 1.
|
| 4112 |
-
void at::native::elementwise_kernel<128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us
|
| 4113 |
-
Activity Buffer Request
|
| 4114 |
-
aten::transpose 0.
|
| 4115 |
-
aten::as_strided 0.
|
| 4116 |
-
aten::empty_like 0.
|
| 4117 |
-
aten::empty 1.
|
| 4118 |
-
cudaLaunchKernel
|
| 4119 |
-
aten::empty_strided 0.
|
| 4120 |
-
cudaDeviceGetAttribute 0.03%
|
| 4121 |
-
cudaFuncSetAttribute 0.06% 3.
|
| 4122 |
-
cudaDeviceSynchronize
|
| 4123 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 4124 |
-
Self CPU time total:
|
| 4125 |
-
Self CUDA time total: 4.
|
| 4126 |
|
| 4127 |
|
| 4128 |
|
|
@@ -4132,39 +4132,91 @@ PROFILE TRACE: torch_flash_ma | cuda_attn_L512_bfloat16
|
|
| 4132 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 4133 |
Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
|
| 4134 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 4135 |
-
torch_flash_ma
|
| 4136 |
-
torch_flash_ma 0.00% 0.000us 0.00% 0.000us 0.000us 4.
|
| 4137 |
-
aten::scaled_dot_product_attention 0.
|
| 4138 |
-
aten::_scaled_dot_product_flash_attention 0.
|
| 4139 |
-
aten::_flash_attention_forward 0.
|
| 4140 |
-
void pytorch_flash::flash_fwd_kernel<Flash_fwd_kerne... 0.00% 0.000us 0.00% 0.000us 0.000us 3.
|
| 4141 |
-
aten::contiguous 0.
|
| 4142 |
-
aten::clone 0.
|
| 4143 |
-
aten::copy_ 1.35%
|
| 4144 |
-
void at::native::elementwise_kernel<128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 817.
|
| 4145 |
-
Activity Buffer Request 23.
|
| 4146 |
-
aten::transpose 0.
|
| 4147 |
-
aten::as_strided 0.
|
| 4148 |
-
aten::empty_like 0.
|
| 4149 |
-
aten::empty 1.
|
| 4150 |
-
cudaLaunchKernel
|
| 4151 |
-
aten::empty_strided 0.
|
| 4152 |
-
cudaDeviceGetAttribute 0.03% 1.
|
| 4153 |
-
cudaFuncSetAttribute 0.
|
| 4154 |
-
cudaDeviceSynchronize
|
| 4155 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 4156 |
-
Self CPU time total: 6.
|
| 4157 |
-
Self CUDA time total: 4.
|
| 4158 |
|
| 4159 |
|
| 4160 |
impl wl p50(ms) ok
|
| 4161 |
torch_flash_ma cuda_attn_L128_bfloat16 1.22 True
|
| 4162 |
-
torch_flash_ma cuda_attn_L256_bfloat16 1.
|
| 4163 |
-
torch_flash_ma cuda_attn_L320_bfloat16 1.
|
| 4164 |
-
torch_flash_ma cuda_attn_L384_bfloat16 1.
|
| 4165 |
torch_flash_ma cuda_attn_L448_bfloat16 1.47 True
|
| 4166 |
torch_flash_ma cuda_attn_L512_bfloat16 1.50 True
|
| 4167 |
</pre></div>
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 4168 |
<div class="cell-artifacts">
|
| 4169 |
<h4>Artifacts:</h4>
|
| 4170 |
<a href="artifacts/benchmark/attention.jsonl" class="artifact" target="_blank">attention.jsonl</a>
|
|
|
|
| 3871 |
<span onclick="toggleOutput('nv')" style="cursor: pointer;">▼ output</span>
|
| 3872 |
<span id="uv-indicator-nv" style="cursor: default; opacity: 0.3;">▶ uv-logs</span>
|
| 3873 |
</span> |
|
| 3874 |
+
Cell: nv | 0.28s
|
| 3875 |
| <button class="run-btn" onclick="runCell('nv')">▶ run</button>
|
| 3876 |
<button class="copy-btn" onclick="copyCell('nv')">Copy</button>
|
| 3877 |
<a href="cells/nv.py" target="_blank" class="raw-btn">Raw</a>
|
|
|
|
| 3888 |
</div>
|
| 3889 |
</div>
|
| 3890 |
<div id="output-nv" class="cell-output">
|
| 3891 |
+
<div class="cell-stdout"><pre class="stdout-text">Wed Oct 29 14:25:53 2025
|
| 3892 |
+-----------------------------------------------------------------------------------------+
|
| 3893 |
| NVIDIA-SMI 570.195.03 Driver Version: 570.195.03 CUDA Version: 12.8 |
|
| 3894 |
|-----------------------------------------+------------------------+----------------------+
|
|
|
|
| 3897 |
| | | MIG M. |
|
| 3898 |
|=========================================+========================+======================|
|
| 3899 |
| 0 NVIDIA L40S On | 00000000:4D:00.0 Off | 0 |
|
| 3900 |
+
| N/A 27C P8 21W / 350W | 0MiB / 46068MiB | 0% Default |
|
| 3901 |
| | | N/A |
|
| 3902 |
+-----------------------------------------+------------------------+----------------------+
|
| 3903 |
|
|
|
|
| 3919 |
<span class="collapse-indicators">
|
| 3920 |
<span onclick="toggleCode('benchmark')" style="cursor: pointer;">▼ code</span>
|
| 3921 |
<span onclick="toggleOutput('benchmark')" style="cursor: pointer;">▼ output</span>
|
| 3922 |
+
<span id="uv-indicator-benchmark" onclick="toggleUvLogsFromHeader('benchmark')" style="cursor: pointer;">▶ uv-logs</span>
|
| 3923 |
</span> |
|
| 3924 |
+
Cell: benchmark | 32.77s
|
| 3925 |
| <button class="run-btn" onclick="runCell('benchmark')">▶ run</button>
|
| 3926 |
<button class="copy-btn" onclick="copyCell('benchmark')">Copy</button>
|
| 3927 |
<a href="cells/benchmark.py" target="_blank" class="raw-btn">Raw</a>
|
|
|
|
| 3972 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 3973 |
Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
|
| 3974 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 3975 |
+
torch_flash_ma 0.00% 0.000us 0.00% 0.000us 0.000us 3.644ms 102.02% 3.644ms 3.644ms 1
|
| 3976 |
+
torch_flash_ma 6.80% 356.846us 47.04% 2.468ms 2.468ms 0.000us 0.00% 3.612ms 3.612ms 1
|
| 3977 |
+
aten::scaled_dot_product_attention 0.82% 43.042us 4.47% 234.776us 78.259us 0.000us 0.00% 2.857ms 952.201us 3
|
| 3978 |
+
aten::_scaled_dot_product_flash_attention 0.56% 29.330us 3.65% 191.734us 63.911us 0.000us 0.00% 2.857ms 952.201us 3
|
| 3979 |
+
aten::_flash_attention_forward 0.75% 39.581us 2.59% 135.674us 45.225us 2.857ms 79.97% 2.857ms 952.201us 3
|
| 3980 |
+
void pytorch_flash::flash_fwd_kernel<Flash_fwd_kerne... 0.00% 0.000us 0.00% 0.000us 0.000us 2.857ms 79.97% 2.857ms 952.201us 3
|
| 3981 |
+
aten::contiguous 0.27% 14.180us 34.32% 1.801ms 150.051us 0.000us 0.00% 755.680us 62.973us 12
|
| 3982 |
+
aten::clone 0.74% 38.791us 34.04% 1.786ms 148.870us 0.000us 0.00% 755.680us 62.973us 12
|
| 3983 |
+
aten::copy_ 1.85% 97.030us 31.43% 1.649ms 137.429us 715.456us 20.03% 755.680us 62.973us 12
|
| 3984 |
+
void at::native::elementwise_kernel<128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 715.456us 20.03% 715.456us 59.621us 12
|
| 3985 |
+
Activity Buffer Request 27.38% 1.437ms 27.38% 1.437ms 1.437ms 40.224us 1.13% 40.224us 40.224us 1
|
| 3986 |
+
aten::transpose 1.47% 77.273us 1.96% 102.714us 4.280us 0.000us 0.00% 0.000us 0.000us 24
|
| 3987 |
+
aten::as_strided 0.48% 25.441us 0.48% 25.441us 1.060us 0.000us 0.00% 0.000us 0.000us 24
|
| 3988 |
+
aten::empty_like 0.70% 36.821us 2.35% 123.326us 8.222us 0.000us 0.00% 0.000us 0.000us 15
|
| 3989 |
+
aten::empty 1.93% 101.493us 1.93% 101.493us 4.229us 0.000us 0.00% 0.000us 0.000us 24
|
| 3990 |
+
cudaLaunchKernel 2.70% 141.775us 2.70% 141.775us 9.452us 0.000us 0.00% 0.000us 0.000us 15
|
| 3991 |
+
aten::empty_strided 0.35% 18.402us 0.35% 18.402us 6.134us 0.000us 0.00% 0.000us 0.000us 3
|
| 3992 |
+
cudaDeviceGetAttribute 0.05% 2.540us 0.05% 2.540us 0.423us 0.000us 0.00% 0.000us 0.000us 6
|
| 3993 |
+
cudaFuncSetAttribute 0.17% 8.890us 0.17% 8.890us 2.963us 0.000us 0.00% 0.000us 0.000us 3
|
| 3994 |
+
cudaDeviceSynchronize 52.96% 2.779ms 52.96% 2.779ms 2.779ms 0.000us 0.00% 0.000us 0.000us 1
|
| 3995 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 3996 |
+
Self CPU time total: 5.247ms
|
| 3997 |
+
Self CUDA time total: 3.572ms
|
| 3998 |
|
| 3999 |
|
| 4000 |
|
|
|
|
| 4004 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 4005 |
Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
|
| 4006 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 4007 |
+
torch_flash_ma 4.70% 246.528us 41.73% 2.189ms 2.189ms 0.000us 0.00% 3.817ms 3.817ms 1
|
| 4008 |
+
torch_flash_ma 0.00% 0.000us 0.00% 0.000us 0.000us 3.772ms 100.28% 3.772ms 3.772ms 1
|
| 4009 |
+
aten::scaled_dot_product_attention 0.51% 26.610us 3.43% 180.143us 60.048us 0.000us 0.00% 2.999ms 999.573us 3
|
| 4010 |
+
aten::_scaled_dot_product_flash_attention 0.37% 19.600us 2.93% 153.533us 51.178us 0.000us 0.00% 2.999ms 999.573us 3
|
| 4011 |
+
aten::_flash_attention_forward 0.63% 32.980us 2.12% 111.443us 37.148us 2.999ms 79.71% 2.999ms 999.573us 3
|
| 4012 |
+
void pytorch_flash::flash_fwd_kernel<Flash_fwd_kerne... 0.00% 0.000us 0.00% 0.000us 0.000us 2.999ms 79.71% 2.999ms 999.573us 3
|
| 4013 |
+
aten::contiguous 0.19% 10.030us 32.68% 1.715ms 142.893us 0.000us 0.00% 818.210us 68.184us 12
|
| 4014 |
+
aten::clone 0.55% 29.002us 32.49% 1.705ms 142.057us 0.000us 0.00% 818.210us 68.184us 12
|
| 4015 |
+
aten::copy_ 2.09% 109.441us 30.74% 1.613ms 134.399us 763.297us 20.29% 818.210us 68.184us 12
|
| 4016 |
+
void at::native::elementwise_kernel<128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 763.297us 20.29% 763.297us 63.608us 12
|
| 4017 |
+
Activity Buffer Request 26.94% 1.413ms 26.94% 1.413ms 1.413ms 54.913us 1.46% 54.913us 54.913us 1
|
| 4018 |
+
aten::transpose 1.00% 52.652us 1.34% 70.433us 2.935us 0.000us 0.00% 0.000us 0.000us 24
|
| 4019 |
+
aten::as_strided 0.34% 17.781us 0.34% 17.781us 0.741us 0.000us 0.00% 0.000us 0.000us 24
|
| 4020 |
+
aten::empty_like 0.38% 19.980us 1.61% 84.581us 5.639us 0.000us 0.00% 0.000us 0.000us 15
|
| 4021 |
+
aten::empty 1.45% 76.201us 1.45% 76.201us 3.175us 0.000us 0.00% 0.000us 0.000us 24
|
| 4022 |
+
cudaLaunchKernel 2.16% 113.102us 2.16% 113.102us 7.540us 0.000us 0.00% 0.000us 0.000us 15
|
| 4023 |
+
aten::empty_strided 0.31% 16.430us 0.31% 16.430us 5.477us 0.000us 0.00% 0.000us 0.000us 3
|
| 4024 |
+
cudaDeviceGetAttribute 0.03% 1.751us 0.03% 1.751us 0.292us 0.000us 0.00% 0.000us 0.000us 6
|
| 4025 |
+
cudaFuncSetAttribute 0.07% 3.771us 0.07% 3.771us 1.257us 0.000us 0.00% 0.000us 0.000us 3
|
| 4026 |
+
cudaDeviceSynchronize 58.27% 3.058ms 58.27% 3.058ms 3.058ms 0.000us 0.00% 0.000us 0.000us 1
|
| 4027 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 4028 |
+
Self CPU time total: 5.247ms
|
| 4029 |
+
Self CUDA time total: 3.762ms
|
| 4030 |
|
| 4031 |
|
| 4032 |
|
|
|
|
| 4036 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 4037 |
Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
|
| 4038 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 4039 |
+
torch_flash_ma 4.50% 237.986us 41.18% 2.178ms 2.178ms 0.000us 0.00% 3.833ms 3.833ms 1
|
| 4040 |
+
torch_flash_ma 0.00% 0.000us 0.00% 0.000us 0.000us 3.785ms 100.29% 3.785ms 3.785ms 1
|
| 4041 |
+
aten::scaled_dot_product_attention 0.46% 24.381us 3.40% 179.915us 59.972us 0.000us 0.00% 2.998ms 999.221us 3
|
| 4042 |
+
aten::_scaled_dot_product_flash_attention 0.36% 19.171us 2.94% 155.534us 51.845us 0.000us 0.00% 2.998ms 999.221us 3
|
| 4043 |
+
aten::_flash_attention_forward 0.65% 34.259us 2.15% 113.691us 37.897us 2.998ms 79.44% 2.998ms 999.221us 3
|
| 4044 |
+
void pytorch_flash::flash_fwd_kernel<Flash_fwd_kerne... 0.00% 0.000us 0.00% 0.000us 0.000us 2.998ms 79.44% 2.998ms 999.221us 3
|
| 4045 |
+
aten::contiguous 0.19% 9.800us 32.38% 1.712ms 142.708us 0.000us 0.00% 835.263us 69.605us 12
|
| 4046 |
+
aten::clone 0.53% 28.211us 32.20% 1.703ms 141.891us 0.000us 0.00% 835.263us 69.605us 12
|
| 4047 |
+
aten::copy_ 1.60% 84.650us 30.46% 1.611ms 134.247us 776.063us 20.56% 835.263us 69.605us 12
|
| 4048 |
+
void at::native::elementwise_kernel<128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 776.063us 20.56% 776.063us 64.672us 12
|
| 4049 |
+
Activity Buffer Request 27.18% 1.437ms 27.18% 1.437ms 1.437ms 59.200us 1.57% 59.200us 59.200us 1
|
| 4050 |
+
aten::transpose 0.99% 52.225us 1.33% 70.125us 2.922us 0.000us 0.00% 0.000us 0.000us 24
|
| 4051 |
+
aten::as_strided 0.34% 17.900us 0.34% 17.900us 0.746us 0.000us 0.00% 0.000us 0.000us 24
|
| 4052 |
+
aten::empty_like 0.37% 19.782us 1.60% 84.803us 5.654us 0.000us 0.00% 0.000us 0.000us 15
|
| 4053 |
+
aten::empty 1.45% 76.431us 1.45% 76.431us 3.185us 0.000us 0.00% 0.000us 0.000us 24
|
| 4054 |
+
cudaLaunchKernel 2.16% 114.204us 2.16% 114.204us 7.614us 0.000us 0.00% 0.000us 0.000us 15
|
| 4055 |
+
aten::empty_strided 0.30% 16.100us 0.30% 16.100us 5.367us 0.000us 0.00% 0.000us 0.000us 3
|
| 4056 |
+
cudaDeviceGetAttribute 0.03% 1.730us 0.03% 1.730us 0.288us 0.000us 0.00% 0.000us 0.000us 6
|
| 4057 |
+
cudaFuncSetAttribute 0.07% 3.730us 0.07% 3.730us 1.243us 0.000us 0.00% 0.000us 0.000us 3
|
| 4058 |
+
cudaDeviceSynchronize 58.82% 3.110ms 58.82% 3.110ms 3.110ms 0.000us 0.00% 0.000us 0.000us 1
|
| 4059 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 4060 |
+
Self CPU time total: 5.288ms
|
| 4061 |
+
Self CUDA time total: 3.774ms
|
| 4062 |
|
| 4063 |
|
| 4064 |
|
|
|
|
| 4068 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 4069 |
Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
|
| 4070 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 4071 |
+
torch_flash_ma 4.36% 241.837us 43.33% 2.405ms 2.405ms 0.000us 0.00% 3.884ms 3.884ms 1
|
| 4072 |
+
torch_flash_ma 0.00% 0.000us 0.00% 0.000us 0.000us 3.837ms 100.27% 3.837ms 3.837ms 1
|
| 4073 |
+
aten::scaled_dot_product_attention 0.48% 26.802us 3.27% 181.715us 60.572us 0.000us 0.00% 3.042ms 1.014ms 3
|
| 4074 |
+
aten::_scaled_dot_product_flash_attention 0.35% 19.308us 2.79% 154.913us 51.638us 0.000us 0.00% 3.042ms 1.014ms 3
|
| 4075 |
+
aten::_flash_attention_forward 0.60% 33.361us 2.03% 112.712us 37.571us 3.042ms 79.50% 3.042ms 1.014ms 3
|
| 4076 |
+
void pytorch_flash::flash_fwd_kernel<Flash_fwd_kerne... 0.00% 0.000us 0.00% 0.000us 0.000us 3.042ms 79.50% 3.042ms 1.014ms 3
|
| 4077 |
+
aten::contiguous 0.17% 9.659us 34.84% 1.934ms 161.162us 0.000us 0.00% 841.829us 70.152us 12
|
| 4078 |
+
aten::clone 0.50% 27.830us 34.67% 1.924ms 160.357us 0.000us 0.00% 841.829us 70.152us 12
|
| 4079 |
+
aten::copy_ 1.56% 86.702us 32.55% 1.807ms 150.547us 784.548us 20.50% 841.829us 70.152us 12
|
| 4080 |
+
void at::native::elementwise_kernel<128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 784.548us 20.50% 784.548us 65.379us 12
|
| 4081 |
+
Activity Buffer Request 25.45% 1.413ms 25.45% 1.413ms 1.413ms 57.281us 1.50% 57.281us 57.281us 1
|
| 4082 |
+
aten::transpose 0.95% 52.620us 1.27% 70.404us 2.933us 0.000us 0.00% 0.000us 0.000us 24
|
| 4083 |
+
aten::as_strided 0.32% 17.784us 0.32% 17.784us 0.741us 0.000us 0.00% 0.000us 0.000us 24
|
| 4084 |
+
aten::empty_like 0.78% 43.221us 2.00% 111.194us 7.413us 0.000us 0.00% 0.000us 0.000us 15
|
| 4085 |
+
aten::empty 1.45% 80.673us 1.45% 80.673us 3.361us 0.000us 0.00% 0.000us 0.000us 24
|
| 4086 |
+
cudaLaunchKernel 5.96% 331.078us 5.96% 331.078us 22.072us 0.000us 0.00% 0.000us 0.000us 15
|
| 4087 |
+
aten::empty_strided 0.28% 15.800us 0.28% 15.800us 5.267us 0.000us 0.00% 0.000us 0.000us 3
|
| 4088 |
+
cudaDeviceGetAttribute 0.03% 1.730us 0.03% 1.730us 0.288us 0.000us 0.00% 0.000us 0.000us 6
|
| 4089 |
+
cudaFuncSetAttribute 0.07% 3.850us 0.07% 3.850us 1.283us 0.000us 0.00% 0.000us 0.000us 3
|
| 4090 |
+
cudaDeviceSynchronize 56.67% 3.146ms 56.67% 3.146ms 3.146ms 0.000us 0.00% 0.000us 0.000us 1
|
| 4091 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 4092 |
+
Self CPU time total: 5.551ms
|
| 4093 |
+
Self CUDA time total: 3.827ms
|
| 4094 |
|
| 4095 |
|
| 4096 |
|
|
|
|
| 4100 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 4101 |
Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
|
| 4102 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 4103 |
+
torch_flash_ma 4.46% 268.165us 40.09% 2.413ms 2.413ms 0.000us 0.00% 4.405ms 4.405ms 1
|
| 4104 |
+
torch_flash_ma 0.00% 0.000us 0.00% 0.000us 0.000us 4.355ms 100.25% 4.355ms 4.355ms 1
|
| 4105 |
+
aten::scaled_dot_product_attention 0.46% 27.642us 3.64% 218.806us 72.935us 0.000us 0.00% 3.540ms 1.180ms 3
|
| 4106 |
+
aten::_scaled_dot_product_flash_attention 0.75% 45.250us 3.18% 191.164us 63.721us 0.000us 0.00% 3.540ms 1.180ms 3
|
| 4107 |
+
aten::_flash_attention_forward 0.61% 36.651us 2.01% 120.923us 40.308us 3.540ms 81.48% 3.540ms 1.180ms 3
|
| 4108 |
+
void pytorch_flash::flash_fwd_kernel<Flash_fwd_kerne... 0.00% 0.000us 0.00% 0.000us 0.000us 3.540ms 81.48% 3.540ms 1.180ms 3
|
| 4109 |
+
aten::contiguous 0.18% 10.862us 31.11% 1.873ms 156.050us 0.000us 0.00% 865.606us 72.134us 12
|
| 4110 |
+
aten::clone 0.51% 30.490us 30.93% 1.862ms 155.145us 0.000us 0.00% 865.606us 72.134us 12
|
| 4111 |
+
aten::copy_ 1.51% 90.931us 29.34% 1.766ms 147.155us 804.645us 18.52% 865.606us 72.134us 12
|
| 4112 |
+
void at::native::elementwise_kernel<128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 804.645us 18.52% 804.645us 67.054us 12
|
| 4113 |
+
Activity Buffer Request 21.61% 1.300ms 21.61% 1.300ms 1.300ms 60.961us 1.40% 60.961us 60.961us 1
|
| 4114 |
+
aten::transpose 0.99% 59.753us 1.30% 78.501us 3.271us 0.000us 0.00% 0.000us 0.000us 24
|
| 4115 |
+
aten::as_strided 0.31% 18.748us 0.31% 18.748us 0.781us 0.000us 0.00% 0.000us 0.000us 24
|
| 4116 |
+
aten::empty_like 0.35% 20.935us 1.45% 87.165us 5.811us 0.000us 0.00% 0.000us 0.000us 15
|
| 4117 |
+
aten::empty 1.32% 79.690us 1.32% 79.690us 3.320us 0.000us 0.00% 0.000us 0.000us 24
|
| 4118 |
+
cudaLaunchKernel 6.67% 401.680us 6.67% 401.680us 26.779us 0.000us 0.00% 0.000us 0.000us 15
|
| 4119 |
+
aten::empty_strided 0.27% 16.081us 0.27% 16.081us 5.360us 0.000us 0.00% 0.000us 0.000us 3
|
| 4120 |
+
cudaDeviceGetAttribute 0.03% 2.030us 0.03% 2.030us 0.338us 0.000us 0.00% 0.000us 0.000us 6
|
| 4121 |
+
cudaFuncSetAttribute 0.06% 3.810us 0.06% 3.810us 1.270us 0.000us 0.00% 0.000us 0.000us 3
|
| 4122 |
+
cudaDeviceSynchronize 59.91% 3.605ms 59.91% 3.605ms 3.605ms 0.000us 0.00% 0.000us 0.000us 1
|
| 4123 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 4124 |
+
Self CPU time total: 6.018ms
|
| 4125 |
+
Self CUDA time total: 4.344ms
|
| 4126 |
|
| 4127 |
|
| 4128 |
|
|
|
|
| 4132 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 4133 |
Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
|
| 4134 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 4135 |
+
torch_flash_ma 4.01% 246.839us 39.75% 2.447ms 2.447ms 0.000us 0.00% 4.458ms 4.458ms 1
|
| 4136 |
+
torch_flash_ma 0.00% 0.000us 0.00% 0.000us 0.000us 4.407ms 100.23% 4.407ms 4.407ms 1
|
| 4137 |
+
aten::scaled_dot_product_attention 0.40% 24.621us 2.95% 181.474us 60.491us 0.000us 0.00% 3.579ms 1.193ms 3
|
| 4138 |
+
aten::_scaled_dot_product_flash_attention 0.34% 20.980us 2.55% 156.853us 52.284us 0.000us 0.00% 3.579ms 1.193ms 3
|
| 4139 |
+
aten::_flash_attention_forward 0.58% 35.588us 1.84% 113.003us 37.668us 3.579ms 81.40% 3.579ms 1.193ms 3
|
| 4140 |
+
void pytorch_flash::flash_fwd_kernel<Flash_fwd_kerne... 0.00% 0.000us 0.00% 0.000us 0.000us 3.579ms 81.40% 3.579ms 1.193ms 3
|
| 4141 |
+
aten::contiguous 0.16% 10.061us 32.01% 1.971ms 164.244us 0.000us 0.00% 878.818us 73.235us 12
|
| 4142 |
+
aten::clone 0.50% 30.903us 31.85% 1.961ms 163.406us 0.000us 0.00% 878.818us 73.235us 12
|
| 4143 |
+
aten::copy_ 1.35% 82.841us 30.27% 1.864ms 155.305us 817.634us 18.60% 878.818us 73.235us 12
|
| 4144 |
+
void at::native::elementwise_kernel<128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 817.634us 18.60% 817.634us 68.136us 12
|
| 4145 |
+
Activity Buffer Request 23.50% 1.447ms 23.50% 1.447ms 1.447ms 61.184us 1.39% 61.184us 61.184us 1
|
| 4146 |
+
aten::transpose 0.85% 52.630us 1.15% 70.790us 2.950us 0.000us 0.00% 0.000us 0.000us 24
|
| 4147 |
+
aten::as_strided 0.29% 18.160us 0.29% 18.160us 0.757us 0.000us 0.00% 0.000us 0.000us 24
|
| 4148 |
+
aten::empty_like 0.33% 20.456us 1.41% 86.700us 5.780us 0.000us 0.00% 0.000us 0.000us 15
|
| 4149 |
+
aten::empty 1.28% 78.794us 1.28% 78.794us 3.283us 0.000us 0.00% 0.000us 0.000us 24
|
| 4150 |
+
cudaLaunchKernel 5.81% 357.919us 5.81% 357.919us 23.861us 0.000us 0.00% 0.000us 0.000us 15
|
| 4151 |
+
aten::empty_strided 0.25% 15.401us 0.25% 15.401us 5.134us 0.000us 0.00% 0.000us 0.000us 3
|
| 4152 |
+
cudaDeviceGetAttribute 0.03% 1.632us 0.03% 1.632us 0.272us 0.000us 0.00% 0.000us 0.000us 6
|
| 4153 |
+
cudaFuncSetAttribute 0.06% 3.720us 0.06% 3.720us 1.240us 0.000us 0.00% 0.000us 0.000us 3
|
| 4154 |
+
cudaDeviceSynchronize 60.25% 3.709ms 60.25% 3.709ms 3.709ms 0.000us 0.00% 0.000us 0.000us 1
|
| 4155 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 4156 |
+
Self CPU time total: 6.156ms
|
| 4157 |
+
Self CUDA time total: 4.397ms
|
| 4158 |
|
| 4159 |
|
| 4160 |
impl wl p50(ms) ok
|
| 4161 |
torch_flash_ma cuda_attn_L128_bfloat16 1.22 True
|
| 4162 |
+
torch_flash_ma cuda_attn_L256_bfloat16 1.28 True
|
| 4163 |
+
torch_flash_ma cuda_attn_L320_bfloat16 1.29 True
|
| 4164 |
+
torch_flash_ma cuda_attn_L384_bfloat16 1.33 True
|
| 4165 |
torch_flash_ma cuda_attn_L448_bfloat16 1.47 True
|
| 4166 |
torch_flash_ma cuda_attn_L512_bfloat16 1.50 True
|
| 4167 |
</pre></div>
|
| 4168 |
+
<div class="uv-install-logs" id="uv-logs-benchmark">
|
| 4169 |
+
<div class="uv-logs-header" onclick="toggleUvLogs(this)">▶ UV Install Logs</div>
|
| 4170 |
+
<div class="uv-logs-content" style="display: none;">
|
| 4171 |
+
Building kernels-benchmark-tools @ file:///__w/kernels-benchmarks/kernels-benchmarks/tools
|
| 4172 |
+
Downloading nvidia-cuda-cupti-cu12 (9.8MiB)
|
| 4173 |
+
Downloading matplotlib (8.3MiB)
|
| 4174 |
+
Downloading nvidia-cufile-cu12 (1.1MiB)
|
| 4175 |
+
Downloading numpy (16.2MiB)
|
| 4176 |
+
Downloading nvidia-cudnn-cu12 (674.0MiB)
|
| 4177 |
+
Downloading nvidia-cusparse-cu12 (274.9MiB)
|
| 4178 |
+
Downloading nvidia-cublas-cu12 (566.8MiB)
|
| 4179 |
+
Downloading nvidia-cusparselt-cu12 (273.9MiB)
|
| 4180 |
+
Downloading nvidia-nvjitlink-cu12 (37.4MiB)
|
| 4181 |
+
Downloading kiwisolver (1.4MiB)
|
| 4182 |
+
Downloading networkx (1.9MiB)
|
| 4183 |
+
Downloading nvidia-curand-cu12 (60.7MiB)
|
| 4184 |
+
Downloading sympy (6.0MiB)
|
| 4185 |
+
Downloading nvidia-cuda-nvrtc-cu12 (84.0MiB)
|
| 4186 |
+
Downloading setuptools (1.1MiB)
|
| 4187 |
+
Downloading nvidia-cusolver-cu12 (255.1MiB)
|
| 4188 |
+
Downloading triton (148.3MiB)
|
| 4189 |
+
Downloading pillow (6.7MiB)
|
| 4190 |
+
Downloading nvidia-cufft-cu12 (184.2MiB)
|
| 4191 |
+
Downloading fonttools (4.7MiB)
|
| 4192 |
+
Downloading nvidia-nccl-cu12 (307.4MiB)
|
| 4193 |
+
Downloading torch (846.9MiB)
|
| 4194 |
+
Downloading nvidia-cufile-cu12
|
| 4195 |
+
Downloading kiwisolver
|
| 4196 |
+
Downloading setuptools
|
| 4197 |
+
Downloading networkx
|
| 4198 |
+
Downloading fonttools
|
| 4199 |
+
Downloading pillow
|
| 4200 |
+
Built kernels-benchmark-tools @ file:///__w/kernels-benchmarks/kernels-benchmarks/tools
|
| 4201 |
+
Downloading nvidia-cuda-cupti-cu12
|
| 4202 |
+
Downloading matplotlib
|
| 4203 |
+
Downloading numpy
|
| 4204 |
+
Downloading sympy
|
| 4205 |
+
Downloading nvidia-nvjitlink-cu12
|
| 4206 |
+
Downloading nvidia-curand-cu12
|
| 4207 |
+
Downloading nvidia-cuda-nvrtc-cu12
|
| 4208 |
+
Downloading triton
|
| 4209 |
+
Downloading nvidia-cufft-cu12
|
| 4210 |
+
Downloading nvidia-cusolver-cu12
|
| 4211 |
+
Downloading nvidia-cusparse-cu12
|
| 4212 |
+
Downloading nvidia-cusparselt-cu12
|
| 4213 |
+
Downloading nvidia-nccl-cu12
|
| 4214 |
+
Downloading nvidia-cublas-cu12
|
| 4215 |
+
Downloading nvidia-cudnn-cu12
|
| 4216 |
+
Downloading torch
|
| 4217 |
+
Installed 37 packages in 212ms
|
| 4218 |
+
</div>
|
| 4219 |
+
</div>
|
| 4220 |
<div class="cell-artifacts">
|
| 4221 |
<h4>Artifacts:</h4>
|
| 4222 |
<a href="artifacts/benchmark/attention.jsonl" class="artifact" target="_blank">attention.jsonl</a>
|
flash_attn/impls/hf_kernels_flash_attn.html
CHANGED
|
@@ -3869,9 +3869,9 @@ span.linenos.special { color: #000000; background-color: #ffffc0; padding-left:
|
|
| 3869 |
<span class="collapse-indicators">
|
| 3870 |
<span onclick="toggleCode('benchmark')" style="cursor: pointer;">▼ code</span>
|
| 3871 |
<span onclick="toggleOutput('benchmark')" style="cursor: pointer;">▼ output</span>
|
| 3872 |
-
<span id="uv-indicator-benchmark"
|
| 3873 |
</span> |
|
| 3874 |
-
Cell: benchmark |
|
| 3875 |
| <button class="run-btn" onclick="runCell('benchmark')">▶ run</button>
|
| 3876 |
<button class="copy-btn" onclick="copyCell('benchmark')">Copy</button>
|
| 3877 |
<a href="cells/benchmark.py" target="_blank" class="raw-btn">Raw</a>
|
|
@@ -3926,21 +3926,21 @@ PROFILE TRACE: hf_kernels_flash_attn | cuda_attn_L128_bfloat16
|
|
| 3926 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 3927 |
Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
|
| 3928 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 3929 |
-
hf_kernels_flash_attn 3.
|
| 3930 |
-
_flash_attn_9e27194::fwd 1.
|
| 3931 |
-
hf_kernels_flash_attn 0.00% 0.000us 0.00% 0.000us 0.000us 2.
|
| 3932 |
-
void flash::flash_fwd_kernel<Flash_fwd_kernel_traits... 0.00% 0.000us 0.00% 0.000us 0.000us 2.
|
| 3933 |
-
Activity Buffer Request
|
| 3934 |
-
cudaDeviceGetAttribute 0.13% 5.
|
| 3935 |
-
aten::empty_like 0.
|
| 3936 |
-
aten::empty_strided 0.
|
| 3937 |
-
aten::empty 0.57%
|
| 3938 |
-
cudaFuncSetAttribute 0.
|
| 3939 |
-
cudaLaunchKernel
|
| 3940 |
-
cudaDeviceSynchronize 58.
|
| 3941 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 3942 |
-
Self CPU time total: 4.
|
| 3943 |
-
Self CUDA time total: 2.
|
| 3944 |
|
| 3945 |
|
| 3946 |
|
|
@@ -3950,21 +3950,21 @@ PROFILE TRACE: hf_kernels_flash_attn | cuda_attn_L256_bfloat16
|
|
| 3950 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 3951 |
Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
|
| 3952 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 3953 |
-
hf_kernels_flash_attn 2.
|
| 3954 |
-
_flash_attn_9e27194::fwd 1.
|
| 3955 |
-
hf_kernels_flash_attn 0.00% 0.000us 0.00% 0.000us 0.000us 2.
|
| 3956 |
-
void flash::flash_fwd_kernel<Flash_fwd_kernel_traits... 0.00% 0.000us 0.00% 0.000us 0.000us 2.
|
| 3957 |
-
Activity Buffer Request
|
| 3958 |
-
cudaDeviceGetAttribute 0.
|
| 3959 |
-
aten::empty_like 0.17% 7.
|
| 3960 |
-
aten::empty_strided 0.
|
| 3961 |
-
aten::empty 0.
|
| 3962 |
-
cudaFuncSetAttribute 0.08% 3.
|
| 3963 |
-
cudaLaunchKernel 0.
|
| 3964 |
-
cudaDeviceSynchronize
|
| 3965 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 3966 |
-
Self CPU time total: 4.
|
| 3967 |
-
Self CUDA time total: 2.
|
| 3968 |
|
| 3969 |
|
| 3970 |
|
|
@@ -3974,21 +3974,21 @@ PROFILE TRACE: hf_kernels_flash_attn | cuda_attn_L320_bfloat16
|
|
| 3974 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 3975 |
Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
|
| 3976 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 3977 |
-
hf_kernels_flash_attn 2.
|
| 3978 |
-
_flash_attn_9e27194::fwd 1.
|
| 3979 |
-
hf_kernels_flash_attn 0.00% 0.000us 0.00% 0.000us 0.000us 3.
|
| 3980 |
-
void flash::flash_fwd_kernel<Flash_fwd_kernel_traits... 0.00% 0.000us 0.00% 0.000us 0.000us 3.
|
| 3981 |
-
Activity Buffer Request 31.
|
| 3982 |
-
cudaDeviceGetAttribute 0.
|
| 3983 |
-
aten::empty_like 0.
|
| 3984 |
-
aten::empty_strided 0.
|
| 3985 |
-
aten::empty 0.
|
| 3986 |
-
cudaFuncSetAttribute 0.
|
| 3987 |
-
cudaLaunchKernel 0.
|
| 3988 |
-
cudaDeviceSynchronize
|
| 3989 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 3990 |
-
Self CPU time total: 4.
|
| 3991 |
-
Self CUDA time total: 3.
|
| 3992 |
|
| 3993 |
|
| 3994 |
|
|
@@ -3998,21 +3998,21 @@ PROFILE TRACE: hf_kernels_flash_attn | cuda_attn_L384_bfloat16
|
|
| 3998 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 3999 |
Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
|
| 4000 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 4001 |
-
hf_kernels_flash_attn 2.
|
| 4002 |
-
_flash_attn_9e27194::fwd 1.
|
| 4003 |
-
hf_kernels_flash_attn 0.00% 0.000us 0.00% 0.000us 0.000us 3.
|
| 4004 |
-
void flash::flash_fwd_kernel<Flash_fwd_kernel_traits... 0.00% 0.000us 0.00% 0.000us 0.000us 3.
|
| 4005 |
-
Activity Buffer Request 29.
|
| 4006 |
-
cudaDeviceGetAttribute 0.
|
| 4007 |
-
aten::empty_like 0.
|
| 4008 |
-
aten::empty_strided 0.
|
| 4009 |
-
aten::empty 0.45% 21.
|
| 4010 |
-
cudaFuncSetAttribute 0.08% 3.
|
| 4011 |
-
cudaLaunchKernel
|
| 4012 |
-
cudaDeviceSynchronize
|
| 4013 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 4014 |
-
Self CPU time total: 4.
|
| 4015 |
-
Self CUDA time total: 3.
|
| 4016 |
|
| 4017 |
|
| 4018 |
|
|
@@ -4022,21 +4022,21 @@ PROFILE TRACE: hf_kernels_flash_attn | cuda_attn_L448_bfloat16
|
|
| 4022 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 4023 |
Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
|
| 4024 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 4025 |
-
hf_kernels_flash_attn 2.
|
| 4026 |
-
_flash_attn_9e27194::fwd
|
| 4027 |
-
hf_kernels_flash_attn 0.00% 0.000us 0.00% 0.000us 0.000us 3.
|
| 4028 |
-
void flash::flash_fwd_kernel<Flash_fwd_kernel_traits... 0.00% 0.000us 0.00% 0.000us 0.000us 3.
|
| 4029 |
-
Activity Buffer Request
|
| 4030 |
-
cudaDeviceGetAttribute 0.08% 4.
|
| 4031 |
-
aten::empty_like 0.
|
| 4032 |
-
aten::empty_strided 0.
|
| 4033 |
-
aten::empty 0.40% 21.
|
| 4034 |
-
cudaFuncSetAttribute 0.07% 3.
|
| 4035 |
-
cudaLaunchKernel
|
| 4036 |
-
cudaDeviceSynchronize
|
| 4037 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 4038 |
-
Self CPU time total: 5.
|
| 4039 |
-
Self CUDA time total: 3.
|
| 4040 |
|
| 4041 |
|
| 4042 |
|
|
@@ -4046,41 +4046,36 @@ PROFILE TRACE: hf_kernels_flash_attn | cuda_attn_L512_bfloat16
|
|
| 4046 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 4047 |
Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
|
| 4048 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 4049 |
-
hf_kernels_flash_attn
|
| 4050 |
-
_flash_attn_9e27194::fwd
|
| 4051 |
-
hf_kernels_flash_attn 0.00% 0.000us 0.00% 0.000us 0.000us 3.
|
| 4052 |
-
void flash::flash_fwd_kernel<Flash_fwd_kernel_traits... 0.00% 0.000us 0.00% 0.000us 0.000us 3.
|
| 4053 |
-
Activity Buffer Request 26.
|
| 4054 |
-
cudaDeviceGetAttribute 0.08% 4.
|
| 4055 |
-
aten::empty_like 0.
|
| 4056 |
-
aten::empty_strided 0.
|
| 4057 |
-
aten::empty 0.
|
| 4058 |
-
cudaFuncSetAttribute 0.08% 4.
|
| 4059 |
-
cudaLaunchKernel
|
| 4060 |
-
cudaDeviceSynchronize
|
| 4061 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 4062 |
-
Self CPU time total: 5.
|
| 4063 |
-
Self CUDA time total: 3.
|
| 4064 |
|
| 4065 |
|
| 4066 |
impl wl p50(ms) ok
|
| 4067 |
-
hf_kernels_flash_attn cuda_attn_L128_bfloat16 0.
|
| 4068 |
hf_kernels_flash_attn cuda_attn_L256_bfloat16 1.01 True
|
| 4069 |
hf_kernels_flash_attn cuda_attn_L320_bfloat16 1.06 True
|
| 4070 |
-
hf_kernels_flash_attn cuda_attn_L384_bfloat16 1.
|
| 4071 |
-
hf_kernels_flash_attn cuda_attn_L448_bfloat16 1.
|
| 4072 |
-
hf_kernels_flash_attn cuda_attn_L512_bfloat16 1.
|
| 4073 |
</pre></div>
|
| 4074 |
-
<div class="
|
| 4075 |
-
|
| 4076 |
-
|
| 4077 |
-
|
| 4078 |
</div>
|
| 4079 |
-
</div>
|
| 4080 |
-
<div class="cell-stderr">Fetching 20 files: 0%| | 0/20 [00:00<?, ?it/s]
|
| 4081 |
-
Fetching 20 files: 5%|▌ | 1/20 [00:00<00:04, 4.26it/s]
|
| 4082 |
-
Fetching 20 files: 10%|█ | 2/20 [00:01<00:17, 1.03it/s]
|
| 4083 |
-
Fetching 20 files: 100%|██████████| 20/20 [00:01<00:00, 11.64it/s]</div>
|
| 4084 |
<div class="cell-artifacts">
|
| 4085 |
<h4>Artifacts:</h4>
|
| 4086 |
<a href="artifacts/benchmark/attention.jsonl" class="artifact" target="_blank">attention.jsonl</a>
|
|
|
|
| 3869 |
<span class="collapse-indicators">
|
| 3870 |
<span onclick="toggleCode('benchmark')" style="cursor: pointer;">▼ code</span>
|
| 3871 |
<span onclick="toggleOutput('benchmark')" style="cursor: pointer;">▼ output</span>
|
| 3872 |
+
<span id="uv-indicator-benchmark" style="cursor: default; opacity: 0.3;">▶ uv-logs</span>
|
| 3873 |
</span> |
|
| 3874 |
+
Cell: benchmark | 5.58s
|
| 3875 |
| <button class="run-btn" onclick="runCell('benchmark')">▶ run</button>
|
| 3876 |
<button class="copy-btn" onclick="copyCell('benchmark')">Copy</button>
|
| 3877 |
<a href="cells/benchmark.py" target="_blank" class="raw-btn">Raw</a>
|
|
|
|
| 3926 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 3927 |
Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
|
| 3928 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 3929 |
+
hf_kernels_flash_attn 3.55% 156.153us 41.08% 1.807ms 1.807ms 0.000us 0.00% 3.775ms 3.775ms 1
|
| 3930 |
+
_flash_attn_9e27194::fwd 1.65% 72.542us 37.53% 1.651ms 550.240us 2.812ms 100.00% 3.775ms 1.258ms 3
|
| 3931 |
+
hf_kernels_flash_attn 0.00% 0.000us 0.00% 0.000us 0.000us 2.814ms 100.05% 2.814ms 2.814ms 1
|
| 3932 |
+
void flash::flash_fwd_kernel<Flash_fwd_kernel_traits... 0.00% 0.000us 0.00% 0.000us 0.000us 2.812ms 100.00% 2.812ms 937.398us 3
|
| 3933 |
+
Activity Buffer Request 32.22% 1.417ms 32.22% 1.417ms 1.417ms 962.880us 34.24% 962.880us 962.880us 1
|
| 3934 |
+
cudaDeviceGetAttribute 0.13% 5.500us 0.13% 5.500us 0.367us 0.000us 0.00% 0.000us 0.000us 15
|
| 3935 |
+
aten::empty_like 0.43% 19.110us 1.25% 54.882us 18.294us 0.000us 0.00% 0.000us 0.000us 3
|
| 3936 |
+
aten::empty_strided 0.81% 35.772us 0.81% 35.772us 11.924us 0.000us 0.00% 0.000us 0.000us 3
|
| 3937 |
+
aten::empty 0.57% 25.101us 0.57% 25.101us 2.789us 0.000us 0.00% 0.000us 0.000us 9
|
| 3938 |
+
cudaFuncSetAttribute 0.30% 13.270us 0.30% 13.270us 4.423us 0.000us 0.00% 0.000us 0.000us 3
|
| 3939 |
+
cudaLaunchKernel 1.42% 62.402us 1.42% 62.402us 20.801us 0.000us 0.00% 0.000us 0.000us 3
|
| 3940 |
+
cudaDeviceSynchronize 58.92% 2.591ms 58.92% 2.591ms 2.591ms 0.000us 0.00% 0.000us 0.000us 1
|
| 3941 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 3942 |
+
Self CPU time total: 4.398ms
|
| 3943 |
+
Self CUDA time total: 2.812ms
|
| 3944 |
|
| 3945 |
|
| 3946 |
|
|
|
|
| 3950 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 3951 |
Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
|
| 3952 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 3953 |
+
hf_kernels_flash_attn 2.04% 91.192us 36.62% 1.634ms 1.634ms 0.000us 0.00% 3.983ms 3.983ms 1
|
| 3954 |
+
_flash_attn_9e27194::fwd 1.11% 49.718us 34.57% 1.543ms 514.203us 2.978ms 100.00% 3.983ms 1.328ms 3
|
| 3955 |
+
hf_kernels_flash_attn 0.00% 0.000us 0.00% 0.000us 0.000us 2.980ms 100.05% 2.980ms 2.980ms 1
|
| 3956 |
+
void flash::flash_fwd_kernel<Flash_fwd_kernel_traits... 0.00% 0.000us 0.00% 0.000us 0.000us 2.978ms 100.00% 2.978ms 992.707us 3
|
| 3957 |
+
Activity Buffer Request 31.74% 1.416ms 31.74% 1.416ms 1.416ms 1.004ms 33.73% 1.004ms 1.004ms 1
|
| 3958 |
+
cudaDeviceGetAttribute 0.08% 3.711us 0.08% 3.711us 0.247us 0.000us 0.00% 0.000us 0.000us 15
|
| 3959 |
+
aten::empty_like 0.17% 7.481us 0.51% 22.841us 7.614us 0.000us 0.00% 0.000us 0.000us 3
|
| 3960 |
+
aten::empty_strided 0.34% 15.360us 0.34% 15.360us 5.120us 0.000us 0.00% 0.000us 0.000us 3
|
| 3961 |
+
aten::empty 0.46% 20.620us 0.46% 20.620us 2.291us 0.000us 0.00% 0.000us 0.000us 9
|
| 3962 |
+
cudaFuncSetAttribute 0.08% 3.741us 0.08% 3.741us 1.247us 0.000us 0.00% 0.000us 0.000us 3
|
| 3963 |
+
cudaLaunchKernel 0.58% 25.842us 0.58% 25.842us 8.614us 0.000us 0.00% 0.000us 0.000us 3
|
| 3964 |
+
cudaDeviceSynchronize 63.38% 2.828ms 63.38% 2.828ms 2.828ms 0.000us 0.00% 0.000us 0.000us 1
|
| 3965 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 3966 |
+
Self CPU time total: 4.462ms
|
| 3967 |
+
Self CUDA time total: 2.978ms
|
| 3968 |
|
| 3969 |
|
| 3970 |
|
|
|
|
| 3974 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 3975 |
Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
|
| 3976 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 3977 |
+
hf_kernels_flash_attn 2.28% 105.284us 36.17% 1.673ms 1.673ms 0.000us 0.00% 4.145ms 4.145ms 1
|
| 3978 |
+
_flash_attn_9e27194::fwd 1.09% 50.271us 33.89% 1.567ms 522.459us 3.096ms 100.00% 4.145ms 1.382ms 3
|
| 3979 |
+
hf_kernels_flash_attn 0.00% 0.000us 0.00% 0.000us 0.000us 3.098ms 100.05% 3.098ms 3.098ms 1
|
| 3980 |
+
void flash::flash_fwd_kernel<Flash_fwd_kernel_traits... 0.00% 0.000us 0.00% 0.000us 0.000us 3.096ms 100.00% 3.096ms 1.032ms 3
|
| 3981 |
+
Activity Buffer Request 31.08% 1.437ms 31.08% 1.437ms 1.437ms 1.049ms 33.87% 1.049ms 1.049ms 1
|
| 3982 |
+
cudaDeviceGetAttribute 0.08% 3.850us 0.08% 3.850us 0.257us 0.000us 0.00% 0.000us 0.000us 15
|
| 3983 |
+
aten::empty_like 0.15% 7.061us 0.49% 22.631us 7.544us 0.000us 0.00% 0.000us 0.000us 3
|
| 3984 |
+
aten::empty_strided 0.34% 15.570us 0.34% 15.570us 5.190us 0.000us 0.00% 0.000us 0.000us 3
|
| 3985 |
+
aten::empty 0.47% 21.760us 0.47% 21.760us 2.418us 0.000us 0.00% 0.000us 0.000us 9
|
| 3986 |
+
cudaFuncSetAttribute 0.08% 3.689us 0.08% 3.689us 1.230us 0.000us 0.00% 0.000us 0.000us 3
|
| 3987 |
+
cudaLaunchKernel 0.61% 27.992us 0.61% 27.992us 9.331us 0.000us 0.00% 0.000us 0.000us 3
|
| 3988 |
+
cudaDeviceSynchronize 63.83% 2.952ms 63.83% 2.952ms 2.952ms 0.000us 0.00% 0.000us 0.000us 1
|
| 3989 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 3990 |
+
Self CPU time total: 4.625ms
|
| 3991 |
+
Self CUDA time total: 3.096ms
|
| 3992 |
|
| 3993 |
|
| 3994 |
|
|
|
|
| 3998 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 3999 |
Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
|
| 4000 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 4001 |
+
hf_kernels_flash_attn 2.30% 110.882us 38.29% 1.842ms 1.842ms 0.000us 0.00% 4.161ms 4.161ms 1
|
| 4002 |
+
_flash_attn_9e27194::fwd 1.05% 50.321us 35.98% 1.731ms 577.014us 3.117ms 100.00% 4.161ms 1.387ms 3
|
| 4003 |
+
hf_kernels_flash_attn 0.00% 0.000us 0.00% 0.000us 0.000us 3.118ms 100.05% 3.118ms 3.118ms 1
|
| 4004 |
+
void flash::flash_fwd_kernel<Flash_fwd_kernel_traits... 0.00% 0.000us 0.00% 0.000us 0.000us 3.117ms 100.00% 3.117ms 1.039ms 3
|
| 4005 |
+
Activity Buffer Request 29.64% 1.426ms 29.64% 1.426ms 1.426ms 1.044ms 33.50% 1.044ms 1.044ms 1
|
| 4006 |
+
cudaDeviceGetAttribute 0.08% 3.780us 0.08% 3.780us 0.252us 0.000us 0.00% 0.000us 0.000us 15
|
| 4007 |
+
aten::empty_like 0.15% 7.259us 0.50% 24.240us 8.080us 0.000us 0.00% 0.000us 0.000us 3
|
| 4008 |
+
aten::empty_strided 0.35% 16.981us 0.35% 16.981us 5.660us 0.000us 0.00% 0.000us 0.000us 3
|
| 4009 |
+
aten::empty 0.45% 21.602us 0.45% 21.602us 2.400us 0.000us 0.00% 0.000us 0.000us 9
|
| 4010 |
+
cudaFuncSetAttribute 0.08% 3.770us 0.08% 3.770us 1.257us 0.000us 0.00% 0.000us 0.000us 3
|
| 4011 |
+
cudaLaunchKernel 4.18% 201.205us 4.18% 201.205us 67.068us 0.000us 0.00% 0.000us 0.000us 3
|
| 4012 |
+
cudaDeviceSynchronize 61.71% 2.969ms 61.71% 2.969ms 2.969ms 0.000us 0.00% 0.000us 0.000us 1
|
| 4013 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 4014 |
+
Self CPU time total: 4.811ms
|
| 4015 |
+
Self CUDA time total: 3.117ms
|
| 4016 |
|
| 4017 |
|
| 4018 |
|
|
|
|
| 4022 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 4023 |
Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
|
| 4024 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 4025 |
+
hf_kernels_flash_attn 2.05% 108.443us 34.64% 1.832ms 1.832ms 0.000us 0.00% 4.810ms 4.810ms 1
|
| 4026 |
+
_flash_attn_9e27194::fwd 0.96% 50.812us 32.59% 1.723ms 574.364us 3.602ms 100.00% 4.810ms 1.603ms 3
|
| 4027 |
+
hf_kernels_flash_attn 0.00% 0.000us 0.00% 0.000us 0.000us 3.603ms 100.04% 3.603ms 3.603ms 1
|
| 4028 |
+
void flash::flash_fwd_kernel<Flash_fwd_kernel_traits... 0.00% 0.000us 0.00% 0.000us 0.000us 3.602ms 100.00% 3.602ms 1.201ms 3
|
| 4029 |
+
Activity Buffer Request 27.53% 1.455ms 27.53% 1.455ms 1.455ms 1.209ms 33.55% 1.209ms 1.209ms 1
|
| 4030 |
+
cudaDeviceGetAttribute 0.08% 4.070us 0.08% 4.070us 0.271us 0.000us 0.00% 0.000us 0.000us 15
|
| 4031 |
+
aten::empty_like 0.14% 7.390us 0.45% 23.900us 7.967us 0.000us 0.00% 0.000us 0.000us 3
|
| 4032 |
+
aten::empty_strided 0.31% 16.510us 0.31% 16.510us 5.503us 0.000us 0.00% 0.000us 0.000us 3
|
| 4033 |
+
aten::empty 0.40% 21.151us 0.40% 21.151us 2.350us 0.000us 0.00% 0.000us 0.000us 9
|
| 4034 |
+
cudaFuncSetAttribute 0.07% 3.770us 0.07% 3.770us 1.257us 0.000us 0.00% 0.000us 0.000us 3
|
| 4035 |
+
cudaLaunchKernel 3.10% 164.023us 3.10% 164.023us 54.674us 0.000us 0.00% 0.000us 0.000us 3
|
| 4036 |
+
cudaDeviceSynchronize 65.36% 3.455ms 65.36% 3.455ms 3.455ms 0.000us 0.00% 0.000us 0.000us 1
|
| 4037 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 4038 |
+
Self CPU time total: 5.287ms
|
| 4039 |
+
Self CUDA time total: 3.602ms
|
| 4040 |
|
| 4041 |
|
| 4042 |
|
|
|
|
| 4046 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 4047 |
Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
|
| 4048 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 4049 |
+
hf_kernels_flash_attn 1.95% 105.103us 34.11% 1.836ms 1.836ms 0.000us 0.00% 4.931ms 4.931ms 1
|
| 4050 |
+
_flash_attn_9e27194::fwd 1.08% 58.141us 32.16% 1.731ms 577.087us 3.693ms 100.00% 4.931ms 1.644ms 3
|
| 4051 |
+
hf_kernels_flash_attn 0.00% 0.000us 0.00% 0.000us 0.000us 3.695ms 100.04% 3.695ms 3.695ms 1
|
| 4052 |
+
void flash::flash_fwd_kernel<Flash_fwd_kernel_traits... 0.00% 0.000us 0.00% 0.000us 0.000us 3.693ms 100.00% 3.693ms 1.231ms 3
|
| 4053 |
+
Activity Buffer Request 26.71% 1.438ms 26.71% 1.438ms 1.438ms 1.238ms 33.53% 1.238ms 1.238ms 1
|
| 4054 |
+
cudaDeviceGetAttribute 0.08% 4.380us 0.08% 4.380us 0.292us 0.000us 0.00% 0.000us 0.000us 15
|
| 4055 |
+
aten::empty_like 0.15% 8.230us 0.50% 26.750us 8.917us 0.000us 0.00% 0.000us 0.000us 3
|
| 4056 |
+
aten::empty_strided 0.34% 18.520us 0.34% 18.520us 6.173us 0.000us 0.00% 0.000us 0.000us 3
|
| 4057 |
+
aten::empty 0.48% 25.961us 0.48% 25.961us 2.885us 0.000us 0.00% 0.000us 0.000us 9
|
| 4058 |
+
cudaFuncSetAttribute 0.08% 4.220us 0.08% 4.220us 1.407us 0.000us 0.00% 0.000us 0.000us 3
|
| 4059 |
+
cudaLaunchKernel 3.23% 173.714us 3.23% 173.714us 57.905us 0.000us 0.00% 0.000us 0.000us 3
|
| 4060 |
+
cudaDeviceSynchronize 65.89% 3.548ms 65.89% 3.548ms 3.548ms 0.000us 0.00% 0.000us 0.000us 1
|
| 4061 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 4062 |
+
Self CPU time total: 5.384ms
|
| 4063 |
+
Self CUDA time total: 3.693ms
|
| 4064 |
|
| 4065 |
|
| 4066 |
impl wl p50(ms) ok
|
| 4067 |
+
hf_kernels_flash_attn cuda_attn_L128_bfloat16 0.97 True
|
| 4068 |
hf_kernels_flash_attn cuda_attn_L256_bfloat16 1.01 True
|
| 4069 |
hf_kernels_flash_attn cuda_attn_L320_bfloat16 1.06 True
|
| 4070 |
+
hf_kernels_flash_attn cuda_attn_L384_bfloat16 1.09 True
|
| 4071 |
+
hf_kernels_flash_attn cuda_attn_L448_bfloat16 1.24 True
|
| 4072 |
+
hf_kernels_flash_attn cuda_attn_L512_bfloat16 1.23 True
|
| 4073 |
</pre></div>
|
| 4074 |
+
<div class="cell-stderr">
|
| 4075 |
+
Fetching 20 files: 0%| | 0/20 [00:00<?, ?it/s]
|
| 4076 |
+
Fetching 20 files: 10%|█ | 2/20 [00:01<00:13, 1.34it/s]
|
| 4077 |
+
Fetching 20 files: 100%|██████████| 20/20 [00:01<00:00, 13.40it/s]
|
| 4078 |
</div>
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 4079 |
<div class="cell-artifacts">
|
| 4080 |
<h4>Artifacts:</h4>
|
| 4081 |
<a href="artifacts/benchmark/attention.jsonl" class="artifact" target="_blank">attention.jsonl</a>
|
flash_attn/impls/hf_kernels_flash_attn3.html
CHANGED
|
@@ -3871,7 +3871,7 @@ span.linenos.special { color: #000000; background-color: #ffffc0; padding-left:
|
|
| 3871 |
<span onclick="toggleOutput('benchmark')" style="cursor: pointer;">▼ output</span>
|
| 3872 |
<span id="uv-indicator-benchmark" style="cursor: default; opacity: 0.3;">▶ uv-logs</span>
|
| 3873 |
</span> |
|
| 3874 |
-
Cell: benchmark | 5.
|
| 3875 |
| <button class="run-btn" onclick="runCell('benchmark')">▶ run</button>
|
| 3876 |
<button class="copy-btn" onclick="copyCell('benchmark')">Copy</button>
|
| 3877 |
<a href="cells/benchmark.py" target="_blank" class="raw-btn">Raw</a>
|
|
@@ -3925,19 +3925,19 @@ PROFILE TRACE: hf_kernels_flash_attn3 | cuda_attn_L128_bfloat16
|
|
| 3925 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 3926 |
Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
|
| 3927 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 3928 |
-
hf_kernels_flash_attn3 3.
|
| 3929 |
-
FlashAttnFunc
|
| 3930 |
-
_flash_attn3_48fe103_dirty::fwd 1.
|
| 3931 |
-
hf_kernels_flash_attn3 0.00% 0.000us 0.00% 0.000us 0.000us 2.
|
| 3932 |
-
void cutlass::device_kernel<flash::enable_sm80_to_sm... 0.00% 0.000us 0.00% 0.000us 0.000us 2.
|
| 3933 |
-
Activity Buffer Request 33.
|
| 3934 |
-
aten::empty 1.
|
| 3935 |
-
cudaFuncSetAttribute 0.
|
| 3936 |
-
cudaLaunchKernel 1.04%
|
| 3937 |
-
cudaDeviceSynchronize 55.
|
| 3938 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 3939 |
-
Self CPU time total: 4.
|
| 3940 |
-
Self CUDA time total: 2.
|
| 3941 |
|
| 3942 |
|
| 3943 |
|
|
@@ -3947,19 +3947,19 @@ PROFILE TRACE: hf_kernels_flash_attn3 | cuda_attn_L256_bfloat16
|
|
| 3947 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 3948 |
Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
|
| 3949 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 3950 |
-
hf_kernels_flash_attn3
|
| 3951 |
-
FlashAttnFunc 2.
|
| 3952 |
-
_flash_attn3_48fe103_dirty::fwd 1.
|
| 3953 |
-
hf_kernels_flash_attn3 0.00% 0.000us 0.00% 0.000us 0.000us 2.
|
| 3954 |
-
void cutlass::device_kernel<flash::enable_sm80_to_sm... 0.00% 0.000us 0.00% 0.000us 0.000us 2.
|
| 3955 |
-
Activity Buffer Request 33.
|
| 3956 |
-
aten::empty 0.
|
| 3957 |
-
cudaFuncSetAttribute 0.12%
|
| 3958 |
-
cudaLaunchKernel 0.
|
| 3959 |
-
cudaDeviceSynchronize
|
| 3960 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 3961 |
-
Self CPU time total: 4.
|
| 3962 |
-
Self CUDA time total: 2.
|
| 3963 |
|
| 3964 |
|
| 3965 |
|
|
@@ -3969,19 +3969,19 @@ PROFILE TRACE: hf_kernels_flash_attn3 | cuda_attn_L320_bfloat16
|
|
| 3969 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 3970 |
Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
|
| 3971 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 3972 |
-
hf_kernels_flash_attn3 2.
|
| 3973 |
-
FlashAttnFunc
|
| 3974 |
-
_flash_attn3_48fe103_dirty::fwd 1.
|
| 3975 |
-
hf_kernels_flash_attn3 0.00% 0.000us 0.00% 0.000us 0.000us 2.
|
| 3976 |
-
void cutlass::device_kernel<flash::enable_sm80_to_sm... 0.00% 0.000us 0.00% 0.000us 0.000us 2.
|
| 3977 |
-
Activity Buffer Request
|
| 3978 |
-
aten::empty 0.
|
| 3979 |
-
cudaFuncSetAttribute 0.
|
| 3980 |
-
cudaLaunchKernel 0.69% 30.
|
| 3981 |
-
cudaDeviceSynchronize 60.
|
| 3982 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 3983 |
-
Self CPU time total: 4.
|
| 3984 |
-
Self CUDA time total: 2.
|
| 3985 |
|
| 3986 |
|
| 3987 |
|
|
@@ -3991,19 +3991,19 @@ PROFILE TRACE: hf_kernels_flash_attn3 | cuda_attn_L384_bfloat16
|
|
| 3991 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 3992 |
Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
|
| 3993 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 3994 |
-
hf_kernels_flash_attn3 2.
|
| 3995 |
-
FlashAttnFunc 1.
|
| 3996 |
-
_flash_attn3_48fe103_dirty::fwd 1.
|
| 3997 |
-
hf_kernels_flash_attn3 0.00% 0.000us 0.00% 0.000us 0.000us 2.
|
| 3998 |
-
void cutlass::device_kernel<flash::enable_sm80_to_sm... 0.00% 0.000us 0.00% 0.000us 0.000us 2.
|
| 3999 |
-
Activity Buffer Request
|
| 4000 |
-
aten::empty 0.
|
| 4001 |
-
cudaFuncSetAttribute 0.
|
| 4002 |
-
cudaLaunchKernel
|
| 4003 |
-
cudaDeviceSynchronize
|
| 4004 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 4005 |
-
Self CPU time total: 4.
|
| 4006 |
-
Self CUDA time total: 2.
|
| 4007 |
|
| 4008 |
|
| 4009 |
|
|
@@ -4013,19 +4013,19 @@ PROFILE TRACE: hf_kernels_flash_attn3 | cuda_attn_L448_bfloat16
|
|
| 4013 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 4014 |
Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
|
| 4015 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 4016 |
-
hf_kernels_flash_attn3 2.
|
| 4017 |
-
FlashAttnFunc 1.
|
| 4018 |
-
_flash_attn3_48fe103_dirty::fwd
|
| 4019 |
-
hf_kernels_flash_attn3 0.00% 0.000us 0.00% 0.000us 0.000us 3.
|
| 4020 |
-
void cutlass::device_kernel<flash::enable_sm80_to_sm... 0.00% 0.000us 0.00% 0.000us 0.000us 3.
|
| 4021 |
-
Activity Buffer Request
|
| 4022 |
-
aten::empty 0.
|
| 4023 |
-
cudaFuncSetAttribute 0.
|
| 4024 |
-
cudaLaunchKernel 3.
|
| 4025 |
-
cudaDeviceSynchronize 62.
|
| 4026 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 4027 |
-
Self CPU time total: 5.
|
| 4028 |
-
Self CUDA time total: 3.
|
| 4029 |
|
| 4030 |
|
| 4031 |
|
|
@@ -4035,33 +4035,33 @@ PROFILE TRACE: hf_kernels_flash_attn3 | cuda_attn_L512_bfloat16
|
|
| 4035 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 4036 |
Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
|
| 4037 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 4038 |
-
hf_kernels_flash_attn3 2.
|
| 4039 |
-
FlashAttnFunc
|
| 4040 |
-
_flash_attn3_48fe103_dirty::fwd 1.
|
| 4041 |
-
hf_kernels_flash_attn3 0.00% 0.000us 0.00% 0.000us 0.000us 3.
|
| 4042 |
-
void cutlass::device_kernel<flash::enable_sm80_to_sm... 0.00% 0.000us 0.00% 0.000us 0.000us 3.
|
| 4043 |
-
Activity Buffer Request
|
| 4044 |
-
aten::empty 0.
|
| 4045 |
-
cudaFuncSetAttribute 0.10% 5.
|
| 4046 |
-
cudaLaunchKernel 3.
|
| 4047 |
-
cudaDeviceSynchronize 63.
|
| 4048 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 4049 |
-
Self CPU time total: 5.
|
| 4050 |
-
Self CUDA time total: 3.
|
| 4051 |
|
| 4052 |
|
| 4053 |
impl wl p50(ms) ok
|
| 4054 |
-
hf_kernels_flash_attn3 cuda_attn_L128_bfloat16 0.
|
| 4055 |
-
hf_kernels_flash_attn3 cuda_attn_L256_bfloat16 0.
|
| 4056 |
-
hf_kernels_flash_attn3 cuda_attn_L320_bfloat16 1.
|
| 4057 |
-
hf_kernels_flash_attn3 cuda_attn_L384_bfloat16 1.
|
| 4058 |
-
hf_kernels_flash_attn3 cuda_attn_L448_bfloat16 1.
|
| 4059 |
hf_kernels_flash_attn3 cuda_attn_L512_bfloat16 1.18 True
|
| 4060 |
</pre></div>
|
| 4061 |
<div class="cell-stderr">
|
| 4062 |
Fetching 4 files: 0%| | 0/4 [00:00<?, ?it/s]
|
| 4063 |
-
Fetching 4 files: 50%|█████ | 2/4 [00:01<00:01, 1.
|
| 4064 |
-
Fetching 4 files: 100%|██████████| 4/4 [00:01<00:00, 2.
|
| 4065 |
</div>
|
| 4066 |
<div class="cell-artifacts">
|
| 4067 |
<h4>Artifacts:</h4>
|
|
|
|
| 3871 |
<span onclick="toggleOutput('benchmark')" style="cursor: pointer;">▼ output</span>
|
| 3872 |
<span id="uv-indicator-benchmark" style="cursor: default; opacity: 0.3;">▶ uv-logs</span>
|
| 3873 |
</span> |
|
| 3874 |
+
Cell: benchmark | 5.52s
|
| 3875 |
| <button class="run-btn" onclick="runCell('benchmark')">▶ run</button>
|
| 3876 |
<button class="copy-btn" onclick="copyCell('benchmark')">Copy</button>
|
| 3877 |
<a href="cells/benchmark.py" target="_blank" class="raw-btn">Raw</a>
|
|
|
|
| 3925 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 3926 |
Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
|
| 3927 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 3928 |
+
hf_kernels_flash_attn3 3.72% 161.222us 44.67% 1.935ms 1.935ms 0.000us 0.00% 3.599ms 3.599ms 1
|
| 3929 |
+
FlashAttnFunc 2.81% 121.834us 40.95% 1.774ms 591.218us 0.000us 0.00% 3.599ms 1.200ms 3
|
| 3930 |
+
_flash_attn3_48fe103_dirty::fwd 1.85% 79.992us 38.14% 1.652ms 550.607us 2.693ms 100.00% 3.599ms 1.200ms 3
|
| 3931 |
+
hf_kernels_flash_attn3 0.00% 0.000us 0.00% 0.000us 0.000us 2.695ms 100.05% 2.695ms 2.695ms 1
|
| 3932 |
+
void cutlass::device_kernel<flash::enable_sm80_to_sm... 0.00% 0.000us 0.00% 0.000us 0.000us 2.693ms 100.00% 2.693ms 897.759us 3
|
| 3933 |
+
Activity Buffer Request 33.93% 1.470ms 33.93% 1.470ms 1.470ms 905.439us 33.62% 905.439us 905.439us 1
|
| 3934 |
+
aten::empty 1.00% 43.311us 1.00% 43.311us 7.219us 0.000us 0.00% 0.000us 0.000us 6
|
| 3935 |
+
cudaFuncSetAttribute 0.32% 13.891us 0.32% 13.891us 4.630us 0.000us 0.00% 0.000us 0.000us 3
|
| 3936 |
+
cudaLaunchKernel 1.04% 45.121us 1.04% 45.121us 15.040us 0.000us 0.00% 0.000us 0.000us 3
|
| 3937 |
+
cudaDeviceSynchronize 55.33% 2.396ms 55.33% 2.396ms 2.396ms 0.000us 0.00% 0.000us 0.000us 1
|
| 3938 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 3939 |
+
Self CPU time total: 4.331ms
|
| 3940 |
+
Self CUDA time total: 2.693ms
|
| 3941 |
|
| 3942 |
|
| 3943 |
|
|
|
|
| 3947 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 3948 |
Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
|
| 3949 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 3950 |
+
hf_kernels_flash_attn3 2.17% 96.772us 39.76% 1.770ms 1.770ms 0.000us 0.00% 3.876ms 3.876ms 1
|
| 3951 |
+
FlashAttnFunc 2.04% 90.694us 37.59% 1.674ms 557.834us 0.000us 0.00% 3.876ms 1.292ms 3
|
| 3952 |
+
_flash_attn3_48fe103_dirty::fwd 1.15% 51.142us 35.55% 1.583ms 527.603us 2.896ms 100.00% 3.876ms 1.292ms 3
|
| 3953 |
+
hf_kernels_flash_attn3 0.00% 0.000us 0.00% 0.000us 0.000us 2.898ms 100.05% 2.898ms 2.898ms 1
|
| 3954 |
+
void cutlass::device_kernel<flash::enable_sm80_to_sm... 0.00% 0.000us 0.00% 0.000us 0.000us 2.896ms 100.00% 2.896ms 965.387us 3
|
| 3955 |
+
Activity Buffer Request 33.04% 1.471ms 33.04% 1.471ms 1.471ms 979.809us 33.83% 979.809us 979.809us 1
|
| 3956 |
+
aten::empty 0.58% 25.610us 0.58% 25.610us 4.268us 0.000us 0.00% 0.000us 0.000us 6
|
| 3957 |
+
cudaFuncSetAttribute 0.12% 5.240us 0.12% 5.240us 1.747us 0.000us 0.00% 0.000us 0.000us 3
|
| 3958 |
+
cudaLaunchKernel 0.67% 29.750us 0.67% 29.750us 9.917us 0.000us 0.00% 0.000us 0.000us 3
|
| 3959 |
+
cudaDeviceSynchronize 60.24% 2.682ms 60.24% 2.682ms 2.682ms 0.000us 0.00% 0.000us 0.000us 1
|
| 3960 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 3961 |
+
Self CPU time total: 4.452ms
|
| 3962 |
+
Self CUDA time total: 2.896ms
|
| 3963 |
|
| 3964 |
|
| 3965 |
|
|
|
|
| 3969 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 3970 |
Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
|
| 3971 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 3972 |
+
hf_kernels_flash_attn3 2.19% 98.331us 39.82% 1.786ms 1.786ms 0.000us 0.00% 3.885ms 3.885ms 1
|
| 3973 |
+
FlashAttnFunc 1.99% 89.333us 37.63% 1.688ms 562.551us 0.000us 0.00% 3.885ms 1.295ms 3
|
| 3974 |
+
_flash_attn3_48fe103_dirty::fwd 1.08% 48.311us 35.64% 1.598ms 532.773us 2.912ms 100.00% 3.885ms 1.295ms 3
|
| 3975 |
+
hf_kernels_flash_attn3 0.00% 0.000us 0.00% 0.000us 0.000us 2.914ms 100.05% 2.914ms 2.914ms 1
|
| 3976 |
+
void cutlass::device_kernel<flash::enable_sm80_to_sm... 0.00% 0.000us 0.00% 0.000us 0.000us 2.912ms 100.00% 2.912ms 970.802us 3
|
| 3977 |
+
Activity Buffer Request 33.18% 1.488ms 33.18% 1.488ms 1.488ms 972.637us 33.40% 972.637us 972.637us 1
|
| 3978 |
+
aten::empty 0.57% 25.370us 0.57% 25.370us 4.228us 0.000us 0.00% 0.000us 0.000us 6
|
| 3979 |
+
cudaFuncSetAttribute 0.13% 5.730us 0.13% 5.730us 1.910us 0.000us 0.00% 0.000us 0.000us 3
|
| 3980 |
+
cudaLaunchKernel 0.69% 30.861us 0.69% 30.861us 10.287us 0.000us 0.00% 0.000us 0.000us 3
|
| 3981 |
+
cudaDeviceSynchronize 60.18% 2.699ms 60.18% 2.699ms 2.699ms 0.000us 0.00% 0.000us 0.000us 1
|
| 3982 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 3983 |
+
Self CPU time total: 4.485ms
|
| 3984 |
+
Self CUDA time total: 2.912ms
|
| 3985 |
|
| 3986 |
|
| 3987 |
|
|
|
|
| 3991 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 3992 |
Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
|
| 3993 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 3994 |
+
hf_kernels_flash_attn3 2.51% 118.553us 41.81% 1.973ms 1.973ms 0.000us 0.00% 3.964ms 3.964ms 1
|
| 3995 |
+
FlashAttnFunc 1.94% 91.662us 39.30% 1.855ms 618.205us 0.000us 0.00% 3.964ms 1.321ms 3
|
| 3996 |
+
_flash_attn3_48fe103_dirty::fwd 1.07% 50.373us 37.36% 1.763ms 587.651us 2.962ms 100.00% 3.964ms 1.321ms 3
|
| 3997 |
+
hf_kernels_flash_attn3 0.00% 0.000us 0.00% 0.000us 0.000us 2.964ms 100.05% 2.964ms 2.964ms 1
|
| 3998 |
+
void cutlass::device_kernel<flash::enable_sm80_to_sm... 0.00% 0.000us 0.00% 0.000us 0.000us 2.962ms 100.00% 2.962ms 987.401us 3
|
| 3999 |
+
Activity Buffer Request 30.92% 1.459ms 30.92% 1.459ms 1.459ms 1.002ms 33.82% 1.002ms 1.002ms 1
|
| 4000 |
+
aten::empty 0.56% 26.451us 0.56% 26.451us 4.408us 0.000us 0.00% 0.000us 0.000us 6
|
| 4001 |
+
cudaFuncSetAttribute 0.11% 5.270us 0.11% 5.270us 1.757us 0.000us 0.00% 0.000us 0.000us 3
|
| 4002 |
+
cudaLaunchKernel 4.70% 221.845us 4.70% 221.845us 73.948us 0.000us 0.00% 0.000us 0.000us 3
|
| 4003 |
+
cudaDeviceSynchronize 58.19% 2.746ms 58.19% 2.746ms 2.746ms 0.000us 0.00% 0.000us 0.000us 1
|
| 4004 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 4005 |
+
Self CPU time total: 4.719ms
|
| 4006 |
+
Self CUDA time total: 2.962ms
|
| 4007 |
|
| 4008 |
|
| 4009 |
|
|
|
|
| 4013 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 4014 |
Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
|
| 4015 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 4016 |
+
hf_kernels_flash_attn3 2.19% 114.453us 37.34% 1.953ms 1.953ms 0.000us 0.00% 4.662ms 4.662ms 1
|
| 4017 |
+
FlashAttnFunc 1.73% 90.401us 35.15% 1.838ms 612.822us 0.000us 0.00% 4.662ms 1.554ms 3
|
| 4018 |
+
_flash_attn3_48fe103_dirty::fwd 0.97% 50.643us 33.42% 1.748ms 582.688us 3.490ms 100.00% 4.662ms 1.554ms 3
|
| 4019 |
+
hf_kernels_flash_attn3 0.00% 0.000us 0.00% 0.000us 0.000us 3.492ms 100.04% 3.492ms 3.492ms 1
|
| 4020 |
+
void cutlass::device_kernel<flash::enable_sm80_to_sm... 0.00% 0.000us 0.00% 0.000us 0.000us 3.490ms 100.00% 3.490ms 1.163ms 3
|
| 4021 |
+
Activity Buffer Request 28.44% 1.487ms 28.44% 1.487ms 1.487ms 1.171ms 33.56% 1.171ms 1.171ms 1
|
| 4022 |
+
aten::empty 0.52% 27.271us 0.52% 27.271us 4.545us 0.000us 0.00% 0.000us 0.000us 6
|
| 4023 |
+
cudaFuncSetAttribute 0.09% 4.950us 0.09% 4.950us 1.650us 0.000us 0.00% 0.000us 0.000us 3
|
| 4024 |
+
cudaLaunchKernel 3.40% 178.024us 3.40% 178.024us 59.341us 0.000us 0.00% 0.000us 0.000us 3
|
| 4025 |
+
cudaDeviceSynchronize 62.66% 3.277ms 62.66% 3.277ms 3.277ms 0.000us 0.00% 0.000us 0.000us 1
|
| 4026 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 4027 |
+
Self CPU time total: 5.230ms
|
| 4028 |
+
Self CUDA time total: 3.490ms
|
| 4029 |
|
| 4030 |
|
| 4031 |
|
|
|
|
| 4035 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 4036 |
Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
|
| 4037 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 4038 |
+
hf_kernels_flash_attn3 2.26% 115.663us 36.27% 1.854ms 1.854ms 0.000us 0.00% 4.679ms 4.679ms 1
|
| 4039 |
+
FlashAttnFunc 2.25% 114.773us 34.01% 1.738ms 579.364us 0.000us 0.00% 4.679ms 1.560ms 3
|
| 4040 |
+
_flash_attn3_48fe103_dirty::fwd 1.02% 51.933us 31.76% 1.623ms 541.107us 3.499ms 100.00% 4.679ms 1.560ms 3
|
| 4041 |
+
hf_kernels_flash_attn3 0.00% 0.000us 0.00% 0.000us 0.000us 3.500ms 100.04% 3.500ms 3.500ms 1
|
| 4042 |
+
void cutlass::device_kernel<flash::enable_sm80_to_sm... 0.00% 0.000us 0.00% 0.000us 0.000us 3.499ms 100.00% 3.499ms 1.166ms 3
|
| 4043 |
+
Activity Buffer Request 26.80% 1.370ms 26.80% 1.370ms 1.370ms 1.181ms 33.75% 1.181ms 1.181ms 1
|
| 4044 |
+
aten::empty 0.54% 27.681us 0.54% 27.681us 4.613us 0.000us 0.00% 0.000us 0.000us 6
|
| 4045 |
+
cudaFuncSetAttribute 0.10% 5.079us 0.10% 5.079us 1.693us 0.000us 0.00% 0.000us 0.000us 3
|
| 4046 |
+
cudaLaunchKernel 3.30% 168.813us 3.30% 168.813us 56.271us 0.000us 0.00% 0.000us 0.000us 3
|
| 4047 |
+
cudaDeviceSynchronize 63.73% 3.257ms 63.73% 3.257ms 3.257ms 0.000us 0.00% 0.000us 0.000us 1
|
| 4048 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 4049 |
+
Self CPU time total: 5.111ms
|
| 4050 |
+
Self CUDA time total: 3.499ms
|
| 4051 |
|
| 4052 |
|
| 4053 |
impl wl p50(ms) ok
|
| 4054 |
+
hf_kernels_flash_attn3 cuda_attn_L128_bfloat16 0.94 True
|
| 4055 |
+
hf_kernels_flash_attn3 cuda_attn_L256_bfloat16 0.97 True
|
| 4056 |
+
hf_kernels_flash_attn3 cuda_attn_L320_bfloat16 1.04 True
|
| 4057 |
+
hf_kernels_flash_attn3 cuda_attn_L384_bfloat16 1.05 True
|
| 4058 |
+
hf_kernels_flash_attn3 cuda_attn_L448_bfloat16 1.20 True
|
| 4059 |
hf_kernels_flash_attn3 cuda_attn_L512_bfloat16 1.18 True
|
| 4060 |
</pre></div>
|
| 4061 |
<div class="cell-stderr">
|
| 4062 |
Fetching 4 files: 0%| | 0/4 [00:00<?, ?it/s]
|
| 4063 |
+
Fetching 4 files: 50%|█████ | 2/4 [00:01<00:01, 1.38it/s]
|
| 4064 |
+
Fetching 4 files: 100%|██████████| 4/4 [00:01<00:00, 2.75it/s]
|
| 4065 |
</div>
|
| 4066 |
<div class="cell-artifacts">
|
| 4067 |
<h4>Artifacts:</h4>
|
flash_attn/impls/mem_efficient_attention.html
CHANGED
|
@@ -3869,9 +3869,9 @@ span.linenos.special { color: #000000; background-color: #ffffc0; padding-left:
|
|
| 3869 |
<span class="collapse-indicators">
|
| 3870 |
<span onclick="toggleCode('benchmark')" style="cursor: pointer;">▼ code</span>
|
| 3871 |
<span onclick="toggleOutput('benchmark')" style="cursor: pointer;">▼ output</span>
|
| 3872 |
-
<span id="uv-indicator-benchmark"
|
| 3873 |
</span> |
|
| 3874 |
-
Cell: benchmark |
|
| 3875 |
| <button class="run-btn" onclick="runCell('benchmark')">▶ run</button>
|
| 3876 |
<button class="copy-btn" onclick="copyCell('benchmark')">Copy</button>
|
| 3877 |
<a href="cells/benchmark.py" target="_blank" class="raw-btn">Raw</a>
|
|
@@ -3924,28 +3924,28 @@ PROFILE TRACE: torch_mem_eff | cuda_attn_L128_bfloat16
|
|
| 3924 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 3925 |
Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
|
| 3926 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 3927 |
-
torch_mem_eff 4.77%
|
| 3928 |
-
torch_mem_eff 0.00% 0.000us 0.00% 0.000us 0.000us 5.
|
| 3929 |
-
aten::scaled_dot_product_attention 0.44%
|
| 3930 |
-
aten::_scaled_dot_product_efficient_attention 0.
|
| 3931 |
-
aten::_efficient_attention_forward 0.51%
|
| 3932 |
-
fmha_cutlassF_bf16_aligned_64x128_rf_sm80(PyTorchMem... 0.00% 0.000us 0.00% 0.000us 0.000us 4.
|
| 3933 |
-
aten::contiguous 0.17%
|
| 3934 |
-
aten::clone 0.
|
| 3935 |
-
aten::copy_ 1.
|
| 3936 |
-
void at::native::elementwise_kernel<128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us
|
| 3937 |
-
Activity Buffer Request 20.
|
| 3938 |
-
aten::transpose
|
| 3939 |
-
aten::as_strided 0.33% 23.
|
| 3940 |
-
aten::empty_like 0.
|
| 3941 |
-
aten::empty 1.
|
| 3942 |
-
cudaLaunchKernel 1.
|
| 3943 |
-
cudaStreamIsCapturing 0.
|
| 3944 |
-
cudaFuncSetAttribute 0.
|
| 3945 |
-
cudaDeviceSynchronize 67.
|
| 3946 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 3947 |
-
Self CPU time total:
|
| 3948 |
-
Self CUDA time total: 5.
|
| 3949 |
|
| 3950 |
|
| 3951 |
|
|
@@ -3955,28 +3955,28 @@ PROFILE TRACE: torch_mem_eff | cuda_attn_L256_bfloat16
|
|
| 3955 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 3956 |
Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
|
| 3957 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 3958 |
-
torch_mem_eff 3.
|
| 3959 |
-
torch_mem_eff 0.00% 0.000us 0.00% 0.000us 0.000us 5.
|
| 3960 |
-
aten::scaled_dot_product_attention 0.
|
| 3961 |
-
aten::_scaled_dot_product_efficient_attention 0.
|
| 3962 |
-
aten::_efficient_attention_forward 0.
|
| 3963 |
-
fmha_cutlassF_bf16_aligned_64x128_rf_sm80(PyTorchMem... 0.00% 0.000us 0.00% 0.000us 0.000us
|
| 3964 |
-
aten::contiguous 0.10% 7.
|
| 3965 |
-
aten::clone 0.
|
| 3966 |
-
aten::copy_ 0.
|
| 3967 |
-
void at::native::elementwise_kernel<128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us
|
| 3968 |
-
Activity Buffer Request
|
| 3969 |
-
aten::transpose 0.
|
| 3970 |
-
aten::as_strided 0.
|
| 3971 |
-
aten::empty_like 0.
|
| 3972 |
-
aten::empty 0.
|
| 3973 |
-
cudaLaunchKernel 1.25%
|
| 3974 |
-
cudaStreamIsCapturing 0.03% 2.
|
| 3975 |
-
cudaFuncSetAttribute 0.05% 3.
|
| 3976 |
-
cudaDeviceSynchronize
|
| 3977 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 3978 |
-
Self CPU time total: 7.
|
| 3979 |
-
Self CUDA time total: 5.
|
| 3980 |
|
| 3981 |
|
| 3982 |
|
|
@@ -3986,28 +3986,28 @@ PROFILE TRACE: torch_mem_eff | cuda_attn_L320_bfloat16
|
|
| 3986 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 3987 |
Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
|
| 3988 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 3989 |
-
torch_mem_eff 3.
|
| 3990 |
-
torch_mem_eff 0.00% 0.000us 0.00% 0.000us 0.000us
|
| 3991 |
-
aten::scaled_dot_product_attention 0.24% 18.
|
| 3992 |
-
aten::_scaled_dot_product_efficient_attention 0.
|
| 3993 |
-
aten::_efficient_attention_forward 0.
|
| 3994 |
-
fmha_cutlassF_bf16_aligned_64x128_rf_sm80(PyTorchMem... 0.00% 0.000us 0.00% 0.000us 0.000us 5.
|
| 3995 |
-
aten::contiguous 0.
|
| 3996 |
-
aten::clone 0.
|
| 3997 |
-
aten::copy_
|
| 3998 |
-
void at::native::elementwise_kernel<128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us
|
| 3999 |
-
Activity Buffer Request
|
| 4000 |
-
aten::transpose 0.
|
| 4001 |
-
aten::as_strided 0.
|
| 4002 |
-
aten::empty_like 0.
|
| 4003 |
-
aten::empty 0.
|
| 4004 |
-
cudaLaunchKernel 1.
|
| 4005 |
-
cudaStreamIsCapturing 0.03% 2.
|
| 4006 |
-
cudaFuncSetAttribute 0.
|
| 4007 |
-
cudaDeviceSynchronize
|
| 4008 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 4009 |
-
Self CPU time total: 7.
|
| 4010 |
-
Self CUDA time total:
|
| 4011 |
|
| 4012 |
|
| 4013 |
|
|
@@ -4017,28 +4017,28 @@ PROFILE TRACE: torch_mem_eff | cuda_attn_L384_bfloat16
|
|
| 4017 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 4018 |
Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
|
| 4019 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 4020 |
-
torch_mem_eff
|
| 4021 |
-
torch_mem_eff 0.00% 0.000us 0.00% 0.000us 0.000us 6.
|
| 4022 |
-
aten::scaled_dot_product_attention 0.24% 18.
|
| 4023 |
-
aten::_scaled_dot_product_efficient_attention 0.
|
| 4024 |
-
aten::_efficient_attention_forward 0.
|
| 4025 |
-
fmha_cutlassF_bf16_aligned_64x128_rf_sm80(PyTorchMem... 0.00% 0.000us 0.00% 0.000us 0.000us 5.
|
| 4026 |
-
aten::contiguous 0.10% 7.
|
| 4027 |
-
aten::clone 0.
|
| 4028 |
-
aten::copy_ 0.
|
| 4029 |
-
void at::native::elementwise_kernel<128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us
|
| 4030 |
-
Activity Buffer Request 18.
|
| 4031 |
-
aten::transpose 0.
|
| 4032 |
-
aten::as_strided 0.
|
| 4033 |
-
aten::empty_like 0.15%
|
| 4034 |
-
aten::empty 0.
|
| 4035 |
-
cudaLaunchKernel
|
| 4036 |
-
cudaStreamIsCapturing 0.03% 2.
|
| 4037 |
-
cudaFuncSetAttribute 0.04% 3.
|
| 4038 |
-
cudaDeviceSynchronize 70.
|
| 4039 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 4040 |
-
Self CPU time total: 7.
|
| 4041 |
-
Self CUDA time total: 6.
|
| 4042 |
|
| 4043 |
|
| 4044 |
|
|
@@ -4048,28 +4048,28 @@ PROFILE TRACE: torch_mem_eff | cuda_attn_L448_bfloat16
|
|
| 4048 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 4049 |
Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
|
| 4050 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 4051 |
-
torch_mem_eff
|
| 4052 |
-
torch_mem_eff 0.00% 0.000us 0.00% 0.000us 0.000us 6.
|
| 4053 |
-
aten::scaled_dot_product_attention 0.
|
| 4054 |
-
aten::_scaled_dot_product_efficient_attention 0.
|
| 4055 |
-
aten::_efficient_attention_forward 0.36% 28.
|
| 4056 |
-
fmha_cutlassF_bf16_aligned_64x128_rf_sm80(PyTorchMem... 0.00% 0.000us 0.00% 0.000us 0.000us 5.
|
| 4057 |
-
aten::contiguous 0.09%
|
| 4058 |
-
aten::clone 0.28%
|
| 4059 |
-
aten::copy_ 0.
|
| 4060 |
-
void at::native::elementwise_kernel<128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us
|
| 4061 |
-
Activity Buffer Request
|
| 4062 |
-
aten::transpose 0.
|
| 4063 |
-
aten::as_strided 0.
|
| 4064 |
-
aten::empty_like 0.
|
| 4065 |
-
aten::empty 0.
|
| 4066 |
-
cudaLaunchKernel 3.
|
| 4067 |
-
cudaStreamIsCapturing 0.03% 2.
|
| 4068 |
-
cudaFuncSetAttribute 0.
|
| 4069 |
-
cudaDeviceSynchronize 71.
|
| 4070 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 4071 |
-
Self CPU time total: 7.
|
| 4072 |
-
Self CUDA time total: 6.
|
| 4073 |
|
| 4074 |
|
| 4075 |
|
|
@@ -4079,90 +4079,38 @@ PROFILE TRACE: torch_mem_eff | cuda_attn_L512_bfloat16
|
|
| 4079 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 4080 |
Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
|
| 4081 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 4082 |
-
torch_mem_eff 3.
|
| 4083 |
-
torch_mem_eff 0.00% 0.000us 0.00% 0.000us 0.000us 6.
|
| 4084 |
-
aten::scaled_dot_product_attention 0.
|
| 4085 |
-
aten::_scaled_dot_product_efficient_attention 0.
|
| 4086 |
-
aten::_efficient_attention_forward 0.
|
| 4087 |
-
fmha_cutlassF_bf16_aligned_64x128_rf_sm80(PyTorchMem... 0.00% 0.000us 0.00% 0.000us 0.000us 5.
|
| 4088 |
-
aten::contiguous 0.
|
| 4089 |
-
aten::clone 0.
|
| 4090 |
-
aten::copy_ 0.
|
| 4091 |
-
void at::native::elementwise_kernel<128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us
|
| 4092 |
-
Activity Buffer Request 17.
|
| 4093 |
-
aten::transpose 0.
|
| 4094 |
-
aten::as_strided 0.
|
| 4095 |
-
aten::empty_like 0.
|
| 4096 |
-
aten::empty 0.
|
| 4097 |
-
cudaLaunchKernel
|
| 4098 |
-
cudaStreamIsCapturing 0.03% 2.
|
| 4099 |
-
cudaFuncSetAttribute 0.
|
| 4100 |
-
cudaDeviceSynchronize
|
| 4101 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 4102 |
-
Self CPU time total: 8.
|
| 4103 |
-
Self CUDA time total: 6.
|
| 4104 |
|
| 4105 |
|
| 4106 |
impl wl p50(ms) ok
|
| 4107 |
-
torch_mem_eff cuda_attn_L128_bfloat16 1.
|
| 4108 |
-
torch_mem_eff cuda_attn_L256_bfloat16 1.
|
| 4109 |
-
torch_mem_eff cuda_attn_L320_bfloat16 2.
|
| 4110 |
-
torch_mem_eff cuda_attn_L384_bfloat16
|
| 4111 |
-
torch_mem_eff cuda_attn_L448_bfloat16 2.
|
| 4112 |
torch_mem_eff cuda_attn_L512_bfloat16 2.19 True
|
| 4113 |
</pre></div>
|
| 4114 |
-
<div class="uv-install-logs" id="uv-logs-benchmark">
|
| 4115 |
-
<div class="uv-logs-header" onclick="toggleUvLogs(this)">▶ UV Install Logs</div>
|
| 4116 |
-
<div class="uv-logs-content" style="display: none;">
|
| 4117 |
-
Building kernels-benchmark-tools @ file:///__w/kernels-benchmarks/kernels-benchmarks/tools
|
| 4118 |
-
Downloading networkx (1.9MiB)
|
| 4119 |
-
Downloading matplotlib (8.3MiB)
|
| 4120 |
-
Downloading nvidia-cufft-cu12 (184.2MiB)
|
| 4121 |
-
Downloading sympy (6.0MiB)
|
| 4122 |
-
Downloading nvidia-cusolver-cu12 (255.1MiB)
|
| 4123 |
-
Downloading nvidia-cublas-cu12 (566.8MiB)
|
| 4124 |
-
Downloading nvidia-cuda-cupti-cu12 (9.8MiB)
|
| 4125 |
-
Downloading numpy (16.2MiB)
|
| 4126 |
-
Downloading nvidia-nvjitlink-cu12 (37.4MiB)
|
| 4127 |
-
Downloading setuptools (1.1MiB)
|
| 4128 |
-
Downloading nvidia-cudnn-cu12 (674.0MiB)
|
| 4129 |
-
Downloading nvidia-curand-cu12 (60.7MiB)
|
| 4130 |
-
Downloading nvidia-nccl-cu12 (307.4MiB)
|
| 4131 |
-
Downloading kiwisolver (1.4MiB)
|
| 4132 |
-
Downloading nvidia-cuda-nvrtc-cu12 (84.0MiB)
|
| 4133 |
-
Downloading fonttools (4.7MiB)
|
| 4134 |
-
Downloading nvidia-cusparse-cu12 (274.9MiB)
|
| 4135 |
-
Downloading pillow (6.7MiB)
|
| 4136 |
-
Downloading nvidia-cusparselt-cu12 (273.9MiB)
|
| 4137 |
-
Downloading nvidia-cufile-cu12 (1.1MiB)
|
| 4138 |
-
Downloading triton (148.3MiB)
|
| 4139 |
-
Downloading torch (846.9MiB)
|
| 4140 |
-
Downloading nvidia-cufile-cu12
|
| 4141 |
-
Downloading kiwisolver
|
| 4142 |
-
Downloading setuptools
|
| 4143 |
-
Downloading fonttools
|
| 4144 |
-
Downloading networkx
|
| 4145 |
-
Downloading pillow
|
| 4146 |
-
Built kernels-benchmark-tools @ file:///__w/kernels-benchmarks/kernels-benchmarks/tools
|
| 4147 |
-
Downloading nvidia-cuda-cupti-cu12
|
| 4148 |
-
Downloading matplotlib
|
| 4149 |
-
Downloading numpy
|
| 4150 |
-
Downloading sympy
|
| 4151 |
-
Downloading nvidia-nvjitlink-cu12
|
| 4152 |
-
Downloading nvidia-curand-cu12
|
| 4153 |
-
Downloading nvidia-cuda-nvrtc-cu12
|
| 4154 |
-
Downloading triton
|
| 4155 |
-
Downloading nvidia-cufft-cu12
|
| 4156 |
-
Downloading nvidia-cusolver-cu12
|
| 4157 |
-
Downloading nvidia-cusparse-cu12
|
| 4158 |
-
Downloading nvidia-cusparselt-cu12
|
| 4159 |
-
Downloading nvidia-nccl-cu12
|
| 4160 |
-
Downloading nvidia-cublas-cu12
|
| 4161 |
-
Downloading nvidia-cudnn-cu12
|
| 4162 |
-
Downloading torch
|
| 4163 |
-
Installed 37 packages in 216ms
|
| 4164 |
-
</div>
|
| 4165 |
-
</div>
|
| 4166 |
<div class="cell-artifacts">
|
| 4167 |
<h4>Artifacts:</h4>
|
| 4168 |
<a href="artifacts/benchmark/attention.jsonl" class="artifact" target="_blank">attention.jsonl</a>
|
|
|
|
| 3869 |
<span class="collapse-indicators">
|
| 3870 |
<span onclick="toggleCode('benchmark')" style="cursor: pointer;">▼ code</span>
|
| 3871 |
<span onclick="toggleOutput('benchmark')" style="cursor: pointer;">▼ output</span>
|
| 3872 |
+
<span id="uv-indicator-benchmark" style="cursor: default; opacity: 0.3;">▶ uv-logs</span>
|
| 3873 |
</span> |
|
| 3874 |
+
Cell: benchmark | 3.92s
|
| 3875 |
| <button class="run-btn" onclick="runCell('benchmark')">▶ run</button>
|
| 3876 |
<button class="copy-btn" onclick="copyCell('benchmark')">Copy</button>
|
| 3877 |
<a href="cells/benchmark.py" target="_blank" class="raw-btn">Raw</a>
|
|
|
|
| 3924 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 3925 |
Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
|
| 3926 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 3927 |
+
torch_mem_eff 4.77% 333.269us 32.71% 2.284ms 2.284ms 0.000us 0.00% 5.420ms 5.420ms 1
|
| 3928 |
+
torch_mem_eff 0.00% 0.000us 0.00% 0.000us 0.000us 5.402ms 100.61% 5.402ms 5.402ms 1
|
| 3929 |
+
aten::scaled_dot_product_attention 0.44% 30.450us 2.54% 177.435us 59.145us 0.000us 0.00% 4.753ms 1.584ms 3
|
| 3930 |
+
aten::_scaled_dot_product_efficient_attention 0.33% 22.722us 2.10% 146.985us 48.995us 0.000us 0.00% 4.753ms 1.584ms 3
|
| 3931 |
+
aten::_efficient_attention_forward 0.51% 35.382us 1.42% 99.273us 33.091us 4.753ms 88.51% 4.753ms 1.584ms 3
|
| 3932 |
+
fmha_cutlassF_bf16_aligned_64x128_rf_sm80(PyTorchMem... 0.00% 0.000us 0.00% 0.000us 0.000us 4.753ms 88.51% 4.753ms 1.584ms 3
|
| 3933 |
+
aten::contiguous 0.17% 11.660us 24.51% 1.712ms 190.185us 0.000us 0.00% 667.266us 74.141us 9
|
| 3934 |
+
aten::clone 0.46% 31.810us 24.34% 1.700ms 188.889us 0.000us 0.00% 667.266us 74.141us 9
|
| 3935 |
+
aten::copy_ 1.01% 70.871us 22.86% 1.597ms 177.404us 616.738us 11.49% 667.266us 74.141us 9
|
| 3936 |
+
void at::native::elementwise_kernel<128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 616.738us 11.49% 616.738us 68.526us 9
|
| 3937 |
+
Activity Buffer Request 20.64% 1.441ms 20.64% 1.441ms 1.441ms 50.528us 0.94% 50.528us 50.528us 1
|
| 3938 |
+
aten::transpose 0.91% 63.619us 1.25% 87.011us 3.625us 0.000us 0.00% 0.000us 0.000us 24
|
| 3939 |
+
aten::as_strided 0.33% 23.392us 0.33% 23.392us 0.975us 0.000us 0.00% 0.000us 0.000us 24
|
| 3940 |
+
aten::empty_like 0.24% 16.972us 1.02% 71.553us 7.950us 0.000us 0.00% 0.000us 0.000us 9
|
| 3941 |
+
aten::empty 1.18% 82.691us 1.18% 82.691us 3.938us 0.000us 0.00% 0.000us 0.000us 21
|
| 3942 |
+
cudaLaunchKernel 1.55% 108.383us 1.55% 108.383us 9.032us 0.000us 0.00% 0.000us 0.000us 12
|
| 3943 |
+
cudaStreamIsCapturing 0.05% 3.260us 0.05% 3.260us 1.087us 0.000us 0.00% 0.000us 0.000us 3
|
| 3944 |
+
cudaFuncSetAttribute 0.12% 8.450us 0.12% 8.450us 2.817us 0.000us 0.00% 0.000us 0.000us 3
|
| 3945 |
+
cudaDeviceSynchronize 67.29% 4.700ms 67.29% 4.700ms 4.700ms 0.000us 0.00% 0.000us 0.000us 1
|
| 3946 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 3947 |
+
Self CPU time total: 6.984ms
|
| 3948 |
+
Self CUDA time total: 5.369ms
|
| 3949 |
|
| 3950 |
|
| 3951 |
|
|
|
|
| 3955 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 3956 |
Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
|
| 3957 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 3958 |
+
torch_mem_eff 3.53% 251.015us 29.52% 2.098ms 2.098ms 0.000us 0.00% 5.633ms 5.633ms 1
|
| 3959 |
+
torch_mem_eff 0.00% 0.000us 0.00% 0.000us 0.000us 5.587ms 100.15% 5.587ms 5.587ms 1
|
| 3960 |
+
aten::scaled_dot_product_attention 0.25% 17.630us 2.05% 145.594us 48.531us 0.000us 0.00% 4.943ms 1.648ms 3
|
| 3961 |
+
aten::_scaled_dot_product_efficient_attention 0.28% 19.810us 1.80% 127.964us 42.655us 0.000us 0.00% 4.943ms 1.648ms 3
|
| 3962 |
+
aten::_efficient_attention_forward 0.42% 29.862us 1.18% 83.512us 27.837us 4.943ms 88.61% 4.943ms 1.648ms 3
|
| 3963 |
+
fmha_cutlassF_bf16_aligned_64x128_rf_sm80(PyTorchMem... 0.00% 0.000us 0.00% 0.000us 0.000us 4.943ms 88.61% 4.943ms 1.648ms 3
|
| 3964 |
+
aten::contiguous 0.10% 7.191us 23.30% 1.656ms 184.002us 0.000us 0.00% 689.540us 76.616us 9
|
| 3965 |
+
aten::clone 0.33% 23.318us 23.20% 1.649ms 183.203us 0.000us 0.00% 689.540us 76.616us 9
|
| 3966 |
+
aten::copy_ 0.92% 65.725us 22.12% 1.572ms 174.717us 635.140us 11.39% 689.540us 76.616us 9
|
| 3967 |
+
void at::native::elementwise_kernel<128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 635.140us 11.39% 635.140us 70.571us 9
|
| 3968 |
+
Activity Buffer Request 20.24% 1.439ms 20.24% 1.439ms 1.439ms 54.400us 0.98% 54.400us 54.400us 1
|
| 3969 |
+
aten::transpose 0.71% 50.494us 0.99% 70.123us 2.922us 0.000us 0.00% 0.000us 0.000us 24
|
| 3970 |
+
aten::as_strided 0.28% 19.629us 0.28% 19.629us 0.818us 0.000us 0.00% 0.000us 0.000us 24
|
| 3971 |
+
aten::empty_like 0.18% 12.608us 0.75% 53.061us 5.896us 0.000us 0.00% 0.000us 0.000us 9
|
| 3972 |
+
aten::empty 0.94% 66.903us 0.94% 66.903us 3.186us 0.000us 0.00% 0.000us 0.000us 21
|
| 3973 |
+
cudaLaunchKernel 1.25% 89.012us 1.25% 89.012us 7.418us 0.000us 0.00% 0.000us 0.000us 12
|
| 3974 |
+
cudaStreamIsCapturing 0.03% 2.220us 0.03% 2.220us 0.740us 0.000us 0.00% 0.000us 0.000us 3
|
| 3975 |
+
cudaFuncSetAttribute 0.05% 3.880us 0.05% 3.880us 1.293us 0.000us 0.00% 0.000us 0.000us 3
|
| 3976 |
+
cudaDeviceSynchronize 70.48% 5.009ms 70.48% 5.009ms 5.009ms 0.000us 0.00% 0.000us 0.000us 1
|
| 3977 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 3978 |
+
Self CPU time total: 7.107ms
|
| 3979 |
+
Self CUDA time total: 5.578ms
|
| 3980 |
|
| 3981 |
|
| 3982 |
|
|
|
|
| 3986 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 3987 |
Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
|
| 3988 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 3989 |
+
torch_mem_eff 3.28% 246.598us 28.54% 2.146ms 2.146ms 0.000us 0.00% 6.014ms 6.014ms 1
|
| 3990 |
+
torch_mem_eff 0.00% 0.000us 0.00% 0.000us 0.000us 5.967ms 100.18% 5.967ms 5.967ms 1
|
| 3991 |
+
aten::scaled_dot_product_attention 0.24% 18.181us 1.92% 144.583us 48.194us 0.000us 0.00% 5.302ms 1.767ms 3
|
| 3992 |
+
aten::_scaled_dot_product_efficient_attention 0.27% 19.980us 1.68% 126.402us 42.134us 0.000us 0.00% 5.302ms 1.767ms 3
|
| 3993 |
+
aten::_efficient_attention_forward 0.38% 28.571us 1.10% 82.521us 27.507us 5.302ms 89.01% 5.302ms 1.767ms 3
|
| 3994 |
+
fmha_cutlassF_bf16_aligned_64x128_rf_sm80(PyTorchMem... 0.00% 0.000us 0.00% 0.000us 0.000us 5.302ms 89.01% 5.302ms 1.767ms 3
|
| 3995 |
+
aten::contiguous 0.09% 6.930us 22.70% 1.707ms 189.666us 0.000us 0.00% 712.547us 79.172us 9
|
| 3996 |
+
aten::clone 0.30% 22.691us 22.61% 1.700ms 188.896us 0.000us 0.00% 712.547us 79.172us 9
|
| 3997 |
+
aten::copy_ 1.08% 81.024us 21.57% 1.622ms 180.228us 654.403us 10.99% 712.547us 79.172us 9
|
| 3998 |
+
void at::native::elementwise_kernel<128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 654.403us 10.99% 654.403us 72.711us 9
|
| 3999 |
+
Activity Buffer Request 19.57% 1.471ms 19.57% 1.471ms 1.471ms 58.144us 0.98% 58.144us 58.144us 1
|
| 4000 |
+
aten::transpose 0.68% 51.431us 0.95% 71.351us 2.973us 0.000us 0.00% 0.000us 0.000us 24
|
| 4001 |
+
aten::as_strided 0.26% 19.920us 0.26% 19.920us 0.830us 0.000us 0.00% 0.000us 0.000us 24
|
| 4002 |
+
aten::empty_like 0.16% 11.979us 0.74% 55.320us 6.147us 0.000us 0.00% 0.000us 0.000us 9
|
| 4003 |
+
aten::empty 0.93% 69.561us 0.93% 69.561us 3.312us 0.000us 0.00% 0.000us 0.000us 21
|
| 4004 |
+
cudaLaunchKernel 1.22% 91.652us 1.22% 91.652us 7.638us 0.000us 0.00% 0.000us 0.000us 12
|
| 4005 |
+
cudaStreamIsCapturing 0.03% 2.359us 0.03% 2.359us 0.786us 0.000us 0.00% 0.000us 0.000us 3
|
| 4006 |
+
cudaFuncSetAttribute 0.05% 3.430us 0.05% 3.430us 1.143us 0.000us 0.00% 0.000us 0.000us 3
|
| 4007 |
+
cudaDeviceSynchronize 71.46% 5.373ms 71.46% 5.373ms 5.373ms 0.000us 0.00% 0.000us 0.000us 1
|
| 4008 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 4009 |
+
Self CPU time total: 7.519ms
|
| 4010 |
+
Self CUDA time total: 5.956ms
|
| 4011 |
|
| 4012 |
|
| 4013 |
|
|
|
|
| 4017 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 4018 |
Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
|
| 4019 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 4020 |
+
torch_mem_eff 3.21% 251.576us 29.97% 2.347ms 2.347ms 0.000us 0.00% 6.116ms 6.116ms 1
|
| 4021 |
+
torch_mem_eff 0.00% 0.000us 0.00% 0.000us 0.000us 6.068ms 100.14% 6.068ms 6.068ms 1
|
| 4022 |
+
aten::scaled_dot_product_attention 0.24% 18.800us 1.87% 146.693us 48.898us 0.000us 0.00% 5.408ms 1.803ms 3
|
| 4023 |
+
aten::_scaled_dot_product_efficient_attention 0.25% 19.900us 1.63% 127.893us 42.631us 0.000us 0.00% 5.408ms 1.803ms 3
|
| 4024 |
+
aten::_efficient_attention_forward 0.38% 29.372us 1.07% 83.903us 27.968us 5.408ms 89.25% 5.408ms 1.803ms 3
|
| 4025 |
+
fmha_cutlassF_bf16_aligned_64x128_rf_sm80(PyTorchMem... 0.00% 0.000us 0.00% 0.000us 0.000us 5.408ms 89.25% 5.408ms 1.803ms 3
|
| 4026 |
+
aten::contiguous 0.10% 7.511us 24.29% 1.902ms 211.340us 0.000us 0.00% 708.735us 78.748us 9
|
| 4027 |
+
aten::clone 0.28% 21.872us 24.19% 1.895ms 210.505us 0.000us 0.00% 708.735us 78.748us 9
|
| 4028 |
+
aten::copy_ 0.85% 66.540us 23.20% 1.817ms 201.834us 651.551us 10.75% 708.735us 78.748us 9
|
| 4029 |
+
void at::native::elementwise_kernel<128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 651.551us 10.75% 651.551us 72.395us 9
|
| 4030 |
+
Activity Buffer Request 18.68% 1.462ms 18.68% 1.462ms 1.462ms 57.184us 0.94% 57.184us 57.184us 1
|
| 4031 |
+
aten::transpose 0.65% 50.781us 0.90% 70.402us 2.933us 0.000us 0.00% 0.000us 0.000us 24
|
| 4032 |
+
aten::as_strided 0.25% 19.621us 0.25% 19.621us 0.818us 0.000us 0.00% 0.000us 0.000us 24
|
| 4033 |
+
aten::empty_like 0.15% 11.809us 0.72% 56.170us 6.241us 0.000us 0.00% 0.000us 0.000us 9
|
| 4034 |
+
aten::empty 0.90% 70.242us 0.90% 70.242us 3.345us 0.000us 0.00% 0.000us 0.000us 21
|
| 4035 |
+
cudaLaunchKernel 3.97% 310.797us 3.97% 310.797us 25.900us 0.000us 0.00% 0.000us 0.000us 12
|
| 4036 |
+
cudaStreamIsCapturing 0.03% 2.250us 0.03% 2.250us 0.750us 0.000us 0.00% 0.000us 0.000us 3
|
| 4037 |
+
cudaFuncSetAttribute 0.04% 3.220us 0.04% 3.220us 1.073us 0.000us 0.00% 0.000us 0.000us 3
|
| 4038 |
+
cudaDeviceSynchronize 70.03% 5.484ms 70.03% 5.484ms 5.484ms 0.000us 0.00% 0.000us 0.000us 1
|
| 4039 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 4040 |
+
Self CPU time total: 7.830ms
|
| 4041 |
+
Self CUDA time total: 6.059ms
|
| 4042 |
|
| 4043 |
|
| 4044 |
|
|
|
|
| 4048 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 4049 |
Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
|
| 4050 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 4051 |
+
torch_mem_eff 3.15% 250.575us 28.50% 2.270ms 2.270ms 0.000us 0.00% 6.322ms 6.322ms 1
|
| 4052 |
+
torch_mem_eff 0.00% 0.000us 0.00% 0.000us 0.000us 6.270ms 100.14% 6.270ms 6.270ms 1
|
| 4053 |
+
aten::scaled_dot_product_attention 0.22% 17.572us 1.82% 145.084us 48.361us 0.000us 0.00% 5.598ms 1.866ms 3
|
| 4054 |
+
aten::_scaled_dot_product_efficient_attention 0.24% 19.250us 1.60% 127.512us 42.504us 0.000us 0.00% 5.598ms 1.866ms 3
|
| 4055 |
+
aten::_efficient_attention_forward 0.36% 28.812us 1.05% 83.962us 27.987us 5.598ms 89.40% 5.598ms 1.866ms 3
|
| 4056 |
+
fmha_cutlassF_bf16_aligned_64x128_rf_sm80(PyTorchMem... 0.00% 0.000us 0.00% 0.000us 0.000us 5.598ms 89.40% 5.598ms 1.866ms 3
|
| 4057 |
+
aten::contiguous 0.09% 6.912us 22.94% 1.827ms 203.045us 0.000us 0.00% 724.000us 80.444us 9
|
| 4058 |
+
aten::clone 0.28% 21.949us 22.86% 1.820ms 202.277us 0.000us 0.00% 724.000us 80.444us 9
|
| 4059 |
+
aten::copy_ 0.82% 65.091us 21.89% 1.744ms 193.745us 664.032us 10.60% 724.000us 80.444us 9
|
| 4060 |
+
void at::native::elementwise_kernel<128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 664.032us 10.60% 664.032us 73.781us 9
|
| 4061 |
+
Activity Buffer Request 18.02% 1.435ms 18.02% 1.435ms 1.435ms 59.968us 0.96% 59.968us 59.968us 1
|
| 4062 |
+
aten::transpose 0.64% 50.930us 0.89% 70.859us 2.952us 0.000us 0.00% 0.000us 0.000us 24
|
| 4063 |
+
aten::as_strided 0.25% 19.929us 0.25% 19.929us 0.830us 0.000us 0.00% 0.000us 0.000us 24
|
| 4064 |
+
aten::empty_like 0.15% 12.022us 0.69% 54.843us 6.094us 0.000us 0.00% 0.000us 0.000us 9
|
| 4065 |
+
aten::empty 0.87% 69.430us 0.87% 69.430us 3.306us 0.000us 0.00% 0.000us 0.000us 21
|
| 4066 |
+
cudaLaunchKernel 3.34% 266.388us 3.34% 266.388us 22.199us 0.000us 0.00% 0.000us 0.000us 12
|
| 4067 |
+
cudaStreamIsCapturing 0.03% 2.320us 0.03% 2.320us 0.773us 0.000us 0.00% 0.000us 0.000us 3
|
| 4068 |
+
cudaFuncSetAttribute 0.04% 3.120us 0.04% 3.120us 1.040us 0.000us 0.00% 0.000us 0.000us 3
|
| 4069 |
+
cudaDeviceSynchronize 71.50% 5.695ms 71.50% 5.695ms 5.695ms 0.000us 0.00% 0.000us 0.000us 1
|
| 4070 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 4071 |
+
Self CPU time total: 7.965ms
|
| 4072 |
+
Self CUDA time total: 6.262ms
|
| 4073 |
|
| 4074 |
|
| 4075 |
|
|
|
|
| 4079 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 4080 |
Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
|
| 4081 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 4082 |
+
torch_mem_eff 3.00% 248.403us 26.98% 2.232ms 2.232ms 0.000us 0.00% 6.668ms 6.668ms 1
|
| 4083 |
+
torch_mem_eff 0.00% 0.000us 0.00% 0.000us 0.000us 6.616ms 100.13% 6.616ms 6.616ms 1
|
| 4084 |
+
aten::scaled_dot_product_attention 0.21% 17.221us 1.72% 142.654us 47.551us 0.000us 0.00% 5.939ms 1.980ms 3
|
| 4085 |
+
aten::_scaled_dot_product_efficient_attention 0.23% 18.779us 1.52% 125.433us 41.811us 0.000us 0.00% 5.939ms 1.980ms 3
|
| 4086 |
+
aten::_efficient_attention_forward 0.34% 28.440us 0.99% 81.712us 27.237us 5.939ms 89.88% 5.939ms 1.980ms 3
|
| 4087 |
+
fmha_cutlassF_bf16_aligned_64x128_rf_sm80(PyTorchMem... 0.00% 0.000us 0.00% 0.000us 0.000us 5.939ms 89.88% 5.939ms 1.980ms 3
|
| 4088 |
+
aten::contiguous 0.08% 6.861us 21.66% 1.792ms 199.142us 0.000us 0.00% 729.440us 81.049us 9
|
| 4089 |
+
aten::clone 0.26% 21.352us 21.58% 1.785ms 198.379us 0.000us 0.00% 729.440us 81.049us 9
|
| 4090 |
+
aten::copy_ 0.83% 69.012us 20.65% 1.709ms 189.858us 668.928us 10.12% 729.440us 81.049us 9
|
| 4091 |
+
void at::native::elementwise_kernel<128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 668.928us 10.12% 668.928us 74.325us 9
|
| 4092 |
+
Activity Buffer Request 17.29% 1.430ms 17.29% 1.430ms 1.430ms 60.512us 0.92% 60.512us 60.512us 1
|
| 4093 |
+
aten::transpose 0.63% 51.780us 0.89% 73.784us 3.074us 0.000us 0.00% 0.000us 0.000us 24
|
| 4094 |
+
aten::as_strided 0.27% 22.004us 0.27% 22.004us 0.917us 0.000us 0.00% 0.000us 0.000us 24
|
| 4095 |
+
aten::empty_like 0.14% 11.870us 0.67% 55.340us 6.149us 0.000us 0.00% 0.000us 0.000us 9
|
| 4096 |
+
aten::empty 0.84% 69.312us 0.84% 69.312us 3.301us 0.000us 0.00% 0.000us 0.000us 21
|
| 4097 |
+
cudaLaunchKernel 2.79% 231.145us 2.79% 231.145us 19.262us 0.000us 0.00% 0.000us 0.000us 12
|
| 4098 |
+
cudaStreamIsCapturing 0.03% 2.280us 0.03% 2.280us 0.760us 0.000us 0.00% 0.000us 0.000us 3
|
| 4099 |
+
cudaFuncSetAttribute 0.04% 3.570us 0.04% 3.570us 1.190us 0.000us 0.00% 0.000us 0.000us 3
|
| 4100 |
+
cudaDeviceSynchronize 73.02% 6.041ms 73.02% 6.041ms 6.041ms 0.000us 0.00% 0.000us 0.000us 1
|
| 4101 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 4102 |
+
Self CPU time total: 8.273ms
|
| 4103 |
+
Self CUDA time total: 6.608ms
|
| 4104 |
|
| 4105 |
|
| 4106 |
impl wl p50(ms) ok
|
| 4107 |
+
torch_mem_eff cuda_attn_L128_bfloat16 1.83 True
|
| 4108 |
+
torch_mem_eff cuda_attn_L256_bfloat16 1.89 True
|
| 4109 |
+
torch_mem_eff cuda_attn_L320_bfloat16 2.00 True
|
| 4110 |
+
torch_mem_eff cuda_attn_L384_bfloat16 1.97 True
|
| 4111 |
+
torch_mem_eff cuda_attn_L448_bfloat16 2.06 True
|
| 4112 |
torch_mem_eff cuda_attn_L512_bfloat16 2.19 True
|
| 4113 |
</pre></div>
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 4114 |
<div class="cell-artifacts">
|
| 4115 |
<h4>Artifacts:</h4>
|
| 4116 |
<a href="artifacts/benchmark/attention.jsonl" class="artifact" target="_blank">attention.jsonl</a>
|
flash_attn/impls/sage_attention.html
CHANGED
|
@@ -3869,9 +3869,9 @@ span.linenos.special { color: #000000; background-color: #ffffc0; padding-left:
|
|
| 3869 |
<span class="collapse-indicators">
|
| 3870 |
<span onclick="toggleCode('benchmark')" style="cursor: pointer;">▼ code</span>
|
| 3871 |
<span onclick="toggleOutput('benchmark')" style="cursor: pointer;">▼ output</span>
|
| 3872 |
-
<span id="uv-indicator-benchmark" style="cursor:
|
| 3873 |
</span> |
|
| 3874 |
-
Cell: benchmark | 4.
|
| 3875 |
| <button class="run-btn" onclick="runCell('benchmark')">▶ run</button>
|
| 3876 |
<button class="copy-btn" onclick="copyCell('benchmark')">Copy</button>
|
| 3877 |
<a href="cells/benchmark.py" target="_blank" class="raw-btn">Raw</a>
|
|
@@ -3920,23 +3920,28 @@ Cell: benchmark | 4.22s
|
|
| 3920 |
<div class="cell-stdout"><pre class="stdout-text">Running attention benchmark on cuda with 6 workloads.
|
| 3921 |
impl wl p50(ms) ok
|
| 3922 |
sage_int8_fp16 cuda_attn_L128_bfloat16 FAIL False
|
| 3923 |
-
Error: module '
|
| 3924 |
sage_int8_fp16 cuda_attn_L256_bfloat16 FAIL False
|
| 3925 |
-
Error: module '
|
| 3926 |
sage_int8_fp16 cuda_attn_L320_bfloat16 FAIL False
|
| 3927 |
-
Error: module '
|
| 3928 |
sage_int8_fp16 cuda_attn_L384_bfloat16 FAIL False
|
| 3929 |
-
Error: module '
|
| 3930 |
sage_int8_fp16 cuda_attn_L448_bfloat16 FAIL False
|
| 3931 |
-
Error: module '
|
| 3932 |
sage_int8_fp16 cuda_attn_L512_bfloat16 FAIL False
|
| 3933 |
-
Error: module '
|
| 3934 |
</pre></div>
|
| 3935 |
-
<div class="
|
| 3936 |
-
|
| 3937 |
-
|
| 3938 |
-
|
| 3939 |
</div>
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 3940 |
<div class="cell-artifacts">
|
| 3941 |
<h4>Artifacts:</h4>
|
| 3942 |
<a href="artifacts/benchmark/attention.jsonl" class="artifact" target="_blank">attention.jsonl</a>
|
|
|
|
| 3869 |
<span class="collapse-indicators">
|
| 3870 |
<span onclick="toggleCode('benchmark')" style="cursor: pointer;">▼ code</span>
|
| 3871 |
<span onclick="toggleOutput('benchmark')" style="cursor: pointer;">▼ output</span>
|
| 3872 |
+
<span id="uv-indicator-benchmark" onclick="toggleUvLogsFromHeader('benchmark')" style="cursor: pointer;">▶ uv-logs</span>
|
| 3873 |
</span> |
|
| 3874 |
+
Cell: benchmark | 4.53s
|
| 3875 |
| <button class="run-btn" onclick="runCell('benchmark')">▶ run</button>
|
| 3876 |
<button class="copy-btn" onclick="copyCell('benchmark')">Copy</button>
|
| 3877 |
<a href="cells/benchmark.py" target="_blank" class="raw-btn">Raw</a>
|
|
|
|
| 3920 |
<div class="cell-stdout"><pre class="stdout-text">Running attention benchmark on cuda with 6 workloads.
|
| 3921 |
impl wl p50(ms) ok
|
| 3922 |
sage_int8_fp16 cuda_attn_L128_bfloat16 FAIL False
|
| 3923 |
+
Error: module 'sage_attention_5c963cbdaf16559b' has no attribute 'fwd'
|
| 3924 |
sage_int8_fp16 cuda_attn_L256_bfloat16 FAIL False
|
| 3925 |
+
Error: module 'sage_attention_5c963cbdaf16559b' has no attribute 'fwd'
|
| 3926 |
sage_int8_fp16 cuda_attn_L320_bfloat16 FAIL False
|
| 3927 |
+
Error: module 'sage_attention_5c963cbdaf16559b' has no attribute 'fwd'
|
| 3928 |
sage_int8_fp16 cuda_attn_L384_bfloat16 FAIL False
|
| 3929 |
+
Error: module 'sage_attention_5c963cbdaf16559b' has no attribute 'fwd'
|
| 3930 |
sage_int8_fp16 cuda_attn_L448_bfloat16 FAIL False
|
| 3931 |
+
Error: module 'sage_attention_5c963cbdaf16559b' has no attribute 'fwd'
|
| 3932 |
sage_int8_fp16 cuda_attn_L512_bfloat16 FAIL False
|
| 3933 |
+
Error: module 'sage_attention_5c963cbdaf16559b' has no attribute 'fwd'
|
| 3934 |
</pre></div>
|
| 3935 |
+
<div class="uv-install-logs" id="uv-logs-benchmark">
|
| 3936 |
+
<div class="uv-logs-header" onclick="toggleUvLogs(this)">▶ UV Install Logs</div>
|
| 3937 |
+
<div class="uv-logs-content" style="display: none;">
|
| 3938 |
+
Installed 15 packages in 14ms
|
| 3939 |
</div>
|
| 3940 |
+
</div>
|
| 3941 |
+
<div class="cell-stderr">Fetching 11 files: 0%| | 0/11 [00:00<?, ?it/s]
|
| 3942 |
+
Fetching 11 files: 18%|█▊ | 2/11 [00:00<00:00, 15.79it/s]
|
| 3943 |
+
Fetching 11 files: 73%|███████▎ | 8/11 [00:00<00:00, 13.55it/s]
|
| 3944 |
+
Fetching 11 files: 100%|██████████| 11/11 [00:00<00:00, 18.83it/s]</div>
|
| 3945 |
<div class="cell-artifacts">
|
| 3946 |
<h4>Artifacts:</h4>
|
| 3947 |
<a href="artifacts/benchmark/attention.jsonl" class="artifact" target="_blank">attention.jsonl</a>
|
flash_attn/impls/xformers.html
CHANGED
|
@@ -3923,21 +3923,21 @@ PROFILE TRACE: xformers_meff | cuda_attn_L128_bfloat16
|
|
| 3923 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 3924 |
Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
|
| 3925 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 3926 |
-
xformers_meff
|
| 3927 |
-
xformers_flash3::flash_fwd 4.
|
| 3928 |
-
flash_attn_3::fwd 1.
|
| 3929 |
-
xformers_meff 0.00% 0.000us 0.00% 0.000us 0.000us 2.
|
| 3930 |
-
void cutlass::device_kernel<flash::enable_sm80_to_sm... 0.00% 0.000us 0.00% 0.000us 0.000us 2.
|
| 3931 |
-
Activity Buffer Request 31.
|
| 3932 |
-
aten::empty 0.
|
| 3933 |
-
cudaFuncSetAttribute 0.
|
| 3934 |
-
cudaLaunchKernel 0.
|
| 3935 |
-
aten::reshape 0.
|
| 3936 |
-
aten::view 0.
|
| 3937 |
-
cudaDeviceSynchronize
|
| 3938 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 3939 |
-
Self CPU time total: 4.
|
| 3940 |
-
Self CUDA time total: 2.
|
| 3941 |
|
| 3942 |
|
| 3943 |
|
|
@@ -3947,21 +3947,21 @@ PROFILE TRACE: xformers_meff | cuda_attn_L256_bfloat16
|
|
| 3947 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 3948 |
Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
|
| 3949 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 3950 |
-
xformers_meff
|
| 3951 |
-
xformers_flash3::flash_fwd 3.
|
| 3952 |
-
flash_attn_3::fwd 1.
|
| 3953 |
-
xformers_meff 0.00% 0.000us 0.00% 0.000us 0.000us 2.
|
| 3954 |
-
void cutlass::device_kernel<flash::enable_sm80_to_sm... 0.00% 0.000us 0.00% 0.000us 0.000us 2.
|
| 3955 |
-
Activity Buffer Request
|
| 3956 |
-
aten::empty 0.
|
| 3957 |
-
cudaFuncSetAttribute 0.12% 5.
|
| 3958 |
-
cudaLaunchKernel 0.
|
| 3959 |
-
aten::reshape 0.
|
| 3960 |
-
aten::view 0.
|
| 3961 |
-
cudaDeviceSynchronize
|
| 3962 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 3963 |
-
Self CPU time total: 4.
|
| 3964 |
-
Self CUDA time total: 2.
|
| 3965 |
|
| 3966 |
|
| 3967 |
|
|
@@ -3971,21 +3971,21 @@ PROFILE TRACE: xformers_meff | cuda_attn_L320_bfloat16
|
|
| 3971 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 3972 |
Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
|
| 3973 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 3974 |
-
xformers_meff 6.
|
| 3975 |
-
xformers_flash3::flash_fwd 3.
|
| 3976 |
-
flash_attn_3::fwd 1.
|
| 3977 |
-
xformers_meff 0.00% 0.000us 0.00% 0.000us 0.000us 2.
|
| 3978 |
-
void cutlass::device_kernel<flash::enable_sm80_to_sm... 0.00% 0.000us 0.00% 0.000us 0.000us 2.
|
| 3979 |
-
Activity Buffer Request 31.
|
| 3980 |
-
aten::empty 0.
|
| 3981 |
-
cudaFuncSetAttribute 0.12% 5.
|
| 3982 |
-
cudaLaunchKernel 0.
|
| 3983 |
-
aten::reshape 0.
|
| 3984 |
-
aten::view 0.31%
|
| 3985 |
-
cudaDeviceSynchronize 55.
|
| 3986 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 3987 |
-
Self CPU time total: 4.
|
| 3988 |
-
Self CUDA time total: 2.
|
| 3989 |
|
| 3990 |
|
| 3991 |
|
|
@@ -3995,21 +3995,21 @@ PROFILE TRACE: xformers_meff | cuda_attn_L384_bfloat16
|
|
| 3995 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 3996 |
Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
|
| 3997 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 3998 |
-
xformers_meff 6.
|
| 3999 |
-
xformers_flash3::flash_fwd
|
| 4000 |
-
flash_attn_3::fwd 1.
|
| 4001 |
-
xformers_meff 0.00% 0.000us 0.00% 0.000us 0.000us 2.
|
| 4002 |
-
void cutlass::device_kernel<flash::enable_sm80_to_sm... 0.00% 0.000us 0.00% 0.000us 0.000us 2.
|
| 4003 |
-
Activity Buffer Request 30.
|
| 4004 |
-
aten::empty 0.
|
| 4005 |
-
cudaFuncSetAttribute 0.11% 5.
|
| 4006 |
-
cudaLaunchKernel
|
| 4007 |
-
aten::reshape 0.
|
| 4008 |
-
aten::view 0.
|
| 4009 |
-
cudaDeviceSynchronize
|
| 4010 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 4011 |
-
Self CPU time total: 4.
|
| 4012 |
-
Self CUDA time total: 2.
|
| 4013 |
|
| 4014 |
|
| 4015 |
|
|
@@ -4019,21 +4019,21 @@ PROFILE TRACE: xformers_meff | cuda_attn_L448_bfloat16
|
|
| 4019 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 4020 |
Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
|
| 4021 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 4022 |
-
xformers_meff 5.
|
| 4023 |
-
xformers_flash3::flash_fwd 2.
|
| 4024 |
-
flash_attn_3::fwd 1.
|
| 4025 |
-
xformers_meff 0.00% 0.000us 0.00% 0.000us 0.000us 3.
|
| 4026 |
-
void cutlass::device_kernel<flash::enable_sm80_to_sm... 0.00% 0.000us 0.00% 0.000us 0.000us 3.
|
| 4027 |
-
Activity Buffer Request 27.
|
| 4028 |
-
aten::empty 0.
|
| 4029 |
-
cudaFuncSetAttribute 0.11% 5.
|
| 4030 |
-
cudaLaunchKernel 3.
|
| 4031 |
-
aten::reshape 0.
|
| 4032 |
-
aten::view 0.
|
| 4033 |
-
cudaDeviceSynchronize 58.
|
| 4034 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 4035 |
-
Self CPU time total: 5.
|
| 4036 |
-
Self CUDA time total: 3.
|
| 4037 |
|
| 4038 |
|
| 4039 |
|
|
@@ -4043,37 +4043,37 @@ PROFILE TRACE: xformers_meff | cuda_attn_L512_bfloat16
|
|
| 4043 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 4044 |
Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
|
| 4045 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 4046 |
-
xformers_meff 5.
|
| 4047 |
-
xformers_flash3::flash_fwd 2.
|
| 4048 |
-
flash_attn_3::fwd 1.
|
| 4049 |
-
xformers_meff 0.00% 0.000us 0.00% 0.000us 0.000us 3.
|
| 4050 |
-
void cutlass::device_kernel<flash::enable_sm80_to_sm... 0.00% 0.000us 0.00% 0.000us 0.000us 3.
|
| 4051 |
-
Activity Buffer Request 27.
|
| 4052 |
-
aten::empty 0.
|
| 4053 |
-
cudaFuncSetAttribute 0.
|
| 4054 |
-
cudaLaunchKernel 3.
|
| 4055 |
-
aten::reshape 0.17% 8.
|
| 4056 |
-
aten::view 0.
|
| 4057 |
-
cudaDeviceSynchronize 58.
|
| 4058 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 4059 |
-
Self CPU time total: 5.
|
| 4060 |
-
Self CUDA time total: 3.
|
| 4061 |
|
| 4062 |
|
| 4063 |
impl wl p50(ms) ok
|
| 4064 |
-
xformers_meff cuda_attn_L128_bfloat16
|
| 4065 |
xformers_meff cuda_attn_L256_bfloat16 1.03 True
|
| 4066 |
xformers_meff cuda_attn_L320_bfloat16 1.08 True
|
| 4067 |
-
xformers_meff cuda_attn_L384_bfloat16 1.
|
| 4068 |
-
xformers_meff cuda_attn_L448_bfloat16 1.
|
| 4069 |
-
xformers_meff cuda_attn_L512_bfloat16 1.
|
| 4070 |
</pre></div>
|
| 4071 |
<div class="uv-install-logs" id="uv-logs-benchmark">
|
| 4072 |
<div class="uv-logs-header" onclick="toggleUvLogs(this)">▶ UV Install Logs</div>
|
| 4073 |
<div class="uv-logs-content" style="display: none;">
|
| 4074 |
Downloading xformers (111.8MiB)
|
| 4075 |
Downloading xformers
|
| 4076 |
-
Installed 1 package in
|
| 4077 |
</div>
|
| 4078 |
</div>
|
| 4079 |
<div class="cell-artifacts">
|
|
|
|
| 3923 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 3924 |
Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
|
| 3925 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 3926 |
+
xformers_meff 10.85% 481.112us 51.55% 2.285ms 2.285ms 0.000us 0.00% 3.582ms 3.582ms 1
|
| 3927 |
+
xformers_flash3::flash_fwd 4.56% 202.185us 39.85% 1.766ms 588.715us 0.000us 0.00% 3.582ms 1.194ms 3
|
| 3928 |
+
flash_attn_3::fwd 1.68% 74.662us 35.29% 1.564ms 521.320us 2.681ms 100.00% 3.582ms 1.194ms 3
|
| 3929 |
+
xformers_meff 0.00% 0.000us 0.00% 0.000us 0.000us 2.682ms 100.06% 2.682ms 2.682ms 1
|
| 3930 |
+
void cutlass::device_kernel<flash::enable_sm80_to_sm... 0.00% 0.000us 0.00% 0.000us 0.000us 2.681ms 100.00% 2.681ms 893.515us 3
|
| 3931 |
+
Activity Buffer Request 31.74% 1.407ms 31.74% 1.407ms 1.407ms 901.761us 33.64% 901.761us 901.761us 1
|
| 3932 |
+
aten::empty 0.77% 33.920us 0.77% 33.920us 5.653us 0.000us 0.00% 0.000us 0.000us 6
|
| 3933 |
+
cudaFuncSetAttribute 0.23% 10.152us 0.23% 10.152us 3.384us 0.000us 0.00% 0.000us 0.000us 3
|
| 3934 |
+
cudaLaunchKernel 0.87% 38.521us 0.87% 38.521us 12.840us 0.000us 0.00% 0.000us 0.000us 3
|
| 3935 |
+
aten::reshape 0.29% 13.028us 0.85% 37.710us 6.285us 0.000us 0.00% 0.000us 0.000us 6
|
| 3936 |
+
aten::view 0.56% 24.682us 0.56% 24.682us 4.114us 0.000us 0.00% 0.000us 0.000us 6
|
| 3937 |
+
cudaDeviceSynchronize 48.45% 2.147ms 48.45% 2.147ms 2.147ms 0.000us 0.00% 0.000us 0.000us 1
|
| 3938 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 3939 |
+
Self CPU time total: 4.432ms
|
| 3940 |
+
Self CUDA time total: 2.681ms
|
| 3941 |
|
| 3942 |
|
| 3943 |
|
|
|
|
| 3947 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 3948 |
Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
|
| 3949 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 3950 |
+
xformers_meff 7.16% 317.438us 45.96% 2.036ms 2.036ms 0.000us 0.00% 3.779ms 3.779ms 1
|
| 3951 |
+
xformers_flash3::flash_fwd 3.35% 148.243us 38.25% 1.695ms 564.991us 0.000us 0.00% 3.779ms 1.260ms 3
|
| 3952 |
+
flash_attn_3::fwd 1.25% 55.403us 34.91% 1.547ms 515.576us 2.825ms 100.00% 3.779ms 1.260ms 3
|
| 3953 |
+
xformers_meff 0.00% 0.000us 0.00% 0.000us 0.000us 2.827ms 100.05% 2.827ms 2.827ms 1
|
| 3954 |
+
void cutlass::device_kernel<flash::enable_sm80_to_sm... 0.00% 0.000us 0.00% 0.000us 0.000us 2.825ms 100.00% 2.825ms 941.739us 3
|
| 3955 |
+
Activity Buffer Request 32.14% 1.424ms 32.14% 1.424ms 1.424ms 954.080us 33.77% 954.080us 954.080us 1
|
| 3956 |
+
aten::empty 0.63% 27.720us 0.63% 27.720us 4.620us 0.000us 0.00% 0.000us 0.000us 6
|
| 3957 |
+
cudaFuncSetAttribute 0.12% 5.400us 0.12% 5.400us 1.800us 0.000us 0.00% 0.000us 0.000us 3
|
| 3958 |
+
cudaLaunchKernel 0.77% 34.161us 0.77% 34.161us 11.387us 0.000us 0.00% 0.000us 0.000us 3
|
| 3959 |
+
aten::reshape 0.21% 9.370us 0.54% 23.750us 3.958us 0.000us 0.00% 0.000us 0.000us 6
|
| 3960 |
+
aten::view 0.32% 14.380us 0.32% 14.380us 2.397us 0.000us 0.00% 0.000us 0.000us 6
|
| 3961 |
+
cudaDeviceSynchronize 54.04% 2.395ms 54.04% 2.395ms 2.395ms 0.000us 0.00% 0.000us 0.000us 1
|
| 3962 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 3963 |
+
Self CPU time total: 4.431ms
|
| 3964 |
+
Self CUDA time total: 2.825ms
|
| 3965 |
|
| 3966 |
|
| 3967 |
|
|
|
|
| 3971 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 3972 |
Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
|
| 3973 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 3974 |
+
xformers_meff 6.87% 310.027us 44.72% 2.018ms 2.018ms 0.000us 0.00% 3.923ms 3.923ms 1
|
| 3975 |
+
xformers_flash3::flash_fwd 3.22% 145.444us 37.33% 1.684ms 561.324us 0.000us 0.00% 3.923ms 1.308ms 3
|
| 3976 |
+
flash_attn_3::fwd 1.15% 52.002us 34.10% 1.539ms 512.843us 2.919ms 100.00% 3.923ms 1.308ms 3
|
| 3977 |
+
xformers_meff 0.00% 0.000us 0.00% 0.000us 0.000us 2.921ms 100.06% 2.921ms 2.921ms 1
|
| 3978 |
+
void cutlass::device_kernel<flash::enable_sm80_to_sm... 0.00% 0.000us 0.00% 0.000us 0.000us 2.919ms 100.00% 2.919ms 973.037us 3
|
| 3979 |
+
Activity Buffer Request 31.44% 1.418ms 31.44% 1.418ms 1.418ms 1.004ms 34.40% 1.004ms 1.004ms 1
|
| 3980 |
+
aten::empty 0.63% 28.392us 0.63% 28.392us 4.732us 0.000us 0.00% 0.000us 0.000us 6
|
| 3981 |
+
cudaFuncSetAttribute 0.12% 5.520us 0.12% 5.520us 1.840us 0.000us 0.00% 0.000us 0.000us 3
|
| 3982 |
+
cudaLaunchKernel 0.76% 34.420us 0.76% 34.420us 11.473us 0.000us 0.00% 0.000us 0.000us 3
|
| 3983 |
+
aten::reshape 0.21% 9.519us 0.52% 23.650us 3.942us 0.000us 0.00% 0.000us 0.000us 6
|
| 3984 |
+
aten::view 0.31% 14.131us 0.31% 14.131us 2.355us 0.000us 0.00% 0.000us 0.000us 6
|
| 3985 |
+
cudaDeviceSynchronize 55.28% 2.494ms 55.28% 2.494ms 2.494ms 0.000us 0.00% 0.000us 0.000us 1
|
| 3986 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 3987 |
+
Self CPU time total: 4.511ms
|
| 3988 |
+
Self CUDA time total: 2.919ms
|
| 3989 |
|
| 3990 |
|
| 3991 |
|
|
|
|
| 3995 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 3996 |
Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
|
| 3997 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 3998 |
+
xformers_meff 6.73% 317.798us 47.46% 2.241ms 2.241ms 0.000us 0.00% 3.892ms 3.892ms 1
|
| 3999 |
+
xformers_flash3::flash_fwd 3.10% 146.544us 40.23% 1.900ms 633.169us 0.000us 0.00% 3.892ms 1.297ms 3
|
| 4000 |
+
flash_attn_3::fwd 1.15% 54.462us 37.13% 1.753ms 584.321us 2.910ms 100.00% 3.892ms 1.297ms 3
|
| 4001 |
+
xformers_meff 0.00% 0.000us 0.00% 0.000us 0.000us 2.911ms 100.05% 2.911ms 2.911ms 1
|
| 4002 |
+
void cutlass::device_kernel<flash::enable_sm80_to_sm... 0.00% 0.000us 0.00% 0.000us 0.000us 2.910ms 100.00% 2.910ms 969.848us 3
|
| 4003 |
+
Activity Buffer Request 30.01% 1.417ms 30.01% 1.417ms 1.417ms 982.915us 33.78% 982.915us 982.915us 1
|
| 4004 |
+
aten::empty 0.62% 29.170us 0.62% 29.170us 4.862us 0.000us 0.00% 0.000us 0.000us 6
|
| 4005 |
+
cudaFuncSetAttribute 0.11% 5.370us 0.11% 5.370us 1.790us 0.000us 0.00% 0.000us 0.000us 3
|
| 4006 |
+
cudaLaunchKernel 5.23% 247.156us 5.23% 247.156us 82.385us 0.000us 0.00% 0.000us 0.000us 3
|
| 4007 |
+
aten::reshape 0.20% 9.560us 0.50% 23.460us 3.910us 0.000us 0.00% 0.000us 0.000us 6
|
| 4008 |
+
aten::view 0.29% 13.900us 0.29% 13.900us 2.317us 0.000us 0.00% 0.000us 0.000us 6
|
| 4009 |
+
cudaDeviceSynchronize 52.54% 2.481ms 52.54% 2.481ms 2.481ms 0.000us 0.00% 0.000us 0.000us 1
|
| 4010 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 4011 |
+
Self CPU time total: 4.721ms
|
| 4012 |
+
Self CUDA time total: 2.910ms
|
| 4013 |
|
| 4014 |
|
| 4015 |
|
|
|
|
| 4019 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 4020 |
Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
|
| 4021 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 4022 |
+
xformers_meff 5.86% 306.369us 41.94% 2.193ms 2.193ms 0.000us 0.00% 4.614ms 4.614ms 1
|
| 4023 |
+
xformers_flash3::flash_fwd 2.85% 149.202us 35.63% 1.863ms 620.885us 0.000us 0.00% 4.614ms 1.538ms 3
|
| 4024 |
+
flash_attn_3::fwd 1.03% 53.951us 32.77% 1.713ms 571.151us 3.461ms 100.00% 4.614ms 1.538ms 3
|
| 4025 |
+
xformers_meff 0.00% 0.000us 0.00% 0.000us 0.000us 3.462ms 100.04% 3.462ms 3.462ms 1
|
| 4026 |
+
void cutlass::device_kernel<flash::enable_sm80_to_sm... 0.00% 0.000us 0.00% 0.000us 0.000us 3.461ms 100.00% 3.461ms 1.154ms 3
|
| 4027 |
+
Activity Buffer Request 27.28% 1.426ms 27.28% 1.426ms 1.426ms 1.153ms 33.31% 1.153ms 1.153ms 1
|
| 4028 |
+
aten::empty 0.55% 28.813us 0.55% 28.813us 4.802us 0.000us 0.00% 0.000us 0.000us 6
|
| 4029 |
+
cudaFuncSetAttribute 0.11% 5.560us 0.11% 5.560us 1.853us 0.000us 0.00% 0.000us 0.000us 3
|
| 4030 |
+
cudaLaunchKernel 3.80% 198.684us 3.80% 198.684us 66.228us 0.000us 0.00% 0.000us 0.000us 3
|
| 4031 |
+
aten::reshape 0.18% 9.430us 0.46% 23.930us 3.988us 0.000us 0.00% 0.000us 0.000us 6
|
| 4032 |
+
aten::view 0.28% 14.500us 0.28% 14.500us 2.417us 0.000us 0.00% 0.000us 0.000us 6
|
| 4033 |
+
cudaDeviceSynchronize 58.06% 3.036ms 58.06% 3.036ms 3.036ms 0.000us 0.00% 0.000us 0.000us 1
|
| 4034 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 4035 |
+
Self CPU time total: 5.228ms
|
| 4036 |
+
Self CUDA time total: 3.461ms
|
| 4037 |
|
| 4038 |
|
| 4039 |
|
|
|
|
| 4043 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 4044 |
Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
|
| 4045 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 4046 |
+
xformers_meff 5.96% 310.158us 41.66% 2.167ms 2.167ms 0.000us 0.00% 4.643ms 4.643ms 1
|
| 4047 |
+
xformers_flash3::flash_fwd 2.83% 146.954us 35.22% 1.832ms 610.728us 0.000us 0.00% 4.643ms 1.548ms 3
|
| 4048 |
+
flash_attn_3::fwd 1.00% 51.911us 32.40% 1.685ms 561.744us 3.464ms 100.00% 4.643ms 1.548ms 3
|
| 4049 |
+
xformers_meff 0.00% 0.000us 0.00% 0.000us 0.000us 3.465ms 100.04% 3.465ms 3.465ms 1
|
| 4050 |
+
void cutlass::device_kernel<flash::enable_sm80_to_sm... 0.00% 0.000us 0.00% 0.000us 0.000us 3.464ms 100.00% 3.464ms 1.155ms 3
|
| 4051 |
+
Activity Buffer Request 27.49% 1.430ms 27.49% 1.430ms 1.430ms 1.179ms 34.05% 1.179ms 1.179ms 1
|
| 4052 |
+
aten::empty 0.54% 28.311us 0.54% 28.311us 4.719us 0.000us 0.00% 0.000us 0.000us 6
|
| 4053 |
+
cudaFuncSetAttribute 0.11% 5.750us 0.11% 5.750us 1.917us 0.000us 0.00% 0.000us 0.000us 3
|
| 4054 |
+
cudaLaunchKernel 3.25% 169.084us 3.25% 169.084us 56.361us 0.000us 0.00% 0.000us 0.000us 3
|
| 4055 |
+
aten::reshape 0.17% 8.670us 0.48% 24.720us 4.120us 0.000us 0.00% 0.000us 0.000us 6
|
| 4056 |
+
aten::view 0.31% 16.050us 0.31% 16.050us 2.675us 0.000us 0.00% 0.000us 0.000us 6
|
| 4057 |
+
cudaDeviceSynchronize 58.34% 3.035ms 58.34% 3.035ms 3.035ms 0.000us 0.00% 0.000us 0.000us 1
|
| 4058 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 4059 |
+
Self CPU time total: 5.202ms
|
| 4060 |
+
Self CUDA time total: 3.464ms
|
| 4061 |
|
| 4062 |
|
| 4063 |
impl wl p50(ms) ok
|
| 4064 |
+
xformers_meff cuda_attn_L128_bfloat16 1.00 True
|
| 4065 |
xformers_meff cuda_attn_L256_bfloat16 1.03 True
|
| 4066 |
xformers_meff cuda_attn_L320_bfloat16 1.08 True
|
| 4067 |
+
xformers_meff cuda_attn_L384_bfloat16 1.09 True
|
| 4068 |
+
xformers_meff cuda_attn_L448_bfloat16 1.25 True
|
| 4069 |
+
xformers_meff cuda_attn_L512_bfloat16 1.24 True
|
| 4070 |
</pre></div>
|
| 4071 |
<div class="uv-install-logs" id="uv-logs-benchmark">
|
| 4072 |
<div class="uv-logs-header" onclick="toggleUvLogs(this)">▶ UV Install Logs</div>
|
| 4073 |
<div class="uv-logs-content" style="display: none;">
|
| 4074 |
Downloading xformers (111.8MiB)
|
| 4075 |
Downloading xformers
|
| 4076 |
+
Installed 1 package in 13ms
|
| 4077 |
</div>
|
| 4078 |
</div>
|
| 4079 |
<div class="cell-artifacts">
|
flash_attn/results/artifacts/combine/latency.svg
CHANGED
|
|
Git LFS Details
|
|
|
Git LFS Details
|
flash_attn/results/combined_results.html
CHANGED
|
@@ -3872,7 +3872,7 @@ span.linenos.special { color: #000000; background-color: #ffffc0; padding-left:
|
|
| 3872 |
<rdf:RDF>
|
| 3873 |
<ns2:Work>
|
| 3874 |
<dc:type rdf:resource="http://purl.org/dc/dcmitype/StillImage" />
|
| 3875 |
-
<dc:date>2025-10-
|
| 3876 |
<dc:format>image/svg+xml</dc:format>
|
| 3877 |
<dc:creator>
|
| 3878 |
<ns2:Agent>
|
|
@@ -3982,96 +3982,96 @@ span.linenos.special { color: #000000; background-color: #ffffc0; padding-left:
|
|
| 3982 |
<g id="matplotlib.axis_2">
|
| 3983 |
<g id="ytick_1">
|
| 3984 |
<g id="grid-y--2" class="grid grid-y">
|
| 3985 |
-
<path d="M 47.81
|
| 3986 |
</g>
|
| 3987 |
<g id="line2d_7">
|
| 3988 |
<defs>
|
| 3989 |
<path id="m0fca2865ba" d="M 0 0 L -3.5 0 " style="stroke: #000000; stroke-width: 0.8" />
|
| 3990 |
</defs>
|
| 3991 |
<g>
|
| 3992 |
-
<use ns4:href="#m0fca2865ba" x="47.81" y="
|
| 3993 |
</g>
|
| 3994 |
</g>
|
| 3995 |
<g id="text_7">
|
| 3996 |
-
<text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: end" x="40.81" y="
|
| 3997 |
</g>
|
| 3998 |
</g>
|
| 3999 |
<g id="ytick_2">
|
| 4000 |
<g id="grid-y--3" class="grid grid-y">
|
| 4001 |
-
<path d="M 47.81
|
| 4002 |
</g>
|
| 4003 |
<g id="line2d_8">
|
| 4004 |
<g>
|
| 4005 |
-
<use ns4:href="#m0fca2865ba" x="47.81" y="
|
| 4006 |
</g>
|
| 4007 |
</g>
|
| 4008 |
<g id="text_8">
|
| 4009 |
-
<text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: end" x="40.81" y="
|
| 4010 |
</g>
|
| 4011 |
</g>
|
| 4012 |
<g id="ytick_3">
|
| 4013 |
<g id="grid-y--4" class="grid grid-y">
|
| 4014 |
-
<path d="M 47.81
|
| 4015 |
</g>
|
| 4016 |
<g id="line2d_9">
|
| 4017 |
<g>
|
| 4018 |
-
<use ns4:href="#m0fca2865ba" x="47.81" y="
|
| 4019 |
</g>
|
| 4020 |
</g>
|
| 4021 |
<g id="text_9">
|
| 4022 |
-
<text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: end" x="40.81" y="
|
| 4023 |
</g>
|
| 4024 |
</g>
|
| 4025 |
<g id="ytick_4">
|
| 4026 |
<g id="grid-y--5" class="grid grid-y">
|
| 4027 |
-
<path d="M 47.81
|
| 4028 |
</g>
|
| 4029 |
<g id="line2d_10">
|
| 4030 |
<g>
|
| 4031 |
-
<use ns4:href="#m0fca2865ba" x="47.81" y="
|
| 4032 |
</g>
|
| 4033 |
</g>
|
| 4034 |
<g id="text_10">
|
| 4035 |
-
<text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: end" x="40.81" y="
|
| 4036 |
</g>
|
| 4037 |
</g>
|
| 4038 |
<g id="ytick_5">
|
| 4039 |
<g id="grid-y--6" class="grid grid-y">
|
| 4040 |
-
<path d="M 47.81
|
| 4041 |
</g>
|
| 4042 |
<g id="line2d_11">
|
| 4043 |
<g>
|
| 4044 |
-
<use ns4:href="#m0fca2865ba" x="47.81" y="
|
| 4045 |
</g>
|
| 4046 |
</g>
|
| 4047 |
<g id="text_11">
|
| 4048 |
-
<text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: end" x="40.81" y="
|
| 4049 |
</g>
|
| 4050 |
</g>
|
| 4051 |
<g id="ytick_6">
|
| 4052 |
<g id="grid-y--7" class="grid grid-y">
|
| 4053 |
-
<path d="M 47.81 103.
|
| 4054 |
</g>
|
| 4055 |
<g id="line2d_12">
|
| 4056 |
<g>
|
| 4057 |
-
<use ns4:href="#m0fca2865ba" x="47.81" y="103.
|
| 4058 |
</g>
|
| 4059 |
</g>
|
| 4060 |
<g id="text_12">
|
| 4061 |
-
<text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: end" x="40.81" y="107.
|
| 4062 |
</g>
|
| 4063 |
</g>
|
| 4064 |
<g id="ytick_7">
|
| 4065 |
<g id="grid-y--8" class="grid grid-y">
|
| 4066 |
-
<path d="M 47.81
|
| 4067 |
</g>
|
| 4068 |
<g id="line2d_13">
|
| 4069 |
<g>
|
| 4070 |
-
<use ns4:href="#m0fca2865ba" x="47.81" y="
|
| 4071 |
</g>
|
| 4072 |
</g>
|
| 4073 |
<g id="text_13">
|
| 4074 |
-
<text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: end" x="40.81" y="
|
| 4075 |
</g>
|
| 4076 |
</g>
|
| 4077 |
<g id="label--y" class="ylabel">
|
|
@@ -4079,73 +4079,73 @@ span.linenos.special { color: #000000; background-color: #ffffc0; padding-left:
|
|
| 4079 |
</g>
|
| 4080 |
</g>
|
| 4081 |
<g id="series--torch-flash-ma" class="series">
|
| 4082 |
-
<path d="M 83.607806
|
| 4083 |
<defs>
|
| 4084 |
<path id="md7efaf3aec" d="M 0 3 C 0.795609 3 1.55874 2.683901 2.12132 2.12132 C 2.683901 1.55874 3 0.795609 3 0 C 3 -0.795609 2.683901 -1.55874 2.12132 -2.12132 C 1.55874 -2.683901 0.795609 -3 0 -3 C -0.795609 -3 -1.55874 -2.683901 -2.12132 -2.12132 C -2.683901 -1.55874 -3 -0.795609 -3 0 C -3 0.795609 -2.683901 1.55874 -2.12132 2.12132 C -1.55874 2.683901 -0.795609 3 0 3 z " style="stroke: #1f77b4" />
|
| 4085 |
</defs>
|
| 4086 |
<g clip-path="url(#p09feef2583)">
|
| 4087 |
-
<use ns4:href="#md7efaf3aec" x="83.607806" y="
|
| 4088 |
-
<use ns4:href="#md7efaf3aec" x="226.799032" y="
|
| 4089 |
-
<use ns4:href="#md7efaf3aec" x="369.990258" y="
|
| 4090 |
-
<use ns4:href="#md7efaf3aec" x="513.181484" y="
|
| 4091 |
-
<use ns4:href="#md7efaf3aec" x="656.37271" y="
|
| 4092 |
-
<use ns4:href="#md7efaf3aec" x="799.563935" y="254.
|
| 4093 |
</g>
|
| 4094 |
</g>
|
| 4095 |
<g id="series--torch-mem-eff" class="series">
|
| 4096 |
-
<path d="M 83.607806
|
| 4097 |
<defs>
|
| 4098 |
<path id="m9b8c54d372" d="M 0 3 C 0.795609 3 1.55874 2.683901 2.12132 2.12132 C 2.683901 1.55874 3 0.795609 3 0 C 3 -0.795609 2.683901 -1.55874 2.12132 -2.12132 C 1.55874 -2.683901 0.795609 -3 0 -3 C -0.795609 -3 -1.55874 -2.683901 -2.12132 -2.12132 C -2.683901 -1.55874 -3 -0.795609 -3 0 C -3 0.795609 -2.683901 1.55874 -2.12132 2.12132 C -1.55874 2.683901 -0.795609 3 0 3 z " style="stroke: #ff7f0e" />
|
| 4099 |
</defs>
|
| 4100 |
<g clip-path="url(#p09feef2583)">
|
| 4101 |
-
<use ns4:href="#m9b8c54d372" x="83.607806" y="
|
| 4102 |
-
<use ns4:href="#m9b8c54d372" x="226.799032" y="
|
| 4103 |
-
<use ns4:href="#m9b8c54d372" x="369.990258" y="
|
| 4104 |
-
<use ns4:href="#m9b8c54d372" x="513.181484" y="
|
| 4105 |
-
<use ns4:href="#m9b8c54d372" x="656.37271" y="
|
| 4106 |
<use ns4:href="#m9b8c54d372" x="799.563935" y="45.999414" style="fill: #ff7f0e; stroke: #ff7f0e" />
|
| 4107 |
</g>
|
| 4108 |
</g>
|
| 4109 |
<g id="series--xformers-meff" class="series">
|
| 4110 |
-
<path d="M 83.607806
|
| 4111 |
<defs>
|
| 4112 |
<path id="mc655281e0b" d="M 0 3 C 0.795609 3 1.55874 2.683901 2.12132 2.12132 C 2.683901 1.55874 3 0.795609 3 0 C 3 -0.795609 2.683901 -1.55874 2.12132 -2.12132 C 1.55874 -2.683901 0.795609 -3 0 -3 C -0.795609 -3 -1.55874 -2.683901 -2.12132 -2.12132 C -2.683901 -1.55874 -3 -0.795609 -3 0 C -3 0.795609 -2.683901 1.55874 -2.12132 2.12132 C -1.55874 2.683901 -0.795609 3 0 3 z " style="stroke: #2ca02c" />
|
| 4113 |
</defs>
|
| 4114 |
<g clip-path="url(#p09feef2583)">
|
| 4115 |
-
<use ns4:href="#mc655281e0b" x="83.607806" y="
|
| 4116 |
-
<use ns4:href="#mc655281e0b" x="226.799032" y="
|
| 4117 |
-
<use ns4:href="#mc655281e0b" x="369.990258" y="
|
| 4118 |
-
<use ns4:href="#mc655281e0b" x="513.181484" y="
|
| 4119 |
-
<use ns4:href="#mc655281e0b" x="656.37271" y="
|
| 4120 |
-
<use ns4:href="#mc655281e0b" x="799.563935" y="
|
| 4121 |
</g>
|
| 4122 |
</g>
|
| 4123 |
<g id="series--hf-kernels-flash-attn" class="series">
|
| 4124 |
-
<path d="M 83.607806
|
| 4125 |
<defs>
|
| 4126 |
<path id="m61c8040d7e" d="M 0 3 C 0.795609 3 1.55874 2.683901 2.12132 2.12132 C 2.683901 1.55874 3 0.795609 3 0 C 3 -0.795609 2.683901 -1.55874 2.12132 -2.12132 C 1.55874 -2.683901 0.795609 -3 0 -3 C -0.795609 -3 -1.55874 -2.683901 -2.12132 -2.12132 C -2.683901 -1.55874 -3 -0.795609 -3 0 C -3 0.795609 -2.683901 1.55874 -2.12132 2.12132 C -1.55874 2.683901 -0.795609 3 0 3 z " style="stroke: #d62728" />
|
| 4127 |
</defs>
|
| 4128 |
<g clip-path="url(#p09feef2583)">
|
| 4129 |
-
<use ns4:href="#m61c8040d7e" x="83.607806" y="
|
| 4130 |
-
<use ns4:href="#m61c8040d7e" x="226.799032" y="
|
| 4131 |
-
<use ns4:href="#m61c8040d7e" x="369.990258" y="
|
| 4132 |
-
<use ns4:href="#m61c8040d7e" x="513.181484" y="
|
| 4133 |
-
<use ns4:href="#m61c8040d7e" x="656.37271" y="
|
| 4134 |
-
<use ns4:href="#m61c8040d7e" x="799.563935" y="
|
| 4135 |
</g>
|
| 4136 |
</g>
|
| 4137 |
<g id="series--hf-kernels-flash-attn3" class="series">
|
| 4138 |
-
<path d="M 83.607806 428.387702 L 226.799032
|
| 4139 |
<defs>
|
| 4140 |
<path id="m7cd35be9cc" d="M 0 3 C 0.795609 3 1.55874 2.683901 2.12132 2.12132 C 2.683901 1.55874 3 0.795609 3 0 C 3 -0.795609 2.683901 -1.55874 2.12132 -2.12132 C 1.55874 -2.683901 0.795609 -3 0 -3 C -0.795609 -3 -1.55874 -2.683901 -2.12132 -2.12132 C -2.683901 -1.55874 -3 -0.795609 -3 0 C -3 0.795609 -2.683901 1.55874 -2.12132 2.12132 C -1.55874 2.683901 -0.795609 3 0 3 z " style="stroke: #9467bd" />
|
| 4141 |
</defs>
|
| 4142 |
<g clip-path="url(#p09feef2583)">
|
| 4143 |
<use ns4:href="#m7cd35be9cc" x="83.607806" y="428.387702" style="fill: #9467bd; stroke: #9467bd" />
|
| 4144 |
-
<use ns4:href="#m7cd35be9cc" x="226.799032" y="
|
| 4145 |
-
<use ns4:href="#m7cd35be9cc" x="369.990258" y="
|
| 4146 |
-
<use ns4:href="#m7cd35be9cc" x="513.181484" y="
|
| 4147 |
-
<use ns4:href="#m7cd35be9cc" x="656.37271" y="
|
| 4148 |
-
<use ns4:href="#m7cd35be9cc" x="799.563935" y="
|
| 4149 |
</g>
|
| 4150 |
</g>
|
| 4151 |
<g id="patch_3">
|
|
@@ -4337,48 +4337,48 @@ Summary: 6 found, 0 skipped, 0 missing
|
|
| 4337 |
COMBINED BENCHMARK SUMMARY
|
| 4338 |
|
| 4339 |
impl wl p50(ms) ok
|
| 4340 |
-
hf_kernels_flash_attn cuda_attn_L128_bfloat16 0.
|
| 4341 |
hf_kernels_flash_attn cuda_attn_L256_bfloat16 1.01 True
|
| 4342 |
hf_kernels_flash_attn cuda_attn_L320_bfloat16 1.06 True
|
| 4343 |
-
hf_kernels_flash_attn cuda_attn_L384_bfloat16 1.
|
| 4344 |
-
hf_kernels_flash_attn cuda_attn_L448_bfloat16 1.
|
| 4345 |
-
hf_kernels_flash_attn cuda_attn_L512_bfloat16 1.
|
| 4346 |
-
hf_kernels_flash_attn3 cuda_attn_L128_bfloat16 0.
|
| 4347 |
-
hf_kernels_flash_attn3 cuda_attn_L256_bfloat16 0.
|
| 4348 |
-
hf_kernels_flash_attn3 cuda_attn_L320_bfloat16 1.
|
| 4349 |
-
hf_kernels_flash_attn3 cuda_attn_L384_bfloat16 1.
|
| 4350 |
-
hf_kernels_flash_attn3 cuda_attn_L448_bfloat16 1.
|
| 4351 |
hf_kernels_flash_attn3 cuda_attn_L512_bfloat16 1.18 True
|
| 4352 |
sage_int8_fp16 cuda_attn_L128_bfloat16 FAIL False
|
| 4353 |
-
Error: module '
|
| 4354 |
sage_int8_fp16 cuda_attn_L256_bfloat16 FAIL False
|
| 4355 |
-
Error: module '
|
| 4356 |
sage_int8_fp16 cuda_attn_L320_bfloat16 FAIL False
|
| 4357 |
-
Error: module '
|
| 4358 |
sage_int8_fp16 cuda_attn_L384_bfloat16 FAIL False
|
| 4359 |
-
Error: module '
|
| 4360 |
sage_int8_fp16 cuda_attn_L448_bfloat16 FAIL False
|
| 4361 |
-
Error: module '
|
| 4362 |
sage_int8_fp16 cuda_attn_L512_bfloat16 FAIL False
|
| 4363 |
-
Error: module '
|
| 4364 |
torch_flash_ma cuda_attn_L128_bfloat16 1.22 True
|
| 4365 |
-
torch_flash_ma cuda_attn_L256_bfloat16 1.
|
| 4366 |
-
torch_flash_ma cuda_attn_L320_bfloat16 1.
|
| 4367 |
-
torch_flash_ma cuda_attn_L384_bfloat16 1.
|
| 4368 |
torch_flash_ma cuda_attn_L448_bfloat16 1.47 True
|
| 4369 |
torch_flash_ma cuda_attn_L512_bfloat16 1.50 True
|
| 4370 |
-
torch_mem_eff cuda_attn_L128_bfloat16 1.
|
| 4371 |
-
torch_mem_eff cuda_attn_L256_bfloat16 1.
|
| 4372 |
-
torch_mem_eff cuda_attn_L320_bfloat16 2.
|
| 4373 |
-
torch_mem_eff cuda_attn_L384_bfloat16
|
| 4374 |
-
torch_mem_eff cuda_attn_L448_bfloat16 2.
|
| 4375 |
torch_mem_eff cuda_attn_L512_bfloat16 2.19 True
|
| 4376 |
-
xformers_meff cuda_attn_L128_bfloat16
|
| 4377 |
xformers_meff cuda_attn_L256_bfloat16 1.03 True
|
| 4378 |
xformers_meff cuda_attn_L320_bfloat16 1.08 True
|
| 4379 |
-
xformers_meff cuda_attn_L384_bfloat16 1.
|
| 4380 |
-
xformers_meff cuda_attn_L448_bfloat16 1.
|
| 4381 |
-
xformers_meff cuda_attn_L512_bfloat16 1.
|
| 4382 |
|
| 4383 |
GENERATING COMBINED VISUALIZATION
|
| 4384 |
|
|
@@ -4402,7 +4402,7 @@ Implementations included:
|
|
| 4402 |
<div class="uv-install-logs" id="uv-logs-combine">
|
| 4403 |
<div class="uv-logs-header" onclick="toggleUvLogs(this)">▶ UV Install Logs</div>
|
| 4404 |
<div class="uv-logs-content" style="display: none;">
|
| 4405 |
-
Installed 37 packages in
|
| 4406 |
</div>
|
| 4407 |
</div>
|
| 4408 |
<div class="cell-artifacts">
|
|
@@ -4415,7 +4415,7 @@ Installed 37 packages in 187ms
|
|
| 4415 |
<rdf:RDF>
|
| 4416 |
<ns2:Work>
|
| 4417 |
<dc:type rdf:resource="http://purl.org/dc/dcmitype/StillImage" />
|
| 4418 |
-
<dc:date>2025-10-
|
| 4419 |
<dc:format>image/svg+xml</dc:format>
|
| 4420 |
<dc:creator>
|
| 4421 |
<ns2:Agent>
|
|
@@ -4525,96 +4525,96 @@ Installed 37 packages in 187ms
|
|
| 4525 |
<g id="matplotlib.axis_2">
|
| 4526 |
<g id="ytick_1">
|
| 4527 |
<g id="grid-y--2" class="grid grid-y">
|
| 4528 |
-
<path d="M 47.81
|
| 4529 |
</g>
|
| 4530 |
<g id="line2d_7">
|
| 4531 |
<defs>
|
| 4532 |
<path id="m0fca2865ba" d="M 0 0 L -3.5 0 " style="stroke: #000000; stroke-width: 0.8" />
|
| 4533 |
</defs>
|
| 4534 |
<g>
|
| 4535 |
-
<use ns4:href="#m0fca2865ba" x="47.81" y="
|
| 4536 |
</g>
|
| 4537 |
</g>
|
| 4538 |
<g id="text_7">
|
| 4539 |
-
<text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: end" x="40.81" y="
|
| 4540 |
</g>
|
| 4541 |
</g>
|
| 4542 |
<g id="ytick_2">
|
| 4543 |
<g id="grid-y--3" class="grid grid-y">
|
| 4544 |
-
<path d="M 47.81
|
| 4545 |
</g>
|
| 4546 |
<g id="line2d_8">
|
| 4547 |
<g>
|
| 4548 |
-
<use ns4:href="#m0fca2865ba" x="47.81" y="
|
| 4549 |
</g>
|
| 4550 |
</g>
|
| 4551 |
<g id="text_8">
|
| 4552 |
-
<text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: end" x="40.81" y="
|
| 4553 |
</g>
|
| 4554 |
</g>
|
| 4555 |
<g id="ytick_3">
|
| 4556 |
<g id="grid-y--4" class="grid grid-y">
|
| 4557 |
-
<path d="M 47.81
|
| 4558 |
</g>
|
| 4559 |
<g id="line2d_9">
|
| 4560 |
<g>
|
| 4561 |
-
<use ns4:href="#m0fca2865ba" x="47.81" y="
|
| 4562 |
</g>
|
| 4563 |
</g>
|
| 4564 |
<g id="text_9">
|
| 4565 |
-
<text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: end" x="40.81" y="
|
| 4566 |
</g>
|
| 4567 |
</g>
|
| 4568 |
<g id="ytick_4">
|
| 4569 |
<g id="grid-y--5" class="grid grid-y">
|
| 4570 |
-
<path d="M 47.81
|
| 4571 |
</g>
|
| 4572 |
<g id="line2d_10">
|
| 4573 |
<g>
|
| 4574 |
-
<use ns4:href="#m0fca2865ba" x="47.81" y="
|
| 4575 |
</g>
|
| 4576 |
</g>
|
| 4577 |
<g id="text_10">
|
| 4578 |
-
<text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: end" x="40.81" y="
|
| 4579 |
</g>
|
| 4580 |
</g>
|
| 4581 |
<g id="ytick_5">
|
| 4582 |
<g id="grid-y--6" class="grid grid-y">
|
| 4583 |
-
<path d="M 47.81
|
| 4584 |
</g>
|
| 4585 |
<g id="line2d_11">
|
| 4586 |
<g>
|
| 4587 |
-
<use ns4:href="#m0fca2865ba" x="47.81" y="
|
| 4588 |
</g>
|
| 4589 |
</g>
|
| 4590 |
<g id="text_11">
|
| 4591 |
-
<text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: end" x="40.81" y="
|
| 4592 |
</g>
|
| 4593 |
</g>
|
| 4594 |
<g id="ytick_6">
|
| 4595 |
<g id="grid-y--7" class="grid grid-y">
|
| 4596 |
-
<path d="M 47.81 103.
|
| 4597 |
</g>
|
| 4598 |
<g id="line2d_12">
|
| 4599 |
<g>
|
| 4600 |
-
<use ns4:href="#m0fca2865ba" x="47.81" y="103.
|
| 4601 |
</g>
|
| 4602 |
</g>
|
| 4603 |
<g id="text_12">
|
| 4604 |
-
<text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: end" x="40.81" y="107.
|
| 4605 |
</g>
|
| 4606 |
</g>
|
| 4607 |
<g id="ytick_7">
|
| 4608 |
<g id="grid-y--8" class="grid grid-y">
|
| 4609 |
-
<path d="M 47.81
|
| 4610 |
</g>
|
| 4611 |
<g id="line2d_13">
|
| 4612 |
<g>
|
| 4613 |
-
<use ns4:href="#m0fca2865ba" x="47.81" y="
|
| 4614 |
</g>
|
| 4615 |
</g>
|
| 4616 |
<g id="text_13">
|
| 4617 |
-
<text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: end" x="40.81" y="
|
| 4618 |
</g>
|
| 4619 |
</g>
|
| 4620 |
<g id="label--y" class="ylabel">
|
|
@@ -4622,73 +4622,73 @@ Installed 37 packages in 187ms
|
|
| 4622 |
</g>
|
| 4623 |
</g>
|
| 4624 |
<g id="series--torch-flash-ma" class="series">
|
| 4625 |
-
<path d="M 83.607806
|
| 4626 |
<defs>
|
| 4627 |
<path id="md7efaf3aec" d="M 0 3 C 0.795609 3 1.55874 2.683901 2.12132 2.12132 C 2.683901 1.55874 3 0.795609 3 0 C 3 -0.795609 2.683901 -1.55874 2.12132 -2.12132 C 1.55874 -2.683901 0.795609 -3 0 -3 C -0.795609 -3 -1.55874 -2.683901 -2.12132 -2.12132 C -2.683901 -1.55874 -3 -0.795609 -3 0 C -3 0.795609 -2.683901 1.55874 -2.12132 2.12132 C -1.55874 2.683901 -0.795609 3 0 3 z " style="stroke: #1f77b4" />
|
| 4628 |
</defs>
|
| 4629 |
<g clip-path="url(#p09feef2583)">
|
| 4630 |
-
<use ns4:href="#md7efaf3aec" x="83.607806" y="
|
| 4631 |
-
<use ns4:href="#md7efaf3aec" x="226.799032" y="
|
| 4632 |
-
<use ns4:href="#md7efaf3aec" x="369.990258" y="
|
| 4633 |
-
<use ns4:href="#md7efaf3aec" x="513.181484" y="
|
| 4634 |
-
<use ns4:href="#md7efaf3aec" x="656.37271" y="
|
| 4635 |
-
<use ns4:href="#md7efaf3aec" x="799.563935" y="254.
|
| 4636 |
</g>
|
| 4637 |
</g>
|
| 4638 |
<g id="series--torch-mem-eff" class="series">
|
| 4639 |
-
<path d="M 83.607806
|
| 4640 |
<defs>
|
| 4641 |
<path id="m9b8c54d372" d="M 0 3 C 0.795609 3 1.55874 2.683901 2.12132 2.12132 C 2.683901 1.55874 3 0.795609 3 0 C 3 -0.795609 2.683901 -1.55874 2.12132 -2.12132 C 1.55874 -2.683901 0.795609 -3 0 -3 C -0.795609 -3 -1.55874 -2.683901 -2.12132 -2.12132 C -2.683901 -1.55874 -3 -0.795609 -3 0 C -3 0.795609 -2.683901 1.55874 -2.12132 2.12132 C -1.55874 2.683901 -0.795609 3 0 3 z " style="stroke: #ff7f0e" />
|
| 4642 |
</defs>
|
| 4643 |
<g clip-path="url(#p09feef2583)">
|
| 4644 |
-
<use ns4:href="#m9b8c54d372" x="83.607806" y="
|
| 4645 |
-
<use ns4:href="#m9b8c54d372" x="226.799032" y="
|
| 4646 |
-
<use ns4:href="#m9b8c54d372" x="369.990258" y="
|
| 4647 |
-
<use ns4:href="#m9b8c54d372" x="513.181484" y="
|
| 4648 |
-
<use ns4:href="#m9b8c54d372" x="656.37271" y="
|
| 4649 |
<use ns4:href="#m9b8c54d372" x="799.563935" y="45.999414" style="fill: #ff7f0e; stroke: #ff7f0e" />
|
| 4650 |
</g>
|
| 4651 |
</g>
|
| 4652 |
<g id="series--xformers-meff" class="series">
|
| 4653 |
-
<path d="M 83.607806
|
| 4654 |
<defs>
|
| 4655 |
<path id="mc655281e0b" d="M 0 3 C 0.795609 3 1.55874 2.683901 2.12132 2.12132 C 2.683901 1.55874 3 0.795609 3 0 C 3 -0.795609 2.683901 -1.55874 2.12132 -2.12132 C 1.55874 -2.683901 0.795609 -3 0 -3 C -0.795609 -3 -1.55874 -2.683901 -2.12132 -2.12132 C -2.683901 -1.55874 -3 -0.795609 -3 0 C -3 0.795609 -2.683901 1.55874 -2.12132 2.12132 C -1.55874 2.683901 -0.795609 3 0 3 z " style="stroke: #2ca02c" />
|
| 4656 |
</defs>
|
| 4657 |
<g clip-path="url(#p09feef2583)">
|
| 4658 |
-
<use ns4:href="#mc655281e0b" x="83.607806" y="
|
| 4659 |
-
<use ns4:href="#mc655281e0b" x="226.799032" y="
|
| 4660 |
-
<use ns4:href="#mc655281e0b" x="369.990258" y="
|
| 4661 |
-
<use ns4:href="#mc655281e0b" x="513.181484" y="
|
| 4662 |
-
<use ns4:href="#mc655281e0b" x="656.37271" y="
|
| 4663 |
-
<use ns4:href="#mc655281e0b" x="799.563935" y="
|
| 4664 |
</g>
|
| 4665 |
</g>
|
| 4666 |
<g id="series--hf-kernels-flash-attn" class="series">
|
| 4667 |
-
<path d="M 83.607806
|
| 4668 |
<defs>
|
| 4669 |
<path id="m61c8040d7e" d="M 0 3 C 0.795609 3 1.55874 2.683901 2.12132 2.12132 C 2.683901 1.55874 3 0.795609 3 0 C 3 -0.795609 2.683901 -1.55874 2.12132 -2.12132 C 1.55874 -2.683901 0.795609 -3 0 -3 C -0.795609 -3 -1.55874 -2.683901 -2.12132 -2.12132 C -2.683901 -1.55874 -3 -0.795609 -3 0 C -3 0.795609 -2.683901 1.55874 -2.12132 2.12132 C -1.55874 2.683901 -0.795609 3 0 3 z " style="stroke: #d62728" />
|
| 4670 |
</defs>
|
| 4671 |
<g clip-path="url(#p09feef2583)">
|
| 4672 |
-
<use ns4:href="#m61c8040d7e" x="83.607806" y="
|
| 4673 |
-
<use ns4:href="#m61c8040d7e" x="226.799032" y="
|
| 4674 |
-
<use ns4:href="#m61c8040d7e" x="369.990258" y="
|
| 4675 |
-
<use ns4:href="#m61c8040d7e" x="513.181484" y="
|
| 4676 |
-
<use ns4:href="#m61c8040d7e" x="656.37271" y="
|
| 4677 |
-
<use ns4:href="#m61c8040d7e" x="799.563935" y="
|
| 4678 |
</g>
|
| 4679 |
</g>
|
| 4680 |
<g id="series--hf-kernels-flash-attn3" class="series">
|
| 4681 |
-
<path d="M 83.607806 428.387702 L 226.799032
|
| 4682 |
<defs>
|
| 4683 |
<path id="m7cd35be9cc" d="M 0 3 C 0.795609 3 1.55874 2.683901 2.12132 2.12132 C 2.683901 1.55874 3 0.795609 3 0 C 3 -0.795609 2.683901 -1.55874 2.12132 -2.12132 C 1.55874 -2.683901 0.795609 -3 0 -3 C -0.795609 -3 -1.55874 -2.683901 -2.12132 -2.12132 C -2.683901 -1.55874 -3 -0.795609 -3 0 C -3 0.795609 -2.683901 1.55874 -2.12132 2.12132 C -1.55874 2.683901 -0.795609 3 0 3 z " style="stroke: #9467bd" />
|
| 4684 |
</defs>
|
| 4685 |
<g clip-path="url(#p09feef2583)">
|
| 4686 |
<use ns4:href="#m7cd35be9cc" x="83.607806" y="428.387702" style="fill: #9467bd; stroke: #9467bd" />
|
| 4687 |
-
<use ns4:href="#m7cd35be9cc" x="226.799032" y="
|
| 4688 |
-
<use ns4:href="#m7cd35be9cc" x="369.990258" y="
|
| 4689 |
-
<use ns4:href="#m7cd35be9cc" x="513.181484" y="
|
| 4690 |
-
<use ns4:href="#m7cd35be9cc" x="656.37271" y="
|
| 4691 |
-
<use ns4:href="#m7cd35be9cc" x="799.563935" y="
|
| 4692 |
</g>
|
| 4693 |
</g>
|
| 4694 |
<g id="patch_3">
|
|
|
|
| 3872 |
<rdf:RDF>
|
| 3873 |
<ns2:Work>
|
| 3874 |
<dc:type rdf:resource="http://purl.org/dc/dcmitype/StillImage" />
|
| 3875 |
+
<dc:date>2025-10-29T14:28:03.109695</dc:date>
|
| 3876 |
<dc:format>image/svg+xml</dc:format>
|
| 3877 |
<dc:creator>
|
| 3878 |
<ns2:Agent>
|
|
|
|
| 3982 |
<g id="matplotlib.axis_2">
|
| 3983 |
<g id="ytick_1">
|
| 3984 |
<g id="grid-y--2" class="grid grid-y">
|
| 3985 |
+
<path d="M 47.81 409.00723 L 835.361742 409.00723 " clip-path="url(#p09feef2583)" style="fill: none; stroke: #b0b0b0; stroke-opacity: 0.3; stroke-width: 0.8; stroke-linecap: square" />
|
| 3986 |
</g>
|
| 3987 |
<g id="line2d_7">
|
| 3988 |
<defs>
|
| 3989 |
<path id="m0fca2865ba" d="M 0 0 L -3.5 0 " style="stroke: #000000; stroke-width: 0.8" />
|
| 3990 |
</defs>
|
| 3991 |
<g>
|
| 3992 |
+
<use ns4:href="#m0fca2865ba" x="47.81" y="409.00723" style="stroke: #000000; stroke-width: 0.8" />
|
| 3993 |
</g>
|
| 3994 |
</g>
|
| 3995 |
<g id="text_7">
|
| 3996 |
+
<text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: end" x="40.81" y="412.806448" transform="rotate(-0 40.81 412.806448)">1.0</text>
|
| 3997 |
</g>
|
| 3998 |
</g>
|
| 3999 |
<g id="ytick_2">
|
| 4000 |
<g id="grid-y--3" class="grid grid-y">
|
| 4001 |
+
<path d="M 47.81 347.973099 L 835.361742 347.973099 " clip-path="url(#p09feef2583)" style="fill: none; stroke: #b0b0b0; stroke-opacity: 0.3; stroke-width: 0.8; stroke-linecap: square" />
|
| 4002 |
</g>
|
| 4003 |
<g id="line2d_8">
|
| 4004 |
<g>
|
| 4005 |
+
<use ns4:href="#m0fca2865ba" x="47.81" y="347.973099" style="stroke: #000000; stroke-width: 0.8" />
|
| 4006 |
</g>
|
| 4007 |
</g>
|
| 4008 |
<g id="text_8">
|
| 4009 |
+
<text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: end" x="40.81" y="351.772318" transform="rotate(-0 40.81 351.772318)">1.2</text>
|
| 4010 |
</g>
|
| 4011 |
</g>
|
| 4012 |
<g id="ytick_3">
|
| 4013 |
<g id="grid-y--4" class="grid grid-y">
|
| 4014 |
+
<path d="M 47.81 286.938969 L 835.361742 286.938969 " clip-path="url(#p09feef2583)" style="fill: none; stroke: #b0b0b0; stroke-opacity: 0.3; stroke-width: 0.8; stroke-linecap: square" />
|
| 4015 |
</g>
|
| 4016 |
<g id="line2d_9">
|
| 4017 |
<g>
|
| 4018 |
+
<use ns4:href="#m0fca2865ba" x="47.81" y="286.938969" style="stroke: #000000; stroke-width: 0.8" />
|
| 4019 |
</g>
|
| 4020 |
</g>
|
| 4021 |
<g id="text_9">
|
| 4022 |
+
<text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: end" x="40.81" y="290.738187" transform="rotate(-0 40.81 290.738187)">1.4</text>
|
| 4023 |
</g>
|
| 4024 |
</g>
|
| 4025 |
<g id="ytick_4">
|
| 4026 |
<g id="grid-y--5" class="grid grid-y">
|
| 4027 |
+
<path d="M 47.81 225.904838 L 835.361742 225.904838 " clip-path="url(#p09feef2583)" style="fill: none; stroke: #b0b0b0; stroke-opacity: 0.3; stroke-width: 0.8; stroke-linecap: square" />
|
| 4028 |
</g>
|
| 4029 |
<g id="line2d_10">
|
| 4030 |
<g>
|
| 4031 |
+
<use ns4:href="#m0fca2865ba" x="47.81" y="225.904838" style="stroke: #000000; stroke-width: 0.8" />
|
| 4032 |
</g>
|
| 4033 |
</g>
|
| 4034 |
<g id="text_10">
|
| 4035 |
+
<text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: end" x="40.81" y="229.704057" transform="rotate(-0 40.81 229.704057)">1.6</text>
|
| 4036 |
</g>
|
| 4037 |
</g>
|
| 4038 |
<g id="ytick_5">
|
| 4039 |
<g id="grid-y--6" class="grid grid-y">
|
| 4040 |
+
<path d="M 47.81 164.870708 L 835.361742 164.870708 " clip-path="url(#p09feef2583)" style="fill: none; stroke: #b0b0b0; stroke-opacity: 0.3; stroke-width: 0.8; stroke-linecap: square" />
|
| 4041 |
</g>
|
| 4042 |
<g id="line2d_11">
|
| 4043 |
<g>
|
| 4044 |
+
<use ns4:href="#m0fca2865ba" x="47.81" y="164.870708" style="stroke: #000000; stroke-width: 0.8" />
|
| 4045 |
</g>
|
| 4046 |
</g>
|
| 4047 |
<g id="text_11">
|
| 4048 |
+
<text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: end" x="40.81" y="168.669926" transform="rotate(-0 40.81 168.669926)">1.8</text>
|
| 4049 |
</g>
|
| 4050 |
</g>
|
| 4051 |
<g id="ytick_6">
|
| 4052 |
<g id="grid-y--7" class="grid grid-y">
|
| 4053 |
+
<path d="M 47.81 103.836577 L 835.361742 103.836577 " clip-path="url(#p09feef2583)" style="fill: none; stroke: #b0b0b0; stroke-opacity: 0.3; stroke-width: 0.8; stroke-linecap: square" />
|
| 4054 |
</g>
|
| 4055 |
<g id="line2d_12">
|
| 4056 |
<g>
|
| 4057 |
+
<use ns4:href="#m0fca2865ba" x="47.81" y="103.836577" style="stroke: #000000; stroke-width: 0.8" />
|
| 4058 |
</g>
|
| 4059 |
</g>
|
| 4060 |
<g id="text_12">
|
| 4061 |
+
<text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: end" x="40.81" y="107.635796" transform="rotate(-0 40.81 107.635796)">2.0</text>
|
| 4062 |
</g>
|
| 4063 |
</g>
|
| 4064 |
<g id="ytick_7">
|
| 4065 |
<g id="grid-y--8" class="grid grid-y">
|
| 4066 |
+
<path d="M 47.81 42.802447 L 835.361742 42.802447 " clip-path="url(#p09feef2583)" style="fill: none; stroke: #b0b0b0; stroke-opacity: 0.3; stroke-width: 0.8; stroke-linecap: square" />
|
| 4067 |
</g>
|
| 4068 |
<g id="line2d_13">
|
| 4069 |
<g>
|
| 4070 |
+
<use ns4:href="#m0fca2865ba" x="47.81" y="42.802447" style="stroke: #000000; stroke-width: 0.8" />
|
| 4071 |
</g>
|
| 4072 |
</g>
|
| 4073 |
<g id="text_13">
|
| 4074 |
+
<text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: end" x="40.81" y="46.601665" transform="rotate(-0 40.81 46.601665)">2.2</text>
|
| 4075 |
</g>
|
| 4076 |
</g>
|
| 4077 |
<g id="label--y" class="ylabel">
|
|
|
|
| 4079 |
</g>
|
| 4080 |
</g>
|
| 4081 |
<g id="series--torch-flash-ma" class="series">
|
| 4082 |
+
<path d="M 83.607806 340.639848 L 226.799032 324.181385 L 369.990258 320.559009 L 513.181484 308.901185 L 656.37271 265.282228 L 799.563935 254.967155 " clip-path="url(#p09feef2583)" style="fill: none; stroke: #1f77b4; stroke-width: 1.5; stroke-linecap: square" />
|
| 4083 |
<defs>
|
| 4084 |
<path id="md7efaf3aec" d="M 0 3 C 0.795609 3 1.55874 2.683901 2.12132 2.12132 C 2.683901 1.55874 3 0.795609 3 0 C 3 -0.795609 2.683901 -1.55874 2.12132 -2.12132 C 1.55874 -2.683901 0.795609 -3 0 -3 C -0.795609 -3 -1.55874 -2.683901 -2.12132 -2.12132 C -2.683901 -1.55874 -3 -0.795609 -3 0 C -3 0.795609 -2.683901 1.55874 -2.12132 2.12132 C -1.55874 2.683901 -0.795609 3 0 3 z " style="stroke: #1f77b4" />
|
| 4085 |
</defs>
|
| 4086 |
<g clip-path="url(#p09feef2583)">
|
| 4087 |
+
<use ns4:href="#md7efaf3aec" x="83.607806" y="340.639848" style="fill: #1f77b4; stroke: #1f77b4" />
|
| 4088 |
+
<use ns4:href="#md7efaf3aec" x="226.799032" y="324.181385" style="fill: #1f77b4; stroke: #1f77b4" />
|
| 4089 |
+
<use ns4:href="#md7efaf3aec" x="369.990258" y="320.559009" style="fill: #1f77b4; stroke: #1f77b4" />
|
| 4090 |
+
<use ns4:href="#md7efaf3aec" x="513.181484" y="308.901185" style="fill: #1f77b4; stroke: #1f77b4" />
|
| 4091 |
+
<use ns4:href="#md7efaf3aec" x="656.37271" y="265.282228" style="fill: #1f77b4; stroke: #1f77b4" />
|
| 4092 |
+
<use ns4:href="#md7efaf3aec" x="799.563935" y="254.967155" style="fill: #1f77b4; stroke: #1f77b4" />
|
| 4093 |
</g>
|
| 4094 |
</g>
|
| 4095 |
<g id="series--torch-mem-eff" class="series">
|
| 4096 |
+
<path d="M 83.607806 156.748591 L 226.799032 137.315018 L 369.990258 105.143013 L 513.181484 114.228248 L 656.37271 86.655469 L 799.563935 45.999414 " clip-path="url(#p09feef2583)" style="fill: none; stroke: #ff7f0e; stroke-width: 1.5; stroke-linecap: square" />
|
| 4097 |
<defs>
|
| 4098 |
<path id="m9b8c54d372" d="M 0 3 C 0.795609 3 1.55874 2.683901 2.12132 2.12132 C 2.683901 1.55874 3 0.795609 3 0 C 3 -0.795609 2.683901 -1.55874 2.12132 -2.12132 C 1.55874 -2.683901 0.795609 -3 0 -3 C -0.795609 -3 -1.55874 -2.683901 -2.12132 -2.12132 C -2.683901 -1.55874 -3 -0.795609 -3 0 C -3 0.795609 -2.683901 1.55874 -2.12132 2.12132 C -1.55874 2.683901 -0.795609 3 0 3 z " style="stroke: #ff7f0e" />
|
| 4099 |
</defs>
|
| 4100 |
<g clip-path="url(#p09feef2583)">
|
| 4101 |
+
<use ns4:href="#m9b8c54d372" x="83.607806" y="156.748591" style="fill: #ff7f0e; stroke: #ff7f0e" />
|
| 4102 |
+
<use ns4:href="#m9b8c54d372" x="226.799032" y="137.315018" style="fill: #ff7f0e; stroke: #ff7f0e" />
|
| 4103 |
+
<use ns4:href="#m9b8c54d372" x="369.990258" y="105.143013" style="fill: #ff7f0e; stroke: #ff7f0e" />
|
| 4104 |
+
<use ns4:href="#m9b8c54d372" x="513.181484" y="114.228248" style="fill: #ff7f0e; stroke: #ff7f0e" />
|
| 4105 |
+
<use ns4:href="#m9b8c54d372" x="656.37271" y="86.655469" style="fill: #ff7f0e; stroke: #ff7f0e" />
|
| 4106 |
<use ns4:href="#m9b8c54d372" x="799.563935" y="45.999414" style="fill: #ff7f0e; stroke: #ff7f0e" />
|
| 4107 |
</g>
|
| 4108 |
</g>
|
| 4109 |
<g id="series--xformers-meff" class="series">
|
| 4110 |
+
<path d="M 83.607806 410.498293 L 226.799032 399.197519 L 369.990258 383.346345 L 513.181484 381.042612 L 656.37271 332.003214 L 799.563935 335.418073 " clip-path="url(#p09feef2583)" style="fill: none; stroke: #2ca02c; stroke-width: 1.5; stroke-linecap: square" />
|
| 4111 |
<defs>
|
| 4112 |
<path id="mc655281e0b" d="M 0 3 C 0.795609 3 1.55874 2.683901 2.12132 2.12132 C 2.683901 1.55874 3 0.795609 3 0 C 3 -0.795609 2.683901 -1.55874 2.12132 -2.12132 C 1.55874 -2.683901 0.795609 -3 0 -3 C -0.795609 -3 -1.55874 -2.683901 -2.12132 -2.12132 C -2.683901 -1.55874 -3 -0.795609 -3 0 C -3 0.795609 -2.683901 1.55874 -2.12132 2.12132 C -1.55874 2.683901 -0.795609 3 0 3 z " style="stroke: #2ca02c" />
|
| 4113 |
</defs>
|
| 4114 |
<g clip-path="url(#p09feef2583)">
|
| 4115 |
+
<use ns4:href="#mc655281e0b" x="83.607806" y="410.498293" style="fill: #2ca02c; stroke: #2ca02c" />
|
| 4116 |
+
<use ns4:href="#mc655281e0b" x="226.799032" y="399.197519" style="fill: #2ca02c; stroke: #2ca02c" />
|
| 4117 |
+
<use ns4:href="#mc655281e0b" x="369.990258" y="383.346345" style="fill: #2ca02c; stroke: #2ca02c" />
|
| 4118 |
+
<use ns4:href="#mc655281e0b" x="513.181484" y="381.042612" style="fill: #2ca02c; stroke: #2ca02c" />
|
| 4119 |
+
<use ns4:href="#mc655281e0b" x="656.37271" y="332.003214" style="fill: #2ca02c; stroke: #2ca02c" />
|
| 4120 |
+
<use ns4:href="#mc655281e0b" x="799.563935" y="335.418073" style="fill: #2ca02c; stroke: #2ca02c" />
|
| 4121 |
</g>
|
| 4122 |
</g>
|
| 4123 |
<g id="series--hf-kernels-flash-attn" class="series">
|
| 4124 |
+
<path d="M 83.607806 418.603626 L 226.799032 405.380276 L 369.990258 389.547718 L 513.181484 382.629499 L 656.37271 335.525188 L 799.563935 340.270592 " clip-path="url(#p09feef2583)" style="fill: none; stroke: #d62728; stroke-width: 1.5; stroke-linecap: square" />
|
| 4125 |
<defs>
|
| 4126 |
<path id="m61c8040d7e" d="M 0 3 C 0.795609 3 1.55874 2.683901 2.12132 2.12132 C 2.683901 1.55874 3 0.795609 3 0 C 3 -0.795609 2.683901 -1.55874 2.12132 -2.12132 C 1.55874 -2.683901 0.795609 -3 0 -3 C -0.795609 -3 -1.55874 -2.683901 -2.12132 -2.12132 C -2.683901 -1.55874 -3 -0.795609 -3 0 C -3 0.795609 -2.683901 1.55874 -2.12132 2.12132 C -1.55874 2.683901 -0.795609 3 0 3 z " style="stroke: #d62728" />
|
| 4127 |
</defs>
|
| 4128 |
<g clip-path="url(#p09feef2583)">
|
| 4129 |
+
<use ns4:href="#m61c8040d7e" x="83.607806" y="418.603626" style="fill: #d62728; stroke: #d62728" />
|
| 4130 |
+
<use ns4:href="#m61c8040d7e" x="226.799032" y="405.380276" style="fill: #d62728; stroke: #d62728" />
|
| 4131 |
+
<use ns4:href="#m61c8040d7e" x="369.990258" y="389.547718" style="fill: #d62728; stroke: #d62728" />
|
| 4132 |
+
<use ns4:href="#m61c8040d7e" x="513.181484" y="382.629499" style="fill: #d62728; stroke: #d62728" />
|
| 4133 |
+
<use ns4:href="#m61c8040d7e" x="656.37271" y="335.525188" style="fill: #d62728; stroke: #d62728" />
|
| 4134 |
+
<use ns4:href="#m61c8040d7e" x="799.563935" y="340.270592" style="fill: #d62728; stroke: #d62728" />
|
| 4135 |
</g>
|
| 4136 |
</g>
|
| 4137 |
<g id="series--hf-kernels-flash-attn3" class="series">
|
| 4138 |
+
<path d="M 83.607806 428.387702 L 226.799032 418.05737 L 369.990258 396.545281 L 513.181484 392.764216 L 656.37271 347.753681 L 799.563935 353.503096 " clip-path="url(#p09feef2583)" style="fill: none; stroke: #9467bd; stroke-width: 1.5; stroke-linecap: square" />
|
| 4139 |
<defs>
|
| 4140 |
<path id="m7cd35be9cc" d="M 0 3 C 0.795609 3 1.55874 2.683901 2.12132 2.12132 C 2.683901 1.55874 3 0.795609 3 0 C 3 -0.795609 2.683901 -1.55874 2.12132 -2.12132 C 1.55874 -2.683901 0.795609 -3 0 -3 C -0.795609 -3 -1.55874 -2.683901 -2.12132 -2.12132 C -2.683901 -1.55874 -3 -0.795609 -3 0 C -3 0.795609 -2.683901 1.55874 -2.12132 2.12132 C -1.55874 2.683901 -0.795609 3 0 3 z " style="stroke: #9467bd" />
|
| 4141 |
</defs>
|
| 4142 |
<g clip-path="url(#p09feef2583)">
|
| 4143 |
<use ns4:href="#m7cd35be9cc" x="83.607806" y="428.387702" style="fill: #9467bd; stroke: #9467bd" />
|
| 4144 |
+
<use ns4:href="#m7cd35be9cc" x="226.799032" y="418.05737" style="fill: #9467bd; stroke: #9467bd" />
|
| 4145 |
+
<use ns4:href="#m7cd35be9cc" x="369.990258" y="396.545281" style="fill: #9467bd; stroke: #9467bd" />
|
| 4146 |
+
<use ns4:href="#m7cd35be9cc" x="513.181484" y="392.764216" style="fill: #9467bd; stroke: #9467bd" />
|
| 4147 |
+
<use ns4:href="#m7cd35be9cc" x="656.37271" y="347.753681" style="fill: #9467bd; stroke: #9467bd" />
|
| 4148 |
+
<use ns4:href="#m7cd35be9cc" x="799.563935" y="353.503096" style="fill: #9467bd; stroke: #9467bd" />
|
| 4149 |
</g>
|
| 4150 |
</g>
|
| 4151 |
<g id="patch_3">
|
|
|
|
| 4337 |
COMBINED BENCHMARK SUMMARY
|
| 4338 |
|
| 4339 |
impl wl p50(ms) ok
|
| 4340 |
+
hf_kernels_flash_attn cuda_attn_L128_bfloat16 0.97 True
|
| 4341 |
hf_kernels_flash_attn cuda_attn_L256_bfloat16 1.01 True
|
| 4342 |
hf_kernels_flash_attn cuda_attn_L320_bfloat16 1.06 True
|
| 4343 |
+
hf_kernels_flash_attn cuda_attn_L384_bfloat16 1.09 True
|
| 4344 |
+
hf_kernels_flash_attn cuda_attn_L448_bfloat16 1.24 True
|
| 4345 |
+
hf_kernels_flash_attn cuda_attn_L512_bfloat16 1.23 True
|
| 4346 |
+
hf_kernels_flash_attn3 cuda_attn_L128_bfloat16 0.94 True
|
| 4347 |
+
hf_kernels_flash_attn3 cuda_attn_L256_bfloat16 0.97 True
|
| 4348 |
+
hf_kernels_flash_attn3 cuda_attn_L320_bfloat16 1.04 True
|
| 4349 |
+
hf_kernels_flash_attn3 cuda_attn_L384_bfloat16 1.05 True
|
| 4350 |
+
hf_kernels_flash_attn3 cuda_attn_L448_bfloat16 1.20 True
|
| 4351 |
hf_kernels_flash_attn3 cuda_attn_L512_bfloat16 1.18 True
|
| 4352 |
sage_int8_fp16 cuda_attn_L128_bfloat16 FAIL False
|
| 4353 |
+
Error: module 'sage_attention_5c963cbdaf16559b' has no attribute 'fwd'
|
| 4354 |
sage_int8_fp16 cuda_attn_L256_bfloat16 FAIL False
|
| 4355 |
+
Error: module 'sage_attention_5c963cbdaf16559b' has no attribute 'fwd'
|
| 4356 |
sage_int8_fp16 cuda_attn_L320_bfloat16 FAIL False
|
| 4357 |
+
Error: module 'sage_attention_5c963cbdaf16559b' has no attribute 'fwd'
|
| 4358 |
sage_int8_fp16 cuda_attn_L384_bfloat16 FAIL False
|
| 4359 |
+
Error: module 'sage_attention_5c963cbdaf16559b' has no attribute 'fwd'
|
| 4360 |
sage_int8_fp16 cuda_attn_L448_bfloat16 FAIL False
|
| 4361 |
+
Error: module 'sage_attention_5c963cbdaf16559b' has no attribute 'fwd'
|
| 4362 |
sage_int8_fp16 cuda_attn_L512_bfloat16 FAIL False
|
| 4363 |
+
Error: module 'sage_attention_5c963cbdaf16559b' has no attribute 'fwd'
|
| 4364 |
torch_flash_ma cuda_attn_L128_bfloat16 1.22 True
|
| 4365 |
+
torch_flash_ma cuda_attn_L256_bfloat16 1.28 True
|
| 4366 |
+
torch_flash_ma cuda_attn_L320_bfloat16 1.29 True
|
| 4367 |
+
torch_flash_ma cuda_attn_L384_bfloat16 1.33 True
|
| 4368 |
torch_flash_ma cuda_attn_L448_bfloat16 1.47 True
|
| 4369 |
torch_flash_ma cuda_attn_L512_bfloat16 1.50 True
|
| 4370 |
+
torch_mem_eff cuda_attn_L128_bfloat16 1.83 True
|
| 4371 |
+
torch_mem_eff cuda_attn_L256_bfloat16 1.89 True
|
| 4372 |
+
torch_mem_eff cuda_attn_L320_bfloat16 2.00 True
|
| 4373 |
+
torch_mem_eff cuda_attn_L384_bfloat16 1.97 True
|
| 4374 |
+
torch_mem_eff cuda_attn_L448_bfloat16 2.06 True
|
| 4375 |
torch_mem_eff cuda_attn_L512_bfloat16 2.19 True
|
| 4376 |
+
xformers_meff cuda_attn_L128_bfloat16 1.00 True
|
| 4377 |
xformers_meff cuda_attn_L256_bfloat16 1.03 True
|
| 4378 |
xformers_meff cuda_attn_L320_bfloat16 1.08 True
|
| 4379 |
+
xformers_meff cuda_attn_L384_bfloat16 1.09 True
|
| 4380 |
+
xformers_meff cuda_attn_L448_bfloat16 1.25 True
|
| 4381 |
+
xformers_meff cuda_attn_L512_bfloat16 1.24 True
|
| 4382 |
|
| 4383 |
GENERATING COMBINED VISUALIZATION
|
| 4384 |
|
|
|
|
| 4402 |
<div class="uv-install-logs" id="uv-logs-combine">
|
| 4403 |
<div class="uv-logs-header" onclick="toggleUvLogs(this)">▶ UV Install Logs</div>
|
| 4404 |
<div class="uv-logs-content" style="display: none;">
|
| 4405 |
+
Installed 37 packages in 208ms
|
| 4406 |
</div>
|
| 4407 |
</div>
|
| 4408 |
<div class="cell-artifacts">
|
|
|
|
| 4415 |
<rdf:RDF>
|
| 4416 |
<ns2:Work>
|
| 4417 |
<dc:type rdf:resource="http://purl.org/dc/dcmitype/StillImage" />
|
| 4418 |
+
<dc:date>2025-10-29T14:28:03.109695</dc:date>
|
| 4419 |
<dc:format>image/svg+xml</dc:format>
|
| 4420 |
<dc:creator>
|
| 4421 |
<ns2:Agent>
|
|
|
|
| 4525 |
<g id="matplotlib.axis_2">
|
| 4526 |
<g id="ytick_1">
|
| 4527 |
<g id="grid-y--2" class="grid grid-y">
|
| 4528 |
+
<path d="M 47.81 409.00723 L 835.361742 409.00723 " clip-path="url(#p09feef2583)" style="fill: none; stroke: #b0b0b0; stroke-opacity: 0.3; stroke-width: 0.8; stroke-linecap: square" />
|
| 4529 |
</g>
|
| 4530 |
<g id="line2d_7">
|
| 4531 |
<defs>
|
| 4532 |
<path id="m0fca2865ba" d="M 0 0 L -3.5 0 " style="stroke: #000000; stroke-width: 0.8" />
|
| 4533 |
</defs>
|
| 4534 |
<g>
|
| 4535 |
+
<use ns4:href="#m0fca2865ba" x="47.81" y="409.00723" style="stroke: #000000; stroke-width: 0.8" />
|
| 4536 |
</g>
|
| 4537 |
</g>
|
| 4538 |
<g id="text_7">
|
| 4539 |
+
<text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: end" x="40.81" y="412.806448" transform="rotate(-0 40.81 412.806448)">1.0</text>
|
| 4540 |
</g>
|
| 4541 |
</g>
|
| 4542 |
<g id="ytick_2">
|
| 4543 |
<g id="grid-y--3" class="grid grid-y">
|
| 4544 |
+
<path d="M 47.81 347.973099 L 835.361742 347.973099 " clip-path="url(#p09feef2583)" style="fill: none; stroke: #b0b0b0; stroke-opacity: 0.3; stroke-width: 0.8; stroke-linecap: square" />
|
| 4545 |
</g>
|
| 4546 |
<g id="line2d_8">
|
| 4547 |
<g>
|
| 4548 |
+
<use ns4:href="#m0fca2865ba" x="47.81" y="347.973099" style="stroke: #000000; stroke-width: 0.8" />
|
| 4549 |
</g>
|
| 4550 |
</g>
|
| 4551 |
<g id="text_8">
|
| 4552 |
+
<text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: end" x="40.81" y="351.772318" transform="rotate(-0 40.81 351.772318)">1.2</text>
|
| 4553 |
</g>
|
| 4554 |
</g>
|
| 4555 |
<g id="ytick_3">
|
| 4556 |
<g id="grid-y--4" class="grid grid-y">
|
| 4557 |
+
<path d="M 47.81 286.938969 L 835.361742 286.938969 " clip-path="url(#p09feef2583)" style="fill: none; stroke: #b0b0b0; stroke-opacity: 0.3; stroke-width: 0.8; stroke-linecap: square" />
|
| 4558 |
</g>
|
| 4559 |
<g id="line2d_9">
|
| 4560 |
<g>
|
| 4561 |
+
<use ns4:href="#m0fca2865ba" x="47.81" y="286.938969" style="stroke: #000000; stroke-width: 0.8" />
|
| 4562 |
</g>
|
| 4563 |
</g>
|
| 4564 |
<g id="text_9">
|
| 4565 |
+
<text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: end" x="40.81" y="290.738187" transform="rotate(-0 40.81 290.738187)">1.4</text>
|
| 4566 |
</g>
|
| 4567 |
</g>
|
| 4568 |
<g id="ytick_4">
|
| 4569 |
<g id="grid-y--5" class="grid grid-y">
|
| 4570 |
+
<path d="M 47.81 225.904838 L 835.361742 225.904838 " clip-path="url(#p09feef2583)" style="fill: none; stroke: #b0b0b0; stroke-opacity: 0.3; stroke-width: 0.8; stroke-linecap: square" />
|
| 4571 |
</g>
|
| 4572 |
<g id="line2d_10">
|
| 4573 |
<g>
|
| 4574 |
+
<use ns4:href="#m0fca2865ba" x="47.81" y="225.904838" style="stroke: #000000; stroke-width: 0.8" />
|
| 4575 |
</g>
|
| 4576 |
</g>
|
| 4577 |
<g id="text_10">
|
| 4578 |
+
<text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: end" x="40.81" y="229.704057" transform="rotate(-0 40.81 229.704057)">1.6</text>
|
| 4579 |
</g>
|
| 4580 |
</g>
|
| 4581 |
<g id="ytick_5">
|
| 4582 |
<g id="grid-y--6" class="grid grid-y">
|
| 4583 |
+
<path d="M 47.81 164.870708 L 835.361742 164.870708 " clip-path="url(#p09feef2583)" style="fill: none; stroke: #b0b0b0; stroke-opacity: 0.3; stroke-width: 0.8; stroke-linecap: square" />
|
| 4584 |
</g>
|
| 4585 |
<g id="line2d_11">
|
| 4586 |
<g>
|
| 4587 |
+
<use ns4:href="#m0fca2865ba" x="47.81" y="164.870708" style="stroke: #000000; stroke-width: 0.8" />
|
| 4588 |
</g>
|
| 4589 |
</g>
|
| 4590 |
<g id="text_11">
|
| 4591 |
+
<text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: end" x="40.81" y="168.669926" transform="rotate(-0 40.81 168.669926)">1.8</text>
|
| 4592 |
</g>
|
| 4593 |
</g>
|
| 4594 |
<g id="ytick_6">
|
| 4595 |
<g id="grid-y--7" class="grid grid-y">
|
| 4596 |
+
<path d="M 47.81 103.836577 L 835.361742 103.836577 " clip-path="url(#p09feef2583)" style="fill: none; stroke: #b0b0b0; stroke-opacity: 0.3; stroke-width: 0.8; stroke-linecap: square" />
|
| 4597 |
</g>
|
| 4598 |
<g id="line2d_12">
|
| 4599 |
<g>
|
| 4600 |
+
<use ns4:href="#m0fca2865ba" x="47.81" y="103.836577" style="stroke: #000000; stroke-width: 0.8" />
|
| 4601 |
</g>
|
| 4602 |
</g>
|
| 4603 |
<g id="text_12">
|
| 4604 |
+
<text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: end" x="40.81" y="107.635796" transform="rotate(-0 40.81 107.635796)">2.0</text>
|
| 4605 |
</g>
|
| 4606 |
</g>
|
| 4607 |
<g id="ytick_7">
|
| 4608 |
<g id="grid-y--8" class="grid grid-y">
|
| 4609 |
+
<path d="M 47.81 42.802447 L 835.361742 42.802447 " clip-path="url(#p09feef2583)" style="fill: none; stroke: #b0b0b0; stroke-opacity: 0.3; stroke-width: 0.8; stroke-linecap: square" />
|
| 4610 |
</g>
|
| 4611 |
<g id="line2d_13">
|
| 4612 |
<g>
|
| 4613 |
+
<use ns4:href="#m0fca2865ba" x="47.81" y="42.802447" style="stroke: #000000; stroke-width: 0.8" />
|
| 4614 |
</g>
|
| 4615 |
</g>
|
| 4616 |
<g id="text_13">
|
| 4617 |
+
<text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: end" x="40.81" y="46.601665" transform="rotate(-0 40.81 46.601665)">2.2</text>
|
| 4618 |
</g>
|
| 4619 |
</g>
|
| 4620 |
<g id="label--y" class="ylabel">
|
|
|
|
| 4622 |
</g>
|
| 4623 |
</g>
|
| 4624 |
<g id="series--torch-flash-ma" class="series">
|
| 4625 |
+
<path d="M 83.607806 340.639848 L 226.799032 324.181385 L 369.990258 320.559009 L 513.181484 308.901185 L 656.37271 265.282228 L 799.563935 254.967155 " clip-path="url(#p09feef2583)" style="fill: none; stroke: #1f77b4; stroke-width: 1.5; stroke-linecap: square" />
|
| 4626 |
<defs>
|
| 4627 |
<path id="md7efaf3aec" d="M 0 3 C 0.795609 3 1.55874 2.683901 2.12132 2.12132 C 2.683901 1.55874 3 0.795609 3 0 C 3 -0.795609 2.683901 -1.55874 2.12132 -2.12132 C 1.55874 -2.683901 0.795609 -3 0 -3 C -0.795609 -3 -1.55874 -2.683901 -2.12132 -2.12132 C -2.683901 -1.55874 -3 -0.795609 -3 0 C -3 0.795609 -2.683901 1.55874 -2.12132 2.12132 C -1.55874 2.683901 -0.795609 3 0 3 z " style="stroke: #1f77b4" />
|
| 4628 |
</defs>
|
| 4629 |
<g clip-path="url(#p09feef2583)">
|
| 4630 |
+
<use ns4:href="#md7efaf3aec" x="83.607806" y="340.639848" style="fill: #1f77b4; stroke: #1f77b4" />
|
| 4631 |
+
<use ns4:href="#md7efaf3aec" x="226.799032" y="324.181385" style="fill: #1f77b4; stroke: #1f77b4" />
|
| 4632 |
+
<use ns4:href="#md7efaf3aec" x="369.990258" y="320.559009" style="fill: #1f77b4; stroke: #1f77b4" />
|
| 4633 |
+
<use ns4:href="#md7efaf3aec" x="513.181484" y="308.901185" style="fill: #1f77b4; stroke: #1f77b4" />
|
| 4634 |
+
<use ns4:href="#md7efaf3aec" x="656.37271" y="265.282228" style="fill: #1f77b4; stroke: #1f77b4" />
|
| 4635 |
+
<use ns4:href="#md7efaf3aec" x="799.563935" y="254.967155" style="fill: #1f77b4; stroke: #1f77b4" />
|
| 4636 |
</g>
|
| 4637 |
</g>
|
| 4638 |
<g id="series--torch-mem-eff" class="series">
|
| 4639 |
+
<path d="M 83.607806 156.748591 L 226.799032 137.315018 L 369.990258 105.143013 L 513.181484 114.228248 L 656.37271 86.655469 L 799.563935 45.999414 " clip-path="url(#p09feef2583)" style="fill: none; stroke: #ff7f0e; stroke-width: 1.5; stroke-linecap: square" />
|
| 4640 |
<defs>
|
| 4641 |
<path id="m9b8c54d372" d="M 0 3 C 0.795609 3 1.55874 2.683901 2.12132 2.12132 C 2.683901 1.55874 3 0.795609 3 0 C 3 -0.795609 2.683901 -1.55874 2.12132 -2.12132 C 1.55874 -2.683901 0.795609 -3 0 -3 C -0.795609 -3 -1.55874 -2.683901 -2.12132 -2.12132 C -2.683901 -1.55874 -3 -0.795609 -3 0 C -3 0.795609 -2.683901 1.55874 -2.12132 2.12132 C -1.55874 2.683901 -0.795609 3 0 3 z " style="stroke: #ff7f0e" />
|
| 4642 |
</defs>
|
| 4643 |
<g clip-path="url(#p09feef2583)">
|
| 4644 |
+
<use ns4:href="#m9b8c54d372" x="83.607806" y="156.748591" style="fill: #ff7f0e; stroke: #ff7f0e" />
|
| 4645 |
+
<use ns4:href="#m9b8c54d372" x="226.799032" y="137.315018" style="fill: #ff7f0e; stroke: #ff7f0e" />
|
| 4646 |
+
<use ns4:href="#m9b8c54d372" x="369.990258" y="105.143013" style="fill: #ff7f0e; stroke: #ff7f0e" />
|
| 4647 |
+
<use ns4:href="#m9b8c54d372" x="513.181484" y="114.228248" style="fill: #ff7f0e; stroke: #ff7f0e" />
|
| 4648 |
+
<use ns4:href="#m9b8c54d372" x="656.37271" y="86.655469" style="fill: #ff7f0e; stroke: #ff7f0e" />
|
| 4649 |
<use ns4:href="#m9b8c54d372" x="799.563935" y="45.999414" style="fill: #ff7f0e; stroke: #ff7f0e" />
|
| 4650 |
</g>
|
| 4651 |
</g>
|
| 4652 |
<g id="series--xformers-meff" class="series">
|
| 4653 |
+
<path d="M 83.607806 410.498293 L 226.799032 399.197519 L 369.990258 383.346345 L 513.181484 381.042612 L 656.37271 332.003214 L 799.563935 335.418073 " clip-path="url(#p09feef2583)" style="fill: none; stroke: #2ca02c; stroke-width: 1.5; stroke-linecap: square" />
|
| 4654 |
<defs>
|
| 4655 |
<path id="mc655281e0b" d="M 0 3 C 0.795609 3 1.55874 2.683901 2.12132 2.12132 C 2.683901 1.55874 3 0.795609 3 0 C 3 -0.795609 2.683901 -1.55874 2.12132 -2.12132 C 1.55874 -2.683901 0.795609 -3 0 -3 C -0.795609 -3 -1.55874 -2.683901 -2.12132 -2.12132 C -2.683901 -1.55874 -3 -0.795609 -3 0 C -3 0.795609 -2.683901 1.55874 -2.12132 2.12132 C -1.55874 2.683901 -0.795609 3 0 3 z " style="stroke: #2ca02c" />
|
| 4656 |
</defs>
|
| 4657 |
<g clip-path="url(#p09feef2583)">
|
| 4658 |
+
<use ns4:href="#mc655281e0b" x="83.607806" y="410.498293" style="fill: #2ca02c; stroke: #2ca02c" />
|
| 4659 |
+
<use ns4:href="#mc655281e0b" x="226.799032" y="399.197519" style="fill: #2ca02c; stroke: #2ca02c" />
|
| 4660 |
+
<use ns4:href="#mc655281e0b" x="369.990258" y="383.346345" style="fill: #2ca02c; stroke: #2ca02c" />
|
| 4661 |
+
<use ns4:href="#mc655281e0b" x="513.181484" y="381.042612" style="fill: #2ca02c; stroke: #2ca02c" />
|
| 4662 |
+
<use ns4:href="#mc655281e0b" x="656.37271" y="332.003214" style="fill: #2ca02c; stroke: #2ca02c" />
|
| 4663 |
+
<use ns4:href="#mc655281e0b" x="799.563935" y="335.418073" style="fill: #2ca02c; stroke: #2ca02c" />
|
| 4664 |
</g>
|
| 4665 |
</g>
|
| 4666 |
<g id="series--hf-kernels-flash-attn" class="series">
|
| 4667 |
+
<path d="M 83.607806 418.603626 L 226.799032 405.380276 L 369.990258 389.547718 L 513.181484 382.629499 L 656.37271 335.525188 L 799.563935 340.270592 " clip-path="url(#p09feef2583)" style="fill: none; stroke: #d62728; stroke-width: 1.5; stroke-linecap: square" />
|
| 4668 |
<defs>
|
| 4669 |
<path id="m61c8040d7e" d="M 0 3 C 0.795609 3 1.55874 2.683901 2.12132 2.12132 C 2.683901 1.55874 3 0.795609 3 0 C 3 -0.795609 2.683901 -1.55874 2.12132 -2.12132 C 1.55874 -2.683901 0.795609 -3 0 -3 C -0.795609 -3 -1.55874 -2.683901 -2.12132 -2.12132 C -2.683901 -1.55874 -3 -0.795609 -3 0 C -3 0.795609 -2.683901 1.55874 -2.12132 2.12132 C -1.55874 2.683901 -0.795609 3 0 3 z " style="stroke: #d62728" />
|
| 4670 |
</defs>
|
| 4671 |
<g clip-path="url(#p09feef2583)">
|
| 4672 |
+
<use ns4:href="#m61c8040d7e" x="83.607806" y="418.603626" style="fill: #d62728; stroke: #d62728" />
|
| 4673 |
+
<use ns4:href="#m61c8040d7e" x="226.799032" y="405.380276" style="fill: #d62728; stroke: #d62728" />
|
| 4674 |
+
<use ns4:href="#m61c8040d7e" x="369.990258" y="389.547718" style="fill: #d62728; stroke: #d62728" />
|
| 4675 |
+
<use ns4:href="#m61c8040d7e" x="513.181484" y="382.629499" style="fill: #d62728; stroke: #d62728" />
|
| 4676 |
+
<use ns4:href="#m61c8040d7e" x="656.37271" y="335.525188" style="fill: #d62728; stroke: #d62728" />
|
| 4677 |
+
<use ns4:href="#m61c8040d7e" x="799.563935" y="340.270592" style="fill: #d62728; stroke: #d62728" />
|
| 4678 |
</g>
|
| 4679 |
</g>
|
| 4680 |
<g id="series--hf-kernels-flash-attn3" class="series">
|
| 4681 |
+
<path d="M 83.607806 428.387702 L 226.799032 418.05737 L 369.990258 396.545281 L 513.181484 392.764216 L 656.37271 347.753681 L 799.563935 353.503096 " clip-path="url(#p09feef2583)" style="fill: none; stroke: #9467bd; stroke-width: 1.5; stroke-linecap: square" />
|
| 4682 |
<defs>
|
| 4683 |
<path id="m7cd35be9cc" d="M 0 3 C 0.795609 3 1.55874 2.683901 2.12132 2.12132 C 2.683901 1.55874 3 0.795609 3 0 C 3 -0.795609 2.683901 -1.55874 2.12132 -2.12132 C 1.55874 -2.683901 0.795609 -3 0 -3 C -0.795609 -3 -1.55874 -2.683901 -2.12132 -2.12132 C -2.683901 -1.55874 -3 -0.795609 -3 0 C -3 0.795609 -2.683901 1.55874 -2.12132 2.12132 C -1.55874 2.683901 -0.795609 3 0 3 z " style="stroke: #9467bd" />
|
| 4684 |
</defs>
|
| 4685 |
<g clip-path="url(#p09feef2583)">
|
| 4686 |
<use ns4:href="#m7cd35be9cc" x="83.607806" y="428.387702" style="fill: #9467bd; stroke: #9467bd" />
|
| 4687 |
+
<use ns4:href="#m7cd35be9cc" x="226.799032" y="418.05737" style="fill: #9467bd; stroke: #9467bd" />
|
| 4688 |
+
<use ns4:href="#m7cd35be9cc" x="369.990258" y="396.545281" style="fill: #9467bd; stroke: #9467bd" />
|
| 4689 |
+
<use ns4:href="#m7cd35be9cc" x="513.181484" y="392.764216" style="fill: #9467bd; stroke: #9467bd" />
|
| 4690 |
+
<use ns4:href="#m7cd35be9cc" x="656.37271" y="347.753681" style="fill: #9467bd; stroke: #9467bd" />
|
| 4691 |
+
<use ns4:href="#m7cd35be9cc" x="799.563935" y="353.503096" style="fill: #9467bd; stroke: #9467bd" />
|
| 4692 |
</g>
|
| 4693 |
</g>
|
| 4694 |
<g id="patch_3">
|
index.html
CHANGED
|
The diff for this file is too large to render.
See raw diff
|
|
|
layer_norm/impls/artifacts/benchmark/layer_norm.jsonl
CHANGED
|
@@ -1,4 +1,4 @@
|
|
| 1 |
-
{"ts": "2025-10-
|
| 2 |
-
{"ts": "2025-10-
|
| 3 |
-
{"ts": "2025-10-
|
| 4 |
-
{"ts": "2025-10-
|
|
|
|
| 1 |
+
{"ts": "2025-10-29T14:26:39Z", "run": "a26c482e532b445fa4192ef14ec4850f", "impl": "hf_kernels_layer_norm", "tags": {"family": "hf-kernels", "repo": "kernels-community/layer-norm", "op": "layer_norm"}, "wl": {"name": "LN_B16_S2048_D4096", "batch": 16, "seq_len": 2048, "hidden_dim": 4096, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.8322699999894212, "p50": 0.8364899999833142, "p90": 0.8382409999967422, "mean": 0.8359703999872181, "iqr": 0.0036810000096920703, "raw_times": [0.8322699999894212, 0.8382909999795629, 0.8345599999870501, 0.8382409999967422, 0.8364899999833142], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.8381600000006983, "peak_bytes": 2415935488, "ok": true, "absmax": 0.03125, "corr": {"ok": true, "rtol": 0.001, "atol": 0.03125, "absmax": 0.03125, "mae": 0.0015411376953125, "mse": 1.1205673217773438e-05, "ref": "layer_norm_ref"}, "err": null}
|
| 2 |
+
{"ts": "2025-10-29T14:26:39Z", "run": "a26c482e532b445fa4192ef14ec4850f", "impl": "hf_kernels_layer_norm", "tags": {"family": "hf-kernels", "repo": "kernels-community/layer-norm", "op": "layer_norm"}, "wl": {"name": "LN_B16_S2048_D8192", "batch": 16, "seq_len": 2048, "hidden_dim": 8192, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 1.6435499999829517, "p50": 1.6499199999771008, "p90": 1.6516499999852385, "mean": 1.650240399987979, "iqr": 0.0024989999474200886, "raw_times": [1.6516499999852385, 1.6435499999829517, 1.6499199999771008, 1.6491510000378184, 1.656930999956785], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 1.644769999984419, "peak_bytes": 4831870976, "ok": true, "absmax": 0.03125, "corr": {"ok": true, "rtol": 0.001, "atol": 0.03125, "absmax": 0.03125, "mae": 0.0015106201171875, "mse": 1.1086463928222656e-05, "ref": "layer_norm_ref"}, "err": null}
|
| 3 |
+
{"ts": "2025-10-29T14:26:39Z", "run": "a26c482e532b445fa4192ef14ec4850f", "impl": "hf_kernels_layer_norm", "tags": {"family": "hf-kernels", "repo": "kernels-community/layer-norm", "op": "layer_norm"}, "wl": {"name": "LN_B16_S4096_D4096", "batch": 16, "seq_len": 4096, "hidden_dim": 4096, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 1.6425610000396773, "p50": 1.6517310000381258, "p90": 1.654420999955164, "mean": 1.6505027999983213, "iqr": 0.006990999963818467, "raw_times": [1.6474299999913455, 1.6517310000381258, 1.654420999955164, 1.6563709999672938, 1.6425610000396773], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 1.6471609999939574, "peak_bytes": 4831854592, "ok": true, "absmax": 0.03125, "corr": {"ok": true, "rtol": 0.001, "atol": 0.03125, "absmax": 0.03125, "mae": 0.0015411376953125, "mse": 1.1205673217773438e-05, "ref": "layer_norm_ref"}, "err": null}
|
| 4 |
+
{"ts": "2025-10-29T14:26:39Z", "run": "a26c482e532b445fa4192ef14ec4850f", "impl": "hf_kernels_layer_norm", "tags": {"family": "hf-kernels", "repo": "kernels-community/layer-norm", "op": "layer_norm"}, "wl": {"name": "LN_B16_S4096_D8192", "batch": 16, "seq_len": 4096, "hidden_dim": 8192, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 3.237169999977141, "p50": 3.2576509999557857, "p90": 3.264301000001524, "mean": 3.257706599993071, "iqr": 0.008230999981151399, "raw_times": [3.2576509999557857, 3.264301000001524, 3.2733410000105323, 3.2560700000203724, 3.237169999977141], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 3.2725309999932506, "peak_bytes": 9663709184, "ok": true, "absmax": 0.03125, "corr": {"ok": true, "rtol": 0.001, "atol": 0.03125, "absmax": 0.03125, "mae": 0.0015106201171875, "mse": 1.1026859283447266e-05, "ref": "layer_norm_ref"}, "err": null}
|
layer_norm/impls/hf_kernels_layer_norm.html
CHANGED
|
@@ -3862,8 +3862,7 @@ span.linenos.special { color: #000000; background-color: #ffffc0; padding-left:
|
|
| 3862 |
</div>
|
| 3863 |
|
| 3864 |
<div class="main-content">
|
| 3865 |
-
<
|
| 3866 |
-
<h1>HF Kernels LayerNorm Implementation</h1>
|
| 3867 |
<p>Based on kernels-community <code>layer-norm</code> kernel.</p>
|
| 3868 |
<h2>LayerNorm Benchmark (HF Kernels)</h2>
|
| 3869 |
<div class="cell" id="cell-benchmark">
|
|
@@ -3873,10 +3872,11 @@ span.linenos.special { color: #000000; background-color: #ffffc0; padding-left:
|
|
| 3873 |
<span onclick="toggleOutput('benchmark')" style="cursor: pointer;">▼ output</span>
|
| 3874 |
<span id="uv-indicator-benchmark" onclick="toggleUvLogsFromHeader('benchmark')" style="cursor: pointer;">▶ uv-logs</span>
|
| 3875 |
</span> |
|
| 3876 |
-
Cell: benchmark |
|
| 3877 |
| <button class="run-btn" onclick="runCell('benchmark')">▶ run</button>
|
| 3878 |
<button class="copy-btn" onclick="copyCell('benchmark')">Copy</button>
|
| 3879 |
<a href="cells/benchmark.py" target="_blank" class="raw-btn">Raw</a>
|
|
|
|
| 3880 |
</div>
|
| 3881 |
<div id="code-benchmark" class="cell-code" data-lines="49">
|
| 3882 |
<div class="code-wrap">
|
|
@@ -3943,19 +3943,19 @@ PROFILE TRACE: hf_kernels_layer_norm | LN_B16_S2048_D4096
|
|
| 3943 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 3944 |
Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
|
| 3945 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 3946 |
-
hf_kernels_layer_norm
|
| 3947 |
-
_layer_norm_f8ec252::dropout_add_ln_fwd 1.
|
| 3948 |
-
hf_kernels_layer_norm 0.00% 0.000us 0.00% 0.000us 0.000us 2.
|
| 3949 |
-
void layer_norm::ln_fwd_kernel<layer_norm::Kernel_tr... 0.00% 0.000us 0.00% 0.000us 0.000us 2.
|
| 3950 |
-
Activity Buffer Request 36.
|
| 3951 |
-
aten::view 0.
|
| 3952 |
-
aten::empty 1.
|
| 3953 |
-
cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFla... 0.
|
| 3954 |
-
cudaLaunchKernel 1.
|
| 3955 |
-
cudaDeviceSynchronize 53.
|
| 3956 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 3957 |
-
Self CPU time total: 3.
|
| 3958 |
-
Self CUDA time total: 2.
|
| 3959 |
|
| 3960 |
|
| 3961 |
|
|
@@ -3965,19 +3965,19 @@ PROFILE TRACE: hf_kernels_layer_norm | LN_B16_S2048_D8192
|
|
| 3965 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 3966 |
Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
|
| 3967 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 3968 |
-
hf_kernels_layer_norm 2.
|
| 3969 |
-
_layer_norm_f8ec252::dropout_add_ln_fwd 0.
|
| 3970 |
-
hf_kernels_layer_norm 0.00% 0.000us 0.00% 0.000us 0.000us 4.
|
| 3971 |
-
void layer_norm::ln_fwd_kernel<layer_norm::Kernel_tr... 0.00% 0.000us 0.00% 0.000us 0.000us 4.
|
| 3972 |
-
Activity Buffer Request
|
| 3973 |
-
aten::view 0.20% 12.
|
| 3974 |
-
aten::empty 0.
|
| 3975 |
-
cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFla... 0.
|
| 3976 |
-
cudaLaunchKernel 0.
|
| 3977 |
-
cudaDeviceSynchronize
|
| 3978 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 3979 |
-
Self CPU time total: 6.
|
| 3980 |
-
Self CUDA time total: 4.
|
| 3981 |
|
| 3982 |
|
| 3983 |
|
|
@@ -3987,19 +3987,19 @@ PROFILE TRACE: hf_kernels_layer_norm | LN_B16_S4096_D4096
|
|
| 3987 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 3988 |
Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
|
| 3989 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 3990 |
-
hf_kernels_layer_norm 1.
|
| 3991 |
-
_layer_norm_f8ec252::dropout_add_ln_fwd 0.
|
| 3992 |
-
hf_kernels_layer_norm 0.00% 0.000us 0.00% 0.000us 0.000us 4.
|
| 3993 |
-
void layer_norm::ln_fwd_kernel<layer_norm::Kernel_tr... 0.00% 0.000us 0.00% 0.000us 0.000us 4.
|
| 3994 |
-
Activity Buffer Request
|
| 3995 |
-
aten::view 0.
|
| 3996 |
-
aten::empty 0.
|
| 3997 |
-
cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFla... 0.08%
|
| 3998 |
-
cudaLaunchKernel 0.
|
| 3999 |
-
cudaDeviceSynchronize
|
| 4000 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 4001 |
-
Self CPU time total: 6.
|
| 4002 |
-
Self CUDA time total: 4.
|
| 4003 |
|
| 4004 |
|
| 4005 |
|
|
@@ -4009,37 +4009,38 @@ PROFILE TRACE: hf_kernels_layer_norm | LN_B16_S4096_D8192
|
|
| 4009 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 4010 |
Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
|
| 4011 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 4012 |
-
hf_kernels_layer_norm 1.
|
| 4013 |
-
_layer_norm_f8ec252::dropout_add_ln_fwd 0.
|
| 4014 |
-
hf_kernels_layer_norm 0.00% 0.000us 0.00% 0.000us 0.000us 9.
|
| 4015 |
-
void layer_norm::ln_fwd_kernel<layer_norm::Kernel_tr... 0.00% 0.000us 0.00% 0.000us 0.000us 9.
|
| 4016 |
-
Activity Buffer Request 12.
|
| 4017 |
-
aten::view 0.12% 13.
|
| 4018 |
-
aten::empty 0.
|
| 4019 |
-
cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFla... 0.05% 5.
|
| 4020 |
-
cudaLaunchKernel 2.
|
| 4021 |
-
cudaDeviceSynchronize 82.
|
| 4022 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 4023 |
-
Self CPU time total: 11.
|
| 4024 |
-
Self CUDA time total: 9.
|
| 4025 |
|
| 4026 |
|
| 4027 |
impl wl p50(ms) ok
|
| 4028 |
hf_kernels_layer_norm LN_B16_S2048_D4096 0.84 True
|
| 4029 |
hf_kernels_layer_norm LN_B16_S2048_D8192 1.65 True
|
| 4030 |
hf_kernels_layer_norm LN_B16_S4096_D4096 1.65 True
|
| 4031 |
-
hf_kernels_layer_norm LN_B16_S4096_D8192 3.
|
| 4032 |
</pre></div>
|
| 4033 |
<div class="uv-install-logs" id="uv-logs-benchmark">
|
| 4034 |
<div class="uv-logs-header" onclick="toggleUvLogs(this)">▶ UV Install Logs</div>
|
| 4035 |
<div class="uv-logs-content" style="display: none;">
|
|
|
|
|
|
|
| 4036 |
Installed 15 packages in 13ms
|
| 4037 |
</div>
|
| 4038 |
</div>
|
| 4039 |
<div class="cell-stderr">Fetching 4 files: 0%| | 0/4 [00:00<?, ?it/s]
|
| 4040 |
-
Fetching 4 files:
|
| 4041 |
-
Fetching 4 files:
|
| 4042 |
-
Fetching 4 files: 100%|██████████| 4/4 [00:02<00:00, 1.61it/s]</div>
|
| 4043 |
<div class="cell-artifacts">
|
| 4044 |
<h4>Artifacts:</h4>
|
| 4045 |
<a href="artifacts/benchmark/layer_norm.jsonl" class="artifact" target="_blank">layer_norm.jsonl</a>
|
|
|
|
| 3862 |
</div>
|
| 3863 |
|
| 3864 |
<div class="main-content">
|
| 3865 |
+
<h1>HF Kernels LayerNorm Implementation</h1>
|
|
|
|
| 3866 |
<p>Based on kernels-community <code>layer-norm</code> kernel.</p>
|
| 3867 |
<h2>LayerNorm Benchmark (HF Kernels)</h2>
|
| 3868 |
<div class="cell" id="cell-benchmark">
|
|
|
|
| 3872 |
<span onclick="toggleOutput('benchmark')" style="cursor: pointer;">▼ output</span>
|
| 3873 |
<span id="uv-indicator-benchmark" onclick="toggleUvLogsFromHeader('benchmark')" style="cursor: pointer;">▶ uv-logs</span>
|
| 3874 |
</span> |
|
| 3875 |
+
Cell: benchmark | 6.34s
|
| 3876 |
| <button class="run-btn" onclick="runCell('benchmark')">▶ run</button>
|
| 3877 |
<button class="copy-btn" onclick="copyCell('benchmark')">Copy</button>
|
| 3878 |
<a href="cells/benchmark.py" target="_blank" class="raw-btn">Raw</a>
|
| 3879 |
+
<a href="https://github.com/huggingface/kernels-uvnotes/blob/main/layer_norm/impls/hf_kernels_layer_norm.md" target="_blank" class="github-btn">GitHub</a>
|
| 3880 |
</div>
|
| 3881 |
<div id="code-benchmark" class="cell-code" data-lines="49">
|
| 3882 |
<div class="code-wrap">
|
|
|
|
| 3943 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 3944 |
Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
|
| 3945 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 3946 |
+
hf_kernels_layer_norm 5.26% 209.855us 46.73% 1.864ms 1.864ms 0.000us 0.00% 3.097ms 3.097ms 1
|
| 3947 |
+
_layer_norm_f8ec252::dropout_add_ln_fwd 1.78% 70.832us 40.86% 1.630ms 543.337us 2.360ms 100.00% 3.097ms 1.032ms 3
|
| 3948 |
+
hf_kernels_layer_norm 0.00% 0.000us 0.00% 0.000us 0.000us 2.362ms 100.06% 2.362ms 2.362ms 1
|
| 3949 |
+
void layer_norm::ln_fwd_kernel<layer_norm::Kernel_tr... 0.00% 0.000us 0.00% 0.000us 0.000us 2.360ms 100.00% 2.360ms 786.699us 3
|
| 3950 |
+
Activity Buffer Request 36.61% 1.461ms 36.61% 1.461ms 1.461ms 736.736us 31.22% 736.736us 736.736us 1
|
| 3951 |
+
aten::view 0.61% 24.271us 0.61% 24.271us 4.045us 0.000us 0.00% 0.000us 0.000us 6
|
| 3952 |
+
aten::empty 1.19% 47.642us 1.19% 47.642us 5.294us 0.000us 0.00% 0.000us 0.000us 9
|
| 3953 |
+
cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFla... 0.27% 10.789us 0.27% 10.789us 3.596us 0.000us 0.00% 0.000us 0.000us 3
|
| 3954 |
+
cudaLaunchKernel 1.01% 40.102us 1.01% 40.102us 13.367us 0.000us 0.00% 0.000us 0.000us 3
|
| 3955 |
+
cudaDeviceSynchronize 53.27% 2.125ms 53.27% 2.125ms 2.125ms 0.000us 0.00% 0.000us 0.000us 1
|
| 3956 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 3957 |
+
Self CPU time total: 3.989ms
|
| 3958 |
+
Self CUDA time total: 2.360ms
|
| 3959 |
|
| 3960 |
|
| 3961 |
|
|
|
|
| 3965 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 3966 |
Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
|
| 3967 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 3968 |
+
hf_kernels_layer_norm 2.24% 143.733us 27.27% 1.751ms 1.751ms 0.000us 0.00% 6.440ms 6.440ms 1
|
| 3969 |
+
_layer_norm_f8ec252::dropout_add_ln_fwd 0.75% 48.181us 24.84% 1.595ms 531.669us 4.846ms 100.00% 6.440ms 2.147ms 3
|
| 3970 |
+
hf_kernels_layer_norm 0.00% 0.000us 0.00% 0.000us 0.000us 4.848ms 100.03% 4.848ms 4.848ms 1
|
| 3971 |
+
void layer_norm::ln_fwd_kernel<layer_norm::Kernel_tr... 0.00% 0.000us 0.00% 0.000us 0.000us 4.846ms 100.00% 4.846ms 1.615ms 3
|
| 3972 |
+
Activity Buffer Request 23.08% 1.482ms 23.08% 1.482ms 1.482ms 1.594ms 32.88% 1.594ms 1.594ms 1
|
| 3973 |
+
aten::view 0.20% 12.572us 0.20% 12.572us 2.095us 0.000us 0.00% 0.000us 0.000us 6
|
| 3974 |
+
aten::empty 0.46% 29.840us 0.46% 29.840us 3.316us 0.000us 0.00% 0.000us 0.000us 9
|
| 3975 |
+
cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFla... 0.08% 5.420us 0.08% 5.420us 1.807us 0.000us 0.00% 0.000us 0.000us 3
|
| 3976 |
+
cudaLaunchKernel 0.46% 29.490us 0.46% 29.490us 9.830us 0.000us 0.00% 0.000us 0.000us 3
|
| 3977 |
+
cudaDeviceSynchronize 72.73% 4.670ms 72.73% 4.670ms 4.670ms 0.000us 0.00% 0.000us 0.000us 1
|
| 3978 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 3979 |
+
Self CPU time total: 6.421ms
|
| 3980 |
+
Self CUDA time total: 4.846ms
|
| 3981 |
|
| 3982 |
|
| 3983 |
|
|
|
|
| 3987 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 3988 |
Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
|
| 3989 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 3990 |
+
hf_kernels_layer_norm 1.96% 126.465us 27.43% 1.766ms 1.766ms 0.000us 0.00% 6.435ms 6.435ms 1
|
| 3991 |
+
_layer_norm_f8ec252::dropout_add_ln_fwd 0.73% 46.779us 25.26% 1.627ms 542.360us 4.838ms 100.00% 6.435ms 2.145ms 3
|
| 3992 |
+
hf_kernels_layer_norm 0.00% 0.000us 0.00% 0.000us 0.000us 4.839ms 100.03% 4.839ms 4.839ms 1
|
| 3993 |
+
void layer_norm::ln_fwd_kernel<layer_norm::Kernel_tr... 0.00% 0.000us 0.00% 0.000us 0.000us 4.838ms 100.00% 4.838ms 1.613ms 3
|
| 3994 |
+
Activity Buffer Request 23.54% 1.516ms 23.54% 1.516ms 1.516ms 1.597ms 33.01% 1.597ms 1.597ms 1
|
| 3995 |
+
aten::view 0.20% 12.929us 0.20% 12.929us 2.155us 0.000us 0.00% 0.000us 0.000us 6
|
| 3996 |
+
aten::empty 0.46% 29.911us 0.46% 29.911us 3.323us 0.000us 0.00% 0.000us 0.000us 9
|
| 3997 |
+
cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFla... 0.08% 5.300us 0.08% 5.300us 1.767us 0.000us 0.00% 0.000us 0.000us 3
|
| 3998 |
+
cudaLaunchKernel 0.45% 29.003us 0.45% 29.003us 9.668us 0.000us 0.00% 0.000us 0.000us 3
|
| 3999 |
+
cudaDeviceSynchronize 72.57% 4.674ms 72.57% 4.674ms 4.674ms 0.000us 0.00% 0.000us 0.000us 1
|
| 4000 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 4001 |
+
Self CPU time total: 6.440ms
|
| 4002 |
+
Self CUDA time total: 4.838ms
|
| 4003 |
|
| 4004 |
|
| 4005 |
|
|
|
|
| 4009 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 4010 |
Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
|
| 4011 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 4012 |
+
hf_kernels_layer_norm 1.17% 134.085us 17.09% 1.957ms 1.957ms 0.000us 0.00% 12.886ms 12.886ms 1
|
| 4013 |
+
_layer_norm_f8ec252::dropout_add_ln_fwd 0.41% 46.869us 15.80% 1.809ms 603.015us 9.665ms 100.00% 12.886ms 4.295ms 3
|
| 4014 |
+
hf_kernels_layer_norm 0.00% 0.000us 0.00% 0.000us 0.000us 9.667ms 100.01% 9.667ms 9.667ms 1
|
| 4015 |
+
void layer_norm::ln_fwd_kernel<layer_norm::Kernel_tr... 0.00% 0.000us 0.00% 0.000us 0.000us 9.665ms 100.00% 9.665ms 3.222ms 3
|
| 4016 |
+
Activity Buffer Request 12.76% 1.462ms 12.76% 1.462ms 1.462ms 3.220ms 33.32% 3.220ms 3.220ms 1
|
| 4017 |
+
aten::view 0.12% 13.968us 0.12% 13.968us 2.328us 0.000us 0.00% 0.000us 0.000us 6
|
| 4018 |
+
aten::empty 0.26% 30.043us 0.26% 30.043us 3.338us 0.000us 0.00% 0.000us 0.000us 9
|
| 4019 |
+
cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFla... 0.05% 5.590us 0.05% 5.590us 1.863us 0.000us 0.00% 0.000us 0.000us 3
|
| 4020 |
+
cudaLaunchKernel 2.31% 264.797us 2.31% 264.797us 88.266us 0.000us 0.00% 0.000us 0.000us 3
|
| 4021 |
+
cudaDeviceSynchronize 82.91% 9.495ms 82.91% 9.495ms 9.495ms 0.000us 0.00% 0.000us 0.000us 1
|
| 4022 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 4023 |
+
Self CPU time total: 11.452ms
|
| 4024 |
+
Self CUDA time total: 9.665ms
|
| 4025 |
|
| 4026 |
|
| 4027 |
impl wl p50(ms) ok
|
| 4028 |
hf_kernels_layer_norm LN_B16_S2048_D4096 0.84 True
|
| 4029 |
hf_kernels_layer_norm LN_B16_S2048_D8192 1.65 True
|
| 4030 |
hf_kernels_layer_norm LN_B16_S4096_D4096 1.65 True
|
| 4031 |
+
hf_kernels_layer_norm LN_B16_S4096_D8192 3.26 True
|
| 4032 |
</pre></div>
|
| 4033 |
<div class="uv-install-logs" id="uv-logs-benchmark">
|
| 4034 |
<div class="uv-logs-header" onclick="toggleUvLogs(this)">▶ UV Install Logs</div>
|
| 4035 |
<div class="uv-logs-content" style="display: none;">
|
| 4036 |
+
Downloading hf-xet (3.2MiB)
|
| 4037 |
+
Downloading hf-xet
|
| 4038 |
Installed 15 packages in 13ms
|
| 4039 |
</div>
|
| 4040 |
</div>
|
| 4041 |
<div class="cell-stderr">Fetching 4 files: 0%| | 0/4 [00:00<?, ?it/s]
|
| 4042 |
+
Fetching 4 files: 50%|█████ | 2/4 [00:01<00:01, 1.22it/s]
|
| 4043 |
+
Fetching 4 files: 100%|██████████| 4/4 [00:01<00:00, 2.44it/s]</div>
|
|
|
|
| 4044 |
<div class="cell-artifacts">
|
| 4045 |
<h4>Artifacts:</h4>
|
| 4046 |
<a href="artifacts/benchmark/layer_norm.jsonl" class="artifact" target="_blank">layer_norm.jsonl</a>
|
layer_norm/impls/torch_layer_norm.html
CHANGED
|
@@ -3862,8 +3862,7 @@ span.linenos.special { color: #000000; background-color: #ffffc0; padding-left:
|
|
| 3862 |
</div>
|
| 3863 |
|
| 3864 |
<div class="main-content">
|
| 3865 |
-
<
|
| 3866 |
-
<h1>Torch LayerNorm Implementation</h1>
|
| 3867 |
<h2>GPU Info</h2>
|
| 3868 |
<div class="cell" id="cell-nv">
|
| 3869 |
<div class="cell-header">
|
|
@@ -3872,10 +3871,11 @@ span.linenos.special { color: #000000; background-color: #ffffc0; padding-left:
|
|
| 3872 |
<span onclick="toggleOutput('nv')" style="cursor: pointer;">▼ output</span>
|
| 3873 |
<span id="uv-indicator-nv" style="cursor: default; opacity: 0.3;">▶ uv-logs</span>
|
| 3874 |
</span> |
|
| 3875 |
-
Cell: nv | 0.
|
| 3876 |
| <button class="run-btn" onclick="runCell('nv')">▶ run</button>
|
| 3877 |
<button class="copy-btn" onclick="copyCell('nv')">Copy</button>
|
| 3878 |
<a href="cells/nv.py" target="_blank" class="raw-btn">Raw</a>
|
|
|
|
| 3879 |
</div>
|
| 3880 |
<div id="code-nv" class="cell-code" data-lines="2">
|
| 3881 |
<div class="code-wrap">
|
|
@@ -3887,7 +3887,7 @@ Cell: nv | 0.22s
|
|
| 3887 |
</div>
|
| 3888 |
</div>
|
| 3889 |
<div id="output-nv" class="cell-output">
|
| 3890 |
-
<div class="cell-stdout"><pre class="stdout-text">
|
| 3891 |
+-----------------------------------------------------------------------------------------+
|
| 3892 |
| NVIDIA-SMI 570.195.03 Driver Version: 570.195.03 CUDA Version: 12.8 |
|
| 3893 |
|-----------------------------------------+------------------------+----------------------+
|
|
@@ -3896,7 +3896,7 @@ Cell: nv | 0.22s
|
|
| 3896 |
| | | MIG M. |
|
| 3897 |
|=========================================+========================+======================|
|
| 3898 |
| 0 NVIDIA L40S On | 00000000:4D:00.0 Off | 0 |
|
| 3899 |
-
| N/A
|
| 3900 |
| | | N/A |
|
| 3901 |
+-----------------------------------------+------------------------+----------------------+
|
| 3902 |
|
|
@@ -3920,10 +3920,11 @@ Cell: nv | 0.22s
|
|
| 3920 |
<span onclick="toggleOutput('benchmark')" style="cursor: pointer;">▼ output</span>
|
| 3921 |
<span id="uv-indicator-benchmark" onclick="toggleUvLogsFromHeader('benchmark')" style="cursor: pointer;">▶ uv-logs</span>
|
| 3922 |
</span> |
|
| 3923 |
-
Cell: benchmark | 7.
|
| 3924 |
| <button class="run-btn" onclick="runCell('benchmark')">▶ run</button>
|
| 3925 |
<button class="copy-btn" onclick="copyCell('benchmark')">Copy</button>
|
| 3926 |
<a href="cells/benchmark.py" target="_blank" class="raw-btn">Raw</a>
|
|
|
|
| 3927 |
</div>
|
| 3928 |
<div id="code-benchmark" class="cell-code" data-lines="26">
|
| 3929 |
<div class="code-wrap">
|
|
@@ -3967,19 +3968,19 @@ PROFILE TRACE: torch_layer_norm | LN_B16_S2048_D4096
|
|
| 3967 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 3968 |
Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
|
| 3969 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 3970 |
-
torch_layer_norm 3.
|
| 3971 |
-
aten::layer_norm 0.
|
| 3972 |
-
aten::native_layer_norm
|
| 3973 |
torch_layer_norm 0.00% 0.000us 0.00% 0.000us 0.000us 2.318ms 100.06% 2.318ms 2.318ms 1
|
| 3974 |
-
void at::native::(anonymous namespace)::vectorized_l... 0.00% 0.000us 0.00% 0.000us 0.000us 2.
|
| 3975 |
-
Activity Buffer Request 37.
|
| 3976 |
-
aten::empty 1.
|
| 3977 |
-
cudaLaunchKernel 1.16% 45.
|
| 3978 |
-
aten::view 0.
|
| 3979 |
-
cudaDeviceSynchronize 53.
|
| 3980 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 3981 |
-
Self CPU time total: 3.
|
| 3982 |
-
Self CUDA time total: 2.
|
| 3983 |
|
| 3984 |
|
| 3985 |
|
|
@@ -3989,19 +3990,19 @@ PROFILE TRACE: torch_layer_norm | LN_B16_S2048_D8192
|
|
| 3989 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 3990 |
Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
|
| 3991 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 3992 |
-
torch_layer_norm 1.
|
| 3993 |
-
aten::layer_norm 0.
|
| 3994 |
-
aten::native_layer_norm 0.
|
| 3995 |
-
torch_layer_norm 0.00% 0.000us 0.00% 0.000us 0.000us 4.
|
| 3996 |
-
void at::native::(anonymous namespace)::vectorized_l... 0.00% 0.000us 0.00% 0.000us 0.000us 4.
|
| 3997 |
-
Activity Buffer Request 22.
|
| 3998 |
-
aten::empty 0.
|
| 3999 |
-
cudaLaunchKernel 0.
|
| 4000 |
-
aten::view 0.06% 3.
|
| 4001 |
-
cudaDeviceSynchronize 74.
|
| 4002 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 4003 |
-
Self CPU time total: 6.
|
| 4004 |
-
Self CUDA time total: 4.
|
| 4005 |
|
| 4006 |
|
| 4007 |
|
|
@@ -4011,19 +4012,19 @@ PROFILE TRACE: torch_layer_norm | LN_B16_S4096_D4096
|
|
| 4011 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 4012 |
Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
|
| 4013 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 4014 |
-
torch_layer_norm 1.
|
| 4015 |
-
aten::layer_norm 0.15% 9.
|
| 4016 |
-
aten::native_layer_norm 0.
|
| 4017 |
-
torch_layer_norm 0.00% 0.000us 0.00% 0.000us 0.000us 4.
|
| 4018 |
-
void at::native::(anonymous namespace)::vectorized_l... 0.00% 0.000us 0.00% 0.000us 0.000us 4.
|
| 4019 |
-
Activity Buffer Request
|
| 4020 |
-
aten::empty 0.
|
| 4021 |
-
cudaLaunchKernel 0.
|
| 4022 |
-
aten::view 0.07% 4.
|
| 4023 |
-
cudaDeviceSynchronize
|
| 4024 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 4025 |
-
Self CPU time total: 6.
|
| 4026 |
-
Self CUDA time total: 4.
|
| 4027 |
|
| 4028 |
|
| 4029 |
|
|
@@ -4033,19 +4034,19 @@ PROFILE TRACE: torch_layer_norm | LN_B16_S4096_D8192
|
|
| 4033 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 4034 |
Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
|
| 4035 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 4036 |
-
torch_layer_norm 0.
|
| 4037 |
-
aten::layer_norm 0.09% 9.
|
| 4038 |
-
aten::native_layer_norm 0.
|
| 4039 |
-
torch_layer_norm 0.00% 0.000us 0.00% 0.000us 0.000us 9.
|
| 4040 |
-
void at::native::(anonymous namespace)::vectorized_l... 0.00% 0.000us 0.00% 0.000us 0.000us 9.
|
| 4041 |
-
Activity Buffer Request
|
| 4042 |
-
aten::empty 0.
|
| 4043 |
-
cudaLaunchKernel
|
| 4044 |
-
aten::view 0.04% 4.
|
| 4045 |
-
cudaDeviceSynchronize
|
| 4046 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 4047 |
-
Self CPU time total: 11.
|
| 4048 |
-
Self CUDA time total: 9.
|
| 4049 |
|
| 4050 |
|
| 4051 |
impl wl p50(ms) ok
|
|
@@ -4057,7 +4058,7 @@ torch_layer_norm LN_B16_S4096_D8192 3.33 True
|
|
| 4057 |
<div class="uv-install-logs" id="uv-logs-benchmark">
|
| 4058 |
<div class="uv-logs-header" onclick="toggleUvLogs(this)">▶ UV Install Logs</div>
|
| 4059 |
<div class="uv-logs-content" style="display: none;">
|
| 4060 |
-
Installed 37 packages in
|
| 4061 |
</div>
|
| 4062 |
</div>
|
| 4063 |
<div class="cell-artifacts">
|
|
|
|
| 3862 |
</div>
|
| 3863 |
|
| 3864 |
<div class="main-content">
|
| 3865 |
+
<h1>Torch LayerNorm Implementation</h1>
|
|
|
|
| 3866 |
<h2>GPU Info</h2>
|
| 3867 |
<div class="cell" id="cell-nv">
|
| 3868 |
<div class="cell-header">
|
|
|
|
| 3871 |
<span onclick="toggleOutput('nv')" style="cursor: pointer;">▼ output</span>
|
| 3872 |
<span id="uv-indicator-nv" style="cursor: default; opacity: 0.3;">▶ uv-logs</span>
|
| 3873 |
</span> |
|
| 3874 |
+
Cell: nv | 0.26s
|
| 3875 |
| <button class="run-btn" onclick="runCell('nv')">▶ run</button>
|
| 3876 |
<button class="copy-btn" onclick="copyCell('nv')">Copy</button>
|
| 3877 |
<a href="cells/nv.py" target="_blank" class="raw-btn">Raw</a>
|
| 3878 |
+
<a href="https://github.com/huggingface/kernels-uvnotes/blob/main/layer_norm/impls/torch_layer_norm.md" target="_blank" class="github-btn">GitHub</a>
|
| 3879 |
</div>
|
| 3880 |
<div id="code-nv" class="cell-code" data-lines="2">
|
| 3881 |
<div class="code-wrap">
|
|
|
|
| 3887 |
</div>
|
| 3888 |
</div>
|
| 3889 |
<div id="output-nv" class="cell-output">
|
| 3890 |
+
<div class="cell-stdout"><pre class="stdout-text">Wed Oct 29 14:26:26 2025
|
| 3891 |
+-----------------------------------------------------------------------------------------+
|
| 3892 |
| NVIDIA-SMI 570.195.03 Driver Version: 570.195.03 CUDA Version: 12.8 |
|
| 3893 |
|-----------------------------------------+------------------------+----------------------+
|
|
|
|
| 3896 |
| | | MIG M. |
|
| 3897 |
|=========================================+========================+======================|
|
| 3898 |
| 0 NVIDIA L40S On | 00000000:4D:00.0 Off | 0 |
|
| 3899 |
+
| N/A 30C P0 108W / 350W | 0MiB / 46068MiB | 100% Default |
|
| 3900 |
| | | N/A |
|
| 3901 |
+-----------------------------------------+------------------------+----------------------+
|
| 3902 |
|
|
|
|
| 3920 |
<span onclick="toggleOutput('benchmark')" style="cursor: pointer;">▼ output</span>
|
| 3921 |
<span id="uv-indicator-benchmark" onclick="toggleUvLogsFromHeader('benchmark')" style="cursor: pointer;">▶ uv-logs</span>
|
| 3922 |
</span> |
|
| 3923 |
+
Cell: benchmark | 7.36s
|
| 3924 |
| <button class="run-btn" onclick="runCell('benchmark')">▶ run</button>
|
| 3925 |
<button class="copy-btn" onclick="copyCell('benchmark')">Copy</button>
|
| 3926 |
<a href="cells/benchmark.py" target="_blank" class="raw-btn">Raw</a>
|
| 3927 |
+
<a href="https://github.com/huggingface/kernels-uvnotes/blob/main/layer_norm/impls/torch_layer_norm.md" target="_blank" class="github-btn">GitHub</a>
|
| 3928 |
</div>
|
| 3929 |
<div id="code-benchmark" class="cell-code" data-lines="26">
|
| 3930 |
<div class="code-wrap">
|
|
|
|
| 3968 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 3969 |
Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
|
| 3970 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 3971 |
+
torch_layer_norm 3.90% 151.572us 46.01% 1.786ms 1.786ms 0.000us 0.00% 3.026ms 3.026ms 1
|
| 3972 |
+
aten::layer_norm 0.43% 16.762us 42.11% 1.635ms 544.851us 0.000us 0.00% 3.026ms 1.009ms 3
|
| 3973 |
+
aten::native_layer_norm 2.06% 80.009us 41.67% 1.618ms 539.263us 2.316ms 100.00% 3.026ms 1.009ms 3
|
| 3974 |
torch_layer_norm 0.00% 0.000us 0.00% 0.000us 0.000us 2.318ms 100.06% 2.318ms 2.318ms 1
|
| 3975 |
+
void at::native::(anonymous namespace)::vectorized_l... 0.00% 0.000us 0.00% 0.000us 0.000us 2.316ms 100.00% 2.316ms 772.127us 3
|
| 3976 |
+
Activity Buffer Request 37.08% 1.440ms 37.08% 1.440ms 1.440ms 709.855us 30.65% 709.855us 709.855us 1
|
| 3977 |
+
aten::empty 1.19% 46.261us 1.19% 46.261us 5.140us 0.000us 0.00% 0.000us 0.000us 9
|
| 3978 |
+
cudaLaunchKernel 1.16% 45.163us 1.16% 45.163us 15.054us 0.000us 0.00% 0.000us 0.000us 3
|
| 3979 |
+
aten::view 0.17% 6.761us 0.17% 6.761us 1.127us 0.000us 0.00% 0.000us 0.000us 6
|
| 3980 |
+
cudaDeviceSynchronize 53.99% 2.096ms 53.99% 2.096ms 2.096ms 0.000us 0.00% 0.000us 0.000us 1
|
| 3981 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 3982 |
+
Self CPU time total: 3.882ms
|
| 3983 |
+
Self CUDA time total: 2.316ms
|
| 3984 |
|
| 3985 |
|
| 3986 |
|
|
|
|
| 3990 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 3991 |
Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
|
| 3992 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 3993 |
+
torch_layer_norm 1.19% 75.581us 25.55% 1.628ms 1.628ms 0.000us 0.00% 6.473ms 6.473ms 1
|
| 3994 |
+
aten::layer_norm 0.14% 9.142us 24.37% 1.553ms 517.550us 0.000us 0.00% 6.473ms 2.158ms 3
|
| 3995 |
+
aten::native_layer_norm 0.81% 51.921us 24.22% 1.544ms 514.502us 4.881ms 100.00% 6.473ms 2.158ms 3
|
| 3996 |
+
torch_layer_norm 0.00% 0.000us 0.00% 0.000us 0.000us 4.882ms 100.03% 4.882ms 4.882ms 1
|
| 3997 |
+
void at::native::(anonymous namespace)::vectorized_l... 0.00% 0.000us 0.00% 0.000us 0.000us 4.881ms 100.00% 4.881ms 1.627ms 3
|
| 3998 |
+
Activity Buffer Request 22.46% 1.431ms 22.46% 1.431ms 1.431ms 1.592ms 32.61% 1.592ms 1.592ms 1
|
| 3999 |
+
aten::empty 0.44% 27.841us 0.44% 27.841us 3.093us 0.000us 0.00% 0.000us 0.000us 9
|
| 4000 |
+
cudaLaunchKernel 0.45% 28.910us 0.45% 28.910us 9.637us 0.000us 0.00% 0.000us 0.000us 3
|
| 4001 |
+
aten::view 0.06% 3.829us 0.06% 3.829us 0.638us 0.000us 0.00% 0.000us 0.000us 6
|
| 4002 |
+
cudaDeviceSynchronize 74.45% 4.743ms 74.45% 4.743ms 4.743ms 0.000us 0.00% 0.000us 0.000us 1
|
| 4003 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 4004 |
+
Self CPU time total: 6.372ms
|
| 4005 |
+
Self CUDA time total: 4.881ms
|
| 4006 |
|
| 4007 |
|
| 4008 |
|
|
|
|
| 4012 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 4013 |
Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
|
| 4014 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 4015 |
+
torch_layer_norm 1.15% 71.882us 26.71% 1.668ms 1.668ms 0.000us 0.00% 6.222ms 6.222ms 1
|
| 4016 |
+
aten::layer_norm 0.15% 9.629us 25.56% 1.596ms 532.153us 0.000us 0.00% 6.222ms 2.074ms 3
|
| 4017 |
+
aten::native_layer_norm 0.90% 56.373us 25.41% 1.587ms 528.943us 4.717ms 100.00% 6.222ms 2.074ms 3
|
| 4018 |
+
torch_layer_norm 0.00% 0.000us 0.00% 0.000us 0.000us 4.718ms 100.03% 4.718ms 4.718ms 1
|
| 4019 |
+
void at::native::(anonymous namespace)::vectorized_l... 0.00% 0.000us 0.00% 0.000us 0.000us 4.717ms 100.00% 4.717ms 1.572ms 3
|
| 4020 |
+
Activity Buffer Request 23.44% 1.464ms 23.44% 1.464ms 1.464ms 1.506ms 31.93% 1.506ms 1.506ms 1
|
| 4021 |
+
aten::empty 0.46% 28.850us 0.46% 28.850us 3.206us 0.000us 0.00% 0.000us 0.000us 9
|
| 4022 |
+
cudaLaunchKernel 0.52% 32.781us 0.52% 32.781us 10.927us 0.000us 0.00% 0.000us 0.000us 3
|
| 4023 |
+
aten::view 0.07% 4.590us 0.07% 4.590us 0.765us 0.000us 0.00% 0.000us 0.000us 6
|
| 4024 |
+
cudaDeviceSynchronize 73.29% 4.577ms 73.29% 4.577ms 4.577ms 0.000us 0.00% 0.000us 0.000us 1
|
| 4025 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 4026 |
+
Self CPU time total: 6.246ms
|
| 4027 |
+
Self CUDA time total: 4.717ms
|
| 4028 |
|
| 4029 |
|
| 4030 |
|
|
|
|
| 4034 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 4035 |
Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
|
| 4036 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 4037 |
+
torch_layer_norm 0.67% 74.340us 13.35% 1.490ms 1.490ms 0.000us 0.00% 13.028ms 13.028ms 1
|
| 4038 |
+
aten::layer_norm 0.09% 9.510us 12.69% 1.416ms 471.835us 0.000us 0.00% 13.028ms 4.343ms 3
|
| 4039 |
+
aten::native_layer_norm 0.47% 52.269us 12.60% 1.406ms 468.665us 9.808ms 100.00% 13.028ms 4.343ms 3
|
| 4040 |
+
torch_layer_norm 0.00% 0.000us 0.00% 0.000us 0.000us 9.809ms 100.02% 9.809ms 9.809ms 1
|
| 4041 |
+
void at::native::(anonymous namespace)::vectorized_l... 0.00% 0.000us 0.00% 0.000us 0.000us 9.808ms 100.00% 9.808ms 3.269ms 3
|
| 4042 |
+
Activity Buffer Request 9.72% 1.085ms 9.72% 1.085ms 1.085ms 3.220ms 32.83% 3.220ms 3.220ms 1
|
| 4043 |
+
aten::empty 0.26% 29.181us 0.26% 29.181us 3.242us 0.000us 0.00% 0.000us 0.000us 9
|
| 4044 |
+
cudaLaunchKernel 2.11% 235.817us 2.11% 235.817us 78.606us 0.000us 0.00% 0.000us 0.000us 3
|
| 4045 |
+
aten::view 0.04% 4.022us 0.04% 4.022us 0.670us 0.000us 0.00% 0.000us 0.000us 6
|
| 4046 |
+
cudaDeviceSynchronize 86.65% 9.669ms 86.65% 9.669ms 9.669ms 0.000us 0.00% 0.000us 0.000us 1
|
| 4047 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 4048 |
+
Self CPU time total: 11.159ms
|
| 4049 |
+
Self CUDA time total: 9.808ms
|
| 4050 |
|
| 4051 |
|
| 4052 |
impl wl p50(ms) ok
|
|
|
|
| 4058 |
<div class="uv-install-logs" id="uv-logs-benchmark">
|
| 4059 |
<div class="uv-logs-header" onclick="toggleUvLogs(this)">▶ UV Install Logs</div>
|
| 4060 |
<div class="uv-logs-content" style="display: none;">
|
| 4061 |
+
Installed 37 packages in 222ms
|
| 4062 |
</div>
|
| 4063 |
</div>
|
| 4064 |
<div class="cell-artifacts">
|
layer_norm/results/artifacts/combine/latency.svg
CHANGED
|
|
Git LFS Details
|
|
|
Git LFS Details
|
layer_norm/results/combined_results.html
CHANGED
|
@@ -3872,7 +3872,7 @@ span.linenos.special { color: #000000; background-color: #ffffc0; padding-left:
|
|
| 3872 |
<rdf:RDF>
|
| 3873 |
<ns2:Work>
|
| 3874 |
<dc:type rdf:resource="http://purl.org/dc/dcmitype/StillImage" />
|
| 3875 |
-
<dc:date>2025-10-
|
| 3876 |
<dc:format>image/svg+xml</dc:format>
|
| 3877 |
<dc:creator>
|
| 3878 |
<ns2:Agent>
|
|
@@ -3956,70 +3956,70 @@ span.linenos.special { color: #000000; background-color: #ffffc0; padding-left:
|
|
| 3956 |
<g id="matplotlib.axis_2">
|
| 3957 |
<g id="ytick_1">
|
| 3958 |
<g id="grid-y--2" class="grid grid-y">
|
| 3959 |
-
<path d="M 47.72
|
| 3960 |
</g>
|
| 3961 |
<g id="line2d_5">
|
| 3962 |
<defs>
|
| 3963 |
<path id="m0fca2865ba" d="M 0 0 L -3.5 0 " style="stroke: #000000; stroke-width: 0.8" />
|
| 3964 |
</defs>
|
| 3965 |
<g>
|
| 3966 |
-
<use ns4:href="#m0fca2865ba" x="47.72" y="
|
| 3967 |
</g>
|
| 3968 |
</g>
|
| 3969 |
<g id="text_5">
|
| 3970 |
-
<text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: end" x="40.72" y="412.
|
| 3971 |
</g>
|
| 3972 |
</g>
|
| 3973 |
<g id="ytick_2">
|
| 3974 |
<g id="grid-y--3" class="grid grid-y">
|
| 3975 |
-
<path d="M 47.72 331.
|
| 3976 |
</g>
|
| 3977 |
<g id="line2d_6">
|
| 3978 |
<g>
|
| 3979 |
-
<use ns4:href="#m0fca2865ba" x="47.72" y="331.
|
| 3980 |
</g>
|
| 3981 |
</g>
|
| 3982 |
<g id="text_6">
|
| 3983 |
-
<text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: end" x="40.72" y="
|
| 3984 |
</g>
|
| 3985 |
</g>
|
| 3986 |
<g id="ytick_3">
|
| 3987 |
<g id="grid-y--4" class="grid grid-y">
|
| 3988 |
-
<path d="M 47.72 253.
|
| 3989 |
</g>
|
| 3990 |
<g id="line2d_7">
|
| 3991 |
<g>
|
| 3992 |
-
<use ns4:href="#m0fca2865ba" x="47.72" y="253.
|
| 3993 |
</g>
|
| 3994 |
</g>
|
| 3995 |
<g id="text_7">
|
| 3996 |
-
<text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: end" x="40.72" y="
|
| 3997 |
</g>
|
| 3998 |
</g>
|
| 3999 |
<g id="ytick_4">
|
| 4000 |
<g id="grid-y--5" class="grid grid-y">
|
| 4001 |
-
<path d="M 47.72 175.
|
| 4002 |
</g>
|
| 4003 |
<g id="line2d_8">
|
| 4004 |
<g>
|
| 4005 |
-
<use ns4:href="#m0fca2865ba" x="47.72" y="175.
|
| 4006 |
</g>
|
| 4007 |
</g>
|
| 4008 |
<g id="text_8">
|
| 4009 |
-
<text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: end" x="40.72" y="179.
|
| 4010 |
</g>
|
| 4011 |
</g>
|
| 4012 |
<g id="ytick_5">
|
| 4013 |
<g id="grid-y--6" class="grid grid-y">
|
| 4014 |
-
<path d="M 47.72
|
| 4015 |
</g>
|
| 4016 |
<g id="line2d_9">
|
| 4017 |
<g>
|
| 4018 |
-
<use ns4:href="#m0fca2865ba" x="47.72" y="
|
| 4019 |
</g>
|
| 4020 |
</g>
|
| 4021 |
<g id="text_9">
|
| 4022 |
-
<text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: end" x="40.72" y="101.
|
| 4023 |
</g>
|
| 4024 |
</g>
|
| 4025 |
<g id="label--y" class="ylabel">
|
|
@@ -4027,27 +4027,27 @@ span.linenos.special { color: #000000; background-color: #ffffc0; padding-left:
|
|
| 4027 |
</g>
|
| 4028 |
</g>
|
| 4029 |
<g id="series--torch-layer-norm" class="series">
|
| 4030 |
-
<path d="M 83.741924 437.689571 L 323.888085
|
| 4031 |
<defs>
|
| 4032 |
<path id="md7efaf3aec" d="M 0 3 C 0.795609 3 1.55874 2.683901 2.12132 2.12132 C 2.683901 1.55874 3 0.795609 3 0 C 3 -0.795609 2.683901 -1.55874 2.12132 -2.12132 C 1.55874 -2.683901 0.795609 -3 0 -3 C -0.795609 -3 -1.55874 -2.683901 -2.12132 -2.12132 C -2.683901 -1.55874 -3 -0.795609 -3 0 C -3 0.795609 -2.683901 1.55874 -2.12132 2.12132 C -1.55874 2.683901 -0.795609 3 0 3 z " style="stroke: #1f77b4" />
|
| 4033 |
</defs>
|
| 4034 |
<g clip-path="url(#p2214f54723)">
|
| 4035 |
<use ns4:href="#md7efaf3aec" x="83.741924" y="437.689571" style="fill: #1f77b4; stroke: #1f77b4" />
|
| 4036 |
-
<use ns4:href="#md7efaf3aec" x="323.888085" y="
|
| 4037 |
-
<use ns4:href="#md7efaf3aec" x="564.034245" y="
|
| 4038 |
<use ns4:href="#md7efaf3aec" x="804.180406" y="46.442361" style="fill: #1f77b4; stroke: #1f77b4" />
|
| 4039 |
</g>
|
| 4040 |
</g>
|
| 4041 |
<g id="series--hf-kernels-layer-norm" class="series">
|
| 4042 |
-
<path d="M 83.741924 434.
|
| 4043 |
<defs>
|
| 4044 |
<path id="m9b8c54d372" d="M 0 3 C 0.795609 3 1.55874 2.683901 2.12132 2.12132 C 2.683901 1.55874 3 0.795609 3 0 C 3 -0.795609 2.683901 -1.55874 2.12132 -2.12132 C 1.55874 -2.683901 0.795609 -3 0 -3 C -0.795609 -3 -1.55874 -2.683901 -2.12132 -2.12132 C -2.683901 -1.55874 -3 -0.795609 -3 0 C -3 0.795609 -2.683901 1.55874 -2.12132 2.12132 C -1.55874 2.683901 -0.795609 3 0 3 z " style="stroke: #ff7f0e" />
|
| 4045 |
</defs>
|
| 4046 |
<g clip-path="url(#p2214f54723)">
|
| 4047 |
-
<use ns4:href="#m9b8c54d372" x="83.741924" y="434.
|
| 4048 |
-
<use ns4:href="#m9b8c54d372" x="323.888085" y="307.
|
| 4049 |
-
<use ns4:href="#m9b8c54d372" x="564.034245" y="307.
|
| 4050 |
-
<use ns4:href="#m9b8c54d372" x="804.180406" y="
|
| 4051 |
</g>
|
| 4052 |
</g>
|
| 4053 |
<g id="patch_3">
|
|
@@ -4105,7 +4105,7 @@ span.linenos.special { color: #000000; background-color: #ffffc0; padding-left:
|
|
| 4105 |
<span onclick="toggleOutput('combine')" style="cursor: pointer;">▼ output</span>
|
| 4106 |
<span id="uv-indicator-combine" onclick="toggleUvLogsFromHeader('combine')" style="cursor: pointer;">▶ uv-logs</span>
|
| 4107 |
</span> |
|
| 4108 |
-
Cell: combine | 4.
|
| 4109 |
| <button class="run-btn" onclick="runCell('combine')">▶ run</button>
|
| 4110 |
<button class="copy-btn" onclick="copyCell('combine')">Copy</button>
|
| 4111 |
<a href="cells/combine.py" target="_blank" class="raw-btn">Raw</a>
|
|
@@ -4195,7 +4195,7 @@ impl wl p50(ms) ok
|
|
| 4195 |
hf_kernels_layer_norm LN_B16_S2048_D4096 0.84 True
|
| 4196 |
hf_kernels_layer_norm LN_B16_S2048_D8192 1.65 True
|
| 4197 |
hf_kernels_layer_norm LN_B16_S4096_D4096 1.65 True
|
| 4198 |
-
hf_kernels_layer_norm LN_B16_S4096_D8192 3.
|
| 4199 |
torch_layer_norm LN_B16_S2048_D4096 0.82 True
|
| 4200 |
torch_layer_norm LN_B16_S2048_D8192 1.68 True
|
| 4201 |
torch_layer_norm LN_B16_S4096_D4096 1.61 True
|
|
@@ -4219,7 +4219,7 @@ Implementations included:
|
|
| 4219 |
<div class="uv-install-logs" id="uv-logs-combine">
|
| 4220 |
<div class="uv-logs-header" onclick="toggleUvLogs(this)">▶ UV Install Logs</div>
|
| 4221 |
<div class="uv-logs-content" style="display: none;">
|
| 4222 |
-
Installed 37 packages in
|
| 4223 |
</div>
|
| 4224 |
</div>
|
| 4225 |
<div class="cell-artifacts">
|
|
@@ -4232,7 +4232,7 @@ Installed 37 packages in 219ms
|
|
| 4232 |
<rdf:RDF>
|
| 4233 |
<ns2:Work>
|
| 4234 |
<dc:type rdf:resource="http://purl.org/dc/dcmitype/StillImage" />
|
| 4235 |
-
<dc:date>2025-10-
|
| 4236 |
<dc:format>image/svg+xml</dc:format>
|
| 4237 |
<dc:creator>
|
| 4238 |
<ns2:Agent>
|
|
@@ -4316,70 +4316,70 @@ Installed 37 packages in 219ms
|
|
| 4316 |
<g id="matplotlib.axis_2">
|
| 4317 |
<g id="ytick_1">
|
| 4318 |
<g id="grid-y--2" class="grid grid-y">
|
| 4319 |
-
<path d="M 47.72
|
| 4320 |
</g>
|
| 4321 |
<g id="line2d_5">
|
| 4322 |
<defs>
|
| 4323 |
<path id="m0fca2865ba" d="M 0 0 L -3.5 0 " style="stroke: #000000; stroke-width: 0.8" />
|
| 4324 |
</defs>
|
| 4325 |
<g>
|
| 4326 |
-
<use ns4:href="#m0fca2865ba" x="47.72" y="
|
| 4327 |
</g>
|
| 4328 |
</g>
|
| 4329 |
<g id="text_5">
|
| 4330 |
-
<text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: end" x="40.72" y="412.
|
| 4331 |
</g>
|
| 4332 |
</g>
|
| 4333 |
<g id="ytick_2">
|
| 4334 |
<g id="grid-y--3" class="grid grid-y">
|
| 4335 |
-
<path d="M 47.72 331.
|
| 4336 |
</g>
|
| 4337 |
<g id="line2d_6">
|
| 4338 |
<g>
|
| 4339 |
-
<use ns4:href="#m0fca2865ba" x="47.72" y="331.
|
| 4340 |
</g>
|
| 4341 |
</g>
|
| 4342 |
<g id="text_6">
|
| 4343 |
-
<text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: end" x="40.72" y="
|
| 4344 |
</g>
|
| 4345 |
</g>
|
| 4346 |
<g id="ytick_3">
|
| 4347 |
<g id="grid-y--4" class="grid grid-y">
|
| 4348 |
-
<path d="M 47.72 253.
|
| 4349 |
</g>
|
| 4350 |
<g id="line2d_7">
|
| 4351 |
<g>
|
| 4352 |
-
<use ns4:href="#m0fca2865ba" x="47.72" y="253.
|
| 4353 |
</g>
|
| 4354 |
</g>
|
| 4355 |
<g id="text_7">
|
| 4356 |
-
<text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: end" x="40.72" y="
|
| 4357 |
</g>
|
| 4358 |
</g>
|
| 4359 |
<g id="ytick_4">
|
| 4360 |
<g id="grid-y--5" class="grid grid-y">
|
| 4361 |
-
<path d="M 47.72 175.
|
| 4362 |
</g>
|
| 4363 |
<g id="line2d_8">
|
| 4364 |
<g>
|
| 4365 |
-
<use ns4:href="#m0fca2865ba" x="47.72" y="175.
|
| 4366 |
</g>
|
| 4367 |
</g>
|
| 4368 |
<g id="text_8">
|
| 4369 |
-
<text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: end" x="40.72" y="179.
|
| 4370 |
</g>
|
| 4371 |
</g>
|
| 4372 |
<g id="ytick_5">
|
| 4373 |
<g id="grid-y--6" class="grid grid-y">
|
| 4374 |
-
<path d="M 47.72
|
| 4375 |
</g>
|
| 4376 |
<g id="line2d_9">
|
| 4377 |
<g>
|
| 4378 |
-
<use ns4:href="#m0fca2865ba" x="47.72" y="
|
| 4379 |
</g>
|
| 4380 |
</g>
|
| 4381 |
<g id="text_9">
|
| 4382 |
-
<text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: end" x="40.72" y="101.
|
| 4383 |
</g>
|
| 4384 |
</g>
|
| 4385 |
<g id="label--y" class="ylabel">
|
|
@@ -4387,27 +4387,27 @@ Installed 37 packages in 219ms
|
|
| 4387 |
</g>
|
| 4388 |
</g>
|
| 4389 |
<g id="series--torch-layer-norm" class="series">
|
| 4390 |
-
<path d="M 83.741924 437.689571 L 323.888085
|
| 4391 |
<defs>
|
| 4392 |
<path id="md7efaf3aec" d="M 0 3 C 0.795609 3 1.55874 2.683901 2.12132 2.12132 C 2.683901 1.55874 3 0.795609 3 0 C 3 -0.795609 2.683901 -1.55874 2.12132 -2.12132 C 1.55874 -2.683901 0.795609 -3 0 -3 C -0.795609 -3 -1.55874 -2.683901 -2.12132 -2.12132 C -2.683901 -1.55874 -3 -0.795609 -3 0 C -3 0.795609 -2.683901 1.55874 -2.12132 2.12132 C -1.55874 2.683901 -0.795609 3 0 3 z " style="stroke: #1f77b4" />
|
| 4393 |
</defs>
|
| 4394 |
<g clip-path="url(#p2214f54723)">
|
| 4395 |
<use ns4:href="#md7efaf3aec" x="83.741924" y="437.689571" style="fill: #1f77b4; stroke: #1f77b4" />
|
| 4396 |
-
<use ns4:href="#md7efaf3aec" x="323.888085" y="
|
| 4397 |
-
<use ns4:href="#md7efaf3aec" x="564.034245" y="
|
| 4398 |
<use ns4:href="#md7efaf3aec" x="804.180406" y="46.442361" style="fill: #1f77b4; stroke: #1f77b4" />
|
| 4399 |
</g>
|
| 4400 |
</g>
|
| 4401 |
<g id="series--hf-kernels-layer-norm" class="series">
|
| 4402 |
-
<path d="M 83.741924 434.
|
| 4403 |
<defs>
|
| 4404 |
<path id="m9b8c54d372" d="M 0 3 C 0.795609 3 1.55874 2.683901 2.12132 2.12132 C 2.683901 1.55874 3 0.795609 3 0 C 3 -0.795609 2.683901 -1.55874 2.12132 -2.12132 C 1.55874 -2.683901 0.795609 -3 0 -3 C -0.795609 -3 -1.55874 -2.683901 -2.12132 -2.12132 C -2.683901 -1.55874 -3 -0.795609 -3 0 C -3 0.795609 -2.683901 1.55874 -2.12132 2.12132 C -1.55874 2.683901 -0.795609 3 0 3 z " style="stroke: #ff7f0e" />
|
| 4405 |
</defs>
|
| 4406 |
<g clip-path="url(#p2214f54723)">
|
| 4407 |
-
<use ns4:href="#m9b8c54d372" x="83.741924" y="434.
|
| 4408 |
-
<use ns4:href="#m9b8c54d372" x="323.888085" y="307.
|
| 4409 |
-
<use ns4:href="#m9b8c54d372" x="564.034245" y="307.
|
| 4410 |
-
<use ns4:href="#m9b8c54d372" x="804.180406" y="
|
| 4411 |
</g>
|
| 4412 |
</g>
|
| 4413 |
<g id="patch_3">
|
|
|
|
| 3872 |
<rdf:RDF>
|
| 3873 |
<ns2:Work>
|
| 3874 |
<dc:type rdf:resource="http://purl.org/dc/dcmitype/StillImage" />
|
| 3875 |
+
<dc:date>2025-10-29T14:27:45.722521</dc:date>
|
| 3876 |
<dc:format>image/svg+xml</dc:format>
|
| 3877 |
<dc:creator>
|
| 3878 |
<ns2:Agent>
|
|
|
|
| 3956 |
<g id="matplotlib.axis_2">
|
| 3957 |
<g id="ytick_1">
|
| 3958 |
<g id="grid-y--2" class="grid grid-y">
|
| 3959 |
+
<path d="M 47.72 408.957392 L 840.20233 408.957392 " clip-path="url(#p2214f54723)" style="fill: none; stroke: #b0b0b0; stroke-opacity: 0.3; stroke-width: 0.8; stroke-linecap: square" />
|
| 3960 |
</g>
|
| 3961 |
<g id="line2d_5">
|
| 3962 |
<defs>
|
| 3963 |
<path id="m0fca2865ba" d="M 0 0 L -3.5 0 " style="stroke: #000000; stroke-width: 0.8" />
|
| 3964 |
</defs>
|
| 3965 |
<g>
|
| 3966 |
+
<use ns4:href="#m0fca2865ba" x="47.72" y="408.957392" style="stroke: #000000; stroke-width: 0.8" />
|
| 3967 |
</g>
|
| 3968 |
</g>
|
| 3969 |
<g id="text_5">
|
| 3970 |
+
<text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: end" x="40.72" y="412.756611" transform="rotate(-0 40.72 412.756611)">1.0</text>
|
| 3971 |
</g>
|
| 3972 |
</g>
|
| 3973 |
<g id="ytick_2">
|
| 3974 |
<g id="grid-y--3" class="grid grid-y">
|
| 3975 |
+
<path d="M 47.72 331.05018 L 840.20233 331.05018 " clip-path="url(#p2214f54723)" style="fill: none; stroke: #b0b0b0; stroke-opacity: 0.3; stroke-width: 0.8; stroke-linecap: square" />
|
| 3976 |
</g>
|
| 3977 |
<g id="line2d_6">
|
| 3978 |
<g>
|
| 3979 |
+
<use ns4:href="#m0fca2865ba" x="47.72" y="331.05018" style="stroke: #000000; stroke-width: 0.8" />
|
| 3980 |
</g>
|
| 3981 |
</g>
|
| 3982 |
<g id="text_6">
|
| 3983 |
+
<text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: end" x="40.72" y="334.849399" transform="rotate(-0 40.72 334.849399)">1.5</text>
|
| 3984 |
</g>
|
| 3985 |
</g>
|
| 3986 |
<g id="ytick_3">
|
| 3987 |
<g id="grid-y--4" class="grid grid-y">
|
| 3988 |
+
<path d="M 47.72 253.142969 L 840.20233 253.142969 " clip-path="url(#p2214f54723)" style="fill: none; stroke: #b0b0b0; stroke-opacity: 0.3; stroke-width: 0.8; stroke-linecap: square" />
|
| 3989 |
</g>
|
| 3990 |
<g id="line2d_7">
|
| 3991 |
<g>
|
| 3992 |
+
<use ns4:href="#m0fca2865ba" x="47.72" y="253.142969" style="stroke: #000000; stroke-width: 0.8" />
|
| 3993 |
</g>
|
| 3994 |
</g>
|
| 3995 |
<g id="text_7">
|
| 3996 |
+
<text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: end" x="40.72" y="256.942188" transform="rotate(-0 40.72 256.942188)">2.0</text>
|
| 3997 |
</g>
|
| 3998 |
</g>
|
| 3999 |
<g id="ytick_4">
|
| 4000 |
<g id="grid-y--5" class="grid grid-y">
|
| 4001 |
+
<path d="M 47.72 175.235758 L 840.20233 175.235758 " clip-path="url(#p2214f54723)" style="fill: none; stroke: #b0b0b0; stroke-opacity: 0.3; stroke-width: 0.8; stroke-linecap: square" />
|
| 4002 |
</g>
|
| 4003 |
<g id="line2d_8">
|
| 4004 |
<g>
|
| 4005 |
+
<use ns4:href="#m0fca2865ba" x="47.72" y="175.235758" style="stroke: #000000; stroke-width: 0.8" />
|
| 4006 |
</g>
|
| 4007 |
</g>
|
| 4008 |
<g id="text_8">
|
| 4009 |
+
<text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: end" x="40.72" y="179.034976" transform="rotate(-0 40.72 179.034976)">2.5</text>
|
| 4010 |
</g>
|
| 4011 |
</g>
|
| 4012 |
<g id="ytick_5">
|
| 4013 |
<g id="grid-y--6" class="grid grid-y">
|
| 4014 |
+
<path d="M 47.72 97.328546 L 840.20233 97.328546 " clip-path="url(#p2214f54723)" style="fill: none; stroke: #b0b0b0; stroke-opacity: 0.3; stroke-width: 0.8; stroke-linecap: square" />
|
| 4015 |
</g>
|
| 4016 |
<g id="line2d_9">
|
| 4017 |
<g>
|
| 4018 |
+
<use ns4:href="#m0fca2865ba" x="47.72" y="97.328546" style="stroke: #000000; stroke-width: 0.8" />
|
| 4019 |
</g>
|
| 4020 |
</g>
|
| 4021 |
<g id="text_9">
|
| 4022 |
+
<text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: end" x="40.72" y="101.127765" transform="rotate(-0 40.72 101.127765)">3.0</text>
|
| 4023 |
</g>
|
| 4024 |
</g>
|
| 4025 |
<g id="label--y" class="ylabel">
|
|
|
|
| 4027 |
</g>
|
| 4028 |
</g>
|
| 4029 |
<g id="series--torch-layer-norm" class="series">
|
| 4030 |
+
<path d="M 83.741924 437.689571 L 323.888085 302.833591 L 564.034245 313.993176 L 804.180406 46.442361 " clip-path="url(#p2214f54723)" style="fill: none; stroke: #1f77b4; stroke-width: 1.5; stroke-linecap: square" />
|
| 4031 |
<defs>
|
| 4032 |
<path id="md7efaf3aec" d="M 0 3 C 0.795609 3 1.55874 2.683901 2.12132 2.12132 C 2.683901 1.55874 3 0.795609 3 0 C 3 -0.795609 2.683901 -1.55874 2.12132 -2.12132 C 1.55874 -2.683901 0.795609 -3 0 -3 C -0.795609 -3 -1.55874 -2.683901 -2.12132 -2.12132 C -2.683901 -1.55874 -3 -0.795609 -3 0 C -3 0.795609 -2.683901 1.55874 -2.12132 2.12132 C -1.55874 2.683901 -0.795609 3 0 3 z " style="stroke: #1f77b4" />
|
| 4033 |
</defs>
|
| 4034 |
<g clip-path="url(#p2214f54723)">
|
| 4035 |
<use ns4:href="#md7efaf3aec" x="83.741924" y="437.689571" style="fill: #1f77b4; stroke: #1f77b4" />
|
| 4036 |
+
<use ns4:href="#md7efaf3aec" x="323.888085" y="302.833591" style="fill: #1f77b4; stroke: #1f77b4" />
|
| 4037 |
+
<use ns4:href="#md7efaf3aec" x="564.034245" y="313.993176" style="fill: #1f77b4; stroke: #1f77b4" />
|
| 4038 |
<use ns4:href="#md7efaf3aec" x="804.180406" y="46.442361" style="fill: #1f77b4; stroke: #1f77b4" />
|
| 4039 |
</g>
|
| 4040 |
</g>
|
| 4041 |
<g id="series--hf-kernels-layer-norm" class="series">
|
| 4042 |
+
<path d="M 83.741924 434.434608 L 323.888085 307.690482 L 564.034245 307.408302 L 804.180406 57.182805 " clip-path="url(#p2214f54723)" style="fill: none; stroke: #ff7f0e; stroke-width: 1.5; stroke-linecap: square" />
|
| 4043 |
<defs>
|
| 4044 |
<path id="m9b8c54d372" d="M 0 3 C 0.795609 3 1.55874 2.683901 2.12132 2.12132 C 2.683901 1.55874 3 0.795609 3 0 C 3 -0.795609 2.683901 -1.55874 2.12132 -2.12132 C 1.55874 -2.683901 0.795609 -3 0 -3 C -0.795609 -3 -1.55874 -2.683901 -2.12132 -2.12132 C -2.683901 -1.55874 -3 -0.795609 -3 0 C -3 0.795609 -2.683901 1.55874 -2.12132 2.12132 C -1.55874 2.683901 -0.795609 3 0 3 z " style="stroke: #ff7f0e" />
|
| 4045 |
</defs>
|
| 4046 |
<g clip-path="url(#p2214f54723)">
|
| 4047 |
+
<use ns4:href="#m9b8c54d372" x="83.741924" y="434.434608" style="fill: #ff7f0e; stroke: #ff7f0e" />
|
| 4048 |
+
<use ns4:href="#m9b8c54d372" x="323.888085" y="307.690482" style="fill: #ff7f0e; stroke: #ff7f0e" />
|
| 4049 |
+
<use ns4:href="#m9b8c54d372" x="564.034245" y="307.408302" style="fill: #ff7f0e; stroke: #ff7f0e" />
|
| 4050 |
+
<use ns4:href="#m9b8c54d372" x="804.180406" y="57.182805" style="fill: #ff7f0e; stroke: #ff7f0e" />
|
| 4051 |
</g>
|
| 4052 |
</g>
|
| 4053 |
<g id="patch_3">
|
|
|
|
| 4105 |
<span onclick="toggleOutput('combine')" style="cursor: pointer;">▼ output</span>
|
| 4106 |
<span id="uv-indicator-combine" onclick="toggleUvLogsFromHeader('combine')" style="cursor: pointer;">▶ uv-logs</span>
|
| 4107 |
</span> |
|
| 4108 |
+
Cell: combine | 4.21s
|
| 4109 |
| <button class="run-btn" onclick="runCell('combine')">▶ run</button>
|
| 4110 |
<button class="copy-btn" onclick="copyCell('combine')">Copy</button>
|
| 4111 |
<a href="cells/combine.py" target="_blank" class="raw-btn">Raw</a>
|
|
|
|
| 4195 |
hf_kernels_layer_norm LN_B16_S2048_D4096 0.84 True
|
| 4196 |
hf_kernels_layer_norm LN_B16_S2048_D8192 1.65 True
|
| 4197 |
hf_kernels_layer_norm LN_B16_S4096_D4096 1.65 True
|
| 4198 |
+
hf_kernels_layer_norm LN_B16_S4096_D8192 3.26 True
|
| 4199 |
torch_layer_norm LN_B16_S2048_D4096 0.82 True
|
| 4200 |
torch_layer_norm LN_B16_S2048_D8192 1.68 True
|
| 4201 |
torch_layer_norm LN_B16_S4096_D4096 1.61 True
|
|
|
|
| 4219 |
<div class="uv-install-logs" id="uv-logs-combine">
|
| 4220 |
<div class="uv-logs-header" onclick="toggleUvLogs(this)">▶ UV Install Logs</div>
|
| 4221 |
<div class="uv-logs-content" style="display: none;">
|
| 4222 |
+
Installed 37 packages in 210ms
|
| 4223 |
</div>
|
| 4224 |
</div>
|
| 4225 |
<div class="cell-artifacts">
|
|
|
|
| 4232 |
<rdf:RDF>
|
| 4233 |
<ns2:Work>
|
| 4234 |
<dc:type rdf:resource="http://purl.org/dc/dcmitype/StillImage" />
|
| 4235 |
+
<dc:date>2025-10-29T14:27:45.722521</dc:date>
|
| 4236 |
<dc:format>image/svg+xml</dc:format>
|
| 4237 |
<dc:creator>
|
| 4238 |
<ns2:Agent>
|
|
|
|
| 4316 |
<g id="matplotlib.axis_2">
|
| 4317 |
<g id="ytick_1">
|
| 4318 |
<g id="grid-y--2" class="grid grid-y">
|
| 4319 |
+
<path d="M 47.72 408.957392 L 840.20233 408.957392 " clip-path="url(#p2214f54723)" style="fill: none; stroke: #b0b0b0; stroke-opacity: 0.3; stroke-width: 0.8; stroke-linecap: square" />
|
| 4320 |
</g>
|
| 4321 |
<g id="line2d_5">
|
| 4322 |
<defs>
|
| 4323 |
<path id="m0fca2865ba" d="M 0 0 L -3.5 0 " style="stroke: #000000; stroke-width: 0.8" />
|
| 4324 |
</defs>
|
| 4325 |
<g>
|
| 4326 |
+
<use ns4:href="#m0fca2865ba" x="47.72" y="408.957392" style="stroke: #000000; stroke-width: 0.8" />
|
| 4327 |
</g>
|
| 4328 |
</g>
|
| 4329 |
<g id="text_5">
|
| 4330 |
+
<text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: end" x="40.72" y="412.756611" transform="rotate(-0 40.72 412.756611)">1.0</text>
|
| 4331 |
</g>
|
| 4332 |
</g>
|
| 4333 |
<g id="ytick_2">
|
| 4334 |
<g id="grid-y--3" class="grid grid-y">
|
| 4335 |
+
<path d="M 47.72 331.05018 L 840.20233 331.05018 " clip-path="url(#p2214f54723)" style="fill: none; stroke: #b0b0b0; stroke-opacity: 0.3; stroke-width: 0.8; stroke-linecap: square" />
|
| 4336 |
</g>
|
| 4337 |
<g id="line2d_6">
|
| 4338 |
<g>
|
| 4339 |
+
<use ns4:href="#m0fca2865ba" x="47.72" y="331.05018" style="stroke: #000000; stroke-width: 0.8" />
|
| 4340 |
</g>
|
| 4341 |
</g>
|
| 4342 |
<g id="text_6">
|
| 4343 |
+
<text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: end" x="40.72" y="334.849399" transform="rotate(-0 40.72 334.849399)">1.5</text>
|
| 4344 |
</g>
|
| 4345 |
</g>
|
| 4346 |
<g id="ytick_3">
|
| 4347 |
<g id="grid-y--4" class="grid grid-y">
|
| 4348 |
+
<path d="M 47.72 253.142969 L 840.20233 253.142969 " clip-path="url(#p2214f54723)" style="fill: none; stroke: #b0b0b0; stroke-opacity: 0.3; stroke-width: 0.8; stroke-linecap: square" />
|
| 4349 |
</g>
|
| 4350 |
<g id="line2d_7">
|
| 4351 |
<g>
|
| 4352 |
+
<use ns4:href="#m0fca2865ba" x="47.72" y="253.142969" style="stroke: #000000; stroke-width: 0.8" />
|
| 4353 |
</g>
|
| 4354 |
</g>
|
| 4355 |
<g id="text_7">
|
| 4356 |
+
<text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: end" x="40.72" y="256.942188" transform="rotate(-0 40.72 256.942188)">2.0</text>
|
| 4357 |
</g>
|
| 4358 |
</g>
|
| 4359 |
<g id="ytick_4">
|
| 4360 |
<g id="grid-y--5" class="grid grid-y">
|
| 4361 |
+
<path d="M 47.72 175.235758 L 840.20233 175.235758 " clip-path="url(#p2214f54723)" style="fill: none; stroke: #b0b0b0; stroke-opacity: 0.3; stroke-width: 0.8; stroke-linecap: square" />
|
| 4362 |
</g>
|
| 4363 |
<g id="line2d_8">
|
| 4364 |
<g>
|
| 4365 |
+
<use ns4:href="#m0fca2865ba" x="47.72" y="175.235758" style="stroke: #000000; stroke-width: 0.8" />
|
| 4366 |
</g>
|
| 4367 |
</g>
|
| 4368 |
<g id="text_8">
|
| 4369 |
+
<text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: end" x="40.72" y="179.034976" transform="rotate(-0 40.72 179.034976)">2.5</text>
|
| 4370 |
</g>
|
| 4371 |
</g>
|
| 4372 |
<g id="ytick_5">
|
| 4373 |
<g id="grid-y--6" class="grid grid-y">
|
| 4374 |
+
<path d="M 47.72 97.328546 L 840.20233 97.328546 " clip-path="url(#p2214f54723)" style="fill: none; stroke: #b0b0b0; stroke-opacity: 0.3; stroke-width: 0.8; stroke-linecap: square" />
|
| 4375 |
</g>
|
| 4376 |
<g id="line2d_9">
|
| 4377 |
<g>
|
| 4378 |
+
<use ns4:href="#m0fca2865ba" x="47.72" y="97.328546" style="stroke: #000000; stroke-width: 0.8" />
|
| 4379 |
</g>
|
| 4380 |
</g>
|
| 4381 |
<g id="text_9">
|
| 4382 |
+
<text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: end" x="40.72" y="101.127765" transform="rotate(-0 40.72 101.127765)">3.0</text>
|
| 4383 |
</g>
|
| 4384 |
</g>
|
| 4385 |
<g id="label--y" class="ylabel">
|
|
|
|
| 4387 |
</g>
|
| 4388 |
</g>
|
| 4389 |
<g id="series--torch-layer-norm" class="series">
|
| 4390 |
+
<path d="M 83.741924 437.689571 L 323.888085 302.833591 L 564.034245 313.993176 L 804.180406 46.442361 " clip-path="url(#p2214f54723)" style="fill: none; stroke: #1f77b4; stroke-width: 1.5; stroke-linecap: square" />
|
| 4391 |
<defs>
|
| 4392 |
<path id="md7efaf3aec" d="M 0 3 C 0.795609 3 1.55874 2.683901 2.12132 2.12132 C 2.683901 1.55874 3 0.795609 3 0 C 3 -0.795609 2.683901 -1.55874 2.12132 -2.12132 C 1.55874 -2.683901 0.795609 -3 0 -3 C -0.795609 -3 -1.55874 -2.683901 -2.12132 -2.12132 C -2.683901 -1.55874 -3 -0.795609 -3 0 C -3 0.795609 -2.683901 1.55874 -2.12132 2.12132 C -1.55874 2.683901 -0.795609 3 0 3 z " style="stroke: #1f77b4" />
|
| 4393 |
</defs>
|
| 4394 |
<g clip-path="url(#p2214f54723)">
|
| 4395 |
<use ns4:href="#md7efaf3aec" x="83.741924" y="437.689571" style="fill: #1f77b4; stroke: #1f77b4" />
|
| 4396 |
+
<use ns4:href="#md7efaf3aec" x="323.888085" y="302.833591" style="fill: #1f77b4; stroke: #1f77b4" />
|
| 4397 |
+
<use ns4:href="#md7efaf3aec" x="564.034245" y="313.993176" style="fill: #1f77b4; stroke: #1f77b4" />
|
| 4398 |
<use ns4:href="#md7efaf3aec" x="804.180406" y="46.442361" style="fill: #1f77b4; stroke: #1f77b4" />
|
| 4399 |
</g>
|
| 4400 |
</g>
|
| 4401 |
<g id="series--hf-kernels-layer-norm" class="series">
|
| 4402 |
+
<path d="M 83.741924 434.434608 L 323.888085 307.690482 L 564.034245 307.408302 L 804.180406 57.182805 " clip-path="url(#p2214f54723)" style="fill: none; stroke: #ff7f0e; stroke-width: 1.5; stroke-linecap: square" />
|
| 4403 |
<defs>
|
| 4404 |
<path id="m9b8c54d372" d="M 0 3 C 0.795609 3 1.55874 2.683901 2.12132 2.12132 C 2.683901 1.55874 3 0.795609 3 0 C 3 -0.795609 2.683901 -1.55874 2.12132 -2.12132 C 1.55874 -2.683901 0.795609 -3 0 -3 C -0.795609 -3 -1.55874 -2.683901 -2.12132 -2.12132 C -2.683901 -1.55874 -3 -0.795609 -3 0 C -3 0.795609 -2.683901 1.55874 -2.12132 2.12132 C -1.55874 2.683901 -0.795609 3 0 3 z " style="stroke: #ff7f0e" />
|
| 4405 |
</defs>
|
| 4406 |
<g clip-path="url(#p2214f54723)">
|
| 4407 |
+
<use ns4:href="#m9b8c54d372" x="83.741924" y="434.434608" style="fill: #ff7f0e; stroke: #ff7f0e" />
|
| 4408 |
+
<use ns4:href="#m9b8c54d372" x="323.888085" y="307.690482" style="fill: #ff7f0e; stroke: #ff7f0e" />
|
| 4409 |
+
<use ns4:href="#m9b8c54d372" x="564.034245" y="307.408302" style="fill: #ff7f0e; stroke: #ff7f0e" />
|
| 4410 |
+
<use ns4:href="#m9b8c54d372" x="804.180406" y="57.182805" style="fill: #ff7f0e; stroke: #ff7f0e" />
|
| 4411 |
</g>
|
| 4412 |
</g>
|
| 4413 |
<g id="patch_3">
|
rotary/impls/artifacts/benchmark/rotary.jsonl
CHANGED
|
@@ -1,24 +1,24 @@
|
|
| 1 |
-
{"ts": "2025-10-
|
| 2 |
-
{"ts": "2025-10-
|
| 3 |
-
{"ts": "2025-10-
|
| 4 |
-
{"ts": "2025-10-
|
| 5 |
-
{"ts": "2025-10-
|
| 6 |
-
{"ts": "2025-10-
|
| 7 |
-
{"ts": "2025-10-
|
| 8 |
-
{"ts": "2025-10-
|
| 9 |
-
{"ts": "2025-10-
|
| 10 |
-
{"ts": "2025-10-
|
| 11 |
-
{"ts": "2025-10-
|
| 12 |
-
{"ts": "2025-10-
|
| 13 |
-
{"ts": "2025-10-
|
| 14 |
-
{"ts": "2025-10-
|
| 15 |
-
{"ts": "2025-10-
|
| 16 |
-
{"ts": "2025-10-
|
| 17 |
-
{"ts": "2025-10-
|
| 18 |
-
{"ts": "2025-10-
|
| 19 |
-
{"ts": "2025-10-
|
| 20 |
-
{"ts": "2025-10-
|
| 21 |
-
{"ts": "2025-10-
|
| 22 |
-
{"ts": "2025-10-
|
| 23 |
-
{"ts": "2025-10-
|
| 24 |
-
{"ts": "2025-10-
|
|
|
|
| 1 |
+
{"ts": "2025-10-29T14:27:29Z", "run": "fb4d417a09cd449ead797cfa379bb3d8", "impl": "torch_eager", "tags": {"family": "pytorch", "backend": "eager"}, "wl": {"name": "cuda_B1_S128_H8_D64_R32", "batch": 1, "seqlen": 128, "num_heads": 8, "head_dim": 64, "rotary_dim": 32, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.17006399997399058, "p50": 0.17533400000502297, "p90": 0.1853339999797754, "mean": 0.1802961999942454, "iqr": 0.014799999974002276, "raw_times": [0.17533400000502297, 0.20021500000666492, 0.17053400000577312, 0.1853339999797754, 0.17006399997399058], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.18331500001522727, "peak_bytes": 1720320, "ok": true, "absmax": null, "corr": {"ok": true, "rtol": 0.003, "atol": 0.005, "absmax_q": 0.0, "absmax_k": 0.0, "mae_q": 0.0, "mae_k": 0.0, "mse_q": 0.0, "mse_k": 0.0, "ref": "rotary_torch"}, "err": null}
|
| 2 |
+
{"ts": "2025-10-29T14:27:29Z", "run": "fb4d417a09cd449ead797cfa379bb3d8", "impl": "torch_eager", "tags": {"family": "pytorch", "backend": "eager"}, "wl": {"name": "cuda_B1_S128_H8_D128_R64", "batch": 1, "seqlen": 128, "num_heads": 8, "head_dim": 128, "rotary_dim": 64, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.2246159999685915, "p50": 0.2266160000203854, "p90": 0.22888500001272405, "mean": 0.22735560000910482, "iqr": 0.002880000010918593, "raw_times": [0.22600500000180546, 0.22888500001272405, 0.2246159999685915, 0.2266160000203854, 0.2306560000420177], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.2471160000254713, "peak_bytes": 3440640, "ok": true, "absmax": null, "corr": {"ok": true, "rtol": 0.003, "atol": 0.005, "absmax_q": 0.0, "absmax_k": 0.0, "mae_q": 0.0, "mae_k": 0.0, "mse_q": 0.0, "mse_k": 0.0, "ref": "rotary_torch"}, "err": null}
|
| 3 |
+
{"ts": "2025-10-29T14:27:29Z", "run": "fb4d417a09cd449ead797cfa379bb3d8", "impl": "torch_eager", "tags": {"family": "pytorch", "backend": "eager"}, "wl": {"name": "cuda_B1_S128_H32_D64_R32", "batch": 1, "seqlen": 128, "num_heads": 32, "head_dim": 64, "rotary_dim": 32, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.21813499995460006, "p50": 0.22189599997091136, "p90": 0.2272149999953399, "mean": 0.22315939997952228, "iqr": 0.007960000004914036, "raw_times": [0.2272149999953399, 0.22189599997091136, 0.21925499999042586, 0.21813499995460006, 0.2292959999863342], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.2391049999914685, "peak_bytes": 6832128, "ok": true, "absmax": null, "corr": {"ok": true, "rtol": 0.003, "atol": 0.005, "absmax_q": 0.0, "absmax_k": 0.0, "mae_q": 0.0, "mae_k": 0.0, "mse_q": 0.0, "mse_k": 0.0, "ref": "rotary_torch"}, "err": null}
|
| 4 |
+
{"ts": "2025-10-29T14:27:29Z", "run": "fb4d417a09cd449ead797cfa379bb3d8", "impl": "torch_eager", "tags": {"family": "pytorch", "backend": "eager"}, "wl": {"name": "cuda_B1_S128_H32_D128_R64", "batch": 1, "seqlen": 128, "num_heads": 32, "head_dim": 128, "rotary_dim": 64, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.21478600001501036, "p50": 0.21544499998071842, "p90": 0.2178249999928994, "mean": 0.2161891999890031, "iqr": 0.0027799999884337012, "raw_times": [0.21544499998071842, 0.2178249999928994, 0.21478600001501036, 0.21784499995192164, 0.2150450000044657], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.22814599998355334, "peak_bytes": 13664256, "ok": true, "absmax": null, "corr": {"ok": true, "rtol": 0.003, "atol": 0.005, "absmax_q": 0.0, "absmax_k": 0.0, "mae_q": 0.0, "mae_k": 0.0, "mse_q": 0.0, "mse_k": 0.0, "ref": "rotary_torch"}, "err": null}
|
| 5 |
+
{"ts": "2025-10-29T14:27:29Z", "run": "fb4d417a09cd449ead797cfa379bb3d8", "impl": "torch_eager", "tags": {"family": "pytorch", "backend": "eager"}, "wl": {"name": "cuda_B1_S512_H8_D64_R32", "batch": 1, "seqlen": 512, "num_heads": 8, "head_dim": 64, "rotary_dim": 32, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.2160950000416051, "p50": 0.22390499998437008, "p90": 0.22473600000694205, "mean": 0.22559540000202105, "iqr": 0.00507100003233063, "raw_times": [0.22390499998437008, 0.24357600000257662, 0.22473600000694205, 0.2160950000416051, 0.21966499997461142], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.229085999990275, "peak_bytes": 6881280, "ok": true, "absmax": null, "corr": {"ok": true, "rtol": 0.003, "atol": 0.005, "absmax_q": 0.0, "absmax_k": 0.0, "mae_q": 0.0, "mae_k": 0.0, "mse_q": 0.0, "mse_k": 0.0, "ref": "rotary_torch"}, "err": null}
|
| 6 |
+
{"ts": "2025-10-29T14:27:30Z", "run": "fb4d417a09cd449ead797cfa379bb3d8", "impl": "torch_eager", "tags": {"family": "pytorch", "backend": "eager"}, "wl": {"name": "cuda_B1_S512_H8_D128_R64", "batch": 1, "seqlen": 512, "num_heads": 8, "head_dim": 128, "rotary_dim": 64, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.21366499998975996, "p50": 0.21597500000325454, "p90": 0.21670500001391702, "mean": 0.2158129999884295, "iqr": 0.0008600000569458643, "raw_times": [0.21366499998975996, 0.2168749999782449, 0.21597500000325454, 0.21670500001391702, 0.21584499995697115], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.21572499997546402, "peak_bytes": 13762560, "ok": true, "absmax": null, "corr": {"ok": true, "rtol": 0.003, "atol": 0.005, "absmax_q": 0.0, "absmax_k": 0.0, "mae_q": 0.0, "mae_k": 0.0, "mse_q": 0.0, "mse_k": 0.0, "ref": "rotary_torch"}, "err": null}
|
| 7 |
+
{"ts": "2025-10-29T14:27:30Z", "run": "fb4d417a09cd449ead797cfa379bb3d8", "impl": "torch_eager", "tags": {"family": "pytorch", "backend": "eager"}, "wl": {"name": "cuda_B1_S512_H32_D64_R32", "batch": 1, "seqlen": 512, "num_heads": 32, "head_dim": 64, "rotary_dim": 32, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.2142449999951168, "p50": 0.21574499999132968, "p90": 0.2169850000086626, "mean": 0.21585539999477987, "iqr": 0.0022990000161371427, "raw_times": [0.2142449999951168, 0.2176159999862648, 0.21468599999252547, 0.2169850000086626, 0.21574499999132968], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.2192349999745602, "peak_bytes": 27328512, "ok": true, "absmax": null, "corr": {"ok": true, "rtol": 0.003, "atol": 0.005, "absmax_q": 0.0, "absmax_k": 0.0, "mae_q": 0.0, "mae_k": 0.0, "mse_q": 0.0, "mse_k": 0.0, "ref": "rotary_torch"}, "err": null}
|
| 8 |
+
{"ts": "2025-10-29T14:27:30Z", "run": "fb4d417a09cd449ead797cfa379bb3d8", "impl": "torch_eager", "tags": {"family": "pytorch", "backend": "eager"}, "wl": {"name": "cuda_B1_S512_H32_D128_R64", "batch": 1, "seqlen": 512, "num_heads": 32, "head_dim": 128, "rotary_dim": 64, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.21675499999673775, "p50": 0.21711599998752718, "p90": 0.21833499999956985, "mean": 0.2174776000060774, "iqr": 0.0015789999565640755, "raw_times": [0.21675499999673775, 0.21711599998752718, 0.21833499999956985, 0.21675600004300577, 0.2184260000035465], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.22064600000248902, "peak_bytes": 54657024, "ok": true, "absmax": null, "corr": {"ok": true, "rtol": 0.003, "atol": 0.005, "absmax_q": 0.0, "absmax_k": 0.0, "mae_q": 0.0, "mae_k": 0.0, "mse_q": 0.0, "mse_k": 0.0, "ref": "rotary_torch"}, "err": null}
|
| 9 |
+
{"ts": "2025-10-29T14:27:30Z", "run": "fb4d417a09cd449ead797cfa379bb3d8", "impl": "torch_eager", "tags": {"family": "pytorch", "backend": "eager"}, "wl": {"name": "cuda_B1_S2048_H8_D64_R32", "batch": 1, "seqlen": 2048, "num_heads": 8, "head_dim": 64, "rotary_dim": 32, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.2153649999740992, "p50": 0.21702599997297511, "p90": 0.21829499996783852, "mean": 0.21729759998834197, "iqr": 0.0014989999499448459, "raw_times": [0.2153649999740992, 0.21679600001789368, 0.21900600000890336, 0.21702599997297511, 0.21829499996783852], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.22154499998805477, "peak_bytes": 27525120, "ok": true, "absmax": null, "corr": {"ok": true, "rtol": 0.003, "atol": 0.005, "absmax_q": 0.0, "absmax_k": 0.0, "mae_q": 0.0, "mae_k": 0.0, "mse_q": 0.0, "mse_k": 0.0, "ref": "rotary_torch"}, "err": null}
|
| 10 |
+
{"ts": "2025-10-29T14:27:30Z", "run": "fb4d417a09cd449ead797cfa379bb3d8", "impl": "torch_eager", "tags": {"family": "pytorch", "backend": "eager"}, "wl": {"name": "cuda_B1_S2048_H8_D128_R64", "batch": 1, "seqlen": 2048, "num_heads": 8, "head_dim": 128, "rotary_dim": 64, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.2143060000321384, "p50": 0.2161449999675824, "p90": 0.21640500000330576, "mean": 0.21578740000904872, "iqr": 0.0008589999538344273, "raw_times": [0.21653499999274572, 0.21640500000330576, 0.2143060000321384, 0.2161449999675824, 0.21554600004947133], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.23455599995259035, "peak_bytes": 55050240, "ok": true, "absmax": null, "corr": {"ok": true, "rtol": 0.003, "atol": 0.005, "absmax_q": 0.0, "absmax_k": 0.0, "mae_q": 0.0, "mae_k": 0.0, "mse_q": 0.0, "mse_k": 0.0, "ref": "rotary_torch"}, "err": null}
|
| 11 |
+
{"ts": "2025-10-29T14:27:30Z", "run": "fb4d417a09cd449ead797cfa379bb3d8", "impl": "torch_eager", "tags": {"family": "pytorch", "backend": "eager"}, "wl": {"name": "cuda_B1_S2048_H32_D64_R32", "batch": 1, "seqlen": 2048, "num_heads": 32, "head_dim": 64, "rotary_dim": 32, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.21488499999122723, "p50": 0.21633500000461936, "p90": 0.21918499999173946, "mean": 0.21730919999072285, "iqr": 0.004300000000512227, "raw_times": [0.21488499999122723, 0.21918499999173946, 0.21488499999122723, 0.22125599997480094, 0.21633500000461936], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.2185359999771208, "peak_bytes": 109314048, "ok": true, "absmax": null, "corr": {"ok": true, "rtol": 0.003, "atol": 0.005, "absmax_q": 0.0, "absmax_k": 0.0, "mae_q": 0.0, "mae_k": 0.0, "mse_q": 0.0, "mse_k": 0.0, "ref": "rotary_torch"}, "err": null}
|
| 12 |
+
{"ts": "2025-10-29T14:27:30Z", "run": "fb4d417a09cd449ead797cfa379bb3d8", "impl": "torch_eager", "tags": {"family": "pytorch", "backend": "eager"}, "wl": {"name": "cuda_B1_S2048_H32_D128_R64", "batch": 1, "seqlen": 2048, "num_heads": 32, "head_dim": 128, "rotary_dim": 64, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.2230359999657594, "p50": 0.22526600002947816, "p90": 0.22695600000588456, "mean": 0.22723160000168718, "iqr": 0.0026509999884183344, "raw_times": [0.22526600002947816, 0.2230359999657594, 0.23659499998984757, 0.22430500001746623, 0.22695600000588456], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.22456599998577076, "peak_bytes": 218628096, "ok": true, "absmax": null, "corr": {"ok": true, "rtol": 0.003, "atol": 0.005, "absmax_q": 0.0, "absmax_k": 0.0, "mae_q": 0.0, "mae_k": 0.0, "mse_q": 0.0, "mse_k": 0.0, "ref": "rotary_torch"}, "err": null}
|
| 13 |
+
{"ts": "2025-10-29T14:27:30Z", "run": "fb4d417a09cd449ead797cfa379bb3d8", "impl": "torch_eager", "tags": {"family": "pytorch", "backend": "eager"}, "wl": {"name": "cuda_B2_S128_H8_D64_R32", "batch": 2, "seqlen": 128, "num_heads": 8, "head_dim": 64, "rotary_dim": 32, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.21608499997682884, "p50": 0.2175149999743553, "p90": 0.22948600002337116, "mean": 0.2247094000040306, "iqr": 0.012610999988282856, "raw_times": [0.21608499997682884, 0.2168750000350883, 0.22948600002337116, 0.24358600001050945, 0.2175149999743553], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.21851499997183055, "peak_bytes": 68698112, "ok": true, "absmax": null, "corr": {"ok": true, "rtol": 0.003, "atol": 0.005, "absmax_q": 0.0, "absmax_k": 0.0, "mae_q": 0.0, "mae_k": 0.0, "mse_q": 0.0, "mse_k": 0.0, "ref": "rotary_torch"}, "err": null}
|
| 14 |
+
{"ts": "2025-10-29T14:27:30Z", "run": "fb4d417a09cd449ead797cfa379bb3d8", "impl": "torch_eager", "tags": {"family": "pytorch", "backend": "eager"}, "wl": {"name": "cuda_B2_S128_H8_D128_R64", "batch": 2, "seqlen": 128, "num_heads": 8, "head_dim": 128, "rotary_dim": 64, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.2162149999662688, "p50": 0.21694499997693129, "p90": 0.2171250000060354, "mean": 0.21706100000074002, "iqr": 0.0003099999617006688, "raw_times": [0.2162149999662688, 0.21694499997693129, 0.2171250000060354, 0.21820500001012988, 0.21681500004433474], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.21809500003655558, "peak_bytes": 6848512, "ok": true, "absmax": null, "corr": {"ok": true, "rtol": 0.003, "atol": 0.005, "absmax_q": 0.0, "absmax_k": 0.0, "mae_q": 0.0, "mae_k": 0.0, "mse_q": 0.0, "mse_k": 0.0, "ref": "rotary_torch"}, "err": null}
|
| 15 |
+
{"ts": "2025-10-29T14:27:30Z", "run": "fb4d417a09cd449ead797cfa379bb3d8", "impl": "torch_eager", "tags": {"family": "pytorch", "backend": "eager"}, "wl": {"name": "cuda_B2_S128_H32_D64_R32", "batch": 2, "seqlen": 128, "num_heads": 32, "head_dim": 64, "rotary_dim": 32, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.2136749999976928, "p50": 0.21658500003240988, "p90": 0.21662599999672238, "mean": 0.21621120000645533, "iqr": 0.00066100000140068, "raw_times": [0.2136749999976928, 0.2159649999953217, 0.21658500003240988, 0.21662599999672238, 0.21820500001012988], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.2181750000431748, "peak_bytes": 13647872, "ok": true, "absmax": null, "corr": {"ok": true, "rtol": 0.003, "atol": 0.005, "absmax_q": 0.0, "absmax_k": 0.0, "mae_q": 0.0, "mae_k": 0.0, "mse_q": 0.0, "mse_k": 0.0, "ref": "rotary_torch"}, "err": null}
|
| 16 |
+
{"ts": "2025-10-29T14:27:30Z", "run": "fb4d417a09cd449ead797cfa379bb3d8", "impl": "torch_eager", "tags": {"family": "pytorch", "backend": "eager"}, "wl": {"name": "cuda_B2_S128_H32_D128_R64", "batch": 2, "seqlen": 128, "num_heads": 32, "head_dim": 128, "rotary_dim": 64, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.21695499998486412, "p50": 0.21774499998628016, "p90": 0.2285450000272249, "mean": 0.22256720000086716, "iqr": 0.010920000022451859, "raw_times": [0.21774499998628016, 0.21762500000477303, 0.2319660000011936, 0.21695499998486412, 0.2285450000272249], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.22269599998026024, "peak_bytes": 27295744, "ok": true, "absmax": null, "corr": {"ok": true, "rtol": 0.003, "atol": 0.005, "absmax_q": 0.0, "absmax_k": 0.0, "mae_q": 0.0, "mae_k": 0.0, "mse_q": 0.0, "mse_k": 0.0, "ref": "rotary_torch"}, "err": null}
|
| 17 |
+
{"ts": "2025-10-29T14:27:30Z", "run": "fb4d417a09cd449ead797cfa379bb3d8", "impl": "torch_eager", "tags": {"family": "pytorch", "backend": "eager"}, "wl": {"name": "cuda_B2_S512_H8_D64_R32", "batch": 2, "seqlen": 512, "num_heads": 8, "head_dim": 64, "rotary_dim": 32, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.21440599999777987, "p50": 0.21785499995985447, "p90": 0.2335159999802272, "mean": 0.2228595999895333, "iqr": 0.01891099998374557, "raw_times": [0.21440599999777987, 0.23391600001332336, 0.21785499995985447, 0.2335159999802272, 0.21460499999648164], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.21932499998911226, "peak_bytes": 13697024, "ok": true, "absmax": null, "corr": {"ok": true, "rtol": 0.003, "atol": 0.005, "absmax_q": 0.0, "absmax_k": 0.0, "mae_q": 0.0, "mae_k": 0.0, "mse_q": 0.0, "mse_k": 0.0, "ref": "rotary_torch"}, "err": null}
|
| 18 |
+
{"ts": "2025-10-29T14:27:30Z", "run": "fb4d417a09cd449ead797cfa379bb3d8", "impl": "torch_eager", "tags": {"family": "pytorch", "backend": "eager"}, "wl": {"name": "cuda_B2_S512_H8_D128_R64", "batch": 2, "seqlen": 512, "num_heads": 8, "head_dim": 128, "rotary_dim": 64, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.2133250000042608, "p50": 0.21422499997925115, "p90": 0.21653499999274572, "mean": 0.21708740000576654, "iqr": 0.0029589999712698045, "raw_times": [0.21357600002147592, 0.2133250000042608, 0.21653499999274572, 0.21422499997925115, 0.2277760000310991], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.22739600001386862, "peak_bytes": 27394048, "ok": true, "absmax": null, "corr": {"ok": true, "rtol": 0.003, "atol": 0.005, "absmax_q": 0.0, "absmax_k": 0.0, "mae_q": 0.0, "mae_k": 0.0, "mse_q": 0.0, "mse_k": 0.0, "ref": "rotary_torch"}, "err": null}
|
| 19 |
+
{"ts": "2025-10-29T14:27:30Z", "run": "fb4d417a09cd449ead797cfa379bb3d8", "impl": "torch_eager", "tags": {"family": "pytorch", "backend": "eager"}, "wl": {"name": "cuda_B2_S512_H32_D64_R32", "batch": 2, "seqlen": 512, "num_heads": 32, "head_dim": 64, "rotary_dim": 32, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.21597500000325454, "p50": 0.2176859999849512, "p90": 0.21771499996248167, "mean": 0.21758339998996234, "iqr": 0.0013999999737279722, "raw_times": [0.2176859999849512, 0.21771499996248167, 0.22022600001037063, 0.2163149999887537, 0.21597500000325454], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.21613599994907418, "peak_bytes": 54591488, "ok": true, "absmax": null, "corr": {"ok": true, "rtol": 0.003, "atol": 0.005, "absmax_q": 0.0, "absmax_k": 0.0, "mae_q": 0.0, "mae_k": 0.0, "mse_q": 0.0, "mse_k": 0.0, "ref": "rotary_torch"}, "err": null}
|
| 20 |
+
{"ts": "2025-10-29T14:27:30Z", "run": "fb4d417a09cd449ead797cfa379bb3d8", "impl": "torch_eager", "tags": {"family": "pytorch", "backend": "eager"}, "wl": {"name": "cuda_B2_S512_H32_D128_R64", "batch": 2, "seqlen": 512, "num_heads": 32, "head_dim": 128, "rotary_dim": 64, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.21577600000455277, "p50": 0.2173749999769825, "p90": 0.21900600000890336, "mean": 0.21836960000882755, "iqr": 0.0018509999790694565, "raw_times": [0.2171550000298339, 0.21577600000455277, 0.2225360000238652, 0.2173749999769825, 0.21900600000890336], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.22321599999486352, "peak_bytes": 109182976, "ok": true, "absmax": null, "corr": {"ok": true, "rtol": 0.003, "atol": 0.005, "absmax_q": 0.0, "absmax_k": 0.0, "mae_q": 0.0, "mae_k": 0.0, "mse_q": 0.0, "mse_k": 0.0, "ref": "rotary_torch"}, "err": null}
|
| 21 |
+
{"ts": "2025-10-29T14:27:30Z", "run": "fb4d417a09cd449ead797cfa379bb3d8", "impl": "torch_eager", "tags": {"family": "pytorch", "backend": "eager"}, "wl": {"name": "cuda_B2_S2048_H8_D64_R32", "batch": 2, "seqlen": 2048, "num_heads": 8, "head_dim": 64, "rotary_dim": 32, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.21538499998996485, "p50": 0.21647599999141676, "p90": 0.21717500004569956, "mean": 0.2167214000110107, "iqr": 0.001030000021273736, "raw_times": [0.21717500004569956, 0.2184260000035465, 0.21538499998996485, 0.21614500002442583, 0.21647599999141676], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.21872600001415776, "peak_bytes": 54788096, "ok": true, "absmax": null, "corr": {"ok": true, "rtol": 0.003, "atol": 0.005, "absmax_q": 0.0, "absmax_k": 0.0, "mae_q": 0.0, "mae_k": 0.0, "mse_q": 0.0, "mse_k": 0.0, "ref": "rotary_torch"}, "err": null}
|
| 22 |
+
{"ts": "2025-10-29T14:27:30Z", "run": "fb4d417a09cd449ead797cfa379bb3d8", "impl": "torch_eager", "tags": {"family": "pytorch", "backend": "eager"}, "wl": {"name": "cuda_B2_S2048_H8_D128_R64", "batch": 2, "seqlen": 2048, "num_heads": 8, "head_dim": 128, "rotary_dim": 64, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.21501500003751062, "p50": 0.2168760000245129, "p90": 0.2187050000088675, "mean": 0.21949320001795058, "iqr": 0.0030500000320898835, "raw_times": [0.21565499997677762, 0.2187050000088675, 0.21501500003751062, 0.23121500004208428, 0.2168760000245129], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.22076499999457155, "peak_bytes": 109576192, "ok": true, "absmax": null, "corr": {"ok": true, "rtol": 0.003, "atol": 0.005, "absmax_q": 0.0, "absmax_k": 0.0, "mae_q": 0.0, "mae_k": 0.0, "mse_q": 0.0, "mse_k": 0.0, "ref": "rotary_torch"}, "err": null}
|
| 23 |
+
{"ts": "2025-10-29T14:27:30Z", "run": "fb4d417a09cd449ead797cfa379bb3d8", "impl": "torch_eager", "tags": {"family": "pytorch", "backend": "eager"}, "wl": {"name": "cuda_B2_S2048_H32_D64_R32", "batch": 2, "seqlen": 2048, "num_heads": 32, "head_dim": 64, "rotary_dim": 32, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.22937599999295344, "p50": 0.23008499999832566, "p90": 0.23144499999716572, "mean": 0.23359140000138723, "iqr": 0.0020100000028833165, "raw_times": [0.23144499999716572, 0.23008499999832566, 0.2294349999942824, 0.22937599999295344, 0.24761600002420892], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.23195599999326078, "peak_bytes": 218365952, "ok": true, "absmax": null, "corr": {"ok": true, "rtol": 0.003, "atol": 0.005, "absmax_q": 0.0, "absmax_k": 0.0, "mae_q": 0.0, "mae_k": 0.0, "mse_q": 0.0, "mse_k": 0.0, "ref": "rotary_torch"}, "err": null}
|
| 24 |
+
{"ts": "2025-10-29T14:27:30Z", "run": "fb4d417a09cd449ead797cfa379bb3d8", "impl": "torch_eager", "tags": {"family": "pytorch", "backend": "eager"}, "wl": {"name": "cuda_B2_S2048_H32_D128_R64", "batch": 2, "seqlen": 2048, "num_heads": 32, "head_dim": 128, "rotary_dim": 64, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.637245999996594, "p50": 0.6388759999822469, "p90": 0.6389449999915087, "mean": 0.6396317999929124, "iqr": 0.0012190000120426703, "raw_times": [0.6388759999822469, 0.6453660000147465, 0.6389449999915087, 0.637245999996594, 0.637725999979466], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.6364359999793123, "peak_bytes": 436731904, "ok": true, "absmax": null, "corr": {"ok": true, "rtol": 0.003, "atol": 0.005, "absmax_q": 0.0, "absmax_k": 0.0, "mae_q": 0.0, "mae_k": 0.0, "mse_q": 0.0, "mse_k": 0.0, "ref": "rotary_torch"}, "err": null}
|
rotary/impls/hf_kernels_rotary.html
CHANGED
|
The diff for this file is too large to render.
See raw diff
|
|
|
rotary/impls/torch_rotary.html
CHANGED
|
The diff for this file is too large to render.
See raw diff
|
|
|
rotary/index.html
CHANGED
|
The diff for this file is too large to render.
See raw diff
|
|
|
rotary/results/artifacts/combine/latency.svg
CHANGED
|
|
Git LFS Details
|
|
|
Git LFS Details
|
rotary/results/combined_results.html
CHANGED
|
@@ -3872,7 +3872,7 @@ span.linenos.special { color: #000000; background-color: #ffffc0; padding-left:
|
|
| 3872 |
<rdf:RDF>
|
| 3873 |
<ns2:Work>
|
| 3874 |
<dc:type rdf:resource="http://purl.org/dc/dcmitype/StillImage" />
|
| 3875 |
-
<dc:date>2025-10-
|
| 3876 |
<dc:format>image/svg+xml</dc:format>
|
| 3877 |
<dc:creator>
|
| 3878 |
<ns2:Agent>
|
|
@@ -4216,70 +4216,70 @@ span.linenos.special { color: #000000; background-color: #ffffc0; padding-left:
|
|
| 4216 |
<g id="matplotlib.axis_2">
|
| 4217 |
<g id="ytick_1">
|
| 4218 |
<g id="grid-y--2" class="grid grid-y">
|
| 4219 |
-
<path d="M 47.72
|
| 4220 |
</g>
|
| 4221 |
<g id="line2d_25">
|
| 4222 |
<defs>
|
| 4223 |
<path id="m0fca2865ba" d="M 0 0 L -3.5 0 " style="stroke: #000000; stroke-width: 0.8" />
|
| 4224 |
</defs>
|
| 4225 |
<g>
|
| 4226 |
-
<use ns4:href="#m0fca2865ba" x="47.72" y="
|
| 4227 |
</g>
|
| 4228 |
</g>
|
| 4229 |
<g id="text_25">
|
| 4230 |
-
<text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: end" x="40.72" y="
|
| 4231 |
</g>
|
| 4232 |
</g>
|
| 4233 |
<g id="ytick_2">
|
| 4234 |
<g id="grid-y--3" class="grid grid-y">
|
| 4235 |
-
<path d="M 47.72
|
| 4236 |
</g>
|
| 4237 |
<g id="line2d_26">
|
| 4238 |
<g>
|
| 4239 |
-
<use ns4:href="#m0fca2865ba" x="47.72" y="
|
| 4240 |
</g>
|
| 4241 |
</g>
|
| 4242 |
<g id="text_26">
|
| 4243 |
-
<text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: end" x="40.72" y="
|
| 4244 |
</g>
|
| 4245 |
</g>
|
| 4246 |
<g id="ytick_3">
|
| 4247 |
<g id="grid-y--4" class="grid grid-y">
|
| 4248 |
-
<path d="M 47.72
|
| 4249 |
</g>
|
| 4250 |
<g id="line2d_27">
|
| 4251 |
<g>
|
| 4252 |
-
<use ns4:href="#m0fca2865ba" x="47.72" y="
|
| 4253 |
</g>
|
| 4254 |
</g>
|
| 4255 |
<g id="text_27">
|
| 4256 |
-
<text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: end" x="40.72" y="
|
| 4257 |
</g>
|
| 4258 |
</g>
|
| 4259 |
<g id="ytick_4">
|
| 4260 |
<g id="grid-y--5" class="grid grid-y">
|
| 4261 |
-
<path d="M 47.72
|
| 4262 |
</g>
|
| 4263 |
<g id="line2d_28">
|
| 4264 |
<g>
|
| 4265 |
-
<use ns4:href="#m0fca2865ba" x="47.72" y="
|
| 4266 |
</g>
|
| 4267 |
</g>
|
| 4268 |
<g id="text_28">
|
| 4269 |
-
<text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: end" x="40.72" y="
|
| 4270 |
</g>
|
| 4271 |
</g>
|
| 4272 |
<g id="ytick_5">
|
| 4273 |
<g id="grid-y--6" class="grid grid-y">
|
| 4274 |
-
<path d="M 47.72
|
| 4275 |
</g>
|
| 4276 |
<g id="line2d_29">
|
| 4277 |
<g>
|
| 4278 |
-
<use ns4:href="#m0fca2865ba" x="47.72" y="
|
| 4279 |
</g>
|
| 4280 |
</g>
|
| 4281 |
<g id="text_29">
|
| 4282 |
-
<text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: end" x="40.72" y="
|
| 4283 |
</g>
|
| 4284 |
</g>
|
| 4285 |
<g id="label--y" class="ylabel">
|
|
@@ -4287,34 +4287,34 @@ span.linenos.special { color: #000000; background-color: #ffffc0; padding-left:
|
|
| 4287 |
</g>
|
| 4288 |
</g>
|
| 4289 |
<g id="series--torch-eager" class="series">
|
| 4290 |
-
<path d="M 82.966497 405.060892 L 113.615625
|
| 4291 |
<defs>
|
| 4292 |
<path id="md7efaf3aec" d="M 0 3 C 0.795609 3 1.55874 2.683901 2.12132 2.12132 C 2.683901 1.55874 3 0.795609 3 0 C 3 -0.795609 2.683901 -1.55874 2.12132 -2.12132 C 1.55874 -2.683901 0.795609 -3 0 -3 C -0.795609 -3 -1.55874 -2.683901 -2.12132 -2.12132 C -2.683901 -1.55874 -3 -0.795609 -3 0 C -3 0.795609 -2.683901 1.55874 -2.12132 2.12132 C -1.55874 2.683901 -0.795609 3 0 3 z " style="stroke: #1f77b4" />
|
| 4293 |
</defs>
|
| 4294 |
<g clip-path="url(#p088c925177)">
|
| 4295 |
<use ns4:href="#md7efaf3aec" x="82.966497" y="405.060892" style="fill: #1f77b4; stroke: #1f77b4" />
|
| 4296 |
-
<use ns4:href="#md7efaf3aec" x="113.615625" y="
|
| 4297 |
-
<use ns4:href="#md7efaf3aec" x="144.264753" y="368.
|
| 4298 |
-
<use ns4:href="#md7efaf3aec" x="174.913881" y="
|
| 4299 |
-
<use ns4:href="#md7efaf3aec" x="205.563009" y="
|
| 4300 |
-
<use ns4:href="#md7efaf3aec" x="236.212137" y="
|
| 4301 |
-
<use ns4:href="#md7efaf3aec" x="266.861265" y="
|
| 4302 |
-
<use ns4:href="#md7efaf3aec" x="297.510393" y="
|
| 4303 |
-
<use ns4:href="#md7efaf3aec" x="328.159521" y="
|
| 4304 |
-
<use ns4:href="#md7efaf3aec" x="358.808648" y="
|
| 4305 |
-
<use ns4:href="#md7efaf3aec" x="389.457776" y="
|
| 4306 |
-
<use ns4:href="#md7efaf3aec" x="420.106904" y="
|
| 4307 |
-
<use ns4:href="#md7efaf3aec" x="450.756032" y="
|
| 4308 |
-
<use ns4:href="#md7efaf3aec" x="481.40516" y="
|
| 4309 |
-
<use ns4:href="#md7efaf3aec" x="512.054288" y="
|
| 4310 |
-
<use ns4:href="#md7efaf3aec" x="542.703416" y="
|
| 4311 |
-
<use ns4:href="#md7efaf3aec" x="573.352544" y="
|
| 4312 |
-
<use ns4:href="#md7efaf3aec" x="604.001672" y="
|
| 4313 |
-
<use ns4:href="#md7efaf3aec" x="634.6508" y="
|
| 4314 |
-
<use ns4:href="#md7efaf3aec" x="665.299928" y="
|
| 4315 |
-
<use ns4:href="#md7efaf3aec" x="695.949056" y="
|
| 4316 |
-
<use ns4:href="#md7efaf3aec" x="726.598184" y="
|
| 4317 |
-
<use ns4:href="#md7efaf3aec" x="757.247312" y="
|
| 4318 |
<use ns4:href="#md7efaf3aec" x="787.896439" y="44.888614" style="fill: #1f77b4; stroke: #1f77b4" />
|
| 4319 |
</g>
|
| 4320 |
</g>
|
|
@@ -4364,7 +4364,7 @@ span.linenos.special { color: #000000; background-color: #ffffc0; padding-left:
|
|
| 4364 |
<span onclick="toggleOutput('combine')" style="cursor: pointer;">▼ output</span>
|
| 4365 |
<span id="uv-indicator-combine" onclick="toggleUvLogsFromHeader('combine')" style="cursor: pointer;">▶ uv-logs</span>
|
| 4366 |
</span> |
|
| 4367 |
-
Cell: combine | 4.
|
| 4368 |
| <button class="run-btn" onclick="runCell('combine')">▶ run</button>
|
| 4369 |
<button class="copy-btn" onclick="copyCell('combine')">Copy</button>
|
| 4370 |
<a href="cells/combine.py" target="_blank" class="raw-btn">Raw</a>
|
|
@@ -4453,7 +4453,7 @@ COMBINED BENCHMARK SUMMARY
|
|
| 4453 |
impl wl p50(ms) ok
|
| 4454 |
hf_kernels_rotary cuda_B1_S128_H32_D128_R64 0.09 False
|
| 4455 |
hf_kernels_rotary cuda_B1_S128_H32_D64_R32 0.09 False
|
| 4456 |
-
hf_kernels_rotary cuda_B1_S128_H8_D128_R64 0.
|
| 4457 |
hf_kernels_rotary cuda_B1_S128_H8_D64_R32 0.08 False
|
| 4458 |
hf_kernels_rotary cuda_B1_S2048_H32_D128_R64 0.09 False
|
| 4459 |
hf_kernels_rotary cuda_B1_S2048_H32_D64_R32 0.09 False
|
|
@@ -4478,8 +4478,8 @@ hf_kernels_rotary cuda_B2_S512_H8_D64_R32 0.09 False
|
|
| 4478 |
torch_eager cuda_B1_S128_H32_D128_R64 0.22 True
|
| 4479 |
torch_eager cuda_B1_S128_H32_D64_R32 0.22 True
|
| 4480 |
torch_eager cuda_B1_S128_H8_D128_R64 0.23 True
|
| 4481 |
-
torch_eager cuda_B1_S128_H8_D64_R32 0.
|
| 4482 |
-
torch_eager cuda_B1_S2048_H32_D128_R64 0.
|
| 4483 |
torch_eager cuda_B1_S2048_H32_D64_R32 0.22 True
|
| 4484 |
torch_eager cuda_B1_S2048_H8_D128_R64 0.22 True
|
| 4485 |
torch_eager cuda_B1_S2048_H8_D64_R32 0.22 True
|
|
@@ -4497,7 +4497,7 @@ torch_eager cuda_B2_S2048_H8_D128_R64 0.22 True
|
|
| 4497 |
torch_eager cuda_B2_S2048_H8_D64_R32 0.22 True
|
| 4498 |
torch_eager cuda_B2_S512_H32_D128_R64 0.22 True
|
| 4499 |
torch_eager cuda_B2_S512_H32_D64_R32 0.22 True
|
| 4500 |
-
torch_eager cuda_B2_S512_H8_D128_R64 0.
|
| 4501 |
torch_eager cuda_B2_S512_H8_D64_R32 0.22 True
|
| 4502 |
|
| 4503 |
GENERATING COMBINED VISUALIZATION
|
|
@@ -4518,7 +4518,7 @@ Implementations included:
|
|
| 4518 |
<div class="uv-install-logs" id="uv-logs-combine">
|
| 4519 |
<div class="uv-logs-header" onclick="toggleUvLogs(this)">▶ UV Install Logs</div>
|
| 4520 |
<div class="uv-logs-content" style="display: none;">
|
| 4521 |
-
Installed 37 packages in
|
| 4522 |
</div>
|
| 4523 |
</div>
|
| 4524 |
<div class="cell-artifacts">
|
|
@@ -4531,7 +4531,7 @@ Installed 37 packages in 219ms
|
|
| 4531 |
<rdf:RDF>
|
| 4532 |
<ns2:Work>
|
| 4533 |
<dc:type rdf:resource="http://purl.org/dc/dcmitype/StillImage" />
|
| 4534 |
-
<dc:date>2025-10-
|
| 4535 |
<dc:format>image/svg+xml</dc:format>
|
| 4536 |
<dc:creator>
|
| 4537 |
<ns2:Agent>
|
|
@@ -4875,70 +4875,70 @@ Installed 37 packages in 219ms
|
|
| 4875 |
<g id="matplotlib.axis_2">
|
| 4876 |
<g id="ytick_1">
|
| 4877 |
<g id="grid-y--2" class="grid grid-y">
|
| 4878 |
-
<path d="M 47.72
|
| 4879 |
</g>
|
| 4880 |
<g id="line2d_25">
|
| 4881 |
<defs>
|
| 4882 |
<path id="m0fca2865ba" d="M 0 0 L -3.5 0 " style="stroke: #000000; stroke-width: 0.8" />
|
| 4883 |
</defs>
|
| 4884 |
<g>
|
| 4885 |
-
<use ns4:href="#m0fca2865ba" x="47.72" y="
|
| 4886 |
</g>
|
| 4887 |
</g>
|
| 4888 |
<g id="text_25">
|
| 4889 |
-
<text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: end" x="40.72" y="
|
| 4890 |
</g>
|
| 4891 |
</g>
|
| 4892 |
<g id="ytick_2">
|
| 4893 |
<g id="grid-y--3" class="grid grid-y">
|
| 4894 |
-
<path d="M 47.72
|
| 4895 |
</g>
|
| 4896 |
<g id="line2d_26">
|
| 4897 |
<g>
|
| 4898 |
-
<use ns4:href="#m0fca2865ba" x="47.72" y="
|
| 4899 |
</g>
|
| 4900 |
</g>
|
| 4901 |
<g id="text_26">
|
| 4902 |
-
<text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: end" x="40.72" y="
|
| 4903 |
</g>
|
| 4904 |
</g>
|
| 4905 |
<g id="ytick_3">
|
| 4906 |
<g id="grid-y--4" class="grid grid-y">
|
| 4907 |
-
<path d="M 47.72
|
| 4908 |
</g>
|
| 4909 |
<g id="line2d_27">
|
| 4910 |
<g>
|
| 4911 |
-
<use ns4:href="#m0fca2865ba" x="47.72" y="
|
| 4912 |
</g>
|
| 4913 |
</g>
|
| 4914 |
<g id="text_27">
|
| 4915 |
-
<text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: end" x="40.72" y="
|
| 4916 |
</g>
|
| 4917 |
</g>
|
| 4918 |
<g id="ytick_4">
|
| 4919 |
<g id="grid-y--5" class="grid grid-y">
|
| 4920 |
-
<path d="M 47.72
|
| 4921 |
</g>
|
| 4922 |
<g id="line2d_28">
|
| 4923 |
<g>
|
| 4924 |
-
<use ns4:href="#m0fca2865ba" x="47.72" y="
|
| 4925 |
</g>
|
| 4926 |
</g>
|
| 4927 |
<g id="text_28">
|
| 4928 |
-
<text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: end" x="40.72" y="
|
| 4929 |
</g>
|
| 4930 |
</g>
|
| 4931 |
<g id="ytick_5">
|
| 4932 |
<g id="grid-y--6" class="grid grid-y">
|
| 4933 |
-
<path d="M 47.72
|
| 4934 |
</g>
|
| 4935 |
<g id="line2d_29">
|
| 4936 |
<g>
|
| 4937 |
-
<use ns4:href="#m0fca2865ba" x="47.72" y="
|
| 4938 |
</g>
|
| 4939 |
</g>
|
| 4940 |
<g id="text_29">
|
| 4941 |
-
<text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: end" x="40.72" y="
|
| 4942 |
</g>
|
| 4943 |
</g>
|
| 4944 |
<g id="label--y" class="ylabel">
|
|
@@ -4946,34 +4946,34 @@ Installed 37 packages in 219ms
|
|
| 4946 |
</g>
|
| 4947 |
</g>
|
| 4948 |
<g id="series--torch-eager" class="series">
|
| 4949 |
-
<path d="M 82.966497 405.060892 L 113.615625
|
| 4950 |
<defs>
|
| 4951 |
<path id="md7efaf3aec" d="M 0 3 C 0.795609 3 1.55874 2.683901 2.12132 2.12132 C 2.683901 1.55874 3 0.795609 3 0 C 3 -0.795609 2.683901 -1.55874 2.12132 -2.12132 C 1.55874 -2.683901 0.795609 -3 0 -3 C -0.795609 -3 -1.55874 -2.683901 -2.12132 -2.12132 C -2.683901 -1.55874 -3 -0.795609 -3 0 C -3 0.795609 -2.683901 1.55874 -2.12132 2.12132 C -1.55874 2.683901 -0.795609 3 0 3 z " style="stroke: #1f77b4" />
|
| 4952 |
</defs>
|
| 4953 |
<g clip-path="url(#p088c925177)">
|
| 4954 |
<use ns4:href="#md7efaf3aec" x="82.966497" y="405.060892" style="fill: #1f77b4; stroke: #1f77b4" />
|
| 4955 |
-
<use ns4:href="#md7efaf3aec" x="113.615625" y="
|
| 4956 |
-
<use ns4:href="#md7efaf3aec" x="144.264753" y="368.
|
| 4957 |
-
<use ns4:href="#md7efaf3aec" x="174.913881" y="
|
| 4958 |
-
<use ns4:href="#md7efaf3aec" x="205.563009" y="
|
| 4959 |
-
<use ns4:href="#md7efaf3aec" x="236.212137" y="
|
| 4960 |
-
<use ns4:href="#md7efaf3aec" x="266.861265" y="
|
| 4961 |
-
<use ns4:href="#md7efaf3aec" x="297.510393" y="
|
| 4962 |
-
<use ns4:href="#md7efaf3aec" x="328.159521" y="
|
| 4963 |
-
<use ns4:href="#md7efaf3aec" x="358.808648" y="
|
| 4964 |
-
<use ns4:href="#md7efaf3aec" x="389.457776" y="
|
| 4965 |
-
<use ns4:href="#md7efaf3aec" x="420.106904" y="
|
| 4966 |
-
<use ns4:href="#md7efaf3aec" x="450.756032" y="
|
| 4967 |
-
<use ns4:href="#md7efaf3aec" x="481.40516" y="
|
| 4968 |
-
<use ns4:href="#md7efaf3aec" x="512.054288" y="
|
| 4969 |
-
<use ns4:href="#md7efaf3aec" x="542.703416" y="
|
| 4970 |
-
<use ns4:href="#md7efaf3aec" x="573.352544" y="
|
| 4971 |
-
<use ns4:href="#md7efaf3aec" x="604.001672" y="
|
| 4972 |
-
<use ns4:href="#md7efaf3aec" x="634.6508" y="
|
| 4973 |
-
<use ns4:href="#md7efaf3aec" x="665.299928" y="
|
| 4974 |
-
<use ns4:href="#md7efaf3aec" x="695.949056" y="
|
| 4975 |
-
<use ns4:href="#md7efaf3aec" x="726.598184" y="
|
| 4976 |
-
<use ns4:href="#md7efaf3aec" x="757.247312" y="
|
| 4977 |
<use ns4:href="#md7efaf3aec" x="787.896439" y="44.888614" style="fill: #1f77b4; stroke: #1f77b4" />
|
| 4978 |
</g>
|
| 4979 |
</g>
|
|
|
|
| 3872 |
<rdf:RDF>
|
| 3873 |
<ns2:Work>
|
| 3874 |
<dc:type rdf:resource="http://purl.org/dc/dcmitype/StillImage" />
|
| 3875 |
+
<dc:date>2025-10-29T14:27:54.393501</dc:date>
|
| 3876 |
<dc:format>image/svg+xml</dc:format>
|
| 3877 |
<dc:creator>
|
| 3878 |
<ns2:Agent>
|
|
|
|
| 4216 |
<g id="matplotlib.axis_2">
|
| 4217 |
<g id="ytick_1">
|
| 4218 |
<g id="grid-y--2" class="grid grid-y">
|
| 4219 |
+
<path d="M 47.72 385.895403 L 823.142937 385.895403 " clip-path="url(#p088c925177)" style="fill: none; stroke: #b0b0b0; stroke-opacity: 0.3; stroke-width: 0.8; stroke-linecap: square" />
|
| 4220 |
</g>
|
| 4221 |
<g id="line2d_25">
|
| 4222 |
<defs>
|
| 4223 |
<path id="m0fca2865ba" d="M 0 0 L -3.5 0 " style="stroke: #000000; stroke-width: 0.8" />
|
| 4224 |
</defs>
|
| 4225 |
<g>
|
| 4226 |
+
<use ns4:href="#m0fca2865ba" x="47.72" y="385.895403" style="stroke: #000000; stroke-width: 0.8" />
|
| 4227 |
</g>
|
| 4228 |
</g>
|
| 4229 |
<g id="text_25">
|
| 4230 |
+
<text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: end" x="40.72" y="389.694621" transform="rotate(-0 40.72 389.694621)">0.2</text>
|
| 4231 |
</g>
|
| 4232 |
</g>
|
| 4233 |
<g id="ytick_2">
|
| 4234 |
<g id="grid-y--3" class="grid grid-y">
|
| 4235 |
+
<path d="M 47.72 308.195371 L 823.142937 308.195371 " clip-path="url(#p088c925177)" style="fill: none; stroke: #b0b0b0; stroke-opacity: 0.3; stroke-width: 0.8; stroke-linecap: square" />
|
| 4236 |
</g>
|
| 4237 |
<g id="line2d_26">
|
| 4238 |
<g>
|
| 4239 |
+
<use ns4:href="#m0fca2865ba" x="47.72" y="308.195371" style="stroke: #000000; stroke-width: 0.8" />
|
| 4240 |
</g>
|
| 4241 |
</g>
|
| 4242 |
<g id="text_26">
|
| 4243 |
+
<text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: end" x="40.72" y="311.99459" transform="rotate(-0 40.72 311.99459)">0.3</text>
|
| 4244 |
</g>
|
| 4245 |
</g>
|
| 4246 |
<g id="ytick_3">
|
| 4247 |
<g id="grid-y--4" class="grid grid-y">
|
| 4248 |
+
<path d="M 47.72 230.49534 L 823.142937 230.49534 " clip-path="url(#p088c925177)" style="fill: none; stroke: #b0b0b0; stroke-opacity: 0.3; stroke-width: 0.8; stroke-linecap: square" />
|
| 4249 |
</g>
|
| 4250 |
<g id="line2d_27">
|
| 4251 |
<g>
|
| 4252 |
+
<use ns4:href="#m0fca2865ba" x="47.72" y="230.49534" style="stroke: #000000; stroke-width: 0.8" />
|
| 4253 |
</g>
|
| 4254 |
</g>
|
| 4255 |
<g id="text_27">
|
| 4256 |
+
<text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: end" x="40.72" y="234.294559" transform="rotate(-0 40.72 234.294559)">0.4</text>
|
| 4257 |
</g>
|
| 4258 |
</g>
|
| 4259 |
<g id="ytick_4">
|
| 4260 |
<g id="grid-y--5" class="grid grid-y">
|
| 4261 |
+
<path d="M 47.72 152.795309 L 823.142937 152.795309 " clip-path="url(#p088c925177)" style="fill: none; stroke: #b0b0b0; stroke-opacity: 0.3; stroke-width: 0.8; stroke-linecap: square" />
|
| 4262 |
</g>
|
| 4263 |
<g id="line2d_28">
|
| 4264 |
<g>
|
| 4265 |
+
<use ns4:href="#m0fca2865ba" x="47.72" y="152.795309" style="stroke: #000000; stroke-width: 0.8" />
|
| 4266 |
</g>
|
| 4267 |
</g>
|
| 4268 |
<g id="text_28">
|
| 4269 |
+
<text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: end" x="40.72" y="156.594528" transform="rotate(-0 40.72 156.594528)">0.5</text>
|
| 4270 |
</g>
|
| 4271 |
</g>
|
| 4272 |
<g id="ytick_5">
|
| 4273 |
<g id="grid-y--6" class="grid grid-y">
|
| 4274 |
+
<path d="M 47.72 75.095278 L 823.142937 75.095278 " clip-path="url(#p088c925177)" style="fill: none; stroke: #b0b0b0; stroke-opacity: 0.3; stroke-width: 0.8; stroke-linecap: square" />
|
| 4275 |
</g>
|
| 4276 |
<g id="line2d_29">
|
| 4277 |
<g>
|
| 4278 |
+
<use ns4:href="#m0fca2865ba" x="47.72" y="75.095278" style="stroke: #000000; stroke-width: 0.8" />
|
| 4279 |
</g>
|
| 4280 |
</g>
|
| 4281 |
<g id="text_29">
|
| 4282 |
+
<text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: end" x="40.72" y="78.894497" transform="rotate(-0 40.72 78.894497)">0.6</text>
|
| 4283 |
</g>
|
| 4284 |
</g>
|
| 4285 |
<g id="label--y" class="ylabel">
|
|
|
|
| 4287 |
</g>
|
| 4288 |
</g>
|
| 4289 |
<g id="series--torch-eager" class="series">
|
| 4290 |
+
<path d="M 82.966497 405.060892 L 113.615625 365.214762 L 144.264753 368.882204 L 174.913881 373.894633 L 205.563009 367.32121 L 236.212137 373.482823 L 266.861265 373.661533 L 297.510393 372.596265 L 328.159521 372.666195 L 358.808648 373.350733 L 389.457776 373.203103 L 420.106904 366.263713 L 450.756032 372.286242 L 481.40516 372.729132 L 512.054288 373.008852 L 542.703416 372.107532 L 573.352544 372.022062 L 604.001672 374.842573 L 634.6508 372.153375 L 665.299928 372.395022 L 695.949056 373.093546 L 726.598184 372.782745 L 757.247312 362.519348 L 787.896439 44.888614 " clip-path="url(#p088c925177)" style="fill: none; stroke: #1f77b4; stroke-width: 1.5; stroke-linecap: square" />
|
| 4291 |
<defs>
|
| 4292 |
<path id="md7efaf3aec" d="M 0 3 C 0.795609 3 1.55874 2.683901 2.12132 2.12132 C 2.683901 1.55874 3 0.795609 3 0 C 3 -0.795609 2.683901 -1.55874 2.12132 -2.12132 C 1.55874 -2.683901 0.795609 -3 0 -3 C -0.795609 -3 -1.55874 -2.683901 -2.12132 -2.12132 C -2.683901 -1.55874 -3 -0.795609 -3 0 C -3 0.795609 -2.683901 1.55874 -2.12132 2.12132 C -1.55874 2.683901 -0.795609 3 0 3 z " style="stroke: #1f77b4" />
|
| 4293 |
</defs>
|
| 4294 |
<g clip-path="url(#p088c925177)">
|
| 4295 |
<use ns4:href="#md7efaf3aec" x="82.966497" y="405.060892" style="fill: #1f77b4; stroke: #1f77b4" />
|
| 4296 |
+
<use ns4:href="#md7efaf3aec" x="113.615625" y="365.214762" style="fill: #1f77b4; stroke: #1f77b4" />
|
| 4297 |
+
<use ns4:href="#md7efaf3aec" x="144.264753" y="368.882204" style="fill: #1f77b4; stroke: #1f77b4" />
|
| 4298 |
+
<use ns4:href="#md7efaf3aec" x="174.913881" y="373.894633" style="fill: #1f77b4; stroke: #1f77b4" />
|
| 4299 |
+
<use ns4:href="#md7efaf3aec" x="205.563009" y="367.32121" style="fill: #1f77b4; stroke: #1f77b4" />
|
| 4300 |
+
<use ns4:href="#md7efaf3aec" x="236.212137" y="373.482823" style="fill: #1f77b4; stroke: #1f77b4" />
|
| 4301 |
+
<use ns4:href="#md7efaf3aec" x="266.861265" y="373.661533" style="fill: #1f77b4; stroke: #1f77b4" />
|
| 4302 |
+
<use ns4:href="#md7efaf3aec" x="297.510393" y="372.596265" style="fill: #1f77b4; stroke: #1f77b4" />
|
| 4303 |
+
<use ns4:href="#md7efaf3aec" x="328.159521" y="372.666195" style="fill: #1f77b4; stroke: #1f77b4" />
|
| 4304 |
+
<use ns4:href="#md7efaf3aec" x="358.808648" y="373.350733" style="fill: #1f77b4; stroke: #1f77b4" />
|
| 4305 |
+
<use ns4:href="#md7efaf3aec" x="389.457776" y="373.203103" style="fill: #1f77b4; stroke: #1f77b4" />
|
| 4306 |
+
<use ns4:href="#md7efaf3aec" x="420.106904" y="366.263713" style="fill: #1f77b4; stroke: #1f77b4" />
|
| 4307 |
+
<use ns4:href="#md7efaf3aec" x="450.756032" y="372.286242" style="fill: #1f77b4; stroke: #1f77b4" />
|
| 4308 |
+
<use ns4:href="#md7efaf3aec" x="481.40516" y="372.729132" style="fill: #1f77b4; stroke: #1f77b4" />
|
| 4309 |
+
<use ns4:href="#md7efaf3aec" x="512.054288" y="373.008852" style="fill: #1f77b4; stroke: #1f77b4" />
|
| 4310 |
+
<use ns4:href="#md7efaf3aec" x="542.703416" y="372.107532" style="fill: #1f77b4; stroke: #1f77b4" />
|
| 4311 |
+
<use ns4:href="#md7efaf3aec" x="573.352544" y="372.022062" style="fill: #1f77b4; stroke: #1f77b4" />
|
| 4312 |
+
<use ns4:href="#md7efaf3aec" x="604.001672" y="374.842573" style="fill: #1f77b4; stroke: #1f77b4" />
|
| 4313 |
+
<use ns4:href="#md7efaf3aec" x="634.6508" y="372.153375" style="fill: #1f77b4; stroke: #1f77b4" />
|
| 4314 |
+
<use ns4:href="#md7efaf3aec" x="665.299928" y="372.395022" style="fill: #1f77b4; stroke: #1f77b4" />
|
| 4315 |
+
<use ns4:href="#md7efaf3aec" x="695.949056" y="373.093546" style="fill: #1f77b4; stroke: #1f77b4" />
|
| 4316 |
+
<use ns4:href="#md7efaf3aec" x="726.598184" y="372.782745" style="fill: #1f77b4; stroke: #1f77b4" />
|
| 4317 |
+
<use ns4:href="#md7efaf3aec" x="757.247312" y="362.519348" style="fill: #1f77b4; stroke: #1f77b4" />
|
| 4318 |
<use ns4:href="#md7efaf3aec" x="787.896439" y="44.888614" style="fill: #1f77b4; stroke: #1f77b4" />
|
| 4319 |
</g>
|
| 4320 |
</g>
|
|
|
|
| 4364 |
<span onclick="toggleOutput('combine')" style="cursor: pointer;">▼ output</span>
|
| 4365 |
<span id="uv-indicator-combine" onclick="toggleUvLogsFromHeader('combine')" style="cursor: pointer;">▶ uv-logs</span>
|
| 4366 |
</span> |
|
| 4367 |
+
Cell: combine | 4.35s
|
| 4368 |
| <button class="run-btn" onclick="runCell('combine')">▶ run</button>
|
| 4369 |
<button class="copy-btn" onclick="copyCell('combine')">Copy</button>
|
| 4370 |
<a href="cells/combine.py" target="_blank" class="raw-btn">Raw</a>
|
|
|
|
| 4453 |
impl wl p50(ms) ok
|
| 4454 |
hf_kernels_rotary cuda_B1_S128_H32_D128_R64 0.09 False
|
| 4455 |
hf_kernels_rotary cuda_B1_S128_H32_D64_R32 0.09 False
|
| 4456 |
+
hf_kernels_rotary cuda_B1_S128_H8_D128_R64 0.10 False
|
| 4457 |
hf_kernels_rotary cuda_B1_S128_H8_D64_R32 0.08 False
|
| 4458 |
hf_kernels_rotary cuda_B1_S2048_H32_D128_R64 0.09 False
|
| 4459 |
hf_kernels_rotary cuda_B1_S2048_H32_D64_R32 0.09 False
|
|
|
|
| 4478 |
torch_eager cuda_B1_S128_H32_D128_R64 0.22 True
|
| 4479 |
torch_eager cuda_B1_S128_H32_D64_R32 0.22 True
|
| 4480 |
torch_eager cuda_B1_S128_H8_D128_R64 0.23 True
|
| 4481 |
+
torch_eager cuda_B1_S128_H8_D64_R32 0.18 True
|
| 4482 |
+
torch_eager cuda_B1_S2048_H32_D128_R64 0.23 True
|
| 4483 |
torch_eager cuda_B1_S2048_H32_D64_R32 0.22 True
|
| 4484 |
torch_eager cuda_B1_S2048_H8_D128_R64 0.22 True
|
| 4485 |
torch_eager cuda_B1_S2048_H8_D64_R32 0.22 True
|
|
|
|
| 4497 |
torch_eager cuda_B2_S2048_H8_D64_R32 0.22 True
|
| 4498 |
torch_eager cuda_B2_S512_H32_D128_R64 0.22 True
|
| 4499 |
torch_eager cuda_B2_S512_H32_D64_R32 0.22 True
|
| 4500 |
+
torch_eager cuda_B2_S512_H8_D128_R64 0.21 True
|
| 4501 |
torch_eager cuda_B2_S512_H8_D64_R32 0.22 True
|
| 4502 |
|
| 4503 |
GENERATING COMBINED VISUALIZATION
|
|
|
|
| 4518 |
<div class="uv-install-logs" id="uv-logs-combine">
|
| 4519 |
<div class="uv-logs-header" onclick="toggleUvLogs(this)">▶ UV Install Logs</div>
|
| 4520 |
<div class="uv-logs-content" style="display: none;">
|
| 4521 |
+
Installed 37 packages in 239ms
|
| 4522 |
</div>
|
| 4523 |
</div>
|
| 4524 |
<div class="cell-artifacts">
|
|
|
|
| 4531 |
<rdf:RDF>
|
| 4532 |
<ns2:Work>
|
| 4533 |
<dc:type rdf:resource="http://purl.org/dc/dcmitype/StillImage" />
|
| 4534 |
+
<dc:date>2025-10-29T14:27:54.393501</dc:date>
|
| 4535 |
<dc:format>image/svg+xml</dc:format>
|
| 4536 |
<dc:creator>
|
| 4537 |
<ns2:Agent>
|
|
|
|
| 4875 |
<g id="matplotlib.axis_2">
|
| 4876 |
<g id="ytick_1">
|
| 4877 |
<g id="grid-y--2" class="grid grid-y">
|
| 4878 |
+
<path d="M 47.72 385.895403 L 823.142937 385.895403 " clip-path="url(#p088c925177)" style="fill: none; stroke: #b0b0b0; stroke-opacity: 0.3; stroke-width: 0.8; stroke-linecap: square" />
|
| 4879 |
</g>
|
| 4880 |
<g id="line2d_25">
|
| 4881 |
<defs>
|
| 4882 |
<path id="m0fca2865ba" d="M 0 0 L -3.5 0 " style="stroke: #000000; stroke-width: 0.8" />
|
| 4883 |
</defs>
|
| 4884 |
<g>
|
| 4885 |
+
<use ns4:href="#m0fca2865ba" x="47.72" y="385.895403" style="stroke: #000000; stroke-width: 0.8" />
|
| 4886 |
</g>
|
| 4887 |
</g>
|
| 4888 |
<g id="text_25">
|
| 4889 |
+
<text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: end" x="40.72" y="389.694621" transform="rotate(-0 40.72 389.694621)">0.2</text>
|
| 4890 |
</g>
|
| 4891 |
</g>
|
| 4892 |
<g id="ytick_2">
|
| 4893 |
<g id="grid-y--3" class="grid grid-y">
|
| 4894 |
+
<path d="M 47.72 308.195371 L 823.142937 308.195371 " clip-path="url(#p088c925177)" style="fill: none; stroke: #b0b0b0; stroke-opacity: 0.3; stroke-width: 0.8; stroke-linecap: square" />
|
| 4895 |
</g>
|
| 4896 |
<g id="line2d_26">
|
| 4897 |
<g>
|
| 4898 |
+
<use ns4:href="#m0fca2865ba" x="47.72" y="308.195371" style="stroke: #000000; stroke-width: 0.8" />
|
| 4899 |
</g>
|
| 4900 |
</g>
|
| 4901 |
<g id="text_26">
|
| 4902 |
+
<text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: end" x="40.72" y="311.99459" transform="rotate(-0 40.72 311.99459)">0.3</text>
|
| 4903 |
</g>
|
| 4904 |
</g>
|
| 4905 |
<g id="ytick_3">
|
| 4906 |
<g id="grid-y--4" class="grid grid-y">
|
| 4907 |
+
<path d="M 47.72 230.49534 L 823.142937 230.49534 " clip-path="url(#p088c925177)" style="fill: none; stroke: #b0b0b0; stroke-opacity: 0.3; stroke-width: 0.8; stroke-linecap: square" />
|
| 4908 |
</g>
|
| 4909 |
<g id="line2d_27">
|
| 4910 |
<g>
|
| 4911 |
+
<use ns4:href="#m0fca2865ba" x="47.72" y="230.49534" style="stroke: #000000; stroke-width: 0.8" />
|
| 4912 |
</g>
|
| 4913 |
</g>
|
| 4914 |
<g id="text_27">
|
| 4915 |
+
<text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: end" x="40.72" y="234.294559" transform="rotate(-0 40.72 234.294559)">0.4</text>
|
| 4916 |
</g>
|
| 4917 |
</g>
|
| 4918 |
<g id="ytick_4">
|
| 4919 |
<g id="grid-y--5" class="grid grid-y">
|
| 4920 |
+
<path d="M 47.72 152.795309 L 823.142937 152.795309 " clip-path="url(#p088c925177)" style="fill: none; stroke: #b0b0b0; stroke-opacity: 0.3; stroke-width: 0.8; stroke-linecap: square" />
|
| 4921 |
</g>
|
| 4922 |
<g id="line2d_28">
|
| 4923 |
<g>
|
| 4924 |
+
<use ns4:href="#m0fca2865ba" x="47.72" y="152.795309" style="stroke: #000000; stroke-width: 0.8" />
|
| 4925 |
</g>
|
| 4926 |
</g>
|
| 4927 |
<g id="text_28">
|
| 4928 |
+
<text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: end" x="40.72" y="156.594528" transform="rotate(-0 40.72 156.594528)">0.5</text>
|
| 4929 |
</g>
|
| 4930 |
</g>
|
| 4931 |
<g id="ytick_5">
|
| 4932 |
<g id="grid-y--6" class="grid grid-y">
|
| 4933 |
+
<path d="M 47.72 75.095278 L 823.142937 75.095278 " clip-path="url(#p088c925177)" style="fill: none; stroke: #b0b0b0; stroke-opacity: 0.3; stroke-width: 0.8; stroke-linecap: square" />
|
| 4934 |
</g>
|
| 4935 |
<g id="line2d_29">
|
| 4936 |
<g>
|
| 4937 |
+
<use ns4:href="#m0fca2865ba" x="47.72" y="75.095278" style="stroke: #000000; stroke-width: 0.8" />
|
| 4938 |
</g>
|
| 4939 |
</g>
|
| 4940 |
<g id="text_29">
|
| 4941 |
+
<text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: end" x="40.72" y="78.894497" transform="rotate(-0 40.72 78.894497)">0.6</text>
|
| 4942 |
</g>
|
| 4943 |
</g>
|
| 4944 |
<g id="label--y" class="ylabel">
|
|
|
|
| 4946 |
</g>
|
| 4947 |
</g>
|
| 4948 |
<g id="series--torch-eager" class="series">
|
| 4949 |
+
<path d="M 82.966497 405.060892 L 113.615625 365.214762 L 144.264753 368.882204 L 174.913881 373.894633 L 205.563009 367.32121 L 236.212137 373.482823 L 266.861265 373.661533 L 297.510393 372.596265 L 328.159521 372.666195 L 358.808648 373.350733 L 389.457776 373.203103 L 420.106904 366.263713 L 450.756032 372.286242 L 481.40516 372.729132 L 512.054288 373.008852 L 542.703416 372.107532 L 573.352544 372.022062 L 604.001672 374.842573 L 634.6508 372.153375 L 665.299928 372.395022 L 695.949056 373.093546 L 726.598184 372.782745 L 757.247312 362.519348 L 787.896439 44.888614 " clip-path="url(#p088c925177)" style="fill: none; stroke: #1f77b4; stroke-width: 1.5; stroke-linecap: square" />
|
| 4950 |
<defs>
|
| 4951 |
<path id="md7efaf3aec" d="M 0 3 C 0.795609 3 1.55874 2.683901 2.12132 2.12132 C 2.683901 1.55874 3 0.795609 3 0 C 3 -0.795609 2.683901 -1.55874 2.12132 -2.12132 C 1.55874 -2.683901 0.795609 -3 0 -3 C -0.795609 -3 -1.55874 -2.683901 -2.12132 -2.12132 C -2.683901 -1.55874 -3 -0.795609 -3 0 C -3 0.795609 -2.683901 1.55874 -2.12132 2.12132 C -1.55874 2.683901 -0.795609 3 0 3 z " style="stroke: #1f77b4" />
|
| 4952 |
</defs>
|
| 4953 |
<g clip-path="url(#p088c925177)">
|
| 4954 |
<use ns4:href="#md7efaf3aec" x="82.966497" y="405.060892" style="fill: #1f77b4; stroke: #1f77b4" />
|
| 4955 |
+
<use ns4:href="#md7efaf3aec" x="113.615625" y="365.214762" style="fill: #1f77b4; stroke: #1f77b4" />
|
| 4956 |
+
<use ns4:href="#md7efaf3aec" x="144.264753" y="368.882204" style="fill: #1f77b4; stroke: #1f77b4" />
|
| 4957 |
+
<use ns4:href="#md7efaf3aec" x="174.913881" y="373.894633" style="fill: #1f77b4; stroke: #1f77b4" />
|
| 4958 |
+
<use ns4:href="#md7efaf3aec" x="205.563009" y="367.32121" style="fill: #1f77b4; stroke: #1f77b4" />
|
| 4959 |
+
<use ns4:href="#md7efaf3aec" x="236.212137" y="373.482823" style="fill: #1f77b4; stroke: #1f77b4" />
|
| 4960 |
+
<use ns4:href="#md7efaf3aec" x="266.861265" y="373.661533" style="fill: #1f77b4; stroke: #1f77b4" />
|
| 4961 |
+
<use ns4:href="#md7efaf3aec" x="297.510393" y="372.596265" style="fill: #1f77b4; stroke: #1f77b4" />
|
| 4962 |
+
<use ns4:href="#md7efaf3aec" x="328.159521" y="372.666195" style="fill: #1f77b4; stroke: #1f77b4" />
|
| 4963 |
+
<use ns4:href="#md7efaf3aec" x="358.808648" y="373.350733" style="fill: #1f77b4; stroke: #1f77b4" />
|
| 4964 |
+
<use ns4:href="#md7efaf3aec" x="389.457776" y="373.203103" style="fill: #1f77b4; stroke: #1f77b4" />
|
| 4965 |
+
<use ns4:href="#md7efaf3aec" x="420.106904" y="366.263713" style="fill: #1f77b4; stroke: #1f77b4" />
|
| 4966 |
+
<use ns4:href="#md7efaf3aec" x="450.756032" y="372.286242" style="fill: #1f77b4; stroke: #1f77b4" />
|
| 4967 |
+
<use ns4:href="#md7efaf3aec" x="481.40516" y="372.729132" style="fill: #1f77b4; stroke: #1f77b4" />
|
| 4968 |
+
<use ns4:href="#md7efaf3aec" x="512.054288" y="373.008852" style="fill: #1f77b4; stroke: #1f77b4" />
|
| 4969 |
+
<use ns4:href="#md7efaf3aec" x="542.703416" y="372.107532" style="fill: #1f77b4; stroke: #1f77b4" />
|
| 4970 |
+
<use ns4:href="#md7efaf3aec" x="573.352544" y="372.022062" style="fill: #1f77b4; stroke: #1f77b4" />
|
| 4971 |
+
<use ns4:href="#md7efaf3aec" x="604.001672" y="374.842573" style="fill: #1f77b4; stroke: #1f77b4" />
|
| 4972 |
+
<use ns4:href="#md7efaf3aec" x="634.6508" y="372.153375" style="fill: #1f77b4; stroke: #1f77b4" />
|
| 4973 |
+
<use ns4:href="#md7efaf3aec" x="665.299928" y="372.395022" style="fill: #1f77b4; stroke: #1f77b4" />
|
| 4974 |
+
<use ns4:href="#md7efaf3aec" x="695.949056" y="373.093546" style="fill: #1f77b4; stroke: #1f77b4" />
|
| 4975 |
+
<use ns4:href="#md7efaf3aec" x="726.598184" y="372.782745" style="fill: #1f77b4; stroke: #1f77b4" />
|
| 4976 |
+
<use ns4:href="#md7efaf3aec" x="757.247312" y="362.519348" style="fill: #1f77b4; stroke: #1f77b4" />
|
| 4977 |
<use ns4:href="#md7efaf3aec" x="787.896439" y="44.888614" style="fill: #1f77b4; stroke: #1f77b4" />
|
| 4978 |
</g>
|
| 4979 |
</g>
|