Upload folder using huggingface_hub
Browse files- activation/impls/artifacts/benchmark/activation.jsonl +9 -9
- activation/impls/cells/benchmark.py +7 -13
- activation/impls/hf_kernels_swiglu.html +99 -91
- activation/impls/torch_swiglu.html +129 -127
- activation/results/artifacts/combine/latency.svg +2 -2
- activation/results/combined_results.html +89 -107
- causal_conv1d/impls/artifacts/benchmark/causal_conv1d.jsonl +24 -24
- causal_conv1d/impls/cells/benchmark.py +18 -9
- causal_conv1d/impls/hf_kernels_causal_conv1d.html +0 -0
- causal_conv1d/impls/torch_causal_conv1d.html +0 -0
- causal_conv1d/results/artifacts/combine/latency.svg +2 -2
- causal_conv1d/results/combined_results.html +145 -137
- flash_attn/impls/artifacts/benchmark/attention.jsonl +6 -6
- flash_attn/impls/cells/benchmark.py +10 -9
- flash_attn/impls/flash_attention.html +151 -195
- flash_attn/impls/hf_kernels_flash_attn.html +102 -93
- flash_attn/impls/hf_kernels_flash_attn3.html +96 -84
- flash_attn/impls/mem_efficient_attention.html +139 -131
- flash_attn/impls/sage_attention.html +20 -17
- flash_attn/impls/xformers.html +146 -92
- flash_attn/results/artifacts/combine/latency.svg +2 -2
- flash_attn/results/combined_results.html +156 -148
- index.html +14 -12
- layer_norm/impls/artifacts/benchmark/layer_norm.jsonl +4 -4
- layer_norm/impls/cells/benchmark.py +5 -28
- layer_norm/impls/hf_kernels_layer_norm.html +62 -54
- layer_norm/impls/torch_layer_norm.html +61 -59
- layer_norm/results/artifacts/combine/latency.svg +2 -2
- layer_norm/results/combined_results.html +60 -52
- rotary/impls/artifacts/benchmark/rotary.jsonl +24 -24
- rotary/impls/cells/benchmark.py +12 -21
- rotary/impls/hf_kernels_rotary.html +0 -0
- rotary/impls/torch_rotary.html +0 -0
- rotary/index.html +8 -0
- rotary/results/artifacts/combine/latency.svg +2 -2
- rotary/results/combined_results.html +302 -134
activation/impls/artifacts/benchmark/activation.jsonl
CHANGED
|
@@ -1,9 +1,9 @@
|
|
| 1 |
-
{"ts": "2025-10-
|
| 2 |
-
{"ts": "2025-10-
|
| 3 |
-
{"ts": "2025-10-
|
| 4 |
-
{"ts": "2025-10-
|
| 5 |
-
{"ts": "2025-10-
|
| 6 |
-
{"ts": "2025-10-
|
| 7 |
-
{"ts": "2025-10-
|
| 8 |
-
{"ts": "2025-10-
|
| 9 |
-
{"ts": "2025-10-
|
|
|
|
| 1 |
+
{"ts": "2025-10-29T15:50:51Z", "run": "e54f22d2514045f0929111d26589784b", "impl": "torch_eager", "tags": {"family": "hf-kernels", "backend": "eager"}, "wl": {"name": "cuda_T128_D768", "num_tokens": 128, "hidden_dim": 768, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.040432000048440386, "p50": 0.04165099994679622, "p90": 0.0417410000181917, "mean": 0.04172699999571705, "iqr": 0.0011400000516914588, "raw_times": [0.0417410000181917, 0.04420999999865671, 0.040432000048440386, 0.04165099994679622, 0.04060099996650024], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.046430999987023824, "peak_bytes": 1966080, "ok": true, "absmax": 0.0, "corr": {"ok": true, "rtol": 0.02, "atol": 0.02, "absmax": 0.0, "mae": 0.0, "mse": 0.0, "ref": "swiglu_fp32"}, "err": null}
|
| 2 |
+
{"ts": "2025-10-29T15:50:51Z", "run": "e54f22d2514045f0929111d26589784b", "impl": "torch_eager", "tags": {"family": "hf-kernels", "backend": "eager"}, "wl": {"name": "cuda_T128_D1024", "num_tokens": 128, "hidden_dim": 1024, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.04963099996757592, "p50": 0.05265099997586731, "p90": 0.053851000018312334, "mean": 0.054568999985349365, "iqr": 0.0016500000583619112, "raw_times": [0.04963099996757592, 0.05265099997586731, 0.05220099995995042, 0.053851000018312334, 0.06451100000504084], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.05472100002634761, "peak_bytes": 2621440, "ok": true, "absmax": 0.0, "corr": {"ok": true, "rtol": 0.02, "atol": 0.02, "absmax": 0.0, "mae": 0.0, "mse": 0.0, "ref": "swiglu_fp32"}, "err": null}
|
| 3 |
+
{"ts": "2025-10-29T15:50:51Z", "run": "e54f22d2514045f0929111d26589784b", "impl": "torch_eager", "tags": {"family": "hf-kernels", "backend": "eager"}, "wl": {"name": "cuda_T128_D2048", "num_tokens": 128, "hidden_dim": 2048, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.04966099999137441, "p50": 0.05102099999021448, "p90": 0.05103099999814731, "mean": 0.05151719999503257, "iqr": 0.0007099999947968172, "raw_times": [0.04966099999137441, 0.05555199999207616, 0.05032100000335049, 0.05102099999021448, 0.05103099999814731], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.05423200002496742, "peak_bytes": 5242880, "ok": true, "absmax": 0.0, "corr": {"ok": true, "rtol": 0.02, "atol": 0.02, "absmax": 0.0, "mae": 0.0, "mse": 0.0, "ref": "swiglu_fp32"}, "err": null}
|
| 4 |
+
{"ts": "2025-10-29T15:50:51Z", "run": "e54f22d2514045f0929111d26589784b", "impl": "torch_eager", "tags": {"family": "hf-kernels", "backend": "eager"}, "wl": {"name": "cuda_T256_D768", "num_tokens": 256, "hidden_dim": 768, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.04886099998202553, "p50": 0.05024199998615586, "p90": 0.0503609999782384, "mean": 0.05005519998348973, "iqr": 0.0007900000014160469, "raw_times": [0.04886099998202553, 0.04957099997682235, 0.051240999994206504, 0.05024199998615586, 0.0503609999782384], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.053871000034177996, "peak_bytes": 3932160, "ok": true, "absmax": 0.0, "corr": {"ok": true, "rtol": 0.02, "atol": 0.02, "absmax": 0.0, "mae": 0.0, "mse": 0.0, "ref": "swiglu_fp32"}, "err": null}
|
| 5 |
+
{"ts": "2025-10-29T15:50:51Z", "run": "e54f22d2514045f0929111d26589784b", "impl": "torch_eager", "tags": {"family": "hf-kernels", "backend": "eager"}, "wl": {"name": "cuda_T256_D1024", "num_tokens": 256, "hidden_dim": 1024, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.04914099997677113, "p50": 0.04985100002841136, "p90": 0.05049099996767836, "mean": 0.04988699998875745, "iqr": 0.0013399999829744047, "raw_times": [0.04915099998470396, 0.05080099998622245, 0.04985100002841136, 0.04914099997677113, 0.05049099996767836], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.053920999960155314, "peak_bytes": 5242880, "ok": true, "absmax": 0.0, "corr": {"ok": true, "rtol": 0.02, "atol": 0.02, "absmax": 0.0, "mae": 0.0, "mse": 0.0, "ref": "swiglu_fp32"}, "err": null}
|
| 6 |
+
{"ts": "2025-10-29T15:50:51Z", "run": "e54f22d2514045f0929111d26589784b", "impl": "torch_eager", "tags": {"family": "hf-kernels", "backend": "eager"}, "wl": {"name": "cuda_T256_D2048", "num_tokens": 256, "hidden_dim": 2048, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.04656100003330721, "p50": 0.04960100000062084, "p90": 0.05333199999313365, "mean": 0.05254540001260466, "iqr": 0.0039209999727063405, "raw_times": [0.04656100003330721, 0.05333199999313365, 0.04960100000062084, 0.04941100002042731, 0.06382200001553429], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.051971000004868984, "peak_bytes": 10485760, "ok": true, "absmax": 0.0, "corr": {"ok": true, "rtol": 0.02, "atol": 0.02, "absmax": 0.0, "mae": 0.0, "mse": 0.0, "ref": "swiglu_fp32"}, "err": null}
|
| 7 |
+
{"ts": "2025-10-29T15:50:51Z", "run": "e54f22d2514045f0929111d26589784b", "impl": "torch_eager", "tags": {"family": "hf-kernels", "backend": "eager"}, "wl": {"name": "cuda_T512_D768", "num_tokens": 512, "hidden_dim": 768, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.04889099994898061, "p50": 0.050290999979552, "p90": 0.05037099998617123, "mean": 0.05047499996635452, "iqr": 0.0002600000357233512, "raw_times": [0.04889099994898061, 0.052710999966620875, 0.050110999950447876, 0.05037099998617123, 0.050290999979552], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.05234200000359124, "peak_bytes": 7864320, "ok": true, "absmax": 0.0, "corr": {"ok": true, "rtol": 0.02, "atol": 0.02, "absmax": 0.0, "mae": 0.0, "mse": 0.0, "ref": "swiglu_fp32"}, "err": null}
|
| 8 |
+
{"ts": "2025-10-29T15:50:51Z", "run": "e54f22d2514045f0929111d26589784b", "impl": "torch_eager", "tags": {"family": "hf-kernels", "backend": "eager"}, "wl": {"name": "cuda_T512_D1024", "num_tokens": 512, "hidden_dim": 1024, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.0489209999727791, "p50": 0.04973099999006081, "p90": 0.05078099997035679, "mean": 0.051391199974659685, "iqr": 0.0012099999935344385, "raw_times": [0.0489209999727791, 0.05078099997035679, 0.04973099999006081, 0.04957099997682235, 0.05795199996327938], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.0512020000087432, "peak_bytes": 10485760, "ok": true, "absmax": 0.0, "corr": {"ok": true, "rtol": 0.02, "atol": 0.02, "absmax": 0.0, "mae": 0.0, "mse": 0.0, "ref": "swiglu_fp32"}, "err": null}
|
| 9 |
+
{"ts": "2025-10-29T15:50:51Z", "run": "e54f22d2514045f0929111d26589784b", "impl": "torch_eager", "tags": {"family": "hf-kernels", "backend": "eager"}, "wl": {"name": "cuda_T512_D2048", "num_tokens": 512, "hidden_dim": 2048, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.04852099999652637, "p50": 0.04917100000056962, "p90": 0.049370999988695985, "mean": 0.049055200008751854, "iqr": 0.0007299999538190605, "raw_times": [0.04852099999652637, 0.048641000034876924, 0.04917100000056962, 0.049370999988695985, 0.04957200002309037], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.05309099998385136, "peak_bytes": 20971520, "ok": true, "absmax": 0.0, "corr": {"ok": true, "rtol": 0.02, "atol": 0.02, "absmax": 0.0, "mae": 0.0, "mse": 0.0, "ref": "swiglu_fp32"}, "err": null}
|
activation/impls/cells/benchmark.py
CHANGED
|
@@ -4,7 +4,6 @@
|
|
| 4 |
# "numpy",
|
| 5 |
# "torch==2.8.0",
|
| 6 |
# "kernels-benchmark-tools",
|
| 7 |
-
# "kernels",
|
| 8 |
# ]
|
| 9 |
#
|
| 10 |
# [tool.uv.sources]
|
|
@@ -13,22 +12,17 @@
|
|
| 13 |
import torch
|
| 14 |
import sys
|
| 15 |
from kernels_benchmark_tools import KernelTypeEnum, run_benchmark
|
| 16 |
-
|
| 17 |
|
| 18 |
-
# Load the activation kernel
|
| 19 |
-
activation = get_kernel("kernels-community/activation")
|
| 20 |
|
| 21 |
-
|
| 22 |
-
|
| 23 |
-
|
| 24 |
-
out_shape = input_tensor.shape[:-1] + (hidden_dim,)
|
| 25 |
-
out = torch.empty(out_shape, dtype=input_tensor.dtype, device=input_tensor.device)
|
| 26 |
-
return activation.silu_and_mul(out, input_tensor)
|
| 27 |
|
| 28 |
|
| 29 |
run_benchmark(
|
| 30 |
kernel_type=KernelTypeEnum.ACTIVATION,
|
| 31 |
-
impl_name="
|
| 32 |
-
impl_tags={"family":
|
| 33 |
-
impl_func=
|
| 34 |
)
|
|
|
|
| 4 |
# "numpy",
|
| 5 |
# "torch==2.8.0",
|
| 6 |
# "kernels-benchmark-tools",
|
|
|
|
| 7 |
# ]
|
| 8 |
#
|
| 9 |
# [tool.uv.sources]
|
|
|
|
| 12 |
import torch
|
| 13 |
import sys
|
| 14 |
from kernels_benchmark_tools import KernelTypeEnum, run_benchmark
|
| 15 |
+
import torch, torch.nn.functional as F
|
| 16 |
|
|
|
|
|
|
|
| 17 |
|
| 18 |
+
def swiglu_eager(x):
|
| 19 |
+
d = x.shape[-1] // 2
|
| 20 |
+
return F.silu(x[..., :d]) * x[..., d:]
|
|
|
|
|
|
|
|
|
|
| 21 |
|
| 22 |
|
| 23 |
run_benchmark(
|
| 24 |
kernel_type=KernelTypeEnum.ACTIVATION,
|
| 25 |
+
impl_name="torch_eager",
|
| 26 |
+
impl_tags={"family":"hf-kernels", "backend":"eager"},
|
| 27 |
+
impl_func=swiglu_eager,
|
| 28 |
)
|
activation/impls/hf_kernels_swiglu.html
CHANGED
|
@@ -809,6 +809,14 @@
|
|
| 809 |
.artifact-preview svg {
|
| 810 |
background: transparent;
|
| 811 |
}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 812 |
/* CSV table styling */
|
| 813 |
.artifact-csv {
|
| 814 |
margin-top: 1rem;
|
|
@@ -3871,7 +3879,7 @@ span.linenos.special { color: #000000; background-color: #ffffc0; padding-left:
|
|
| 3871 |
<span onclick="toggleOutput('nv')" style="cursor: pointer;">▼ output</span>
|
| 3872 |
<span id="uv-indicator-nv" style="cursor: default; opacity: 0.3;">▶ uv-logs</span>
|
| 3873 |
</span> |
|
| 3874 |
-
Cell: nv | 0.
|
| 3875 |
| <button class="run-btn" onclick="runCell('nv')">▶ run</button>
|
| 3876 |
<button class="copy-btn" onclick="copyCell('nv')">Copy</button>
|
| 3877 |
<a href="cells/nv.py" target="_blank" class="raw-btn">Raw</a>
|
|
@@ -3887,7 +3895,7 @@ Cell: nv | 0.26s
|
|
| 3887 |
</div>
|
| 3888 |
</div>
|
| 3889 |
<div id="output-nv" class="cell-output">
|
| 3890 |
-
<div class="cell-stdout"><pre class="stdout-text">Wed Oct 29
|
| 3891 |
+-----------------------------------------------------------------------------------------+
|
| 3892 |
| NVIDIA-SMI 570.195.03 Driver Version: 570.195.03 CUDA Version: 12.8 |
|
| 3893 |
|-----------------------------------------+------------------------+----------------------+
|
|
@@ -3896,7 +3904,7 @@ Cell: nv | 0.26s
|
|
| 3896 |
| | | MIG M. |
|
| 3897 |
|=========================================+========================+======================|
|
| 3898 |
| 0 NVIDIA L40S On | 00000000:4D:00.0 Off | 0 |
|
| 3899 |
-
| N/A
|
| 3900 |
| | | N/A |
|
| 3901 |
+-----------------------------------------+------------------------+----------------------+
|
| 3902 |
|
|
@@ -3920,7 +3928,7 @@ Cell: nv | 0.26s
|
|
| 3920 |
<span onclick="toggleOutput('benchmark')" style="cursor: pointer;">▼ output</span>
|
| 3921 |
<span id="uv-indicator-benchmark" onclick="toggleUvLogsFromHeader('benchmark')" style="cursor: pointer;">▶ uv-logs</span>
|
| 3922 |
</span> |
|
| 3923 |
-
Cell: benchmark |
|
| 3924 |
| <button class="run-btn" onclick="runCell('benchmark')">▶ run</button>
|
| 3925 |
<button class="copy-btn" onclick="copyCell('benchmark')">Copy</button>
|
| 3926 |
<a href="cells/benchmark.py" target="_blank" class="raw-btn">Raw</a>
|
|
@@ -3976,17 +3984,17 @@ PROFILE TRACE: hf_kernels_swiglu | cuda_T128_D768
|
|
| 3976 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 3977 |
Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
|
| 3978 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 3979 |
-
hf_kernels_swiglu 0.00% 0.000us 0.00% 0.000us 0.000us
|
| 3980 |
-
hf_kernels_swiglu
|
| 3981 |
-
_activation_beeaae6::silu_and_mul 1.
|
| 3982 |
-
void vllm::act_and_mul_kernel<c10::BFloat16, &(c10::... 0.00% 0.000us 0.00% 0.000us 0.000us 4.
|
| 3983 |
-
Activity Buffer Request
|
| 3984 |
-
aten::empty 2.
|
| 3985 |
-
cudaLaunchKernel 2.
|
| 3986 |
-
cudaDeviceSynchronize 0.
|
| 3987 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 3988 |
-
Self CPU time total: 1.
|
| 3989 |
-
Self CUDA time total: 4.
|
| 3990 |
|
| 3991 |
|
| 3992 |
|
|
@@ -3996,16 +4004,16 @@ PROFILE TRACE: hf_kernels_swiglu | cuda_T128_D1024
|
|
| 3996 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 3997 |
Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
|
| 3998 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 3999 |
-
hf_kernels_swiglu 0.00% 0.000us 0.00% 0.000us 0.000us
|
| 4000 |
-
hf_kernels_swiglu
|
| 4001 |
-
_activation_beeaae6::silu_and_mul 1.
|
| 4002 |
void vllm::act_and_mul_kernel<c10::BFloat16, &(c10::... 0.00% 0.000us 0.00% 0.000us 0.000us 3.968us 100.00% 3.968us 1.323us 3
|
| 4003 |
-
Activity Buffer Request
|
| 4004 |
-
aten::empty 1.
|
| 4005 |
-
cudaLaunchKernel 1.
|
| 4006 |
-
cudaDeviceSynchronize 0.
|
| 4007 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 4008 |
-
Self CPU time total: 1.
|
| 4009 |
Self CUDA time total: 3.968us
|
| 4010 |
|
| 4011 |
|
|
@@ -4016,17 +4024,17 @@ PROFILE TRACE: hf_kernels_swiglu | cuda_T128_D2048
|
|
| 4016 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 4017 |
Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
|
| 4018 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 4019 |
-
hf_kernels_swiglu 0.00% 0.000us 0.00% 0.000us 0.000us
|
| 4020 |
-
hf_kernels_swiglu
|
| 4021 |
-
_activation_beeaae6::silu_and_mul 1.
|
| 4022 |
-
void vllm::act_and_mul_kernel<c10::BFloat16, &(c10::... 0.00% 0.000us 0.00% 0.000us 0.000us 4.
|
| 4023 |
-
Activity Buffer Request
|
| 4024 |
-
aten::empty 1.
|
| 4025 |
-
cudaLaunchKernel 1.
|
| 4026 |
-
cudaDeviceSynchronize 0.
|
| 4027 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 4028 |
-
Self CPU time total: 1.
|
| 4029 |
-
Self CUDA time total: 4.
|
| 4030 |
|
| 4031 |
|
| 4032 |
|
|
@@ -4036,17 +4044,17 @@ PROFILE TRACE: hf_kernels_swiglu | cuda_T256_D768
|
|
| 4036 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 4037 |
Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
|
| 4038 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 4039 |
-
hf_kernels_swiglu 0.00% 0.000us 0.00% 0.000us 0.000us
|
| 4040 |
-
hf_kernels_swiglu
|
| 4041 |
-
_activation_beeaae6::silu_and_mul 1.
|
| 4042 |
-
void vllm::act_and_mul_kernel<c10::BFloat16, &(c10::... 0.00% 0.000us 0.00% 0.000us 0.000us 4.
|
| 4043 |
-
Activity Buffer Request
|
| 4044 |
-
aten::empty 1.
|
| 4045 |
-
cudaLaunchKernel
|
| 4046 |
-
cudaDeviceSynchronize 0.
|
| 4047 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 4048 |
-
Self CPU time total: 1.
|
| 4049 |
-
Self CUDA time total: 4.
|
| 4050 |
|
| 4051 |
|
| 4052 |
|
|
@@ -4056,17 +4064,17 @@ PROFILE TRACE: hf_kernels_swiglu | cuda_T256_D1024
|
|
| 4056 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 4057 |
Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
|
| 4058 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 4059 |
-
hf_kernels_swiglu 0.00% 0.000us 0.00% 0.000us 0.000us
|
| 4060 |
-
hf_kernels_swiglu
|
| 4061 |
-
_activation_beeaae6::silu_and_mul
|
| 4062 |
-
void vllm::act_and_mul_kernel<c10::BFloat16, &(c10::... 0.00% 0.000us 0.00% 0.000us 0.000us 5.
|
| 4063 |
-
Activity Buffer Request
|
| 4064 |
-
aten::empty
|
| 4065 |
-
cudaLaunchKernel
|
| 4066 |
-
cudaDeviceSynchronize 0.
|
| 4067 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 4068 |
-
Self CPU time total:
|
| 4069 |
-
Self CUDA time total: 5.
|
| 4070 |
|
| 4071 |
|
| 4072 |
|
|
@@ -4076,17 +4084,17 @@ PROFILE TRACE: hf_kernels_swiglu | cuda_T256_D2048
|
|
| 4076 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 4077 |
Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
|
| 4078 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 4079 |
-
hf_kernels_swiglu 0.00% 0.000us 0.00% 0.000us 0.000us
|
| 4080 |
-
hf_kernels_swiglu
|
| 4081 |
-
_activation_beeaae6::silu_and_mul
|
| 4082 |
-
void vllm::act_and_mul_kernel<c10::BFloat16, &(c10::... 0.00% 0.000us 0.00% 0.000us 0.000us 7.
|
| 4083 |
-
Activity Buffer Request
|
| 4084 |
-
aten::empty
|
| 4085 |
-
cudaLaunchKernel
|
| 4086 |
-
cudaDeviceSynchronize
|
| 4087 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 4088 |
-
Self CPU time total:
|
| 4089 |
-
Self CUDA time total: 7.
|
| 4090 |
|
| 4091 |
|
| 4092 |
|
|
@@ -4096,16 +4104,16 @@ PROFILE TRACE: hf_kernels_swiglu | cuda_T512_D768
|
|
| 4096 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 4097 |
Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
|
| 4098 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 4099 |
-
hf_kernels_swiglu 0.00% 0.000us 0.00% 0.000us 0.000us
|
| 4100 |
-
hf_kernels_swiglu
|
| 4101 |
-
_activation_beeaae6::silu_and_mul
|
| 4102 |
void vllm::act_and_mul_kernel<c10::BFloat16, &(c10::... 0.00% 0.000us 0.00% 0.000us 0.000us 6.560us 100.00% 6.560us 2.187us 3
|
| 4103 |
-
Activity Buffer Request
|
| 4104 |
-
aten::empty
|
| 4105 |
-
cudaLaunchKernel
|
| 4106 |
-
cudaDeviceSynchronize
|
| 4107 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 4108 |
-
Self CPU time total:
|
| 4109 |
Self CUDA time total: 6.560us
|
| 4110 |
|
| 4111 |
|
|
@@ -4116,16 +4124,16 @@ PROFILE TRACE: hf_kernels_swiglu | cuda_T512_D1024
|
|
| 4116 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 4117 |
Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
|
| 4118 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 4119 |
-
hf_kernels_swiglu 0.00% 0.000us 0.00% 0.000us 0.000us
|
| 4120 |
-
hf_kernels_swiglu
|
| 4121 |
-
_activation_beeaae6::silu_and_mul
|
| 4122 |
void vllm::act_and_mul_kernel<c10::BFloat16, &(c10::... 0.00% 0.000us 0.00% 0.000us 0.000us 9.408us 100.00% 9.408us 3.136us 3
|
| 4123 |
-
Activity Buffer Request
|
| 4124 |
-
aten::empty
|
| 4125 |
-
cudaLaunchKernel
|
| 4126 |
-
cudaDeviceSynchronize 0.
|
| 4127 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 4128 |
-
Self CPU time total:
|
| 4129 |
Self CUDA time total: 9.408us
|
| 4130 |
|
| 4131 |
|
|
@@ -4136,17 +4144,17 @@ PROFILE TRACE: hf_kernels_swiglu | cuda_T512_D2048
|
|
| 4136 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 4137 |
Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
|
| 4138 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 4139 |
-
hf_kernels_swiglu 0.00% 0.000us 0.00% 0.000us 0.000us
|
| 4140 |
-
hf_kernels_swiglu
|
| 4141 |
-
_activation_beeaae6::silu_and_mul
|
| 4142 |
-
void vllm::act_and_mul_kernel<c10::BFloat16, &(c10::... 0.00% 0.000us 0.00% 0.000us 0.000us
|
| 4143 |
-
Activity Buffer Request
|
| 4144 |
-
aten::empty
|
| 4145 |
-
cudaLaunchKernel
|
| 4146 |
-
cudaDeviceSynchronize
|
| 4147 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 4148 |
-
Self CPU time total:
|
| 4149 |
-
Self CUDA time total:
|
| 4150 |
|
| 4151 |
|
| 4152 |
impl wl p50(ms) ok
|
|
@@ -4163,12 +4171,12 @@ hf_kernels_swiglu cuda_T512_D768 0.03 True
|
|
| 4163 |
<div class="uv-install-logs" id="uv-logs-benchmark">
|
| 4164 |
<div class="uv-logs-header" onclick="toggleUvLogs(this)">▶ UV Install Logs</div>
|
| 4165 |
<div class="uv-logs-content" style="display: none;">
|
| 4166 |
-
Installed
|
| 4167 |
</div>
|
| 4168 |
</div>
|
| 4169 |
<div class="cell-stderr">Fetching 7 files: 0%| | 0/7 [00:00<?, ?it/s]
|
| 4170 |
-
Fetching 7 files: 71%|███████▏ | 5/7 [00:00<00:00,
|
| 4171 |
-
Fetching 7 files: 100%|██████████| 7/7 [00:00<00:00,
|
| 4172 |
<div class="cell-artifacts">
|
| 4173 |
<h4>Artifacts:</h4>
|
| 4174 |
<a href="artifacts/benchmark/activation.jsonl" class="artifact" target="_blank">activation.jsonl</a>
|
|
|
|
| 809 |
.artifact-preview svg {
|
| 810 |
background: transparent;
|
| 811 |
}
|
| 812 |
+
/* Invert SVG images in dark mode */
|
| 813 |
+
:root[data-theme="dark"] .artifact-preview img[src$=".svg"] {
|
| 814 |
+
filter: invert(0.9) hue-rotate(180deg);
|
| 815 |
+
}
|
| 816 |
+
/* Keep SVG images readable in monocolor mode */
|
| 817 |
+
:root[data-ui="monocolor"] .artifact-preview img[src$=".svg"] {
|
| 818 |
+
filter: none;
|
| 819 |
+
}
|
| 820 |
/* CSV table styling */
|
| 821 |
.artifact-csv {
|
| 822 |
margin-top: 1rem;
|
|
|
|
| 3879 |
<span onclick="toggleOutput('nv')" style="cursor: pointer;">▼ output</span>
|
| 3880 |
<span id="uv-indicator-nv" style="cursor: default; opacity: 0.3;">▶ uv-logs</span>
|
| 3881 |
</span> |
|
| 3882 |
+
Cell: nv | 0.21s
|
| 3883 |
| <button class="run-btn" onclick="runCell('nv')">▶ run</button>
|
| 3884 |
<button class="copy-btn" onclick="copyCell('nv')">Copy</button>
|
| 3885 |
<a href="cells/nv.py" target="_blank" class="raw-btn">Raw</a>
|
|
|
|
| 3895 |
</div>
|
| 3896 |
</div>
|
| 3897 |
<div id="output-nv" class="cell-output">
|
| 3898 |
+
<div class="cell-stdout"><pre class="stdout-text">Wed Oct 29 15:50:40 2025
|
| 3899 |
+-----------------------------------------------------------------------------------------+
|
| 3900 |
| NVIDIA-SMI 570.195.03 Driver Version: 570.195.03 CUDA Version: 12.8 |
|
| 3901 |
|-----------------------------------------+------------------------+----------------------+
|
|
|
|
| 3904 |
| | | MIG M. |
|
| 3905 |
|=========================================+========================+======================|
|
| 3906 |
| 0 NVIDIA L40S On | 00000000:4D:00.0 Off | 0 |
|
| 3907 |
+
| N/A 28C P0 78W / 350W | 0MiB / 46068MiB | 11% Default |
|
| 3908 |
| | | N/A |
|
| 3909 |
+-----------------------------------------+------------------------+----------------------+
|
| 3910 |
|
|
|
|
| 3928 |
<span onclick="toggleOutput('benchmark')" style="cursor: pointer;">▼ output</span>
|
| 3929 |
<span id="uv-indicator-benchmark" onclick="toggleUvLogsFromHeader('benchmark')" style="cursor: pointer;">▶ uv-logs</span>
|
| 3930 |
</span> |
|
| 3931 |
+
Cell: benchmark | 7.78s
|
| 3932 |
| <button class="run-btn" onclick="runCell('benchmark')">▶ run</button>
|
| 3933 |
<button class="copy-btn" onclick="copyCell('benchmark')">Copy</button>
|
| 3934 |
<a href="cells/benchmark.py" target="_blank" class="raw-btn">Raw</a>
|
|
|
|
| 3984 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 3985 |
Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
|
| 3986 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 3987 |
+
hf_kernels_swiglu 0.00% 0.000us 0.00% 0.000us 0.000us 79.968us 1983.33% 79.968us 79.968us 1
|
| 3988 |
+
hf_kernels_swiglu 10.58% 184.424us 99.57% 1.736ms 1.736ms 0.000us 0.00% 5.408us 5.408us 1
|
| 3989 |
+
_activation_beeaae6::silu_and_mul 1.26% 21.900us 86.25% 1.504ms 501.188us 4.032us 100.00% 5.408us 1.803us 3
|
| 3990 |
+
void vllm::act_and_mul_kernel<c10::BFloat16, &(c10::... 0.00% 0.000us 0.00% 0.000us 0.000us 4.032us 100.00% 4.032us 1.344us 3
|
| 3991 |
+
Activity Buffer Request 82.49% 1.438ms 82.49% 1.438ms 1.438ms 1.376us 34.13% 1.376us 1.376us 1
|
| 3992 |
+
aten::empty 2.74% 47.772us 2.74% 47.772us 15.924us 0.000us 0.00% 0.000us 0.000us 3
|
| 3993 |
+
cudaLaunchKernel 2.50% 43.631us 2.50% 43.631us 14.544us 0.000us 0.00% 0.000us 0.000us 3
|
| 3994 |
+
cudaDeviceSynchronize 0.43% 7.440us 0.43% 7.440us 7.440us 0.000us 0.00% 0.000us 0.000us 1
|
| 3995 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 3996 |
+
Self CPU time total: 1.743ms
|
| 3997 |
+
Self CUDA time total: 4.032us
|
| 3998 |
|
| 3999 |
|
| 4000 |
|
|
|
|
| 4004 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 4005 |
Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
|
| 4006 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 4007 |
+
hf_kernels_swiglu 0.00% 0.000us 0.00% 0.000us 0.000us 60.192us 1516.94% 60.192us 60.192us 1
|
| 4008 |
+
hf_kernels_swiglu 5.66% 89.803us 99.62% 1.581ms 1.581ms 0.000us 0.00% 5.312us 5.312us 1
|
| 4009 |
+
_activation_beeaae6::silu_and_mul 1.35% 21.470us 92.79% 1.473ms 491.035us 3.968us 100.00% 5.312us 1.771us 3
|
| 4010 |
void vllm::act_and_mul_kernel<c10::BFloat16, &(c10::... 0.00% 0.000us 0.00% 0.000us 0.000us 3.968us 100.00% 3.968us 1.323us 3
|
| 4011 |
+
Activity Buffer Request 89.86% 1.427ms 89.86% 1.427ms 1.427ms 1.344us 33.87% 1.344us 1.344us 1
|
| 4012 |
+
aten::empty 1.17% 18.590us 1.17% 18.590us 6.197us 0.000us 0.00% 0.000us 0.000us 3
|
| 4013 |
+
cudaLaunchKernel 1.58% 25.022us 1.58% 25.022us 8.341us 0.000us 0.00% 0.000us 0.000us 3
|
| 4014 |
+
cudaDeviceSynchronize 0.38% 6.110us 0.38% 6.110us 6.110us 0.000us 0.00% 0.000us 0.000us 1
|
| 4015 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 4016 |
+
Self CPU time total: 1.588ms
|
| 4017 |
Self CUDA time total: 3.968us
|
| 4018 |
|
| 4019 |
|
|
|
|
| 4024 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 4025 |
Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
|
| 4026 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 4027 |
+
hf_kernels_swiglu 0.00% 0.000us 0.00% 0.000us 0.000us 65.535us 1338.54% 65.535us 65.535us 1
|
| 4028 |
+
hf_kernels_swiglu 5.56% 88.483us 99.64% 1.586ms 1.586ms 0.000us 0.00% 6.528us 6.528us 1
|
| 4029 |
+
_activation_beeaae6::silu_and_mul 1.35% 21.452us 92.87% 1.478ms 492.822us 4.896us 100.00% 6.528us 2.176us 3
|
| 4030 |
+
void vllm::act_and_mul_kernel<c10::BFloat16, &(c10::... 0.00% 0.000us 0.00% 0.000us 0.000us 4.896us 100.00% 4.896us 1.632us 3
|
| 4031 |
+
Activity Buffer Request 89.90% 1.431ms 89.90% 1.431ms 1.431ms 1.632us 33.33% 1.632us 1.632us 1
|
| 4032 |
+
aten::empty 1.21% 19.310us 1.21% 19.310us 6.437us 0.000us 0.00% 0.000us 0.000us 3
|
| 4033 |
+
cudaLaunchKernel 1.63% 25.910us 1.63% 25.910us 8.637us 0.000us 0.00% 0.000us 0.000us 3
|
| 4034 |
+
cudaDeviceSynchronize 0.36% 5.661us 0.36% 5.661us 5.661us 0.000us 0.00% 0.000us 0.000us 1
|
| 4035 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 4036 |
+
Self CPU time total: 1.592ms
|
| 4037 |
+
Self CUDA time total: 4.896us
|
| 4038 |
|
| 4039 |
|
| 4040 |
|
|
|
|
| 4044 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 4045 |
Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
|
| 4046 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 4047 |
+
hf_kernels_swiglu 0.00% 0.000us 0.00% 0.000us 0.000us 67.008us 1562.69% 67.008us 67.008us 1
|
| 4048 |
+
hf_kernels_swiglu 4.93% 90.832us 99.72% 1.836ms 1.836ms 0.000us 0.00% 5.728us 5.728us 1
|
| 4049 |
+
_activation_beeaae6::silu_and_mul 1.23% 22.581us 93.74% 1.726ms 575.177us 4.288us 100.00% 5.728us 1.909us 3
|
| 4050 |
+
void vllm::act_and_mul_kernel<c10::BFloat16, &(c10::... 0.00% 0.000us 0.00% 0.000us 0.000us 4.288us 100.00% 4.288us 1.429us 3
|
| 4051 |
+
Activity Buffer Request 81.40% 1.498ms 81.40% 1.498ms 1.498ms 1.440us 33.58% 1.440us 1.440us 1
|
| 4052 |
+
aten::empty 1.04% 19.180us 1.04% 19.180us 6.393us 0.000us 0.00% 0.000us 0.000us 3
|
| 4053 |
+
cudaLaunchKernel 11.11% 204.595us 11.11% 204.595us 68.198us 0.000us 0.00% 0.000us 0.000us 3
|
| 4054 |
+
cudaDeviceSynchronize 0.28% 5.180us 0.28% 5.180us 5.180us 0.000us 0.00% 0.000us 0.000us 1
|
| 4055 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 4056 |
+
Self CPU time total: 1.841ms
|
| 4057 |
+
Self CUDA time total: 4.288us
|
| 4058 |
|
| 4059 |
|
| 4060 |
|
|
|
|
| 4064 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 4065 |
Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
|
| 4066 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 4067 |
+
hf_kernels_swiglu 0.00% 0.000us 0.00% 0.000us 0.000us 64.800us 1106.37% 64.800us 64.800us 1
|
| 4068 |
+
hf_kernels_swiglu 5.65% 97.973us 99.69% 1.728ms 1.728ms 0.000us 0.00% 7.810us 7.810us 1
|
| 4069 |
+
_activation_beeaae6::silu_and_mul 1.27% 22.090us 92.96% 1.611ms 536.996us 5.857us 100.00% 7.810us 2.603us 3
|
| 4070 |
+
void vllm::act_and_mul_kernel<c10::BFloat16, &(c10::... 0.00% 0.000us 0.00% 0.000us 0.000us 5.857us 100.00% 5.857us 1.952us 3
|
| 4071 |
+
Activity Buffer Request 82.37% 1.427ms 82.37% 1.427ms 1.427ms 1.953us 33.34% 1.953us 1.953us 1
|
| 4072 |
+
aten::empty 1.09% 18.810us 1.09% 18.810us 6.270us 0.000us 0.00% 0.000us 0.000us 3
|
| 4073 |
+
cudaLaunchKernel 9.31% 161.434us 9.31% 161.434us 53.811us 0.000us 0.00% 0.000us 0.000us 3
|
| 4074 |
+
cudaDeviceSynchronize 0.31% 5.300us 0.31% 5.300us 5.300us 0.000us 0.00% 0.000us 0.000us 1
|
| 4075 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 4076 |
+
Self CPU time total: 1.733ms
|
| 4077 |
+
Self CUDA time total: 5.857us
|
| 4078 |
|
| 4079 |
|
| 4080 |
|
|
|
|
| 4084 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 4085 |
Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
|
| 4086 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 4087 |
+
hf_kernels_swiglu 0.00% 0.000us 0.00% 0.000us 0.000us 77.311us 1002.48% 77.311us 77.311us 1
|
| 4088 |
+
hf_kernels_swiglu 20.04% 98.272us 98.88% 484.972us 484.972us 0.000us 0.00% 10.304us 10.304us 1
|
| 4089 |
+
_activation_beeaae6::silu_and_mul 4.97% 24.390us 74.66% 366.210us 122.070us 7.712us 100.00% 10.304us 3.435us 3
|
| 4090 |
+
void vllm::act_and_mul_kernel<c10::BFloat16, &(c10::... 0.00% 0.000us 0.00% 0.000us 0.000us 7.712us 100.00% 7.712us 2.571us 3
|
| 4091 |
+
Activity Buffer Request 34.13% 167.415us 34.13% 167.415us 167.415us 2.592us 33.61% 2.592us 2.592us 1
|
| 4092 |
+
aten::empty 4.18% 20.490us 4.18% 20.490us 6.830us 0.000us 0.00% 0.000us 0.000us 3
|
| 4093 |
+
cudaLaunchKernel 35.56% 174.405us 35.56% 174.405us 58.135us 0.000us 0.00% 0.000us 0.000us 3
|
| 4094 |
+
cudaDeviceSynchronize 1.12% 5.511us 1.12% 5.511us 5.511us 0.000us 0.00% 0.000us 0.000us 1
|
| 4095 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 4096 |
+
Self CPU time total: 490.483us
|
| 4097 |
+
Self CUDA time total: 7.712us
|
| 4098 |
|
| 4099 |
|
| 4100 |
|
|
|
|
| 4104 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 4105 |
Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
|
| 4106 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 4107 |
+
hf_kernels_swiglu 0.00% 0.000us 0.00% 0.000us 0.000us 63.327us 965.35% 63.327us 63.327us 1
|
| 4108 |
+
hf_kernels_swiglu 20.14% 83.823us 98.84% 411.400us 411.400us 0.000us 0.00% 8.768us 8.768us 1
|
| 4109 |
+
_activation_beeaae6::silu_and_mul 5.43% 22.601us 74.29% 309.187us 103.062us 6.560us 100.00% 8.768us 2.923us 3
|
| 4110 |
void vllm::act_and_mul_kernel<c10::BFloat16, &(c10::... 0.00% 0.000us 0.00% 0.000us 0.000us 6.560us 100.00% 6.560us 2.187us 3
|
| 4111 |
+
Activity Buffer Request 32.27% 134.313us 32.27% 134.313us 134.313us 2.208us 33.66% 2.208us 2.208us 1
|
| 4112 |
+
aten::empty 4.42% 18.390us 4.42% 18.390us 6.130us 0.000us 0.00% 0.000us 0.000us 3
|
| 4113 |
+
cudaLaunchKernel 36.59% 152.273us 36.59% 152.273us 50.758us 0.000us 0.00% 0.000us 0.000us 3
|
| 4114 |
+
cudaDeviceSynchronize 1.16% 4.810us 1.16% 4.810us 4.810us 0.000us 0.00% 0.000us 0.000us 1
|
| 4115 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 4116 |
+
Self CPU time total: 416.210us
|
| 4117 |
Self CUDA time total: 6.560us
|
| 4118 |
|
| 4119 |
|
|
|
|
| 4124 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 4125 |
Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
|
| 4126 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 4127 |
+
hf_kernels_swiglu 0.00% 0.000us 0.00% 0.000us 0.000us 69.952us 743.54% 69.952us 69.952us 1
|
| 4128 |
+
hf_kernels_swiglu 5.37% 93.270us 99.70% 1.733ms 1.733ms 0.000us 0.00% 12.544us 12.544us 1
|
| 4129 |
+
_activation_beeaae6::silu_and_mul 1.28% 22.251us 93.17% 1.619ms 539.830us 9.408us 100.00% 12.544us 4.181us 3
|
| 4130 |
void vllm::act_and_mul_kernel<c10::BFloat16, &(c10::... 0.00% 0.000us 0.00% 0.000us 0.000us 9.408us 100.00% 9.408us 3.136us 3
|
| 4131 |
+
Activity Buffer Request 83.02% 1.443ms 83.02% 1.443ms 1.443ms 3.136us 33.33% 3.136us 3.136us 1
|
| 4132 |
+
aten::empty 1.17% 20.271us 1.17% 20.271us 6.757us 0.000us 0.00% 0.000us 0.000us 3
|
| 4133 |
+
cudaLaunchKernel 8.87% 154.165us 8.87% 154.165us 51.388us 0.000us 0.00% 0.000us 0.000us 3
|
| 4134 |
+
cudaDeviceSynchronize 0.30% 5.210us 0.30% 5.210us 5.210us 0.000us 0.00% 0.000us 0.000us 1
|
| 4135 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 4136 |
+
Self CPU time total: 1.738ms
|
| 4137 |
Self CUDA time total: 9.408us
|
| 4138 |
|
| 4139 |
|
|
|
|
| 4144 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 4145 |
Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
|
| 4146 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 4147 |
+
hf_kernels_swiglu 0.00% 0.000us 0.00% 0.000us 0.000us 65.278us 502.45% 65.278us 65.278us 1
|
| 4148 |
+
hf_kernels_swiglu 20.56% 86.143us 98.78% 413.910us 413.910us 0.000us 0.00% 17.344us 17.344us 1
|
| 4149 |
+
_activation_beeaae6::silu_and_mul 5.61% 23.493us 73.70% 308.818us 102.939us 12.992us 100.00% 17.344us 5.781us 3
|
| 4150 |
+
void vllm::act_and_mul_kernel<c10::BFloat16, &(c10::... 0.00% 0.000us 0.00% 0.000us 0.000us 12.992us 100.00% 12.992us 4.331us 3
|
| 4151 |
+
Activity Buffer Request 31.64% 132.592us 31.64% 132.592us 132.592us 4.352us 33.50% 4.352us 4.352us 1
|
| 4152 |
+
aten::empty 4.52% 18.949us 4.52% 18.949us 6.316us 0.000us 0.00% 0.000us 0.000us 3
|
| 4153 |
+
cudaLaunchKernel 36.45% 152.733us 36.45% 152.733us 50.911us 0.000us 0.00% 0.000us 0.000us 3
|
| 4154 |
+
cudaDeviceSynchronize 1.22% 5.130us 1.22% 5.130us 5.130us 0.000us 0.00% 0.000us 0.000us 1
|
| 4155 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 4156 |
+
Self CPU time total: 419.040us
|
| 4157 |
+
Self CUDA time total: 12.992us
|
| 4158 |
|
| 4159 |
|
| 4160 |
impl wl p50(ms) ok
|
|
|
|
| 4171 |
<div class="uv-install-logs" id="uv-logs-benchmark">
|
| 4172 |
<div class="uv-logs-header" onclick="toggleUvLogs(this)">▶ UV Install Logs</div>
|
| 4173 |
<div class="uv-logs-content" style="display: none;">
|
| 4174 |
+
Installed 52 packages in 252ms
|
| 4175 |
</div>
|
| 4176 |
</div>
|
| 4177 |
<div class="cell-stderr">Fetching 7 files: 0%| | 0/7 [00:00<?, ?it/s]
|
| 4178 |
+
Fetching 7 files: 71%|███████▏ | 5/7 [00:00<00:00, 17.75it/s]
|
| 4179 |
+
Fetching 7 files: 100%|██████████| 7/7 [00:00<00:00, 24.82it/s]</div>
|
| 4180 |
<div class="cell-artifacts">
|
| 4181 |
<h4>Artifacts:</h4>
|
| 4182 |
<a href="artifacts/benchmark/activation.jsonl" class="artifact" target="_blank">activation.jsonl</a>
|
activation/impls/torch_swiglu.html
CHANGED
|
@@ -809,6 +809,14 @@
|
|
| 809 |
.artifact-preview svg {
|
| 810 |
background: transparent;
|
| 811 |
}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 812 |
/* CSV table styling */
|
| 813 |
.artifact-csv {
|
| 814 |
margin-top: 1rem;
|
|
@@ -3871,7 +3879,7 @@ span.linenos.special { color: #000000; background-color: #ffffc0; padding-left:
|
|
| 3871 |
<span onclick="toggleOutput('nv')" style="cursor: pointer;">▼ output</span>
|
| 3872 |
<span id="uv-indicator-nv" style="cursor: default; opacity: 0.3;">▶ uv-logs</span>
|
| 3873 |
</span> |
|
| 3874 |
-
Cell: nv | 0.
|
| 3875 |
| <button class="run-btn" onclick="runCell('nv')">▶ run</button>
|
| 3876 |
<button class="copy-btn" onclick="copyCell('nv')">Copy</button>
|
| 3877 |
<a href="cells/nv.py" target="_blank" class="raw-btn">Raw</a>
|
|
@@ -3887,7 +3895,7 @@ Cell: nv | 0.26s
|
|
| 3887 |
</div>
|
| 3888 |
</div>
|
| 3889 |
<div id="output-nv" class="cell-output">
|
| 3890 |
-
<div class="cell-stdout"><pre class="stdout-text">Wed Oct 29
|
| 3891 |
+-----------------------------------------------------------------------------------------+
|
| 3892 |
| NVIDIA-SMI 570.195.03 Driver Version: 570.195.03 CUDA Version: 12.8 |
|
| 3893 |
|-----------------------------------------+------------------------+----------------------+
|
|
@@ -3896,7 +3904,7 @@ Cell: nv | 0.26s
|
|
| 3896 |
| | | MIG M. |
|
| 3897 |
|=========================================+========================+======================|
|
| 3898 |
| 0 NVIDIA L40S On | 00000000:4D:00.0 Off | 0 |
|
| 3899 |
-
| N/A
|
| 3900 |
| | | N/A |
|
| 3901 |
+-----------------------------------------+------------------------+----------------------+
|
| 3902 |
|
|
@@ -3918,9 +3926,9 @@ Cell: nv | 0.26s
|
|
| 3918 |
<span class="collapse-indicators">
|
| 3919 |
<span onclick="toggleCode('benchmark')" style="cursor: pointer;">▼ code</span>
|
| 3920 |
<span onclick="toggleOutput('benchmark')" style="cursor: pointer;">▼ output</span>
|
| 3921 |
-
<span id="uv-indicator-benchmark"
|
| 3922 |
</span> |
|
| 3923 |
-
Cell: benchmark |
|
| 3924 |
| <button class="run-btn" onclick="runCell('benchmark')">▶ run</button>
|
| 3925 |
<button class="copy-btn" onclick="copyCell('benchmark')">Copy</button>
|
| 3926 |
<a href="cells/benchmark.py" target="_blank" class="raw-btn">Raw</a>
|
|
@@ -3970,19 +3978,19 @@ PROFILE TRACE: torch_eager | cuda_T128_D768
|
|
| 3970 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 3971 |
Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
|
| 3972 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 3973 |
-
torch_eager 0.00% 0.000us 0.00% 0.000us 0.000us
|
| 3974 |
-
torch_eager
|
| 3975 |
-
aten::silu 3.
|
| 3976 |
-
void at::native::elementwise_kernel<128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 6.
|
| 3977 |
-
aten::mul 1.
|
| 3978 |
-
void at::native::elementwise_kernel<128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 6.
|
| 3979 |
-
Activity Buffer Request
|
| 3980 |
-
aten::slice 2.
|
| 3981 |
-
aten::as_strided 0.61% 11.
|
| 3982 |
-
cudaLaunchKernel 3.
|
| 3983 |
-
cudaDeviceSynchronize 0.40% 7.
|
| 3984 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 3985 |
-
Self CPU time total: 1.
|
| 3986 |
Self CUDA time total: 12.768us
|
| 3987 |
|
| 3988 |
|
|
@@ -3993,20 +4001,20 @@ PROFILE TRACE: torch_eager | cuda_T128_D1024
|
|
| 3993 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 3994 |
Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
|
| 3995 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 3996 |
-
torch_eager 0.00% 0.000us 0.00% 0.000us 0.000us
|
| 3997 |
-
torch_eager
|
| 3998 |
-
aten::silu 2.
|
| 3999 |
-
void at::native::elementwise_kernel<128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 6.399us 51.
|
| 4000 |
-
aten::mul 1.
|
| 4001 |
-
void at::native::elementwise_kernel<128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 5.
|
| 4002 |
-
Activity Buffer Request
|
| 4003 |
-
aten::slice 1.
|
| 4004 |
-
aten::as_strided 0.
|
| 4005 |
-
cudaLaunchKernel 2.
|
| 4006 |
-
cudaDeviceSynchronize 0.
|
| 4007 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 4008 |
-
Self CPU time total: 1.
|
| 4009 |
-
Self CUDA time total: 12.
|
| 4010 |
|
| 4011 |
|
| 4012 |
|
|
@@ -4016,20 +4024,20 @@ PROFILE TRACE: torch_eager | cuda_T128_D2048
|
|
| 4016 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 4017 |
Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
|
| 4018 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 4019 |
-
torch_eager 0.00% 0.000us 0.00% 0.000us 0.000us
|
| 4020 |
-
torch_eager
|
| 4021 |
-
aten::silu 2.
|
| 4022 |
-
void at::native::elementwise_kernel<128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 6.
|
| 4023 |
-
aten::mul 1.
|
| 4024 |
-
void at::native::elementwise_kernel<128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 6.
|
| 4025 |
-
Activity Buffer Request
|
| 4026 |
-
aten::slice 1.
|
| 4027 |
-
aten::as_strided 0.
|
| 4028 |
-
cudaLaunchKernel 2.
|
| 4029 |
-
cudaDeviceSynchronize 0.
|
| 4030 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 4031 |
-
Self CPU time total: 1.
|
| 4032 |
-
Self CUDA time total: 13.
|
| 4033 |
|
| 4034 |
|
| 4035 |
|
|
@@ -4039,20 +4047,20 @@ PROFILE TRACE: torch_eager | cuda_T256_D768
|
|
| 4039 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 4040 |
Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
|
| 4041 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 4042 |
-
torch_eager 0.00% 0.000us 0.00% 0.000us 0.000us
|
| 4043 |
-
torch_eager 6.
|
| 4044 |
-
aten::silu 2.
|
| 4045 |
-
void at::native::elementwise_kernel<128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 6.560us 51.
|
| 4046 |
-
aten::mul 1.
|
| 4047 |
-
void at::native::elementwise_kernel<128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 6.
|
| 4048 |
-
Activity Buffer Request 74.
|
| 4049 |
-
aten::slice 1.
|
| 4050 |
-
aten::as_strided 0.
|
| 4051 |
-
cudaLaunchKernel
|
| 4052 |
-
cudaDeviceSynchronize 0.
|
| 4053 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 4054 |
-
Self CPU time total: 1.
|
| 4055 |
-
Self CUDA time total: 12.
|
| 4056 |
|
| 4057 |
|
| 4058 |
|
|
@@ -4062,20 +4070,20 @@ PROFILE TRACE: torch_eager | cuda_T256_D1024
|
|
| 4062 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 4063 |
Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
|
| 4064 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 4065 |
-
torch_eager 0.00% 0.000us 0.00% 0.000us 0.000us
|
| 4066 |
-
torch_eager 6.
|
| 4067 |
-
aten::silu 2.
|
| 4068 |
-
void at::native::elementwise_kernel<128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 6.
|
| 4069 |
-
aten::mul 1.
|
| 4070 |
-
void at::native::elementwise_kernel<128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 6.
|
| 4071 |
-
Activity Buffer Request
|
| 4072 |
-
aten::slice 1.34%
|
| 4073 |
-
aten::as_strided 0.
|
| 4074 |
-
cudaLaunchKernel
|
| 4075 |
-
cudaDeviceSynchronize 0.
|
| 4076 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 4077 |
-
Self CPU time total: 1.
|
| 4078 |
-
Self CUDA time total: 13.
|
| 4079 |
|
| 4080 |
|
| 4081 |
|
|
@@ -4085,20 +4093,20 @@ PROFILE TRACE: torch_eager | cuda_T256_D2048
|
|
| 4085 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 4086 |
Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
|
| 4087 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 4088 |
-
torch_eager 0.00% 0.000us 0.00% 0.000us 0.000us
|
| 4089 |
-
torch_eager
|
| 4090 |
-
aten::silu
|
| 4091 |
-
void at::native::elementwise_kernel<128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 7.936us 51.
|
| 4092 |
-
aten::mul
|
| 4093 |
-
void at::native::elementwise_kernel<128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 7.
|
| 4094 |
-
Activity Buffer Request
|
| 4095 |
-
aten::slice
|
| 4096 |
-
aten::as_strided
|
| 4097 |
-
cudaLaunchKernel
|
| 4098 |
-
cudaDeviceSynchronize 0.
|
| 4099 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 4100 |
-
Self CPU time total:
|
| 4101 |
-
Self CUDA time total: 15.
|
| 4102 |
|
| 4103 |
|
| 4104 |
|
|
@@ -4108,20 +4116,20 @@ PROFILE TRACE: torch_eager | cuda_T512_D768
|
|
| 4108 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 4109 |
Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
|
| 4110 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 4111 |
-
torch_eager 0.00% 0.000us 0.00% 0.000us 0.000us
|
| 4112 |
-
torch_eager 6.
|
| 4113 |
-
aten::silu 2.
|
| 4114 |
-
void at::native::elementwise_kernel<128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 7.327us 51.
|
| 4115 |
-
aten::mul 1.
|
| 4116 |
-
void at::native::elementwise_kernel<128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us
|
| 4117 |
-
Activity Buffer Request
|
| 4118 |
-
aten::slice 1.
|
| 4119 |
-
aten::as_strided 0.32%
|
| 4120 |
-
cudaLaunchKernel
|
| 4121 |
-
cudaDeviceSynchronize 0.
|
| 4122 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 4123 |
-
Self CPU time total: 1.
|
| 4124 |
-
Self CUDA time total: 14.
|
| 4125 |
|
| 4126 |
|
| 4127 |
|
|
@@ -4131,20 +4139,20 @@ PROFILE TRACE: torch_eager | cuda_T512_D1024
|
|
| 4131 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 4132 |
Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
|
| 4133 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 4134 |
-
torch_eager 0.00% 0.000us 0.00% 0.000us 0.000us
|
| 4135 |
-
torch_eager
|
| 4136 |
-
aten::silu
|
| 4137 |
-
void at::native::elementwise_kernel<128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 7.
|
| 4138 |
-
aten::mul
|
| 4139 |
-
void at::native::elementwise_kernel<128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 7.
|
| 4140 |
-
Activity Buffer Request
|
| 4141 |
-
aten::slice
|
| 4142 |
-
aten::as_strided
|
| 4143 |
-
cudaLaunchKernel
|
| 4144 |
-
cudaDeviceSynchronize
|
| 4145 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 4146 |
-
Self CPU time total:
|
| 4147 |
-
Self CUDA time total: 15.
|
| 4148 |
|
| 4149 |
|
| 4150 |
|
|
@@ -4154,20 +4162,20 @@ PROFILE TRACE: torch_eager | cuda_T512_D2048
|
|
| 4154 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 4155 |
Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
|
| 4156 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 4157 |
-
torch_eager 0.00% 0.000us 0.00% 0.000us 0.000us
|
| 4158 |
-
torch_eager 5.
|
| 4159 |
-
aten::silu 2.
|
| 4160 |
-
void at::native::elementwise_kernel<128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 11.
|
| 4161 |
-
aten::mul 1.
|
| 4162 |
-
void at::native::elementwise_kernel<128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us
|
| 4163 |
-
Activity Buffer Request
|
| 4164 |
-
aten::slice 1.37% 25.
|
| 4165 |
-
aten::as_strided 0.
|
| 4166 |
-
cudaLaunchKernel
|
| 4167 |
-
cudaDeviceSynchronize 0.
|
| 4168 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 4169 |
-
Self CPU time total: 1.
|
| 4170 |
-
Self CUDA time total: 22.
|
| 4171 |
|
| 4172 |
|
| 4173 |
impl wl p50(ms) ok
|
|
@@ -4181,12 +4189,6 @@ torch_eager cuda_T512_D1024 0.05 True
|
|
| 4181 |
torch_eager cuda_T512_D2048 0.05 True
|
| 4182 |
torch_eager cuda_T512_D768 0.05 True
|
| 4183 |
</pre></div>
|
| 4184 |
-
<div class="uv-install-logs" id="uv-logs-benchmark">
|
| 4185 |
-
<div class="uv-logs-header" onclick="toggleUvLogs(this)">▶ UV Install Logs</div>
|
| 4186 |
-
<div class="uv-logs-content" style="display: none;">
|
| 4187 |
-
Installed 37 packages in 230ms
|
| 4188 |
-
</div>
|
| 4189 |
-
</div>
|
| 4190 |
<div class="cell-artifacts">
|
| 4191 |
<h4>Artifacts:</h4>
|
| 4192 |
<a href="artifacts/benchmark/activation.jsonl" class="artifact" target="_blank">activation.jsonl</a>
|
|
|
|
| 809 |
.artifact-preview svg {
|
| 810 |
background: transparent;
|
| 811 |
}
|
| 812 |
+
/* Invert SVG images in dark mode */
|
| 813 |
+
:root[data-theme="dark"] .artifact-preview img[src$=".svg"] {
|
| 814 |
+
filter: invert(0.9) hue-rotate(180deg);
|
| 815 |
+
}
|
| 816 |
+
/* Keep SVG images readable in monocolor mode */
|
| 817 |
+
:root[data-ui="monocolor"] .artifact-preview img[src$=".svg"] {
|
| 818 |
+
filter: none;
|
| 819 |
+
}
|
| 820 |
/* CSV table styling */
|
| 821 |
.artifact-csv {
|
| 822 |
margin-top: 1rem;
|
|
|
|
| 3879 |
<span onclick="toggleOutput('nv')" style="cursor: pointer;">▼ output</span>
|
| 3880 |
<span id="uv-indicator-nv" style="cursor: default; opacity: 0.3;">▶ uv-logs</span>
|
| 3881 |
</span> |
|
| 3882 |
+
Cell: nv | 0.21s
|
| 3883 |
| <button class="run-btn" onclick="runCell('nv')">▶ run</button>
|
| 3884 |
<button class="copy-btn" onclick="copyCell('nv')">Copy</button>
|
| 3885 |
<a href="cells/nv.py" target="_blank" class="raw-btn">Raw</a>
|
|
|
|
| 3895 |
</div>
|
| 3896 |
</div>
|
| 3897 |
<div id="output-nv" class="cell-output">
|
| 3898 |
+
<div class="cell-stdout"><pre class="stdout-text">Wed Oct 29 15:50:40 2025
|
| 3899 |
+-----------------------------------------------------------------------------------------+
|
| 3900 |
| NVIDIA-SMI 570.195.03 Driver Version: 570.195.03 CUDA Version: 12.8 |
|
| 3901 |
|-----------------------------------------+------------------------+----------------------+
|
|
|
|
| 3904 |
| | | MIG M. |
|
| 3905 |
|=========================================+========================+======================|
|
| 3906 |
| 0 NVIDIA L40S On | 00000000:4D:00.0 Off | 0 |
|
| 3907 |
+
| N/A 28C P0 78W / 350W | 0MiB / 46068MiB | 11% Default |
|
| 3908 |
| | | N/A |
|
| 3909 |
+-----------------------------------------+------------------------+----------------------+
|
| 3910 |
|
|
|
|
| 3926 |
<span class="collapse-indicators">
|
| 3927 |
<span onclick="toggleCode('benchmark')" style="cursor: pointer;">▼ code</span>
|
| 3928 |
<span onclick="toggleOutput('benchmark')" style="cursor: pointer;">▼ output</span>
|
| 3929 |
+
<span id="uv-indicator-benchmark" style="cursor: default; opacity: 0.3;">▶ uv-logs</span>
|
| 3930 |
</span> |
|
| 3931 |
+
Cell: benchmark | 3.39s
|
| 3932 |
| <button class="run-btn" onclick="runCell('benchmark')">▶ run</button>
|
| 3933 |
<button class="copy-btn" onclick="copyCell('benchmark')">Copy</button>
|
| 3934 |
<a href="cells/benchmark.py" target="_blank" class="raw-btn">Raw</a>
|
|
|
|
| 3978 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 3979 |
Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
|
| 3980 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 3981 |
+
torch_eager 0.00% 0.000us 0.00% 0.000us 0.000us 198.560us 1555.14% 198.560us 198.560us 1
|
| 3982 |
+
torch_eager 10.82% 202.394us 99.60% 1.864ms 1.864ms 0.000us 0.00% 15.104us 15.104us 1
|
| 3983 |
+
aten::silu 3.05% 57.001us 82.79% 1.549ms 516.356us 6.560us 51.38% 8.896us 2.965us 3
|
| 3984 |
+
void at::native::elementwise_kernel<128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 6.560us 51.38% 6.560us 2.187us 3
|
| 3985 |
+
aten::mul 1.85% 34.663us 3.11% 58.253us 19.418us 6.208us 48.62% 6.208us 2.069us 3
|
| 3986 |
+
void at::native::elementwise_kernel<128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 6.208us 48.62% 6.208us 2.069us 3
|
| 3987 |
+
Activity Buffer Request 77.33% 1.447ms 77.33% 1.447ms 1.447ms 2.336us 18.30% 2.336us 2.336us 1
|
| 3988 |
+
aten::slice 2.27% 42.481us 2.88% 53.841us 8.973us 0.000us 0.00% 0.000us 0.000us 6
|
| 3989 |
+
aten::as_strided 0.61% 11.360us 0.61% 11.360us 1.893us 0.000us 0.00% 0.000us 0.000us 6
|
| 3990 |
+
cudaLaunchKernel 3.67% 68.681us 3.67% 68.681us 11.447us 0.000us 0.00% 0.000us 0.000us 6
|
| 3991 |
+
cudaDeviceSynchronize 0.40% 7.560us 0.40% 7.560us 7.560us 0.000us 0.00% 0.000us 0.000us 1
|
| 3992 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 3993 |
+
Self CPU time total: 1.871ms
|
| 3994 |
Self CUDA time total: 12.768us
|
| 3995 |
|
| 3996 |
|
|
|
|
| 4001 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 4002 |
Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
|
| 4003 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 4004 |
+
torch_eager 0.00% 0.000us 0.00% 0.000us 0.000us 153.854us 1245.68% 153.854us 153.854us 1
|
| 4005 |
+
torch_eager 7.83% 135.935us 99.65% 1.729ms 1.729ms 0.000us 0.00% 14.495us 14.495us 1
|
| 4006 |
+
aten::silu 2.47% 42.821us 87.44% 1.517ms 505.699us 6.399us 51.81% 8.543us 2.848us 3
|
| 4007 |
+
void at::native::elementwise_kernel<128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 6.399us 51.81% 6.399us 2.133us 3
|
| 4008 |
+
aten::mul 1.58% 27.360us 2.69% 46.680us 15.560us 5.952us 48.19% 5.952us 1.984us 3
|
| 4009 |
+
void at::native::elementwise_kernel<128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 5.952us 48.19% 5.952us 1.984us 3
|
| 4010 |
+
Activity Buffer Request 83.34% 1.446ms 83.34% 1.446ms 1.446ms 2.144us 17.36% 2.144us 2.144us 1
|
| 4011 |
+
aten::slice 1.38% 23.991us 1.69% 29.361us 4.893us 0.000us 0.00% 0.000us 0.000us 6
|
| 4012 |
+
aten::as_strided 0.31% 5.370us 0.31% 5.370us 0.895us 0.000us 0.00% 0.000us 0.000us 6
|
| 4013 |
+
cudaLaunchKernel 2.74% 47.550us 2.74% 47.550us 7.925us 0.000us 0.00% 0.000us 0.000us 6
|
| 4014 |
+
cudaDeviceSynchronize 0.35% 6.041us 0.35% 6.041us 6.041us 0.000us 0.00% 0.000us 0.000us 1
|
| 4015 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 4016 |
+
Self CPU time total: 1.735ms
|
| 4017 |
+
Self CUDA time total: 12.351us
|
| 4018 |
|
| 4019 |
|
| 4020 |
|
|
|
|
| 4024 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 4025 |
Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
|
| 4026 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 4027 |
+
torch_eager 0.00% 0.000us 0.00% 0.000us 0.000us 152.990us 1157.70% 152.990us 152.990us 1
|
| 4028 |
+
torch_eager 7.93% 136.944us 99.69% 1.722ms 1.722ms 0.000us 0.00% 15.487us 15.487us 1
|
| 4029 |
+
aten::silu 2.43% 41.922us 87.32% 1.508ms 502.829us 6.752us 51.09% 9.024us 3.008us 3
|
| 4030 |
+
void at::native::elementwise_kernel<128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 6.752us 51.09% 6.752us 2.251us 3
|
| 4031 |
+
aten::mul 1.55% 26.841us 2.71% 46.791us 15.597us 6.463us 48.91% 6.463us 2.154us 3
|
| 4032 |
+
void at::native::elementwise_kernel<128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 6.463us 48.91% 6.463us 2.154us 3
|
| 4033 |
+
Activity Buffer Request 83.33% 1.439ms 83.33% 1.439ms 1.439ms 2.272us 17.19% 2.272us 2.272us 1
|
| 4034 |
+
aten::slice 1.41% 24.420us 1.74% 29.990us 4.998us 0.000us 0.00% 0.000us 0.000us 6
|
| 4035 |
+
aten::as_strided 0.32% 5.570us 0.32% 5.570us 0.928us 0.000us 0.00% 0.000us 0.000us 6
|
| 4036 |
+
cudaLaunchKernel 2.72% 47.030us 2.72% 47.030us 7.838us 0.000us 0.00% 0.000us 0.000us 6
|
| 4037 |
+
cudaDeviceSynchronize 0.31% 5.290us 0.31% 5.290us 5.290us 0.000us 0.00% 0.000us 0.000us 1
|
| 4038 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 4039 |
+
Self CPU time total: 1.728ms
|
| 4040 |
+
Self CUDA time total: 13.215us
|
| 4041 |
|
| 4042 |
|
| 4043 |
|
|
|
|
| 4047 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 4048 |
Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
|
| 4049 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 4050 |
+
torch_eager 0.00% 0.000us 0.00% 0.000us 0.000us 152.287us 1195.72% 152.287us 152.287us 1
|
| 4051 |
+
torch_eager 6.75% 128.682us 99.76% 1.902ms 1.902ms 0.000us 0.00% 14.944us 14.944us 1
|
| 4052 |
+
aten::silu 2.22% 42.301us 89.12% 1.699ms 566.261us 6.560us 51.51% 8.768us 2.923us 3
|
| 4053 |
+
void at::native::elementwise_kernel<128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 6.560us 51.51% 6.560us 2.187us 3
|
| 4054 |
+
aten::mul 1.34% 25.502us 2.28% 43.392us 14.464us 6.176us 48.49% 6.176us 2.059us 3
|
| 4055 |
+
void at::native::elementwise_kernel<128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 6.176us 48.49% 6.176us 2.059us 3
|
| 4056 |
+
Activity Buffer Request 74.83% 1.427ms 74.83% 1.427ms 1.427ms 2.208us 17.34% 2.208us 2.208us 1
|
| 4057 |
+
aten::slice 1.32% 25.141us 1.61% 30.781us 5.130us 0.000us 0.00% 0.000us 0.000us 6
|
| 4058 |
+
aten::as_strided 0.30% 5.640us 0.30% 5.640us 0.940us 0.000us 0.00% 0.000us 0.000us 6
|
| 4059 |
+
cudaLaunchKernel 13.00% 247.856us 13.00% 247.856us 41.309us 0.000us 0.00% 0.000us 0.000us 6
|
| 4060 |
+
cudaDeviceSynchronize 0.24% 4.611us 0.24% 4.611us 4.611us 0.000us 0.00% 0.000us 0.000us 1
|
| 4061 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 4062 |
+
Self CPU time total: 1.906ms
|
| 4063 |
+
Self CUDA time total: 12.736us
|
| 4064 |
|
| 4065 |
|
| 4066 |
|
|
|
|
| 4070 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 4071 |
Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
|
| 4072 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 4073 |
+
torch_eager 0.00% 0.000us 0.00% 0.000us 0.000us 153.054us 1155.39% 153.054us 153.054us 1
|
| 4074 |
+
torch_eager 6.42% 122.793us 99.75% 1.907ms 1.907ms 0.000us 0.00% 15.518us 15.518us 1
|
| 4075 |
+
aten::silu 2.19% 41.952us 89.33% 1.708ms 569.191us 6.751us 50.96% 9.022us 3.007us 3
|
| 4076 |
+
void at::native::elementwise_kernel<128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 6.751us 50.96% 6.751us 2.250us 3
|
| 4077 |
+
aten::mul 1.27% 24.330us 2.36% 45.101us 15.034us 6.496us 49.04% 6.496us 2.165us 3
|
| 4078 |
+
void at::native::elementwise_kernel<128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 6.496us 49.04% 6.496us 2.165us 3
|
| 4079 |
+
Activity Buffer Request 76.06% 1.454ms 76.06% 1.454ms 1.454ms 2.271us 17.14% 2.271us 2.271us 1
|
| 4080 |
+
aten::slice 1.34% 25.570us 1.64% 31.330us 5.222us 0.000us 0.00% 0.000us 0.000us 6
|
| 4081 |
+
aten::as_strided 0.30% 5.760us 0.30% 5.760us 0.960us 0.000us 0.00% 0.000us 0.000us 6
|
| 4082 |
+
cudaLaunchKernel 12.16% 232.387us 12.16% 232.387us 38.731us 0.000us 0.00% 0.000us 0.000us 6
|
| 4083 |
+
cudaDeviceSynchronize 0.25% 4.840us 0.25% 4.840us 4.840us 0.000us 0.00% 0.000us 0.000us 1
|
| 4084 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 4085 |
+
Self CPU time total: 1.912ms
|
| 4086 |
+
Self CUDA time total: 13.247us
|
| 4087 |
|
| 4088 |
|
| 4089 |
|
|
|
|
| 4093 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 4094 |
Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
|
| 4095 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 4096 |
+
torch_eager 0.00% 0.000us 0.00% 0.000us 0.000us 159.743us 1029.27% 159.743us 159.743us 1
|
| 4097 |
+
torch_eager 7.04% 135.613us 99.74% 1.921ms 1.921ms 0.000us 0.00% 18.208us 18.208us 1
|
| 4098 |
+
aten::silu 2.22% 42.702us 88.66% 1.708ms 569.181us 7.936us 51.13% 10.624us 3.541us 3
|
| 4099 |
+
void at::native::elementwise_kernel<128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 7.936us 51.13% 7.936us 2.645us 3
|
| 4100 |
+
aten::mul 1.46% 28.181us 2.39% 45.941us 15.314us 7.584us 48.87% 7.584us 2.528us 3
|
| 4101 |
+
void at::native::elementwise_kernel<128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 7.584us 48.87% 7.584us 2.528us 3
|
| 4102 |
+
Activity Buffer Request 75.65% 1.457ms 75.65% 1.457ms 1.457ms 2.688us 17.32% 2.688us 2.688us 1
|
| 4103 |
+
aten::slice 1.35% 26.081us 1.66% 31.951us 5.325us 0.000us 0.00% 0.000us 0.000us 6
|
| 4104 |
+
aten::as_strided 0.30% 5.870us 0.30% 5.870us 0.978us 0.000us 0.00% 0.000us 0.000us 6
|
| 4105 |
+
cudaLaunchKernel 11.71% 225.495us 11.71% 225.495us 37.582us 0.000us 0.00% 0.000us 0.000us 6
|
| 4106 |
+
cudaDeviceSynchronize 0.26% 4.960us 0.26% 4.960us 4.960us 0.000us 0.00% 0.000us 0.000us 1
|
| 4107 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 4108 |
+
Self CPU time total: 1.926ms
|
| 4109 |
+
Self CUDA time total: 15.520us
|
| 4110 |
|
| 4111 |
|
| 4112 |
|
|
|
|
| 4116 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 4117 |
Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
|
| 4118 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 4119 |
+
torch_eager 0.00% 0.000us 0.00% 0.000us 0.000us 156.031us 1088.46% 156.031us 156.031us 1
|
| 4120 |
+
torch_eager 6.78% 127.672us 99.74% 1.878ms 1.878ms 0.000us 0.00% 16.798us 16.798us 1
|
| 4121 |
+
aten::silu 2.24% 42.252us 88.75% 1.671ms 556.944us 7.327us 51.11% 9.790us 3.263us 3
|
| 4122 |
+
void at::native::elementwise_kernel<128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 7.327us 51.11% 7.327us 2.442us 3
|
| 4123 |
+
aten::mul 1.40% 26.401us 2.46% 46.222us 15.407us 7.008us 48.89% 7.008us 2.336us 3
|
| 4124 |
+
void at::native::elementwise_kernel<128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 7.008us 48.89% 7.008us 2.336us 3
|
| 4125 |
+
Activity Buffer Request 75.83% 1.428ms 75.83% 1.428ms 1.428ms 2.463us 17.18% 2.463us 2.463us 1
|
| 4126 |
+
aten::slice 1.43% 26.941us 1.75% 32.941us 5.490us 0.000us 0.00% 0.000us 0.000us 6
|
| 4127 |
+
aten::as_strided 0.32% 6.000us 0.32% 6.000us 1.000us 0.000us 0.00% 0.000us 0.000us 6
|
| 4128 |
+
cudaLaunchKernel 11.73% 220.885us 11.73% 220.885us 36.814us 0.000us 0.00% 0.000us 0.000us 6
|
| 4129 |
+
cudaDeviceSynchronize 0.26% 4.871us 0.26% 4.871us 4.871us 0.000us 0.00% 0.000us 0.000us 1
|
| 4130 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 4131 |
+
Self CPU time total: 1.883ms
|
| 4132 |
+
Self CUDA time total: 14.335us
|
| 4133 |
|
| 4134 |
|
| 4135 |
|
|
|
|
| 4139 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 4140 |
Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
|
| 4141 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 4142 |
+
torch_eager 0.00% 0.000us 0.00% 0.000us 0.000us 151.072us 971.40% 151.072us 151.072us 1
|
| 4143 |
+
torch_eager 5.82% 108.433us 99.72% 1.859ms 1.859ms 0.000us 0.00% 18.240us 18.240us 1
|
| 4144 |
+
aten::silu 2.20% 40.971us 89.83% 1.675ms 558.344us 7.968us 51.23% 10.656us 3.552us 3
|
| 4145 |
+
void at::native::elementwise_kernel<128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 7.968us 51.23% 7.968us 2.656us 3
|
| 4146 |
+
aten::mul 1.42% 26.501us 2.46% 45.902us 15.301us 7.584us 48.77% 7.584us 2.528us 3
|
| 4147 |
+
void at::native::elementwise_kernel<128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 7.584us 48.77% 7.584us 2.528us 3
|
| 4148 |
+
Activity Buffer Request 76.88% 1.433ms 76.88% 1.433ms 1.433ms 2.688us 17.28% 2.688us 2.688us 1
|
| 4149 |
+
aten::slice 1.31% 24.441us 1.61% 29.960us 4.993us 0.000us 0.00% 0.000us 0.000us 6
|
| 4150 |
+
aten::as_strided 0.30% 5.519us 0.30% 5.519us 0.920us 0.000us 0.00% 0.000us 0.000us 6
|
| 4151 |
+
cudaLaunchKernel 11.80% 219.996us 11.80% 219.996us 36.666us 0.000us 0.00% 0.000us 0.000us 6
|
| 4152 |
+
cudaDeviceSynchronize 0.28% 5.300us 0.28% 5.300us 5.300us 0.000us 0.00% 0.000us 0.000us 1
|
| 4153 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 4154 |
+
Self CPU time total: 1.865ms
|
| 4155 |
+
Self CUDA time total: 15.552us
|
| 4156 |
|
| 4157 |
|
| 4158 |
|
|
|
|
| 4162 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 4163 |
Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
|
| 4164 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 4165 |
+
torch_eager 0.00% 0.000us 0.00% 0.000us 0.000us 157.150us 692.69% 157.150us 157.150us 1
|
| 4166 |
+
torch_eager 5.73% 107.203us 99.74% 1.865ms 1.865ms 0.000us 0.00% 26.622us 26.622us 1
|
| 4167 |
+
aten::silu 2.21% 41.231us 89.87% 1.680ms 560.117us 11.647us 51.34% 15.582us 5.194us 3
|
| 4168 |
+
void at::native::elementwise_kernel<128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 11.647us 51.34% 11.647us 3.882us 3
|
| 4169 |
+
aten::mul 1.38% 25.882us 2.47% 46.192us 15.397us 11.040us 48.66% 11.040us 3.680us 3
|
| 4170 |
+
void at::native::elementwise_kernel<128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 11.040us 48.66% 11.040us 3.680us 3
|
| 4171 |
+
Activity Buffer Request 77.17% 1.443ms 77.17% 1.443ms 1.443ms 3.935us 17.34% 3.935us 3.935us 1
|
| 4172 |
+
aten::slice 1.37% 25.600us 1.67% 31.160us 5.193us 0.000us 0.00% 0.000us 0.000us 6
|
| 4173 |
+
aten::as_strided 0.30% 5.560us 0.30% 5.560us 0.927us 0.000us 0.00% 0.000us 0.000us 6
|
| 4174 |
+
cudaLaunchKernel 11.58% 216.535us 11.58% 216.535us 36.089us 0.000us 0.00% 0.000us 0.000us 6
|
| 4175 |
+
cudaDeviceSynchronize 0.26% 4.830us 0.26% 4.830us 4.830us 0.000us 0.00% 0.000us 0.000us 1
|
| 4176 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 4177 |
+
Self CPU time total: 1.870ms
|
| 4178 |
+
Self CUDA time total: 22.687us
|
| 4179 |
|
| 4180 |
|
| 4181 |
impl wl p50(ms) ok
|
|
|
|
| 4189 |
torch_eager cuda_T512_D2048 0.05 True
|
| 4190 |
torch_eager cuda_T512_D768 0.05 True
|
| 4191 |
</pre></div>
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 4192 |
<div class="cell-artifacts">
|
| 4193 |
<h4>Artifacts:</h4>
|
| 4194 |
<a href="artifacts/benchmark/activation.jsonl" class="artifact" target="_blank">activation.jsonl</a>
|
activation/results/artifacts/combine/latency.svg
CHANGED
|
|
Git LFS Details
|
|
|
Git LFS Details
|
activation/results/combined_results.html
CHANGED
|
@@ -809,6 +809,14 @@
|
|
| 809 |
.artifact-preview svg {
|
| 810 |
background: transparent;
|
| 811 |
}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 812 |
/* CSV table styling */
|
| 813 |
.artifact-csv {
|
| 814 |
margin-top: 1rem;
|
|
@@ -3872,7 +3880,7 @@ span.linenos.special { color: #000000; background-color: #ffffc0; padding-left:
|
|
| 3872 |
<rdf:RDF>
|
| 3873 |
<ns2:Work>
|
| 3874 |
<dc:type rdf:resource="http://purl.org/dc/dcmitype/StillImage" />
|
| 3875 |
-
<dc:date>2025-10-
|
| 3876 |
<dc:format>image/svg+xml</dc:format>
|
| 3877 |
<dc:creator>
|
| 3878 |
<ns2:Agent>
|
|
@@ -4021,96 +4029,83 @@ span.linenos.special { color: #000000; background-color: #ffffc0; padding-left:
|
|
| 4021 |
<g id="matplotlib.axis_2">
|
| 4022 |
<g id="ytick_1">
|
| 4023 |
<g id="grid-y--2" class="grid grid-y">
|
| 4024 |
-
<path d="M 60.23
|
| 4025 |
</g>
|
| 4026 |
<g id="line2d_10">
|
| 4027 |
<defs>
|
| 4028 |
<path id="m0fca2865ba" d="M 0 0 L -3.5 0 " style="stroke: #000000; stroke-width: 0.8" />
|
| 4029 |
</defs>
|
| 4030 |
<g>
|
| 4031 |
-
<use ns4:href="#m0fca2865ba" x="60.23" y="
|
| 4032 |
</g>
|
| 4033 |
</g>
|
| 4034 |
<g id="text_10">
|
| 4035 |
-
<text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: end" x="53.23" y="
|
| 4036 |
</g>
|
| 4037 |
</g>
|
| 4038 |
<g id="ytick_2">
|
| 4039 |
<g id="grid-y--3" class="grid grid-y">
|
| 4040 |
-
<path d="M 60.23
|
| 4041 |
</g>
|
| 4042 |
<g id="line2d_11">
|
| 4043 |
<g>
|
| 4044 |
-
<use ns4:href="#m0fca2865ba" x="60.23" y="
|
| 4045 |
</g>
|
| 4046 |
</g>
|
| 4047 |
<g id="text_11">
|
| 4048 |
-
<text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: end" x="53.23" y="
|
| 4049 |
</g>
|
| 4050 |
</g>
|
| 4051 |
<g id="ytick_3">
|
| 4052 |
<g id="grid-y--4" class="grid grid-y">
|
| 4053 |
-
<path d="M 60.23
|
| 4054 |
</g>
|
| 4055 |
<g id="line2d_12">
|
| 4056 |
<g>
|
| 4057 |
-
<use ns4:href="#m0fca2865ba" x="60.23" y="
|
| 4058 |
</g>
|
| 4059 |
</g>
|
| 4060 |
<g id="text_12">
|
| 4061 |
-
<text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: end" x="53.23" y="
|
| 4062 |
</g>
|
| 4063 |
</g>
|
| 4064 |
<g id="ytick_4">
|
| 4065 |
<g id="grid-y--5" class="grid grid-y">
|
| 4066 |
-
<path d="M 60.23
|
| 4067 |
</g>
|
| 4068 |
<g id="line2d_13">
|
| 4069 |
<g>
|
| 4070 |
-
<use ns4:href="#m0fca2865ba" x="60.23" y="
|
| 4071 |
</g>
|
| 4072 |
</g>
|
| 4073 |
<g id="text_13">
|
| 4074 |
-
<text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: end" x="53.23" y="
|
| 4075 |
</g>
|
| 4076 |
</g>
|
| 4077 |
<g id="ytick_5">
|
| 4078 |
<g id="grid-y--6" class="grid grid-y">
|
| 4079 |
-
<path d="M 60.23
|
| 4080 |
</g>
|
| 4081 |
<g id="line2d_14">
|
| 4082 |
<g>
|
| 4083 |
-
<use ns4:href="#m0fca2865ba" x="60.23" y="
|
| 4084 |
</g>
|
| 4085 |
</g>
|
| 4086 |
<g id="text_14">
|
| 4087 |
-
<text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: end" x="53.23" y="
|
| 4088 |
</g>
|
| 4089 |
</g>
|
| 4090 |
<g id="ytick_6">
|
| 4091 |
<g id="grid-y--7" class="grid grid-y">
|
| 4092 |
-
<path d="M 60.23
|
| 4093 |
</g>
|
| 4094 |
<g id="line2d_15">
|
| 4095 |
<g>
|
| 4096 |
-
<use ns4:href="#m0fca2865ba" x="60.23" y="
|
| 4097 |
</g>
|
| 4098 |
</g>
|
| 4099 |
<g id="text_15">
|
| 4100 |
-
<text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: end" x="53.23" y="
|
| 4101 |
-
</g>
|
| 4102 |
-
</g>
|
| 4103 |
-
<g id="ytick_7">
|
| 4104 |
-
<g id="grid-y--8" class="grid grid-y">
|
| 4105 |
-
<path d="M 60.23 36.267161 L 847.294169 36.267161 " clip-path="url(#p620c7d392f)" style="fill: none; stroke: #b0b0b0; stroke-opacity: 0.3; stroke-width: 0.8; stroke-linecap: square" />
|
| 4106 |
-
</g>
|
| 4107 |
-
<g id="line2d_16">
|
| 4108 |
-
<g>
|
| 4109 |
-
<use ns4:href="#m0fca2865ba" x="60.23" y="36.267161" style="stroke: #000000; stroke-width: 0.8" />
|
| 4110 |
-
</g>
|
| 4111 |
-
</g>
|
| 4112 |
-
<g id="text_16">
|
| 4113 |
-
<text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: end" x="53.23" y="40.066379" transform="rotate(-0 53.23 40.066379)">0.055</text>
|
| 4114 |
</g>
|
| 4115 |
</g>
|
| 4116 |
<g id="label--y" class="ylabel">
|
|
@@ -4118,37 +4113,37 @@ span.linenos.special { color: #000000; background-color: #ffffc0; padding-left:
|
|
| 4118 |
</g>
|
| 4119 |
</g>
|
| 4120 |
<g id="series--hf-kernels-swiglu" class="series">
|
| 4121 |
-
<path d="M 96.005644 451.16779 L 185.444754
|
| 4122 |
<defs>
|
| 4123 |
<path id="md7efaf3aec" d="M 0 3 C 0.795609 3 1.55874 2.683901 2.12132 2.12132 C 2.683901 1.55874 3 0.795609 3 0 C 3 -0.795609 2.683901 -1.55874 2.12132 -2.12132 C 1.55874 -2.683901 0.795609 -3 0 -3 C -0.795609 -3 -1.55874 -2.683901 -2.12132 -2.12132 C -2.683901 -1.55874 -3 -0.795609 -3 0 C -3 0.795609 -2.683901 1.55874 -2.12132 2.12132 C -1.55874 2.683901 -0.795609 3 0 3 z " style="stroke: #1f77b4" />
|
| 4124 |
</defs>
|
| 4125 |
<g clip-path="url(#p620c7d392f)">
|
| 4126 |
<use ns4:href="#md7efaf3aec" x="96.005644" y="451.16779" style="fill: #1f77b4; stroke: #1f77b4" />
|
| 4127 |
-
<use ns4:href="#md7efaf3aec" x="185.444754" y="
|
| 4128 |
-
<use ns4:href="#md7efaf3aec" x="274.883864" y="
|
| 4129 |
-
<use ns4:href="#md7efaf3aec" x="364.322974" y="
|
| 4130 |
-
<use ns4:href="#md7efaf3aec" x="453.762084" y="
|
| 4131 |
-
<use ns4:href="#md7efaf3aec" x="543.201194" y="
|
| 4132 |
-
<use ns4:href="#md7efaf3aec" x="632.640304" y="
|
| 4133 |
-
<use ns4:href="#md7efaf3aec" x="722.079415" y="
|
| 4134 |
-
<use ns4:href="#md7efaf3aec" x="811.518525" y="
|
| 4135 |
</g>
|
| 4136 |
</g>
|
| 4137 |
<g id="series--torch-eager" class="series">
|
| 4138 |
-
<path d="M 96.005644
|
| 4139 |
<defs>
|
| 4140 |
<path id="m9b8c54d372" d="M 0 3 C 0.795609 3 1.55874 2.683901 2.12132 2.12132 C 2.683901 1.55874 3 0.795609 3 0 C 3 -0.795609 2.683901 -1.55874 2.12132 -2.12132 C 1.55874 -2.683901 0.795609 -3 0 -3 C -0.795609 -3 -1.55874 -2.683901 -2.12132 -2.12132 C -2.683901 -1.55874 -3 -0.795609 -3 0 C -3 0.795609 -2.683901 1.55874 -2.12132 2.12132 C -1.55874 2.683901 -0.795609 3 0 3 z " style="stroke: #ff7f0e" />
|
| 4141 |
</defs>
|
| 4142 |
<g clip-path="url(#p620c7d392f)">
|
| 4143 |
-
<use ns4:href="#m9b8c54d372" x="96.005644" y="
|
| 4144 |
<use ns4:href="#m9b8c54d372" x="185.444754" y="47.08418" style="fill: #ff7f0e; stroke: #ff7f0e" />
|
| 4145 |
-
<use ns4:href="#m9b8c54d372" x="274.883864" y="
|
| 4146 |
-
<use ns4:href="#m9b8c54d372" x="364.322974" y="
|
| 4147 |
-
<use ns4:href="#m9b8c54d372" x="453.762084" y="
|
| 4148 |
-
<use ns4:href="#m9b8c54d372" x="543.201194" y="
|
| 4149 |
-
<use ns4:href="#m9b8c54d372" x="632.640304" y="
|
| 4150 |
-
<use ns4:href="#m9b8c54d372" x="722.079415" y="
|
| 4151 |
-
<use ns4:href="#m9b8c54d372" x="811.518525" y="
|
| 4152 |
</g>
|
| 4153 |
</g>
|
| 4154 |
<g id="patch_3">
|
|
@@ -4163,14 +4158,14 @@ span.linenos.special { color: #000000; background-color: #ffffc0; padding-left:
|
|
| 4163 |
<g id="patch_6">
|
| 4164 |
<path d="M 60.23 26.88 L 847.294169 26.88 " style="fill: none; stroke: #000000; stroke-width: 0.8; stroke-linejoin: miter; stroke-linecap: square" />
|
| 4165 |
</g>
|
| 4166 |
-
<g id="
|
| 4167 |
<text style="font-size: 12px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: middle" x="453.762084" y="20.88" transform="rotate(-0 453.762084 20.88)">Attention Implementation Latency</text>
|
| 4168 |
</g>
|
| 4169 |
<g id="legend" class="legend">
|
| 4170 |
<g id="patch_7">
|
| 4171 |
<path d="M 720.811356 64.7925 L 840.294169 64.7925 Q 842.294169 64.7925 842.294169 62.7925 L 842.294169 33.88 Q 842.294169 31.88 840.294169 31.88 L 720.811356 31.88 Q 718.811356 31.88 718.811356 33.88 L 718.811356 62.7925 Q 718.811356 64.7925 720.811356 64.7925 L 720.811356 64.7925 z " style="fill: none; opacity: 0.8; stroke: #cccccc; stroke-linejoin: miter" />
|
| 4172 |
</g>
|
| 4173 |
-
<g id="
|
| 4174 |
<path d="M 722.811356 39.978438 L 732.811356 39.978438 L 742.811356 39.978438 " style="fill: none; stroke: #1f77b4; stroke-width: 1.5; stroke-linecap: square" />
|
| 4175 |
<g>
|
| 4176 |
<use ns4:href="#md7efaf3aec" x="732.811356" y="39.978438" style="fill: #1f77b4; stroke: #1f77b4" />
|
|
@@ -4179,7 +4174,7 @@ span.linenos.special { color: #000000; background-color: #ffffc0; padding-left:
|
|
| 4179 |
<g id="legend-label--hf-kernels-swiglu" class="legend">
|
| 4180 |
<text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: start" x="750.811356" y="43.478438" transform="rotate(-0 750.811356 43.478438)">hf_kernels_swiglu</text>
|
| 4181 |
</g>
|
| 4182 |
-
<g id="
|
| 4183 |
<path d="M 722.811356 54.934687 L 732.811356 54.934687 L 742.811356 54.934687 " style="fill: none; stroke: #ff7f0e; stroke-width: 1.5; stroke-linecap: square" />
|
| 4184 |
<g>
|
| 4185 |
<use ns4:href="#m9b8c54d372" x="732.811356" y="54.934687" style="fill: #ff7f0e; stroke: #ff7f0e" />
|
|
@@ -4206,7 +4201,7 @@ span.linenos.special { color: #000000; background-color: #ffffc0; padding-left:
|
|
| 4206 |
<span onclick="toggleOutput('combine')" style="cursor: pointer;">▼ output</span>
|
| 4207 |
<span id="uv-indicator-combine" onclick="toggleUvLogsFromHeader('combine')" style="cursor: pointer;">▶ uv-logs</span>
|
| 4208 |
</span> |
|
| 4209 |
-
Cell: combine | 4.
|
| 4210 |
| <button class="run-btn" onclick="runCell('combine')">▶ run</button>
|
| 4211 |
<button class="copy-btn" onclick="copyCell('combine')">Copy</button>
|
| 4212 |
<a href="cells/combine.py" target="_blank" class="raw-btn">Raw</a>
|
|
@@ -4345,7 +4340,7 @@ Installed 37 packages in 218ms
|
|
| 4345 |
<rdf:RDF>
|
| 4346 |
<ns2:Work>
|
| 4347 |
<dc:type rdf:resource="http://purl.org/dc/dcmitype/StillImage" />
|
| 4348 |
-
<dc:date>2025-10-
|
| 4349 |
<dc:format>image/svg+xml</dc:format>
|
| 4350 |
<dc:creator>
|
| 4351 |
<ns2:Agent>
|
|
@@ -4494,96 +4489,83 @@ Installed 37 packages in 218ms
|
|
| 4494 |
<g id="matplotlib.axis_2">
|
| 4495 |
<g id="ytick_1">
|
| 4496 |
<g id="grid-y--2" class="grid grid-y">
|
| 4497 |
-
<path d="M 60.23
|
| 4498 |
</g>
|
| 4499 |
<g id="line2d_10">
|
| 4500 |
<defs>
|
| 4501 |
<path id="m0fca2865ba" d="M 0 0 L -3.5 0 " style="stroke: #000000; stroke-width: 0.8" />
|
| 4502 |
</defs>
|
| 4503 |
<g>
|
| 4504 |
-
<use ns4:href="#m0fca2865ba" x="60.23" y="
|
| 4505 |
</g>
|
| 4506 |
</g>
|
| 4507 |
<g id="text_10">
|
| 4508 |
-
<text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: end" x="53.23" y="
|
| 4509 |
</g>
|
| 4510 |
</g>
|
| 4511 |
<g id="ytick_2">
|
| 4512 |
<g id="grid-y--3" class="grid grid-y">
|
| 4513 |
-
<path d="M 60.23
|
| 4514 |
</g>
|
| 4515 |
<g id="line2d_11">
|
| 4516 |
<g>
|
| 4517 |
-
<use ns4:href="#m0fca2865ba" x="60.23" y="
|
| 4518 |
</g>
|
| 4519 |
</g>
|
| 4520 |
<g id="text_11">
|
| 4521 |
-
<text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: end" x="53.23" y="
|
| 4522 |
</g>
|
| 4523 |
</g>
|
| 4524 |
<g id="ytick_3">
|
| 4525 |
<g id="grid-y--4" class="grid grid-y">
|
| 4526 |
-
<path d="M 60.23
|
| 4527 |
</g>
|
| 4528 |
<g id="line2d_12">
|
| 4529 |
<g>
|
| 4530 |
-
<use ns4:href="#m0fca2865ba" x="60.23" y="
|
| 4531 |
</g>
|
| 4532 |
</g>
|
| 4533 |
<g id="text_12">
|
| 4534 |
-
<text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: end" x="53.23" y="
|
| 4535 |
</g>
|
| 4536 |
</g>
|
| 4537 |
<g id="ytick_4">
|
| 4538 |
<g id="grid-y--5" class="grid grid-y">
|
| 4539 |
-
<path d="M 60.23
|
| 4540 |
</g>
|
| 4541 |
<g id="line2d_13">
|
| 4542 |
<g>
|
| 4543 |
-
<use ns4:href="#m0fca2865ba" x="60.23" y="
|
| 4544 |
</g>
|
| 4545 |
</g>
|
| 4546 |
<g id="text_13">
|
| 4547 |
-
<text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: end" x="53.23" y="
|
| 4548 |
</g>
|
| 4549 |
</g>
|
| 4550 |
<g id="ytick_5">
|
| 4551 |
<g id="grid-y--6" class="grid grid-y">
|
| 4552 |
-
<path d="M 60.23
|
| 4553 |
</g>
|
| 4554 |
<g id="line2d_14">
|
| 4555 |
<g>
|
| 4556 |
-
<use ns4:href="#m0fca2865ba" x="60.23" y="
|
| 4557 |
</g>
|
| 4558 |
</g>
|
| 4559 |
<g id="text_14">
|
| 4560 |
-
<text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: end" x="53.23" y="
|
| 4561 |
</g>
|
| 4562 |
</g>
|
| 4563 |
<g id="ytick_6">
|
| 4564 |
<g id="grid-y--7" class="grid grid-y">
|
| 4565 |
-
<path d="M 60.23
|
| 4566 |
</g>
|
| 4567 |
<g id="line2d_15">
|
| 4568 |
<g>
|
| 4569 |
-
<use ns4:href="#m0fca2865ba" x="60.23" y="
|
| 4570 |
</g>
|
| 4571 |
</g>
|
| 4572 |
<g id="text_15">
|
| 4573 |
-
<text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: end" x="53.23" y="
|
| 4574 |
-
</g>
|
| 4575 |
-
</g>
|
| 4576 |
-
<g id="ytick_7">
|
| 4577 |
-
<g id="grid-y--8" class="grid grid-y">
|
| 4578 |
-
<path d="M 60.23 36.267161 L 847.294169 36.267161 " clip-path="url(#p620c7d392f)" style="fill: none; stroke: #b0b0b0; stroke-opacity: 0.3; stroke-width: 0.8; stroke-linecap: square" />
|
| 4579 |
-
</g>
|
| 4580 |
-
<g id="line2d_16">
|
| 4581 |
-
<g>
|
| 4582 |
-
<use ns4:href="#m0fca2865ba" x="60.23" y="36.267161" style="stroke: #000000; stroke-width: 0.8" />
|
| 4583 |
-
</g>
|
| 4584 |
-
</g>
|
| 4585 |
-
<g id="text_16">
|
| 4586 |
-
<text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: end" x="53.23" y="40.066379" transform="rotate(-0 53.23 40.066379)">0.055</text>
|
| 4587 |
</g>
|
| 4588 |
</g>
|
| 4589 |
<g id="label--y" class="ylabel">
|
|
@@ -4591,37 +4573,37 @@ Installed 37 packages in 218ms
|
|
| 4591 |
</g>
|
| 4592 |
</g>
|
| 4593 |
<g id="series--hf-kernels-swiglu" class="series">
|
| 4594 |
-
<path d="M 96.005644 451.16779 L 185.444754
|
| 4595 |
<defs>
|
| 4596 |
<path id="md7efaf3aec" d="M 0 3 C 0.795609 3 1.55874 2.683901 2.12132 2.12132 C 2.683901 1.55874 3 0.795609 3 0 C 3 -0.795609 2.683901 -1.55874 2.12132 -2.12132 C 1.55874 -2.683901 0.795609 -3 0 -3 C -0.795609 -3 -1.55874 -2.683901 -2.12132 -2.12132 C -2.683901 -1.55874 -3 -0.795609 -3 0 C -3 0.795609 -2.683901 1.55874 -2.12132 2.12132 C -1.55874 2.683901 -0.795609 3 0 3 z " style="stroke: #1f77b4" />
|
| 4597 |
</defs>
|
| 4598 |
<g clip-path="url(#p620c7d392f)">
|
| 4599 |
<use ns4:href="#md7efaf3aec" x="96.005644" y="451.16779" style="fill: #1f77b4; stroke: #1f77b4" />
|
| 4600 |
-
<use ns4:href="#md7efaf3aec" x="185.444754" y="
|
| 4601 |
-
<use ns4:href="#md7efaf3aec" x="274.883864" y="
|
| 4602 |
-
<use ns4:href="#md7efaf3aec" x="364.322974" y="
|
| 4603 |
-
<use ns4:href="#md7efaf3aec" x="453.762084" y="
|
| 4604 |
-
<use ns4:href="#md7efaf3aec" x="543.201194" y="
|
| 4605 |
-
<use ns4:href="#md7efaf3aec" x="632.640304" y="
|
| 4606 |
-
<use ns4:href="#md7efaf3aec" x="722.079415" y="
|
| 4607 |
-
<use ns4:href="#md7efaf3aec" x="811.518525" y="
|
| 4608 |
</g>
|
| 4609 |
</g>
|
| 4610 |
<g id="series--torch-eager" class="series">
|
| 4611 |
-
<path d="M 96.005644
|
| 4612 |
<defs>
|
| 4613 |
<path id="m9b8c54d372" d="M 0 3 C 0.795609 3 1.55874 2.683901 2.12132 2.12132 C 2.683901 1.55874 3 0.795609 3 0 C 3 -0.795609 2.683901 -1.55874 2.12132 -2.12132 C 1.55874 -2.683901 0.795609 -3 0 -3 C -0.795609 -3 -1.55874 -2.683901 -2.12132 -2.12132 C -2.683901 -1.55874 -3 -0.795609 -3 0 C -3 0.795609 -2.683901 1.55874 -2.12132 2.12132 C -1.55874 2.683901 -0.795609 3 0 3 z " style="stroke: #ff7f0e" />
|
| 4614 |
</defs>
|
| 4615 |
<g clip-path="url(#p620c7d392f)">
|
| 4616 |
-
<use ns4:href="#m9b8c54d372" x="96.005644" y="
|
| 4617 |
<use ns4:href="#m9b8c54d372" x="185.444754" y="47.08418" style="fill: #ff7f0e; stroke: #ff7f0e" />
|
| 4618 |
-
<use ns4:href="#m9b8c54d372" x="274.883864" y="
|
| 4619 |
-
<use ns4:href="#m9b8c54d372" x="364.322974" y="
|
| 4620 |
-
<use ns4:href="#m9b8c54d372" x="453.762084" y="
|
| 4621 |
-
<use ns4:href="#m9b8c54d372" x="543.201194" y="
|
| 4622 |
-
<use ns4:href="#m9b8c54d372" x="632.640304" y="
|
| 4623 |
-
<use ns4:href="#m9b8c54d372" x="722.079415" y="
|
| 4624 |
-
<use ns4:href="#m9b8c54d372" x="811.518525" y="
|
| 4625 |
</g>
|
| 4626 |
</g>
|
| 4627 |
<g id="patch_3">
|
|
@@ -4636,14 +4618,14 @@ Installed 37 packages in 218ms
|
|
| 4636 |
<g id="patch_6">
|
| 4637 |
<path d="M 60.23 26.88 L 847.294169 26.88 " style="fill: none; stroke: #000000; stroke-width: 0.8; stroke-linejoin: miter; stroke-linecap: square" />
|
| 4638 |
</g>
|
| 4639 |
-
<g id="
|
| 4640 |
<text style="font-size: 12px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: middle" x="453.762084" y="20.88" transform="rotate(-0 453.762084 20.88)">Attention Implementation Latency</text>
|
| 4641 |
</g>
|
| 4642 |
<g id="legend" class="legend">
|
| 4643 |
<g id="patch_7">
|
| 4644 |
<path d="M 720.811356 64.7925 L 840.294169 64.7925 Q 842.294169 64.7925 842.294169 62.7925 L 842.294169 33.88 Q 842.294169 31.88 840.294169 31.88 L 720.811356 31.88 Q 718.811356 31.88 718.811356 33.88 L 718.811356 62.7925 Q 718.811356 64.7925 720.811356 64.7925 L 720.811356 64.7925 z " style="fill: none; opacity: 0.8; stroke: #cccccc; stroke-linejoin: miter" />
|
| 4645 |
</g>
|
| 4646 |
-
<g id="
|
| 4647 |
<path d="M 722.811356 39.978438 L 732.811356 39.978438 L 742.811356 39.978438 " style="fill: none; stroke: #1f77b4; stroke-width: 1.5; stroke-linecap: square" />
|
| 4648 |
<g>
|
| 4649 |
<use ns4:href="#md7efaf3aec" x="732.811356" y="39.978438" style="fill: #1f77b4; stroke: #1f77b4" />
|
|
@@ -4652,7 +4634,7 @@ Installed 37 packages in 218ms
|
|
| 4652 |
<g id="legend-label--hf-kernels-swiglu" class="legend">
|
| 4653 |
<text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: start" x="750.811356" y="43.478438" transform="rotate(-0 750.811356 43.478438)">hf_kernels_swiglu</text>
|
| 4654 |
</g>
|
| 4655 |
-
<g id="
|
| 4656 |
<path d="M 722.811356 54.934687 L 732.811356 54.934687 L 742.811356 54.934687 " style="fill: none; stroke: #ff7f0e; stroke-width: 1.5; stroke-linecap: square" />
|
| 4657 |
<g>
|
| 4658 |
<use ns4:href="#m9b8c54d372" x="732.811356" y="54.934687" style="fill: #ff7f0e; stroke: #ff7f0e" />
|
|
|
|
| 809 |
.artifact-preview svg {
|
| 810 |
background: transparent;
|
| 811 |
}
|
| 812 |
+
/* Invert SVG images in dark mode */
|
| 813 |
+
:root[data-theme="dark"] .artifact-preview img[src$=".svg"] {
|
| 814 |
+
filter: invert(0.9) hue-rotate(180deg);
|
| 815 |
+
}
|
| 816 |
+
/* Keep SVG images readable in monocolor mode */
|
| 817 |
+
:root[data-ui="monocolor"] .artifact-preview img[src$=".svg"] {
|
| 818 |
+
filter: none;
|
| 819 |
+
}
|
| 820 |
/* CSV table styling */
|
| 821 |
.artifact-csv {
|
| 822 |
margin-top: 1rem;
|
|
|
|
| 3880 |
<rdf:RDF>
|
| 3881 |
<ns2:Work>
|
| 3882 |
<dc:type rdf:resource="http://purl.org/dc/dcmitype/StillImage" />
|
| 3883 |
+
<dc:date>2025-10-29T15:51:13.643076</dc:date>
|
| 3884 |
<dc:format>image/svg+xml</dc:format>
|
| 3885 |
<dc:creator>
|
| 3886 |
<ns2:Agent>
|
|
|
|
| 4029 |
<g id="matplotlib.axis_2">
|
| 4030 |
<g id="ytick_1">
|
| 4031 |
<g id="grid-y--2" class="grid grid-y">
|
| 4032 |
+
<path d="M 60.23 438.443756 L 847.294169 438.443756 " clip-path="url(#p620c7d392f)" style="fill: none; stroke: #b0b0b0; stroke-opacity: 0.3; stroke-width: 0.8; stroke-linecap: square" />
|
| 4033 |
</g>
|
| 4034 |
<g id="line2d_10">
|
| 4035 |
<defs>
|
| 4036 |
<path id="m0fca2865ba" d="M 0 0 L -3.5 0 " style="stroke: #000000; stroke-width: 0.8" />
|
| 4037 |
</defs>
|
| 4038 |
<g>
|
| 4039 |
+
<use ns4:href="#m0fca2865ba" x="60.23" y="438.443756" style="stroke: #000000; stroke-width: 0.8" />
|
| 4040 |
</g>
|
| 4041 |
</g>
|
| 4042 |
<g id="text_10">
|
| 4043 |
+
<text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: end" x="53.23" y="442.242975" transform="rotate(-0 53.23 442.242975)">0.025</text>
|
| 4044 |
</g>
|
| 4045 |
</g>
|
| 4046 |
<g id="ytick_2">
|
| 4047 |
<g id="grid-y--3" class="grid grid-y">
|
| 4048 |
+
<path d="M 60.23 367.676049 L 847.294169 367.676049 " clip-path="url(#p620c7d392f)" style="fill: none; stroke: #b0b0b0; stroke-opacity: 0.3; stroke-width: 0.8; stroke-linecap: square" />
|
| 4049 |
</g>
|
| 4050 |
<g id="line2d_11">
|
| 4051 |
<g>
|
| 4052 |
+
<use ns4:href="#m0fca2865ba" x="60.23" y="367.676049" style="stroke: #000000; stroke-width: 0.8" />
|
| 4053 |
</g>
|
| 4054 |
</g>
|
| 4055 |
<g id="text_11">
|
| 4056 |
+
<text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: end" x="53.23" y="371.475268" transform="rotate(-0 53.23 371.475268)">0.030</text>
|
| 4057 |
</g>
|
| 4058 |
</g>
|
| 4059 |
<g id="ytick_3">
|
| 4060 |
<g id="grid-y--4" class="grid grid-y">
|
| 4061 |
+
<path d="M 60.23 296.908341 L 847.294169 296.908341 " clip-path="url(#p620c7d392f)" style="fill: none; stroke: #b0b0b0; stroke-opacity: 0.3; stroke-width: 0.8; stroke-linecap: square" />
|
| 4062 |
</g>
|
| 4063 |
<g id="line2d_12">
|
| 4064 |
<g>
|
| 4065 |
+
<use ns4:href="#m0fca2865ba" x="60.23" y="296.908341" style="stroke: #000000; stroke-width: 0.8" />
|
| 4066 |
</g>
|
| 4067 |
</g>
|
| 4068 |
<g id="text_12">
|
| 4069 |
+
<text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: end" x="53.23" y="300.70756" transform="rotate(-0 53.23 300.70756)">0.035</text>
|
| 4070 |
</g>
|
| 4071 |
</g>
|
| 4072 |
<g id="ytick_4">
|
| 4073 |
<g id="grid-y--5" class="grid grid-y">
|
| 4074 |
+
<path d="M 60.23 226.140634 L 847.294169 226.140634 " clip-path="url(#p620c7d392f)" style="fill: none; stroke: #b0b0b0; stroke-opacity: 0.3; stroke-width: 0.8; stroke-linecap: square" />
|
| 4075 |
</g>
|
| 4076 |
<g id="line2d_13">
|
| 4077 |
<g>
|
| 4078 |
+
<use ns4:href="#m0fca2865ba" x="60.23" y="226.140634" style="stroke: #000000; stroke-width: 0.8" />
|
| 4079 |
</g>
|
| 4080 |
</g>
|
| 4081 |
<g id="text_13">
|
| 4082 |
+
<text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: end" x="53.23" y="229.939852" transform="rotate(-0 53.23 229.939852)">0.040</text>
|
| 4083 |
</g>
|
| 4084 |
</g>
|
| 4085 |
<g id="ytick_5">
|
| 4086 |
<g id="grid-y--6" class="grid grid-y">
|
| 4087 |
+
<path d="M 60.23 155.372926 L 847.294169 155.372926 " clip-path="url(#p620c7d392f)" style="fill: none; stroke: #b0b0b0; stroke-opacity: 0.3; stroke-width: 0.8; stroke-linecap: square" />
|
| 4088 |
</g>
|
| 4089 |
<g id="line2d_14">
|
| 4090 |
<g>
|
| 4091 |
+
<use ns4:href="#m0fca2865ba" x="60.23" y="155.372926" style="stroke: #000000; stroke-width: 0.8" />
|
| 4092 |
</g>
|
| 4093 |
</g>
|
| 4094 |
<g id="text_14">
|
| 4095 |
+
<text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: end" x="53.23" y="159.172145" transform="rotate(-0 53.23 159.172145)">0.045</text>
|
| 4096 |
</g>
|
| 4097 |
</g>
|
| 4098 |
<g id="ytick_6">
|
| 4099 |
<g id="grid-y--7" class="grid grid-y">
|
| 4100 |
+
<path d="M 60.23 84.605219 L 847.294169 84.605219 " clip-path="url(#p620c7d392f)" style="fill: none; stroke: #b0b0b0; stroke-opacity: 0.3; stroke-width: 0.8; stroke-linecap: square" />
|
| 4101 |
</g>
|
| 4102 |
<g id="line2d_15">
|
| 4103 |
<g>
|
| 4104 |
+
<use ns4:href="#m0fca2865ba" x="60.23" y="84.605219" style="stroke: #000000; stroke-width: 0.8" />
|
| 4105 |
</g>
|
| 4106 |
</g>
|
| 4107 |
<g id="text_15">
|
| 4108 |
+
<text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: end" x="53.23" y="88.404437" transform="rotate(-0 53.23 88.404437)">0.050</text>
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 4109 |
</g>
|
| 4110 |
</g>
|
| 4111 |
<g id="label--y" class="ylabel">
|
|
|
|
| 4113 |
</g>
|
| 4114 |
</g>
|
| 4115 |
<g id="series--hf-kernels-swiglu" class="series">
|
| 4116 |
+
<path d="M 96.005644 451.16779 L 185.444754 364.40658 L 274.883864 374.045142 L 364.322974 392.869353 L 453.762084 389.882956 L 543.201194 397.667403 L 632.640304 381.532366 L 722.079415 398.106163 L 811.518525 419.478011 " clip-path="url(#p620c7d392f)" style="fill: none; stroke: #1f77b4; stroke-width: 1.5; stroke-linecap: square" />
|
| 4117 |
<defs>
|
| 4118 |
<path id="md7efaf3aec" d="M 0 3 C 0.795609 3 1.55874 2.683901 2.12132 2.12132 C 2.683901 1.55874 3 0.795609 3 0 C 3 -0.795609 2.683901 -1.55874 2.12132 -2.12132 C 1.55874 -2.683901 0.795609 -3 0 -3 C -0.795609 -3 -1.55874 -2.683901 -2.12132 -2.12132 C -2.683901 -1.55874 -3 -0.795609 -3 0 C -3 0.795609 -2.683901 1.55874 -2.12132 2.12132 C -1.55874 2.683901 -0.795609 3 0 3 z " style="stroke: #1f77b4" />
|
| 4119 |
</defs>
|
| 4120 |
<g clip-path="url(#p620c7d392f)">
|
| 4121 |
<use ns4:href="#md7efaf3aec" x="96.005644" y="451.16779" style="fill: #1f77b4; stroke: #1f77b4" />
|
| 4122 |
+
<use ns4:href="#md7efaf3aec" x="185.444754" y="364.40658" style="fill: #1f77b4; stroke: #1f77b4" />
|
| 4123 |
+
<use ns4:href="#md7efaf3aec" x="274.883864" y="374.045142" style="fill: #1f77b4; stroke: #1f77b4" />
|
| 4124 |
+
<use ns4:href="#md7efaf3aec" x="364.322974" y="392.869353" style="fill: #1f77b4; stroke: #1f77b4" />
|
| 4125 |
+
<use ns4:href="#md7efaf3aec" x="453.762084" y="389.882956" style="fill: #1f77b4; stroke: #1f77b4" />
|
| 4126 |
+
<use ns4:href="#md7efaf3aec" x="543.201194" y="397.667403" style="fill: #1f77b4; stroke: #1f77b4" />
|
| 4127 |
+
<use ns4:href="#md7efaf3aec" x="632.640304" y="381.532366" style="fill: #1f77b4; stroke: #1f77b4" />
|
| 4128 |
+
<use ns4:href="#md7efaf3aec" x="722.079415" y="398.106163" style="fill: #1f77b4; stroke: #1f77b4" />
|
| 4129 |
+
<use ns4:href="#md7efaf3aec" x="811.518525" y="419.478011" style="fill: #1f77b4; stroke: #1f77b4" />
|
| 4130 |
</g>
|
| 4131 |
</g>
|
| 4132 |
<g id="series--torch-eager" class="series">
|
| 4133 |
+
<path d="M 96.005644 202.773137 L 185.444754 47.08418 L 274.883864 70.154453 L 364.322974 81.180062 L 453.762084 86.714096 L 543.201194 90.252482 L 632.640304 80.486538 L 722.079415 88.412521 L 811.518525 96.338505 " clip-path="url(#p620c7d392f)" style="fill: none; stroke: #ff7f0e; stroke-width: 1.5; stroke-linecap: square" />
|
| 4134 |
<defs>
|
| 4135 |
<path id="m9b8c54d372" d="M 0 3 C 0.795609 3 1.55874 2.683901 2.12132 2.12132 C 2.683901 1.55874 3 0.795609 3 0 C 3 -0.795609 2.683901 -1.55874 2.12132 -2.12132 C 1.55874 -2.683901 0.795609 -3 0 -3 C -0.795609 -3 -1.55874 -2.683901 -2.12132 -2.12132 C -2.683901 -1.55874 -3 -0.795609 -3 0 C -3 0.795609 -2.683901 1.55874 -2.12132 2.12132 C -1.55874 2.683901 -0.795609 3 0 3 z " style="stroke: #ff7f0e" />
|
| 4136 |
</defs>
|
| 4137 |
<g clip-path="url(#p620c7d392f)">
|
| 4138 |
+
<use ns4:href="#m9b8c54d372" x="96.005644" y="202.773137" style="fill: #ff7f0e; stroke: #ff7f0e" />
|
| 4139 |
<use ns4:href="#m9b8c54d372" x="185.444754" y="47.08418" style="fill: #ff7f0e; stroke: #ff7f0e" />
|
| 4140 |
+
<use ns4:href="#m9b8c54d372" x="274.883864" y="70.154453" style="fill: #ff7f0e; stroke: #ff7f0e" />
|
| 4141 |
+
<use ns4:href="#m9b8c54d372" x="364.322974" y="81.180062" style="fill: #ff7f0e; stroke: #ff7f0e" />
|
| 4142 |
+
<use ns4:href="#m9b8c54d372" x="453.762084" y="86.714096" style="fill: #ff7f0e; stroke: #ff7f0e" />
|
| 4143 |
+
<use ns4:href="#m9b8c54d372" x="543.201194" y="90.252482" style="fill: #ff7f0e; stroke: #ff7f0e" />
|
| 4144 |
+
<use ns4:href="#m9b8c54d372" x="632.640304" y="80.486538" style="fill: #ff7f0e; stroke: #ff7f0e" />
|
| 4145 |
+
<use ns4:href="#m9b8c54d372" x="722.079415" y="88.412521" style="fill: #ff7f0e; stroke: #ff7f0e" />
|
| 4146 |
+
<use ns4:href="#m9b8c54d372" x="811.518525" y="96.338505" style="fill: #ff7f0e; stroke: #ff7f0e" />
|
| 4147 |
</g>
|
| 4148 |
</g>
|
| 4149 |
<g id="patch_3">
|
|
|
|
| 4158 |
<g id="patch_6">
|
| 4159 |
<path d="M 60.23 26.88 L 847.294169 26.88 " style="fill: none; stroke: #000000; stroke-width: 0.8; stroke-linejoin: miter; stroke-linecap: square" />
|
| 4160 |
</g>
|
| 4161 |
+
<g id="text_16">
|
| 4162 |
<text style="font-size: 12px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: middle" x="453.762084" y="20.88" transform="rotate(-0 453.762084 20.88)">Attention Implementation Latency</text>
|
| 4163 |
</g>
|
| 4164 |
<g id="legend" class="legend">
|
| 4165 |
<g id="patch_7">
|
| 4166 |
<path d="M 720.811356 64.7925 L 840.294169 64.7925 Q 842.294169 64.7925 842.294169 62.7925 L 842.294169 33.88 Q 842.294169 31.88 840.294169 31.88 L 720.811356 31.88 Q 718.811356 31.88 718.811356 33.88 L 718.811356 62.7925 Q 718.811356 64.7925 720.811356 64.7925 L 720.811356 64.7925 z " style="fill: none; opacity: 0.8; stroke: #cccccc; stroke-linejoin: miter" />
|
| 4167 |
</g>
|
| 4168 |
+
<g id="line2d_16">
|
| 4169 |
<path d="M 722.811356 39.978438 L 732.811356 39.978438 L 742.811356 39.978438 " style="fill: none; stroke: #1f77b4; stroke-width: 1.5; stroke-linecap: square" />
|
| 4170 |
<g>
|
| 4171 |
<use ns4:href="#md7efaf3aec" x="732.811356" y="39.978438" style="fill: #1f77b4; stroke: #1f77b4" />
|
|
|
|
| 4174 |
<g id="legend-label--hf-kernels-swiglu" class="legend">
|
| 4175 |
<text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: start" x="750.811356" y="43.478438" transform="rotate(-0 750.811356 43.478438)">hf_kernels_swiglu</text>
|
| 4176 |
</g>
|
| 4177 |
+
<g id="line2d_17">
|
| 4178 |
<path d="M 722.811356 54.934687 L 732.811356 54.934687 L 742.811356 54.934687 " style="fill: none; stroke: #ff7f0e; stroke-width: 1.5; stroke-linecap: square" />
|
| 4179 |
<g>
|
| 4180 |
<use ns4:href="#m9b8c54d372" x="732.811356" y="54.934687" style="fill: #ff7f0e; stroke: #ff7f0e" />
|
|
|
|
| 4201 |
<span onclick="toggleOutput('combine')" style="cursor: pointer;">▼ output</span>
|
| 4202 |
<span id="uv-indicator-combine" onclick="toggleUvLogsFromHeader('combine')" style="cursor: pointer;">▶ uv-logs</span>
|
| 4203 |
</span> |
|
| 4204 |
+
Cell: combine | 4.26s
|
| 4205 |
| <button class="run-btn" onclick="runCell('combine')">▶ run</button>
|
| 4206 |
<button class="copy-btn" onclick="copyCell('combine')">Copy</button>
|
| 4207 |
<a href="cells/combine.py" target="_blank" class="raw-btn">Raw</a>
|
|
|
|
| 4340 |
<rdf:RDF>
|
| 4341 |
<ns2:Work>
|
| 4342 |
<dc:type rdf:resource="http://purl.org/dc/dcmitype/StillImage" />
|
| 4343 |
+
<dc:date>2025-10-29T15:51:13.643076</dc:date>
|
| 4344 |
<dc:format>image/svg+xml</dc:format>
|
| 4345 |
<dc:creator>
|
| 4346 |
<ns2:Agent>
|
|
|
|
| 4489 |
<g id="matplotlib.axis_2">
|
| 4490 |
<g id="ytick_1">
|
| 4491 |
<g id="grid-y--2" class="grid grid-y">
|
| 4492 |
+
<path d="M 60.23 438.443756 L 847.294169 438.443756 " clip-path="url(#p620c7d392f)" style="fill: none; stroke: #b0b0b0; stroke-opacity: 0.3; stroke-width: 0.8; stroke-linecap: square" />
|
| 4493 |
</g>
|
| 4494 |
<g id="line2d_10">
|
| 4495 |
<defs>
|
| 4496 |
<path id="m0fca2865ba" d="M 0 0 L -3.5 0 " style="stroke: #000000; stroke-width: 0.8" />
|
| 4497 |
</defs>
|
| 4498 |
<g>
|
| 4499 |
+
<use ns4:href="#m0fca2865ba" x="60.23" y="438.443756" style="stroke: #000000; stroke-width: 0.8" />
|
| 4500 |
</g>
|
| 4501 |
</g>
|
| 4502 |
<g id="text_10">
|
| 4503 |
+
<text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: end" x="53.23" y="442.242975" transform="rotate(-0 53.23 442.242975)">0.025</text>
|
| 4504 |
</g>
|
| 4505 |
</g>
|
| 4506 |
<g id="ytick_2">
|
| 4507 |
<g id="grid-y--3" class="grid grid-y">
|
| 4508 |
+
<path d="M 60.23 367.676049 L 847.294169 367.676049 " clip-path="url(#p620c7d392f)" style="fill: none; stroke: #b0b0b0; stroke-opacity: 0.3; stroke-width: 0.8; stroke-linecap: square" />
|
| 4509 |
</g>
|
| 4510 |
<g id="line2d_11">
|
| 4511 |
<g>
|
| 4512 |
+
<use ns4:href="#m0fca2865ba" x="60.23" y="367.676049" style="stroke: #000000; stroke-width: 0.8" />
|
| 4513 |
</g>
|
| 4514 |
</g>
|
| 4515 |
<g id="text_11">
|
| 4516 |
+
<text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: end" x="53.23" y="371.475268" transform="rotate(-0 53.23 371.475268)">0.030</text>
|
| 4517 |
</g>
|
| 4518 |
</g>
|
| 4519 |
<g id="ytick_3">
|
| 4520 |
<g id="grid-y--4" class="grid grid-y">
|
| 4521 |
+
<path d="M 60.23 296.908341 L 847.294169 296.908341 " clip-path="url(#p620c7d392f)" style="fill: none; stroke: #b0b0b0; stroke-opacity: 0.3; stroke-width: 0.8; stroke-linecap: square" />
|
| 4522 |
</g>
|
| 4523 |
<g id="line2d_12">
|
| 4524 |
<g>
|
| 4525 |
+
<use ns4:href="#m0fca2865ba" x="60.23" y="296.908341" style="stroke: #000000; stroke-width: 0.8" />
|
| 4526 |
</g>
|
| 4527 |
</g>
|
| 4528 |
<g id="text_12">
|
| 4529 |
+
<text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: end" x="53.23" y="300.70756" transform="rotate(-0 53.23 300.70756)">0.035</text>
|
| 4530 |
</g>
|
| 4531 |
</g>
|
| 4532 |
<g id="ytick_4">
|
| 4533 |
<g id="grid-y--5" class="grid grid-y">
|
| 4534 |
+
<path d="M 60.23 226.140634 L 847.294169 226.140634 " clip-path="url(#p620c7d392f)" style="fill: none; stroke: #b0b0b0; stroke-opacity: 0.3; stroke-width: 0.8; stroke-linecap: square" />
|
| 4535 |
</g>
|
| 4536 |
<g id="line2d_13">
|
| 4537 |
<g>
|
| 4538 |
+
<use ns4:href="#m0fca2865ba" x="60.23" y="226.140634" style="stroke: #000000; stroke-width: 0.8" />
|
| 4539 |
</g>
|
| 4540 |
</g>
|
| 4541 |
<g id="text_13">
|
| 4542 |
+
<text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: end" x="53.23" y="229.939852" transform="rotate(-0 53.23 229.939852)">0.040</text>
|
| 4543 |
</g>
|
| 4544 |
</g>
|
| 4545 |
<g id="ytick_5">
|
| 4546 |
<g id="grid-y--6" class="grid grid-y">
|
| 4547 |
+
<path d="M 60.23 155.372926 L 847.294169 155.372926 " clip-path="url(#p620c7d392f)" style="fill: none; stroke: #b0b0b0; stroke-opacity: 0.3; stroke-width: 0.8; stroke-linecap: square" />
|
| 4548 |
</g>
|
| 4549 |
<g id="line2d_14">
|
| 4550 |
<g>
|
| 4551 |
+
<use ns4:href="#m0fca2865ba" x="60.23" y="155.372926" style="stroke: #000000; stroke-width: 0.8" />
|
| 4552 |
</g>
|
| 4553 |
</g>
|
| 4554 |
<g id="text_14">
|
| 4555 |
+
<text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: end" x="53.23" y="159.172145" transform="rotate(-0 53.23 159.172145)">0.045</text>
|
| 4556 |
</g>
|
| 4557 |
</g>
|
| 4558 |
<g id="ytick_6">
|
| 4559 |
<g id="grid-y--7" class="grid grid-y">
|
| 4560 |
+
<path d="M 60.23 84.605219 L 847.294169 84.605219 " clip-path="url(#p620c7d392f)" style="fill: none; stroke: #b0b0b0; stroke-opacity: 0.3; stroke-width: 0.8; stroke-linecap: square" />
|
| 4561 |
</g>
|
| 4562 |
<g id="line2d_15">
|
| 4563 |
<g>
|
| 4564 |
+
<use ns4:href="#m0fca2865ba" x="60.23" y="84.605219" style="stroke: #000000; stroke-width: 0.8" />
|
| 4565 |
</g>
|
| 4566 |
</g>
|
| 4567 |
<g id="text_15">
|
| 4568 |
+
<text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: end" x="53.23" y="88.404437" transform="rotate(-0 53.23 88.404437)">0.050</text>
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 4569 |
</g>
|
| 4570 |
</g>
|
| 4571 |
<g id="label--y" class="ylabel">
|
|
|
|
| 4573 |
</g>
|
| 4574 |
</g>
|
| 4575 |
<g id="series--hf-kernels-swiglu" class="series">
|
| 4576 |
+
<path d="M 96.005644 451.16779 L 185.444754 364.40658 L 274.883864 374.045142 L 364.322974 392.869353 L 453.762084 389.882956 L 543.201194 397.667403 L 632.640304 381.532366 L 722.079415 398.106163 L 811.518525 419.478011 " clip-path="url(#p620c7d392f)" style="fill: none; stroke: #1f77b4; stroke-width: 1.5; stroke-linecap: square" />
|
| 4577 |
<defs>
|
| 4578 |
<path id="md7efaf3aec" d="M 0 3 C 0.795609 3 1.55874 2.683901 2.12132 2.12132 C 2.683901 1.55874 3 0.795609 3 0 C 3 -0.795609 2.683901 -1.55874 2.12132 -2.12132 C 1.55874 -2.683901 0.795609 -3 0 -3 C -0.795609 -3 -1.55874 -2.683901 -2.12132 -2.12132 C -2.683901 -1.55874 -3 -0.795609 -3 0 C -3 0.795609 -2.683901 1.55874 -2.12132 2.12132 C -1.55874 2.683901 -0.795609 3 0 3 z " style="stroke: #1f77b4" />
|
| 4579 |
</defs>
|
| 4580 |
<g clip-path="url(#p620c7d392f)">
|
| 4581 |
<use ns4:href="#md7efaf3aec" x="96.005644" y="451.16779" style="fill: #1f77b4; stroke: #1f77b4" />
|
| 4582 |
+
<use ns4:href="#md7efaf3aec" x="185.444754" y="364.40658" style="fill: #1f77b4; stroke: #1f77b4" />
|
| 4583 |
+
<use ns4:href="#md7efaf3aec" x="274.883864" y="374.045142" style="fill: #1f77b4; stroke: #1f77b4" />
|
| 4584 |
+
<use ns4:href="#md7efaf3aec" x="364.322974" y="392.869353" style="fill: #1f77b4; stroke: #1f77b4" />
|
| 4585 |
+
<use ns4:href="#md7efaf3aec" x="453.762084" y="389.882956" style="fill: #1f77b4; stroke: #1f77b4" />
|
| 4586 |
+
<use ns4:href="#md7efaf3aec" x="543.201194" y="397.667403" style="fill: #1f77b4; stroke: #1f77b4" />
|
| 4587 |
+
<use ns4:href="#md7efaf3aec" x="632.640304" y="381.532366" style="fill: #1f77b4; stroke: #1f77b4" />
|
| 4588 |
+
<use ns4:href="#md7efaf3aec" x="722.079415" y="398.106163" style="fill: #1f77b4; stroke: #1f77b4" />
|
| 4589 |
+
<use ns4:href="#md7efaf3aec" x="811.518525" y="419.478011" style="fill: #1f77b4; stroke: #1f77b4" />
|
| 4590 |
</g>
|
| 4591 |
</g>
|
| 4592 |
<g id="series--torch-eager" class="series">
|
| 4593 |
+
<path d="M 96.005644 202.773137 L 185.444754 47.08418 L 274.883864 70.154453 L 364.322974 81.180062 L 453.762084 86.714096 L 543.201194 90.252482 L 632.640304 80.486538 L 722.079415 88.412521 L 811.518525 96.338505 " clip-path="url(#p620c7d392f)" style="fill: none; stroke: #ff7f0e; stroke-width: 1.5; stroke-linecap: square" />
|
| 4594 |
<defs>
|
| 4595 |
<path id="m9b8c54d372" d="M 0 3 C 0.795609 3 1.55874 2.683901 2.12132 2.12132 C 2.683901 1.55874 3 0.795609 3 0 C 3 -0.795609 2.683901 -1.55874 2.12132 -2.12132 C 1.55874 -2.683901 0.795609 -3 0 -3 C -0.795609 -3 -1.55874 -2.683901 -2.12132 -2.12132 C -2.683901 -1.55874 -3 -0.795609 -3 0 C -3 0.795609 -2.683901 1.55874 -2.12132 2.12132 C -1.55874 2.683901 -0.795609 3 0 3 z " style="stroke: #ff7f0e" />
|
| 4596 |
</defs>
|
| 4597 |
<g clip-path="url(#p620c7d392f)">
|
| 4598 |
+
<use ns4:href="#m9b8c54d372" x="96.005644" y="202.773137" style="fill: #ff7f0e; stroke: #ff7f0e" />
|
| 4599 |
<use ns4:href="#m9b8c54d372" x="185.444754" y="47.08418" style="fill: #ff7f0e; stroke: #ff7f0e" />
|
| 4600 |
+
<use ns4:href="#m9b8c54d372" x="274.883864" y="70.154453" style="fill: #ff7f0e; stroke: #ff7f0e" />
|
| 4601 |
+
<use ns4:href="#m9b8c54d372" x="364.322974" y="81.180062" style="fill: #ff7f0e; stroke: #ff7f0e" />
|
| 4602 |
+
<use ns4:href="#m9b8c54d372" x="453.762084" y="86.714096" style="fill: #ff7f0e; stroke: #ff7f0e" />
|
| 4603 |
+
<use ns4:href="#m9b8c54d372" x="543.201194" y="90.252482" style="fill: #ff7f0e; stroke: #ff7f0e" />
|
| 4604 |
+
<use ns4:href="#m9b8c54d372" x="632.640304" y="80.486538" style="fill: #ff7f0e; stroke: #ff7f0e" />
|
| 4605 |
+
<use ns4:href="#m9b8c54d372" x="722.079415" y="88.412521" style="fill: #ff7f0e; stroke: #ff7f0e" />
|
| 4606 |
+
<use ns4:href="#m9b8c54d372" x="811.518525" y="96.338505" style="fill: #ff7f0e; stroke: #ff7f0e" />
|
| 4607 |
</g>
|
| 4608 |
</g>
|
| 4609 |
<g id="patch_3">
|
|
|
|
| 4618 |
<g id="patch_6">
|
| 4619 |
<path d="M 60.23 26.88 L 847.294169 26.88 " style="fill: none; stroke: #000000; stroke-width: 0.8; stroke-linejoin: miter; stroke-linecap: square" />
|
| 4620 |
</g>
|
| 4621 |
+
<g id="text_16">
|
| 4622 |
<text style="font-size: 12px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: middle" x="453.762084" y="20.88" transform="rotate(-0 453.762084 20.88)">Attention Implementation Latency</text>
|
| 4623 |
</g>
|
| 4624 |
<g id="legend" class="legend">
|
| 4625 |
<g id="patch_7">
|
| 4626 |
<path d="M 720.811356 64.7925 L 840.294169 64.7925 Q 842.294169 64.7925 842.294169 62.7925 L 842.294169 33.88 Q 842.294169 31.88 840.294169 31.88 L 720.811356 31.88 Q 718.811356 31.88 718.811356 33.88 L 718.811356 62.7925 Q 718.811356 64.7925 720.811356 64.7925 L 720.811356 64.7925 z " style="fill: none; opacity: 0.8; stroke: #cccccc; stroke-linejoin: miter" />
|
| 4627 |
</g>
|
| 4628 |
+
<g id="line2d_16">
|
| 4629 |
<path d="M 722.811356 39.978438 L 732.811356 39.978438 L 742.811356 39.978438 " style="fill: none; stroke: #1f77b4; stroke-width: 1.5; stroke-linecap: square" />
|
| 4630 |
<g>
|
| 4631 |
<use ns4:href="#md7efaf3aec" x="732.811356" y="39.978438" style="fill: #1f77b4; stroke: #1f77b4" />
|
|
|
|
| 4634 |
<g id="legend-label--hf-kernels-swiglu" class="legend">
|
| 4635 |
<text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: start" x="750.811356" y="43.478438" transform="rotate(-0 750.811356 43.478438)">hf_kernels_swiglu</text>
|
| 4636 |
</g>
|
| 4637 |
+
<g id="line2d_17">
|
| 4638 |
<path d="M 722.811356 54.934687 L 732.811356 54.934687 L 742.811356 54.934687 " style="fill: none; stroke: #ff7f0e; stroke-width: 1.5; stroke-linecap: square" />
|
| 4639 |
<g>
|
| 4640 |
<use ns4:href="#m9b8c54d372" x="732.811356" y="54.934687" style="fill: #ff7f0e; stroke: #ff7f0e" />
|
causal_conv1d/impls/artifacts/benchmark/causal_conv1d.jsonl
CHANGED
|
@@ -1,24 +1,24 @@
|
|
| 1 |
-
{"ts": "2025-10-
|
| 2 |
-
{"ts": "2025-10-
|
| 3 |
-
{"ts": "2025-10-
|
| 4 |
-
{"ts": "2025-10-
|
| 5 |
-
{"ts": "2025-10-
|
| 6 |
-
{"ts": "2025-10-
|
| 7 |
-
{"ts": "2025-10-
|
| 8 |
-
{"ts": "2025-10-
|
| 9 |
-
{"ts": "2025-10-
|
| 10 |
-
{"ts": "2025-10-
|
| 11 |
-
{"ts": "2025-10-
|
| 12 |
-
{"ts": "2025-10-
|
| 13 |
-
{"ts": "2025-10-
|
| 14 |
-
{"ts": "2025-10-
|
| 15 |
-
{"ts": "2025-10-
|
| 16 |
-
{"ts": "2025-10-
|
| 17 |
-
{"ts": "2025-10-
|
| 18 |
-
{"ts": "2025-10-
|
| 19 |
-
{"ts": "2025-10-
|
| 20 |
-
{"ts": "2025-10-
|
| 21 |
-
{"ts": "2025-10-
|
| 22 |
-
{"ts": "2025-10-
|
| 23 |
-
{"ts": "2025-10-
|
| 24 |
-
{"ts": "2025-10-
|
|
|
|
| 1 |
+
{"ts": "2025-10-29T15:50:27Z", "run": "73b51cf0e9b74d0eaee1c67a266218ef", "impl": "torch_eager", "tags": {"family": "pytorch", "backend": "eager"}, "wl": {"name": "cuda_B2_D64_S128_W2", "batch": 2, "dim": 64, "seqlen": 128, "width": 2, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.07023200004141472, "p50": 0.07095199998730095, "p90": 0.07123199998204655, "mean": 0.07353400000056354, "iqr": 0.0008999999749903509, "raw_times": [0.07095199998730095, 0.08492199998499927, 0.0703320000070562, 0.07123199998204655, 0.07023200004141472], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.07603100004871521, "peak_bytes": 295936, "ok": true, "absmax": 0.0, "corr": {"ok": true, "rtol": 0.003, "atol": 0.005, "absmax": 0.0, "mae": 0.0, "mse": 0.0, "ref": "causal_conv1d_fp32"}, "err": null}
|
| 2 |
+
{"ts": "2025-10-29T15:50:27Z", "run": "73b51cf0e9b74d0eaee1c67a266218ef", "impl": "torch_eager", "tags": {"family": "pytorch", "backend": "eager"}, "wl": {"name": "cuda_B2_D64_S128_W4", "batch": 2, "dim": 64, "seqlen": 128, "width": 4, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.08460200001536577, "p50": 0.08611200001951147, "p90": 0.08698200002754675, "mean": 0.08602200001632809, "iqr": 0.001740000016070553, "raw_times": [0.08460200001536577, 0.08611200001951147, 0.08698200002754675, 0.08717200000774028, 0.0852420000114762], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.08820200002901402, "peak_bytes": 296448, "ok": true, "absmax": 0.0, "corr": {"ok": true, "rtol": 0.003, "atol": 0.005, "absmax": 0.0, "mae": 0.0, "mse": 0.0, "ref": "causal_conv1d_fp32"}, "err": null}
|
| 3 |
+
{"ts": "2025-10-29T15:50:27Z", "run": "73b51cf0e9b74d0eaee1c67a266218ef", "impl": "torch_eager", "tags": {"family": "pytorch", "backend": "eager"}, "wl": {"name": "cuda_B2_D64_S512_W2", "batch": 2, "dim": 64, "seqlen": 512, "width": 2, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.08334199998216718, "p50": 0.08516200000485696, "p90": 0.08565199999566175, "mean": 0.08509399999638845, "iqr": 0.0015599999869664316, "raw_times": [0.08334199998216718, 0.08722199999056102, 0.08565199999566175, 0.08516200000485696, 0.08409200000869532], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.0867219999918234, "peak_bytes": 1180672, "ok": true, "absmax": 0.0, "corr": {"ok": true, "rtol": 0.003, "atol": 0.005, "absmax": 0.0, "mae": 0.0, "mse": 0.0, "ref": "causal_conv1d_fp32"}, "err": null}
|
| 4 |
+
{"ts": "2025-10-29T15:50:27Z", "run": "73b51cf0e9b74d0eaee1c67a266218ef", "impl": "torch_eager", "tags": {"family": "pytorch", "backend": "eager"}, "wl": {"name": "cuda_B2_D64_S512_W4", "batch": 2, "dim": 64, "seqlen": 512, "width": 4, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.08196199996746145, "p50": 0.08375099997692814, "p90": 0.08384200003774822, "mean": 0.08337179999671207, "iqr": 0.0010800000609378912, "raw_times": [0.08276199997681033, 0.08454200002461221, 0.08384200003774822, 0.08375099997692814, 0.08196199996746145], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.08716199999980745, "peak_bytes": 1181184, "ok": true, "absmax": 0.0, "corr": {"ok": true, "rtol": 0.003, "atol": 0.005, "absmax": 0.0, "mae": 0.0, "mse": 0.0, "ref": "causal_conv1d_fp32"}, "err": null}
|
| 5 |
+
{"ts": "2025-10-29T15:50:27Z", "run": "73b51cf0e9b74d0eaee1c67a266218ef", "impl": "torch_eager", "tags": {"family": "pytorch", "backend": "eager"}, "wl": {"name": "cuda_B2_D64_S2048_W2", "batch": 2, "dim": 64, "seqlen": 2048, "width": 2, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.08276199997681033, "p50": 0.08335200004694343, "p90": 0.08474200001273857, "mean": 0.08374400000548121, "iqr": 0.0019199999883312557, "raw_times": [0.08335200004694343, 0.08474200001273857, 0.08504199996650641, 0.08276199997681033, 0.08282200002440732], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.08652200000369703, "peak_bytes": 4719616, "ok": true, "absmax": 0.0, "corr": {"ok": true, "rtol": 0.003, "atol": 0.005, "absmax": 0.0, "mae": 0.0, "mse": 0.0, "ref": "causal_conv1d_fp32"}, "err": null}
|
| 6 |
+
{"ts": "2025-10-29T15:50:27Z", "run": "73b51cf0e9b74d0eaee1c67a266218ef", "impl": "torch_eager", "tags": {"family": "pytorch", "backend": "eager"}, "wl": {"name": "cuda_B2_D64_S2048_W4", "batch": 2, "dim": 64, "seqlen": 2048, "width": 4, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.08203199996614785, "p50": 0.08333200003107777, "p90": 0.08342199998878641, "mean": 0.08316619998822716, "iqr": 0.0006700000199089118, "raw_times": [0.08203199996614785, 0.08333200003107777, 0.08342199998878641, 0.0827519999688775, 0.08429299998624629], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.08916199999475793, "peak_bytes": 4720128, "ok": true, "absmax": 0.0, "corr": {"ok": true, "rtol": 0.003, "atol": 0.005, "absmax": 0.0, "mae": 0.0, "mse": 0.0, "ref": "causal_conv1d_fp32"}, "err": null}
|
| 7 |
+
{"ts": "2025-10-29T15:50:27Z", "run": "73b51cf0e9b74d0eaee1c67a266218ef", "impl": "torch_eager", "tags": {"family": "pytorch", "backend": "eager"}, "wl": {"name": "cuda_B2_D2048_S128_W2", "batch": 2, "dim": 2048, "seqlen": 128, "width": 2, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.08283200003234015, "p50": 0.08409299999811992, "p90": 0.08469200002991784, "mean": 0.08781020001151774, "iqr": 0.001050000037139398, "raw_times": [0.08469200002991784, 0.08409299999811992, 0.10379200000443234, 0.08364199999277844, 0.08283200003234015], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.08838200000127472, "peak_bytes": 9461760, "ok": true, "absmax": 0.0, "corr": {"ok": true, "rtol": 0.003, "atol": 0.005, "absmax": 0.0, "mae": 0.0, "mse": 0.0, "ref": "causal_conv1d_fp32"}, "err": null}
|
| 8 |
+
{"ts": "2025-10-29T15:50:27Z", "run": "73b51cf0e9b74d0eaee1c67a266218ef", "impl": "torch_eager", "tags": {"family": "pytorch", "backend": "eager"}, "wl": {"name": "cuda_B2_D2048_S128_W4", "batch": 2, "dim": 2048, "seqlen": 128, "width": 4, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.08139199997003743, "p50": 0.08336199999803284, "p90": 0.08399199998621043, "mean": 0.0832759999980226, "iqr": 0.0010599999313853914, "raw_times": [0.08139199997003743, 0.08470199998100725, 0.08293200005482504, 0.08336199999803284, 0.08399199998621043], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.08715199999187462, "peak_bytes": 9478144, "ok": true, "absmax": 0.0, "corr": {"ok": true, "rtol": 0.003, "atol": 0.005, "absmax": 0.0, "mae": 0.0, "mse": 0.0, "ref": "causal_conv1d_fp32"}, "err": null}
|
| 9 |
+
{"ts": "2025-10-29T15:50:27Z", "run": "73b51cf0e9b74d0eaee1c67a266218ef", "impl": "torch_eager", "tags": {"family": "pytorch", "backend": "eager"}, "wl": {"name": "cuda_B2_D2048_S512_W2", "batch": 2, "dim": 2048, "seqlen": 512, "width": 2, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.08235199999262477, "p50": 0.08327199998348078, "p90": 0.0835210000218467, "mean": 0.08336580000332106, "iqr": 0.00033899999607456266, "raw_times": [0.08235199999262477, 0.0835210000218467, 0.08450199999288088, 0.08327199998348078, 0.08318200002577214], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.08735199998000098, "peak_bytes": 37773312, "ok": true, "absmax": 0.0, "corr": {"ok": true, "rtol": 0.003, "atol": 0.005, "absmax": 0.0, "mae": 0.0, "mse": 0.0, "ref": "causal_conv1d_fp32"}, "err": null}
|
| 10 |
+
{"ts": "2025-10-29T15:50:27Z", "run": "73b51cf0e9b74d0eaee1c67a266218ef", "impl": "torch_eager", "tags": {"family": "pytorch", "backend": "eager"}, "wl": {"name": "cuda_B2_D2048_S512_W4", "batch": 2, "dim": 2048, "seqlen": 512, "width": 4, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.08233200003360253, "p50": 0.08335199999010001, "p90": 0.08342199998878641, "mean": 0.08314600000858263, "iqr": 0.0004799999828719592, "raw_times": [0.08342199998878641, 0.08233200003360253, 0.08335199999010001, 0.08294200000591445, 0.08368200002450976], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.08666200000106983, "peak_bytes": 37789696, "ok": true, "absmax": 0.0, "corr": {"ok": true, "rtol": 0.003, "atol": 0.005, "absmax": 0.0, "mae": 0.0, "mse": 0.0, "ref": "causal_conv1d_fp32"}, "err": null}
|
| 11 |
+
{"ts": "2025-10-29T15:50:27Z", "run": "73b51cf0e9b74d0eaee1c67a266218ef", "impl": "torch_eager", "tags": {"family": "pytorch", "backend": "eager"}, "wl": {"name": "cuda_B2_D2048_S2048_W2", "batch": 2, "dim": 2048, "seqlen": 2048, "width": 2, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.1449639999577812, "p50": 0.14544300000807198, "p90": 0.14571399998430934, "mean": 0.14548759999115646, "iqr": 0.00032100001590151805, "raw_times": [0.14544300000807198, 0.1449639999577812, 0.14539299996840782, 0.14571399998430934, 0.14592400003721195], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.14803300001631214, "peak_bytes": 151019520, "ok": true, "absmax": 0.0, "corr": {"ok": true, "rtol": 0.003, "atol": 0.005, "absmax": 0.0, "mae": 0.0, "mse": 0.0, "ref": "causal_conv1d_fp32"}, "err": null}
|
| 12 |
+
{"ts": "2025-10-29T15:50:27Z", "run": "73b51cf0e9b74d0eaee1c67a266218ef", "impl": "torch_eager", "tags": {"family": "pytorch", "backend": "eager"}, "wl": {"name": "cuda_B2_D2048_S2048_W4", "batch": 2, "dim": 2048, "seqlen": 2048, "width": 4, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.16181400002324153, "p50": 0.1630739999995967, "p90": 0.16360400002213282, "mean": 0.16567200000281446, "iqr": 0.0017800000478018774, "raw_times": [0.16181400002324153, 0.17804399999477027, 0.1630739999995967, 0.16182399997433095, 0.16360400002213282], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.16251400001010552, "peak_bytes": 151035904, "ok": true, "absmax": 0.0, "corr": {"ok": true, "rtol": 0.003, "atol": 0.005, "absmax": 0.0, "mae": 0.0, "mse": 0.0, "ref": "causal_conv1d_fp32"}, "err": null}
|
| 13 |
+
{"ts": "2025-10-29T15:50:27Z", "run": "73b51cf0e9b74d0eaee1c67a266218ef", "impl": "torch_eager", "tags": {"family": "pytorch", "backend": "eager"}, "wl": {"name": "cuda_B4_D64_S128_W2", "batch": 4, "dim": 64, "seqlen": 128, "width": 2, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.08132199997135103, "p50": 0.08263099999794576, "p90": 0.08295200001384728, "mean": 0.0824317999899904, "iqr": 0.0009100000397666008, "raw_times": [0.08132199997135103, 0.08321199999272721, 0.08204199997408068, 0.08295200001384728, 0.08263099999794576], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.08486200005108913, "peak_bytes": 33727488, "ok": true, "absmax": 0.0, "corr": {"ok": true, "rtol": 0.003, "atol": 0.005, "absmax": 0.0, "mae": 0.0, "mse": 0.0, "ref": "causal_conv1d_fp32"}, "err": null}
|
| 14 |
+
{"ts": "2025-10-29T15:50:27Z", "run": "73b51cf0e9b74d0eaee1c67a266218ef", "impl": "torch_eager", "tags": {"family": "pytorch", "backend": "eager"}, "wl": {"name": "cuda_B4_D64_S128_W4", "batch": 4, "dim": 64, "seqlen": 128, "width": 4, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.08171299998593895, "p50": 0.08253199996488547, "p90": 0.08321199999272721, "mean": 0.08254819997546292, "iqr": 0.001280000049064256, "raw_times": [0.08193199994366296, 0.08335199999010001, 0.08321199999272721, 0.08171299998593895, 0.08253199996488547], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.08592199998247452, "peak_bytes": 591360, "ok": true, "absmax": 0.0, "corr": {"ok": true, "rtol": 0.003, "atol": 0.005, "absmax": 0.0, "mae": 0.0, "mse": 0.0, "ref": "causal_conv1d_fp32"}, "err": null}
|
| 15 |
+
{"ts": "2025-10-29T15:50:27Z", "run": "73b51cf0e9b74d0eaee1c67a266218ef", "impl": "torch_eager", "tags": {"family": "pytorch", "backend": "eager"}, "wl": {"name": "cuda_B4_D64_S512_W2", "batch": 4, "dim": 64, "seqlen": 512, "width": 2, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.08148200004143291, "p50": 0.08176199997933509, "p90": 0.08353199996236071, "mean": 0.08242180000479493, "iqr": 0.002030999951330159, "raw_times": [0.08150100001103056, 0.08148200004143291, 0.08383200002981539, 0.08176199997933509, 0.08353199996236071], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.08680199999844262, "peak_bytes": 2360320, "ok": true, "absmax": 0.0, "corr": {"ok": true, "rtol": 0.003, "atol": 0.005, "absmax": 0.0, "mae": 0.0, "mse": 0.0, "ref": "causal_conv1d_fp32"}, "err": null}
|
| 16 |
+
{"ts": "2025-10-29T15:50:27Z", "run": "73b51cf0e9b74d0eaee1c67a266218ef", "impl": "torch_eager", "tags": {"family": "pytorch", "backend": "eager"}, "wl": {"name": "cuda_B4_D64_S512_W4", "batch": 4, "dim": 64, "seqlen": 512, "width": 4, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.08221299998467657, "p50": 0.08294200000591445, "p90": 0.08321200004957063, "mean": 0.08299800000486357, "iqr": 0.0007910000476840651, "raw_times": [0.08242100000188657, 0.08420199998226963, 0.08321200004957063, 0.08221299998467657, 0.08294200000591445], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.08663200003411475, "peak_bytes": 2360832, "ok": true, "absmax": 0.0, "corr": {"ok": true, "rtol": 0.003, "atol": 0.005, "absmax": 0.0, "mae": 0.0, "mse": 0.0, "ref": "causal_conv1d_fp32"}, "err": null}
|
| 17 |
+
{"ts": "2025-10-29T15:50:27Z", "run": "73b51cf0e9b74d0eaee1c67a266218ef", "impl": "torch_eager", "tags": {"family": "pytorch", "backend": "eager"}, "wl": {"name": "cuda_B4_D64_S2048_W2", "batch": 4, "dim": 64, "seqlen": 2048, "width": 2, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.08301200000460085, "p50": 0.08371199999146484, "p90": 0.08385299997826223, "mean": 0.08369219999622146, "iqr": 0.0001610000026630587, "raw_times": [0.08301200000460085, 0.08371199999146484, 0.08419200003118021, 0.08385299997826223, 0.08369199997559917], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.086651999993137, "peak_bytes": 9438208, "ok": true, "absmax": 0.0, "corr": {"ok": true, "rtol": 0.003, "atol": 0.005, "absmax": 0.0, "mae": 0.0, "mse": 0.0, "ref": "causal_conv1d_fp32"}, "err": null}
|
| 18 |
+
{"ts": "2025-10-29T15:50:27Z", "run": "73b51cf0e9b74d0eaee1c67a266218ef", "impl": "torch_eager", "tags": {"family": "pytorch", "backend": "eager"}, "wl": {"name": "cuda_B4_D64_S2048_W4", "batch": 4, "dim": 64, "seqlen": 2048, "width": 4, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.08138200001894802, "p50": 0.08318200002577214, "p90": 0.08328199999141361, "mean": 0.08309020000751843, "iqr": 0.0010899999551838846, "raw_times": [0.08219200003622973, 0.08541299996522866, 0.08318200002577214, 0.08138200001894802, 0.08328199999141361], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.08645299999443523, "peak_bytes": 9438720, "ok": true, "absmax": 0.0, "corr": {"ok": true, "rtol": 0.003, "atol": 0.005, "absmax": 0.0, "mae": 0.0, "mse": 0.0, "ref": "causal_conv1d_fp32"}, "err": null}
|
| 19 |
+
{"ts": "2025-10-29T15:50:27Z", "run": "73b51cf0e9b74d0eaee1c67a266218ef", "impl": "torch_eager", "tags": {"family": "pytorch", "backend": "eager"}, "wl": {"name": "cuda_B4_D2048_S128_W2", "batch": 4, "dim": 2048, "seqlen": 128, "width": 2, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.0822520000269833, "p50": 0.08321100000330262, "p90": 0.08357199999409204, "mean": 0.08451599999261816, "iqr": 0.0009600000225873373, "raw_times": [0.09093299996720816, 0.0822520000269833, 0.08321100000330262, 0.0826119999715047, 0.08357199999409204], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.08730200005402367, "peak_bytes": 18931712, "ok": true, "absmax": 0.0, "corr": {"ok": true, "rtol": 0.003, "atol": 0.005, "absmax": 0.0, "mae": 0.0, "mse": 0.0, "ref": "causal_conv1d_fp32"}, "err": null}
|
| 20 |
+
{"ts": "2025-10-29T15:50:27Z", "run": "73b51cf0e9b74d0eaee1c67a266218ef", "impl": "torch_eager", "tags": {"family": "pytorch", "backend": "eager"}, "wl": {"name": "cuda_B4_D2048_S128_W4", "batch": 4, "dim": 2048, "seqlen": 128, "width": 4, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.08279200000060882, "p50": 0.08370200004037542, "p90": 0.08400199999414326, "mean": 0.08373800000072151, "iqr": 0.0006500000040432496, "raw_times": [0.08335199999010001, 0.08400199999414326, 0.08484199997838004, 0.08279200000060882, 0.08370200004037542], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.08856199997353542, "peak_bytes": 18948096, "ok": true, "absmax": 0.0, "corr": {"ok": true, "rtol": 0.003, "atol": 0.005, "absmax": 0.0, "mae": 0.0, "mse": 0.0, "ref": "causal_conv1d_fp32"}, "err": null}
|
| 21 |
+
{"ts": "2025-10-29T15:50:27Z", "run": "73b51cf0e9b74d0eaee1c67a266218ef", "impl": "torch_eager", "tags": {"family": "pytorch", "backend": "eager"}, "wl": {"name": "cuda_B4_D2048_S512_W2", "batch": 4, "dim": 2048, "seqlen": 512, "width": 2, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.09200200003078862, "p50": 0.09372200003099351, "p90": 0.09380200003761274, "mean": 0.09347200001457168, "iqr": 0.00012000003835055395, "raw_times": [0.09200200003078862, 0.09415199997420132, 0.09380200003761274, 0.09372200003099351, 0.09368199999926219], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.09485200001790872, "peak_bytes": 75522048, "ok": true, "absmax": 0.0, "corr": {"ok": true, "rtol": 0.003, "atol": 0.005, "absmax": 0.0, "mae": 0.0, "mse": 0.0, "ref": "causal_conv1d_fp32"}, "err": null}
|
| 22 |
+
{"ts": "2025-10-29T15:50:27Z", "run": "73b51cf0e9b74d0eaee1c67a266218ef", "impl": "torch_eager", "tags": {"family": "pytorch", "backend": "eager"}, "wl": {"name": "cuda_B4_D2048_S512_W4", "batch": 4, "dim": 2048, "seqlen": 512, "width": 4, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.098961999981384, "p50": 0.10011200004100829, "p90": 0.10014200000796336, "mean": 0.10138220001181253, "iqr": 0.0004400000079840538, "raw_times": [0.09970199999997931, 0.098961999981384, 0.10011200004100829, 0.10014200000796336, 0.10799300002872769], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.11010200000782788, "peak_bytes": 75538432, "ok": true, "absmax": 0.0, "corr": {"ok": true, "rtol": 0.003, "atol": 0.005, "absmax": 0.0, "mae": 0.0, "mse": 0.0, "ref": "causal_conv1d_fp32"}, "err": null}
|
| 23 |
+
{"ts": "2025-10-29T15:50:27Z", "run": "73b51cf0e9b74d0eaee1c67a266218ef", "impl": "torch_eager", "tags": {"family": "pytorch", "backend": "eager"}, "wl": {"name": "cuda_B4_D2048_S2048_W2", "batch": 4, "dim": 2048, "seqlen": 2048, "width": 2, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.48627099999976053, "p50": 0.48646099997995407, "p90": 0.4873609999549444, "mean": 0.48691319999534244, "iqr": 0.00103899992609513, "raw_times": [0.48627099999976053, 0.4873609999549444, 0.4881510000132039, 0.4863220000288493, 0.48646099997995407], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.48353100004305816, "peak_bytes": 302014464, "ok": true, "absmax": 0.0, "corr": {"ok": true, "rtol": 0.003, "atol": 0.005, "absmax": 0.0, "mae": 0.0, "mse": 0.0, "ref": "causal_conv1d_fp32"}, "err": null}
|
| 24 |
+
{"ts": "2025-10-29T15:50:27Z", "run": "73b51cf0e9b74d0eaee1c67a266218ef", "impl": "torch_eager", "tags": {"family": "pytorch", "backend": "eager"}, "wl": {"name": "cuda_B4_D2048_S2048_W4", "batch": 4, "dim": 2048, "seqlen": 2048, "width": 4, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.491851000049337, "p50": 0.49710199999708493, "p90": 0.49729199997727846, "mean": 0.49653980000812226, "iqr": 0.0012099999935344385, "raw_times": [0.496081999983744, 0.5003720000331668, 0.49729199997727846, 0.49710199999708493, 0.491851000049337], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.5018319999976484, "peak_bytes": 302030848, "ok": true, "absmax": 0.0, "corr": {"ok": true, "rtol": 0.003, "atol": 0.005, "absmax": 0.0, "mae": 0.0, "mse": 0.0, "ref": "causal_conv1d_fp32"}, "err": null}
|
causal_conv1d/impls/cells/benchmark.py
CHANGED
|
@@ -4,28 +4,37 @@
|
|
| 4 |
# "numpy",
|
| 5 |
# "torch==2.8.0",
|
| 6 |
# "kernels-benchmark-tools",
|
| 7 |
-
# "kernels",
|
| 8 |
# ]
|
| 9 |
#
|
| 10 |
# [tool.uv.sources]
|
| 11 |
# kernels-benchmark-tools = { path = "../../../../../tools", editable = true }
|
| 12 |
# ///
|
| 13 |
import torch
|
|
|
|
| 14 |
import sys
|
| 15 |
from kernels_benchmark_tools import KernelTypeEnum, run_benchmark
|
| 16 |
-
from kernels import get_kernel
|
| 17 |
|
| 18 |
-
# Load the causal conv1d kernel
|
| 19 |
-
causal_conv1d = get_kernel("kernels-community/causal-conv1d")
|
| 20 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 21 |
|
| 22 |
-
|
| 23 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 24 |
|
| 25 |
|
| 26 |
run_benchmark(
|
| 27 |
kernel_type=KernelTypeEnum.CAUSAL_CONV1D,
|
| 28 |
-
impl_name="
|
| 29 |
-
impl_tags={"family": "
|
| 30 |
-
impl_func=
|
| 31 |
)
|
|
|
|
| 4 |
# "numpy",
|
| 5 |
# "torch==2.8.0",
|
| 6 |
# "kernels-benchmark-tools",
|
|
|
|
| 7 |
# ]
|
| 8 |
#
|
| 9 |
# [tool.uv.sources]
|
| 10 |
# kernels-benchmark-tools = { path = "../../../../../tools", editable = true }
|
| 11 |
# ///
|
| 12 |
import torch
|
| 13 |
+
import torch.nn.functional as F
|
| 14 |
import sys
|
| 15 |
from kernels_benchmark_tools import KernelTypeEnum, run_benchmark
|
|
|
|
| 16 |
|
|
|
|
|
|
|
| 17 |
|
| 18 |
+
def torch_causal_conv1d(input_tensor, weight, bias):
|
| 19 |
+
# Convert to weight dtype for computation
|
| 20 |
+
x = input_tensor.to(weight.dtype)
|
| 21 |
+
dim = weight.shape[0]
|
| 22 |
+
width = weight.shape[1]
|
| 23 |
+
seqlen = input_tensor.shape[-1]
|
| 24 |
|
| 25 |
+
# Depthwise causal conv1d using PyTorch
|
| 26 |
+
out = F.conv1d(x, weight.unsqueeze(1), bias, padding=width - 1, groups=dim)
|
| 27 |
+
|
| 28 |
+
# Truncate to original sequence length
|
| 29 |
+
out = out[..., :seqlen]
|
| 30 |
+
|
| 31 |
+
# Convert back to original dtype
|
| 32 |
+
return out.to(input_tensor.dtype)
|
| 33 |
|
| 34 |
|
| 35 |
run_benchmark(
|
| 36 |
kernel_type=KernelTypeEnum.CAUSAL_CONV1D,
|
| 37 |
+
impl_name="torch_eager",
|
| 38 |
+
impl_tags={"family": "pytorch", "backend": "eager"},
|
| 39 |
+
impl_func=torch_causal_conv1d,
|
| 40 |
)
|
causal_conv1d/impls/hf_kernels_causal_conv1d.html
CHANGED
|
The diff for this file is too large to render.
See raw diff
|
|
|
causal_conv1d/impls/torch_causal_conv1d.html
CHANGED
|
The diff for this file is too large to render.
See raw diff
|
|
|
causal_conv1d/results/artifacts/combine/latency.svg
CHANGED
|
|
Git LFS Details
|
|
|
Git LFS Details
|
causal_conv1d/results/combined_results.html
CHANGED
|
@@ -809,6 +809,14 @@
|
|
| 809 |
.artifact-preview svg {
|
| 810 |
background: transparent;
|
| 811 |
}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 812 |
/* CSV table styling */
|
| 813 |
.artifact-csv {
|
| 814 |
margin-top: 1rem;
|
|
@@ -3872,7 +3880,7 @@ span.linenos.special { color: #000000; background-color: #ffffc0; padding-left:
|
|
| 3872 |
<rdf:RDF>
|
| 3873 |
<ns2:Work>
|
| 3874 |
<dc:type rdf:resource="http://purl.org/dc/dcmitype/StillImage" />
|
| 3875 |
-
<dc:date>2025-10-
|
| 3876 |
<dc:format>image/svg+xml</dc:format>
|
| 3877 |
<dc:creator>
|
| 3878 |
<ns2:Agent>
|
|
@@ -4216,70 +4224,70 @@ span.linenos.special { color: #000000; background-color: #ffffc0; padding-left:
|
|
| 4216 |
<g id="matplotlib.axis_2">
|
| 4217 |
<g id="ytick_1">
|
| 4218 |
<g id="grid-y--2" class="grid grid-y">
|
| 4219 |
-
<path d="M 47.72
|
| 4220 |
</g>
|
| 4221 |
<g id="line2d_25">
|
| 4222 |
<defs>
|
| 4223 |
<path id="m0fca2865ba" d="M 0 0 L -3.5 0 " style="stroke: #000000; stroke-width: 0.8" />
|
| 4224 |
</defs>
|
| 4225 |
<g>
|
| 4226 |
-
<use ns4:href="#m0fca2865ba" x="47.72" y="
|
| 4227 |
</g>
|
| 4228 |
</g>
|
| 4229 |
<g id="text_25">
|
| 4230 |
-
<text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: end" x="40.72" y="
|
| 4231 |
</g>
|
| 4232 |
</g>
|
| 4233 |
<g id="ytick_2">
|
| 4234 |
<g id="grid-y--3" class="grid grid-y">
|
| 4235 |
-
<path d="M 47.72
|
| 4236 |
</g>
|
| 4237 |
<g id="line2d_26">
|
| 4238 |
<g>
|
| 4239 |
-
<use ns4:href="#m0fca2865ba" x="47.72" y="
|
| 4240 |
</g>
|
| 4241 |
</g>
|
| 4242 |
<g id="text_26">
|
| 4243 |
-
<text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: end" x="40.72" y="
|
| 4244 |
</g>
|
| 4245 |
</g>
|
| 4246 |
<g id="ytick_3">
|
| 4247 |
<g id="grid-y--4" class="grid grid-y">
|
| 4248 |
-
<path d="M 47.72
|
| 4249 |
</g>
|
| 4250 |
<g id="line2d_27">
|
| 4251 |
<g>
|
| 4252 |
-
<use ns4:href="#m0fca2865ba" x="47.72" y="
|
| 4253 |
</g>
|
| 4254 |
</g>
|
| 4255 |
<g id="text_27">
|
| 4256 |
-
<text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: end" x="40.72" y="
|
| 4257 |
</g>
|
| 4258 |
</g>
|
| 4259 |
<g id="ytick_4">
|
| 4260 |
<g id="grid-y--5" class="grid grid-y">
|
| 4261 |
-
<path d="M 47.72
|
| 4262 |
</g>
|
| 4263 |
<g id="line2d_28">
|
| 4264 |
<g>
|
| 4265 |
-
<use ns4:href="#m0fca2865ba" x="47.72" y="
|
| 4266 |
</g>
|
| 4267 |
</g>
|
| 4268 |
<g id="text_28">
|
| 4269 |
-
<text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: end" x="40.72" y="
|
| 4270 |
</g>
|
| 4271 |
</g>
|
| 4272 |
<g id="ytick_5">
|
| 4273 |
<g id="grid-y--6" class="grid grid-y">
|
| 4274 |
-
<path d="M 47.72
|
| 4275 |
</g>
|
| 4276 |
<g id="line2d_29">
|
| 4277 |
<g>
|
| 4278 |
-
<use ns4:href="#m0fca2865ba" x="47.72" y="
|
| 4279 |
</g>
|
| 4280 |
</g>
|
| 4281 |
<g id="text_29">
|
| 4282 |
-
<text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: end" x="40.72" y="
|
| 4283 |
</g>
|
| 4284 |
</g>
|
| 4285 |
<g id="label--y" class="ylabel">
|
|
@@ -4287,66 +4295,66 @@ span.linenos.special { color: #000000; background-color: #ffffc0; padding-left:
|
|
| 4287 |
</g>
|
| 4288 |
</g>
|
| 4289 |
<g id="series--hf-kernels-causal-conv1d" class="series">
|
| 4290 |
-
<path d="M 83.325193 420.186871 L 114.286231 413.
|
| 4291 |
<defs>
|
| 4292 |
<path id="md7efaf3aec" d="M 0 3 C 0.795609 3 1.55874 2.683901 2.12132 2.12132 C 2.683901 1.55874 3 0.795609 3 0 C 3 -0.795609 2.683901 -1.55874 2.12132 -2.12132 C 1.55874 -2.683901 0.795609 -3 0 -3 C -0.795609 -3 -1.55874 -2.683901 -2.12132 -2.12132 C -2.683901 -1.55874 -3 -0.795609 -3 0 C -3 0.795609 -2.683901 1.55874 -2.12132 2.12132 C -1.55874 2.683901 -0.795609 3 0 3 z " style="stroke: #1f77b4" />
|
| 4293 |
</defs>
|
| 4294 |
<g clip-path="url(#pb49fc4c8d2)">
|
| 4295 |
<use ns4:href="#md7efaf3aec" x="83.325193" y="420.186871" style="fill: #1f77b4; stroke: #1f77b4" />
|
| 4296 |
-
<use ns4:href="#md7efaf3aec" x="114.286231" y="413.
|
| 4297 |
-
<use ns4:href="#md7efaf3aec" x="145.247268" y="
|
| 4298 |
-
<use ns4:href="#md7efaf3aec" x="176.208306" y="414.
|
| 4299 |
-
<use ns4:href="#md7efaf3aec" x="207.169343" y="414.
|
| 4300 |
-
<use ns4:href="#md7efaf3aec" x="238.130381" y="
|
| 4301 |
-
<use ns4:href="#md7efaf3aec" x="269.091418" y="
|
| 4302 |
-
<use ns4:href="#md7efaf3aec" x="300.052455" y="415.
|
| 4303 |
-
<use ns4:href="#md7efaf3aec" x="331.013493" y="416.
|
| 4304 |
-
<use ns4:href="#md7efaf3aec" x="361.97453" y="415.
|
| 4305 |
-
<use ns4:href="#md7efaf3aec" x="392.935568" y="
|
| 4306 |
-
<use ns4:href="#md7efaf3aec" x="423.896605" y="416.
|
| 4307 |
-
<use ns4:href="#md7efaf3aec" x="454.857643" y="415.
|
| 4308 |
-
<use ns4:href="#md7efaf3aec" x="485.81868" y="
|
| 4309 |
-
<use ns4:href="#md7efaf3aec" x="516.779718" y="
|
| 4310 |
-
<use ns4:href="#md7efaf3aec" x="547.740755" y="415.
|
| 4311 |
-
<use ns4:href="#md7efaf3aec" x="578.701793" y="415.
|
| 4312 |
-
<use ns4:href="#md7efaf3aec" x="609.66283" y="
|
| 4313 |
-
<use ns4:href="#md7efaf3aec" x="640.623868" y="414.
|
| 4314 |
-
<use ns4:href="#md7efaf3aec" x="671.584905" y="
|
| 4315 |
-
<use ns4:href="#md7efaf3aec" x="702.545943" y="414.
|
| 4316 |
-
<use ns4:href="#md7efaf3aec" x="733.50698" y="414.
|
| 4317 |
-
<use ns4:href="#md7efaf3aec" x="764.468018" y="
|
| 4318 |
-
<use ns4:href="#md7efaf3aec" x="795.429055" y="
|
| 4319 |
</g>
|
| 4320 |
</g>
|
| 4321 |
<g id="series--torch-eager" class="series">
|
| 4322 |
-
<path d="M 83.325193
|
| 4323 |
<defs>
|
| 4324 |
<path id="m9b8c54d372" d="M 0 3 C 0.795609 3 1.55874 2.683901 2.12132 2.12132 C 2.683901 1.55874 3 0.795609 3 0 C 3 -0.795609 2.683901 -1.55874 2.12132 -2.12132 C 1.55874 -2.683901 0.795609 -3 0 -3 C -0.795609 -3 -1.55874 -2.683901 -2.12132 -2.12132 C -2.683901 -1.55874 -3 -0.795609 -3 0 C -3 0.795609 -2.683901 1.55874 -2.12132 2.12132 C -1.55874 2.683901 -0.795609 3 0 3 z " style="stroke: #ff7f0e" />
|
| 4325 |
</defs>
|
| 4326 |
<g clip-path="url(#pb49fc4c8d2)">
|
| 4327 |
-
<use ns4:href="#m9b8c54d372" x="83.325193" y="
|
| 4328 |
-
<use ns4:href="#m9b8c54d372" x="114.286231" y="
|
| 4329 |
-
<use ns4:href="#m9b8c54d372" x="145.247268" y="
|
| 4330 |
-
<use ns4:href="#m9b8c54d372" x="176.208306" y="
|
| 4331 |
-
<use ns4:href="#m9b8c54d372" x="207.169343" y="
|
| 4332 |
-
<use ns4:href="#m9b8c54d372" x="238.130381" y="
|
| 4333 |
-
<use ns4:href="#m9b8c54d372" x="269.091418" y="
|
| 4334 |
-
<use ns4:href="#m9b8c54d372" x="300.052455" y="
|
| 4335 |
-
<use ns4:href="#m9b8c54d372" x="331.013493" y="
|
| 4336 |
-
<use ns4:href="#m9b8c54d372" x="361.97453" y="
|
| 4337 |
-
<use ns4:href="#m9b8c54d372" x="392.935568" y="
|
| 4338 |
-
<use ns4:href="#m9b8c54d372" x="423.896605" y="
|
| 4339 |
-
<use ns4:href="#m9b8c54d372" x="454.857643" y="
|
| 4340 |
-
<use ns4:href="#m9b8c54d372" x="485.81868" y="
|
| 4341 |
-
<use ns4:href="#m9b8c54d372" x="516.779718" y="
|
| 4342 |
-
<use ns4:href="#m9b8c54d372" x="547.740755" y="
|
| 4343 |
-
<use ns4:href="#m9b8c54d372" x="578.701793" y="
|
| 4344 |
-
<use ns4:href="#m9b8c54d372" x="609.66283" y="
|
| 4345 |
-
<use ns4:href="#m9b8c54d372" x="640.623868" y="
|
| 4346 |
-
<use ns4:href="#m9b8c54d372" x="671.584905" y="
|
| 4347 |
-
<use ns4:href="#m9b8c54d372" x="702.545943" y="
|
| 4348 |
-
<use ns4:href="#m9b8c54d372" x="733.50698" y="
|
| 4349 |
-
<use ns4:href="#m9b8c54d372" x="764.468018" y="
|
| 4350 |
<use ns4:href="#m9b8c54d372" x="795.429055" y="45.608899" style="fill: #ff7f0e; stroke: #ff7f0e" />
|
| 4351 |
</g>
|
| 4352 |
</g>
|
|
@@ -4405,7 +4413,7 @@ span.linenos.special { color: #000000; background-color: #ffffc0; padding-left:
|
|
| 4405 |
<span onclick="toggleOutput('combine')" style="cursor: pointer;">▼ output</span>
|
| 4406 |
<span id="uv-indicator-combine" onclick="toggleUvLogsFromHeader('combine')" style="cursor: pointer;">▶ uv-logs</span>
|
| 4407 |
</span> |
|
| 4408 |
-
Cell: combine | 4.
|
| 4409 |
| <button class="run-btn" onclick="runCell('combine')">▶ run</button>
|
| 4410 |
<button class="copy-btn" onclick="copyCell('combine')">Copy</button>
|
| 4411 |
<a href="cells/combine.py" target="_blank" class="raw-btn">Raw</a>
|
|
@@ -4498,12 +4506,12 @@ hf_kernels_causal_conv1d cuda_B2_D2048_S2048_W2 0.05 True
|
|
| 4498 |
hf_kernels_causal_conv1d cuda_B2_D2048_S2048_W4 0.05 True
|
| 4499 |
hf_kernels_causal_conv1d cuda_B2_D2048_S512_W2 0.05 True
|
| 4500 |
hf_kernels_causal_conv1d cuda_B2_D2048_S512_W4 0.05 True
|
| 4501 |
-
hf_kernels_causal_conv1d cuda_B2_D64_S128_W2 0.
|
| 4502 |
-
hf_kernels_causal_conv1d cuda_B2_D64_S128_W4 0.
|
| 4503 |
-
hf_kernels_causal_conv1d cuda_B2_D64_S2048_W2 0.
|
| 4504 |
hf_kernels_causal_conv1d cuda_B2_D64_S2048_W4 0.05 True
|
| 4505 |
-
hf_kernels_causal_conv1d cuda_B2_D64_S512_W2 0.
|
| 4506 |
-
hf_kernels_causal_conv1d cuda_B2_D64_S512_W4 0.
|
| 4507 |
hf_kernels_causal_conv1d cuda_B4_D2048_S128_W2 0.05 True
|
| 4508 |
hf_kernels_causal_conv1d cuda_B4_D2048_S128_W4 0.05 True
|
| 4509 |
hf_kernels_causal_conv1d cuda_B4_D2048_S2048_W2 0.05 True
|
|
@@ -4514,7 +4522,7 @@ hf_kernels_causal_conv1d cuda_B4_D64_S128_W2 0.05 True
|
|
| 4514 |
hf_kernels_causal_conv1d cuda_B4_D64_S128_W4 0.05 True
|
| 4515 |
hf_kernels_causal_conv1d cuda_B4_D64_S2048_W2 0.05 True
|
| 4516 |
hf_kernels_causal_conv1d cuda_B4_D64_S2048_W4 0.05 True
|
| 4517 |
-
hf_kernels_causal_conv1d cuda_B4_D64_S512_W2 0.
|
| 4518 |
hf_kernels_causal_conv1d cuda_B4_D64_S512_W4 0.05 True
|
| 4519 |
torch_eager cuda_B2_D2048_S128_W2 0.08 True
|
| 4520 |
torch_eager cuda_B2_D2048_S128_W4 0.08 True
|
|
@@ -4537,7 +4545,7 @@ torch_eager cuda_B4_D2048_S512_W4 0.10 True
|
|
| 4537 |
torch_eager cuda_B4_D64_S128_W2 0.08 True
|
| 4538 |
torch_eager cuda_B4_D64_S128_W4 0.08 True
|
| 4539 |
torch_eager cuda_B4_D64_S2048_W2 0.08 True
|
| 4540 |
-
torch_eager cuda_B4_D64_S2048_W4 0.
|
| 4541 |
torch_eager cuda_B4_D64_S512_W2 0.08 True
|
| 4542 |
torch_eager cuda_B4_D64_S512_W4 0.08 True
|
| 4543 |
|
|
@@ -4559,7 +4567,7 @@ Implementations included:
|
|
| 4559 |
<div class="uv-install-logs" id="uv-logs-combine">
|
| 4560 |
<div class="uv-logs-header" onclick="toggleUvLogs(this)">▶ UV Install Logs</div>
|
| 4561 |
<div class="uv-logs-content" style="display: none;">
|
| 4562 |
-
Installed 37 packages in
|
| 4563 |
</div>
|
| 4564 |
</div>
|
| 4565 |
<div class="cell-artifacts">
|
|
@@ -4572,7 +4580,7 @@ Installed 37 packages in 214ms
|
|
| 4572 |
<rdf:RDF>
|
| 4573 |
<ns2:Work>
|
| 4574 |
<dc:type rdf:resource="http://purl.org/dc/dcmitype/StillImage" />
|
| 4575 |
-
<dc:date>2025-10-
|
| 4576 |
<dc:format>image/svg+xml</dc:format>
|
| 4577 |
<dc:creator>
|
| 4578 |
<ns2:Agent>
|
|
@@ -4916,70 +4924,70 @@ Installed 37 packages in 214ms
|
|
| 4916 |
<g id="matplotlib.axis_2">
|
| 4917 |
<g id="ytick_1">
|
| 4918 |
<g id="grid-y--2" class="grid grid-y">
|
| 4919 |
-
<path d="M 47.72
|
| 4920 |
</g>
|
| 4921 |
<g id="line2d_25">
|
| 4922 |
<defs>
|
| 4923 |
<path id="m0fca2865ba" d="M 0 0 L -3.5 0 " style="stroke: #000000; stroke-width: 0.8" />
|
| 4924 |
</defs>
|
| 4925 |
<g>
|
| 4926 |
-
<use ns4:href="#m0fca2865ba" x="47.72" y="
|
| 4927 |
</g>
|
| 4928 |
</g>
|
| 4929 |
<g id="text_25">
|
| 4930 |
-
<text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: end" x="40.72" y="
|
| 4931 |
</g>
|
| 4932 |
</g>
|
| 4933 |
<g id="ytick_2">
|
| 4934 |
<g id="grid-y--3" class="grid grid-y">
|
| 4935 |
-
<path d="M 47.72
|
| 4936 |
</g>
|
| 4937 |
<g id="line2d_26">
|
| 4938 |
<g>
|
| 4939 |
-
<use ns4:href="#m0fca2865ba" x="47.72" y="
|
| 4940 |
</g>
|
| 4941 |
</g>
|
| 4942 |
<g id="text_26">
|
| 4943 |
-
<text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: end" x="40.72" y="
|
| 4944 |
</g>
|
| 4945 |
</g>
|
| 4946 |
<g id="ytick_3">
|
| 4947 |
<g id="grid-y--4" class="grid grid-y">
|
| 4948 |
-
<path d="M 47.72
|
| 4949 |
</g>
|
| 4950 |
<g id="line2d_27">
|
| 4951 |
<g>
|
| 4952 |
-
<use ns4:href="#m0fca2865ba" x="47.72" y="
|
| 4953 |
</g>
|
| 4954 |
</g>
|
| 4955 |
<g id="text_27">
|
| 4956 |
-
<text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: end" x="40.72" y="
|
| 4957 |
</g>
|
| 4958 |
</g>
|
| 4959 |
<g id="ytick_4">
|
| 4960 |
<g id="grid-y--5" class="grid grid-y">
|
| 4961 |
-
<path d="M 47.72
|
| 4962 |
</g>
|
| 4963 |
<g id="line2d_28">
|
| 4964 |
<g>
|
| 4965 |
-
<use ns4:href="#m0fca2865ba" x="47.72" y="
|
| 4966 |
</g>
|
| 4967 |
</g>
|
| 4968 |
<g id="text_28">
|
| 4969 |
-
<text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: end" x="40.72" y="
|
| 4970 |
</g>
|
| 4971 |
</g>
|
| 4972 |
<g id="ytick_5">
|
| 4973 |
<g id="grid-y--6" class="grid grid-y">
|
| 4974 |
-
<path d="M 47.72
|
| 4975 |
</g>
|
| 4976 |
<g id="line2d_29">
|
| 4977 |
<g>
|
| 4978 |
-
<use ns4:href="#m0fca2865ba" x="47.72" y="
|
| 4979 |
</g>
|
| 4980 |
</g>
|
| 4981 |
<g id="text_29">
|
| 4982 |
-
<text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: end" x="40.72" y="
|
| 4983 |
</g>
|
| 4984 |
</g>
|
| 4985 |
<g id="label--y" class="ylabel">
|
|
@@ -4987,66 +4995,66 @@ Installed 37 packages in 214ms
|
|
| 4987 |
</g>
|
| 4988 |
</g>
|
| 4989 |
<g id="series--hf-kernels-causal-conv1d" class="series">
|
| 4990 |
-
<path d="M 83.325193 420.186871 L 114.286231 413.
|
| 4991 |
<defs>
|
| 4992 |
<path id="md7efaf3aec" d="M 0 3 C 0.795609 3 1.55874 2.683901 2.12132 2.12132 C 2.683901 1.55874 3 0.795609 3 0 C 3 -0.795609 2.683901 -1.55874 2.12132 -2.12132 C 1.55874 -2.683901 0.795609 -3 0 -3 C -0.795609 -3 -1.55874 -2.683901 -2.12132 -2.12132 C -2.683901 -1.55874 -3 -0.795609 -3 0 C -3 0.795609 -2.683901 1.55874 -2.12132 2.12132 C -1.55874 2.683901 -0.795609 3 0 3 z " style="stroke: #1f77b4" />
|
| 4993 |
</defs>
|
| 4994 |
<g clip-path="url(#pb49fc4c8d2)">
|
| 4995 |
<use ns4:href="#md7efaf3aec" x="83.325193" y="420.186871" style="fill: #1f77b4; stroke: #1f77b4" />
|
| 4996 |
-
<use ns4:href="#md7efaf3aec" x="114.286231" y="413.
|
| 4997 |
-
<use ns4:href="#md7efaf3aec" x="145.247268" y="
|
| 4998 |
-
<use ns4:href="#md7efaf3aec" x="176.208306" y="414.
|
| 4999 |
-
<use ns4:href="#md7efaf3aec" x="207.169343" y="414.
|
| 5000 |
-
<use ns4:href="#md7efaf3aec" x="238.130381" y="
|
| 5001 |
-
<use ns4:href="#md7efaf3aec" x="269.091418" y="
|
| 5002 |
-
<use ns4:href="#md7efaf3aec" x="300.052455" y="415.
|
| 5003 |
-
<use ns4:href="#md7efaf3aec" x="331.013493" y="416.
|
| 5004 |
-
<use ns4:href="#md7efaf3aec" x="361.97453" y="415.
|
| 5005 |
-
<use ns4:href="#md7efaf3aec" x="392.935568" y="
|
| 5006 |
-
<use ns4:href="#md7efaf3aec" x="423.896605" y="416.
|
| 5007 |
-
<use ns4:href="#md7efaf3aec" x="454.857643" y="415.
|
| 5008 |
-
<use ns4:href="#md7efaf3aec" x="485.81868" y="
|
| 5009 |
-
<use ns4:href="#md7efaf3aec" x="516.779718" y="
|
| 5010 |
-
<use ns4:href="#md7efaf3aec" x="547.740755" y="415.
|
| 5011 |
-
<use ns4:href="#md7efaf3aec" x="578.701793" y="415.
|
| 5012 |
-
<use ns4:href="#md7efaf3aec" x="609.66283" y="
|
| 5013 |
-
<use ns4:href="#md7efaf3aec" x="640.623868" y="414.
|
| 5014 |
-
<use ns4:href="#md7efaf3aec" x="671.584905" y="
|
| 5015 |
-
<use ns4:href="#md7efaf3aec" x="702.545943" y="414.
|
| 5016 |
-
<use ns4:href="#md7efaf3aec" x="733.50698" y="414.
|
| 5017 |
-
<use ns4:href="#md7efaf3aec" x="764.468018" y="
|
| 5018 |
-
<use ns4:href="#md7efaf3aec" x="795.429055" y="
|
| 5019 |
</g>
|
| 5020 |
</g>
|
| 5021 |
<g id="series--torch-eager" class="series">
|
| 5022 |
-
<path d="M 83.325193
|
| 5023 |
<defs>
|
| 5024 |
<path id="m9b8c54d372" d="M 0 3 C 0.795609 3 1.55874 2.683901 2.12132 2.12132 C 2.683901 1.55874 3 0.795609 3 0 C 3 -0.795609 2.683901 -1.55874 2.12132 -2.12132 C 1.55874 -2.683901 0.795609 -3 0 -3 C -0.795609 -3 -1.55874 -2.683901 -2.12132 -2.12132 C -2.683901 -1.55874 -3 -0.795609 -3 0 C -3 0.795609 -2.683901 1.55874 -2.12132 2.12132 C -1.55874 2.683901 -0.795609 3 0 3 z " style="stroke: #ff7f0e" />
|
| 5025 |
</defs>
|
| 5026 |
<g clip-path="url(#pb49fc4c8d2)">
|
| 5027 |
-
<use ns4:href="#m9b8c54d372" x="83.325193" y="
|
| 5028 |
-
<use ns4:href="#m9b8c54d372" x="114.286231" y="
|
| 5029 |
-
<use ns4:href="#m9b8c54d372" x="145.247268" y="
|
| 5030 |
-
<use ns4:href="#m9b8c54d372" x="176.208306" y="
|
| 5031 |
-
<use ns4:href="#m9b8c54d372" x="207.169343" y="
|
| 5032 |
-
<use ns4:href="#m9b8c54d372" x="238.130381" y="
|
| 5033 |
-
<use ns4:href="#m9b8c54d372" x="269.091418" y="
|
| 5034 |
-
<use ns4:href="#m9b8c54d372" x="300.052455" y="
|
| 5035 |
-
<use ns4:href="#m9b8c54d372" x="331.013493" y="
|
| 5036 |
-
<use ns4:href="#m9b8c54d372" x="361.97453" y="
|
| 5037 |
-
<use ns4:href="#m9b8c54d372" x="392.935568" y="
|
| 5038 |
-
<use ns4:href="#m9b8c54d372" x="423.896605" y="
|
| 5039 |
-
<use ns4:href="#m9b8c54d372" x="454.857643" y="
|
| 5040 |
-
<use ns4:href="#m9b8c54d372" x="485.81868" y="
|
| 5041 |
-
<use ns4:href="#m9b8c54d372" x="516.779718" y="
|
| 5042 |
-
<use ns4:href="#m9b8c54d372" x="547.740755" y="
|
| 5043 |
-
<use ns4:href="#m9b8c54d372" x="578.701793" y="
|
| 5044 |
-
<use ns4:href="#m9b8c54d372" x="609.66283" y="
|
| 5045 |
-
<use ns4:href="#m9b8c54d372" x="640.623868" y="
|
| 5046 |
-
<use ns4:href="#m9b8c54d372" x="671.584905" y="
|
| 5047 |
-
<use ns4:href="#m9b8c54d372" x="702.545943" y="
|
| 5048 |
-
<use ns4:href="#m9b8c54d372" x="733.50698" y="
|
| 5049 |
-
<use ns4:href="#m9b8c54d372" x="764.468018" y="
|
| 5050 |
<use ns4:href="#m9b8c54d372" x="795.429055" y="45.608899" style="fill: #ff7f0e; stroke: #ff7f0e" />
|
| 5051 |
</g>
|
| 5052 |
</g>
|
|
|
|
| 809 |
.artifact-preview svg {
|
| 810 |
background: transparent;
|
| 811 |
}
|
| 812 |
+
/* Invert SVG images in dark mode */
|
| 813 |
+
:root[data-theme="dark"] .artifact-preview img[src$=".svg"] {
|
| 814 |
+
filter: invert(0.9) hue-rotate(180deg);
|
| 815 |
+
}
|
| 816 |
+
/* Keep SVG images readable in monocolor mode */
|
| 817 |
+
:root[data-ui="monocolor"] .artifact-preview img[src$=".svg"] {
|
| 818 |
+
filter: none;
|
| 819 |
+
}
|
| 820 |
/* CSV table styling */
|
| 821 |
.artifact-csv {
|
| 822 |
margin-top: 1rem;
|
|
|
|
| 3880 |
<rdf:RDF>
|
| 3881 |
<ns2:Work>
|
| 3882 |
<dc:type rdf:resource="http://purl.org/dc/dcmitype/StillImage" />
|
| 3883 |
+
<dc:date>2025-10-29T15:50:56.264680</dc:date>
|
| 3884 |
<dc:format>image/svg+xml</dc:format>
|
| 3885 |
<dc:creator>
|
| 3886 |
<ns2:Agent>
|
|
|
|
| 4224 |
<g id="matplotlib.axis_2">
|
| 4225 |
<g id="ytick_1">
|
| 4226 |
<g id="grid-y--2" class="grid grid-y">
|
| 4227 |
+
<path d="M 47.72 373.1985 L 831.034248 373.1985 " clip-path="url(#pb49fc4c8d2)" style="fill: none; stroke: #b0b0b0; stroke-opacity: 0.3; stroke-width: 0.8; stroke-linecap: square" />
|
| 4228 |
</g>
|
| 4229 |
<g id="line2d_25">
|
| 4230 |
<defs>
|
| 4231 |
<path id="m0fca2865ba" d="M 0 0 L -3.5 0 " style="stroke: #000000; stroke-width: 0.8" />
|
| 4232 |
</defs>
|
| 4233 |
<g>
|
| 4234 |
+
<use ns4:href="#m0fca2865ba" x="47.72" y="373.1985" style="stroke: #000000; stroke-width: 0.8" />
|
| 4235 |
</g>
|
| 4236 |
</g>
|
| 4237 |
<g id="text_25">
|
| 4238 |
+
<text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: end" x="40.72" y="376.997718" transform="rotate(-0 40.72 376.997718)">0.1</text>
|
| 4239 |
</g>
|
| 4240 |
</g>
|
| 4241 |
<g id="ytick_2">
|
| 4242 |
<g id="grid-y--3" class="grid grid-y">
|
| 4243 |
+
<path d="M 47.72 290.703423 L 831.034248 290.703423 " clip-path="url(#pb49fc4c8d2)" style="fill: none; stroke: #b0b0b0; stroke-opacity: 0.3; stroke-width: 0.8; stroke-linecap: square" />
|
| 4244 |
</g>
|
| 4245 |
<g id="line2d_26">
|
| 4246 |
<g>
|
| 4247 |
+
<use ns4:href="#m0fca2865ba" x="47.72" y="290.703423" style="stroke: #000000; stroke-width: 0.8" />
|
| 4248 |
</g>
|
| 4249 |
</g>
|
| 4250 |
<g id="text_26">
|
| 4251 |
+
<text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: end" x="40.72" y="294.502641" transform="rotate(-0 40.72 294.502641)">0.2</text>
|
| 4252 |
</g>
|
| 4253 |
</g>
|
| 4254 |
<g id="ytick_3">
|
| 4255 |
<g id="grid-y--4" class="grid grid-y">
|
| 4256 |
+
<path d="M 47.72 208.208345 L 831.034248 208.208345 " clip-path="url(#pb49fc4c8d2)" style="fill: none; stroke: #b0b0b0; stroke-opacity: 0.3; stroke-width: 0.8; stroke-linecap: square" />
|
| 4257 |
</g>
|
| 4258 |
<g id="line2d_27">
|
| 4259 |
<g>
|
| 4260 |
+
<use ns4:href="#m0fca2865ba" x="47.72" y="208.208345" style="stroke: #000000; stroke-width: 0.8" />
|
| 4261 |
</g>
|
| 4262 |
</g>
|
| 4263 |
<g id="text_27">
|
| 4264 |
+
<text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: end" x="40.72" y="212.007564" transform="rotate(-0 40.72 212.007564)">0.3</text>
|
| 4265 |
</g>
|
| 4266 |
</g>
|
| 4267 |
<g id="ytick_4">
|
| 4268 |
<g id="grid-y--5" class="grid grid-y">
|
| 4269 |
+
<path d="M 47.72 125.713268 L 831.034248 125.713268 " clip-path="url(#pb49fc4c8d2)" style="fill: none; stroke: #b0b0b0; stroke-opacity: 0.3; stroke-width: 0.8; stroke-linecap: square" />
|
| 4270 |
</g>
|
| 4271 |
<g id="line2d_28">
|
| 4272 |
<g>
|
| 4273 |
+
<use ns4:href="#m0fca2865ba" x="47.72" y="125.713268" style="stroke: #000000; stroke-width: 0.8" />
|
| 4274 |
</g>
|
| 4275 |
</g>
|
| 4276 |
<g id="text_28">
|
| 4277 |
+
<text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: end" x="40.72" y="129.512487" transform="rotate(-0 40.72 129.512487)">0.4</text>
|
| 4278 |
</g>
|
| 4279 |
</g>
|
| 4280 |
<g id="ytick_5">
|
| 4281 |
<g id="grid-y--6" class="grid grid-y">
|
| 4282 |
+
<path d="M 47.72 43.218191 L 831.034248 43.218191 " clip-path="url(#pb49fc4c8d2)" style="fill: none; stroke: #b0b0b0; stroke-opacity: 0.3; stroke-width: 0.8; stroke-linecap: square" />
|
| 4283 |
</g>
|
| 4284 |
<g id="line2d_29">
|
| 4285 |
<g>
|
| 4286 |
+
<use ns4:href="#m0fca2865ba" x="47.72" y="43.218191" style="stroke: #000000; stroke-width: 0.8" />
|
| 4287 |
</g>
|
| 4288 |
</g>
|
| 4289 |
<g id="text_29">
|
| 4290 |
+
<text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: end" x="40.72" y="47.01741" transform="rotate(-0 40.72 47.01741)">0.5</text>
|
| 4291 |
</g>
|
| 4292 |
</g>
|
| 4293 |
<g id="label--y" class="ylabel">
|
|
|
|
| 4295 |
</g>
|
| 4296 |
</g>
|
| 4297 |
<g id="series--hf-kernels-causal-conv1d" class="series">
|
| 4298 |
+
<path d="M 83.325193 420.186871 L 114.286231 413.282033 L 145.247268 414.725697 L 176.208306 414.527708 L 207.169343 414.733946 L 238.130381 414.742196 L 269.091418 415.294913 L 300.052455 415.129922 L 331.013493 416.168535 L 361.97453 415.319661 L 392.935568 415.451653 L 423.896605 416.399522 L 454.857643 415.476402 L 485.81868 414.915435 L 516.779718 415.558897 L 547.740755 415.080425 L 578.701793 415.905376 L 609.66283 414.428714 L 640.623868 414.980606 L 671.584905 414.964932 L 702.545943 414.634952 L 733.50698 414.321471 L 764.468018 414.766944 L 795.429055 414.676199 " clip-path="url(#pb49fc4c8d2)" style="fill: none; stroke: #1f77b4; stroke-width: 1.5; stroke-linecap: square" />
|
| 4299 |
<defs>
|
| 4300 |
<path id="md7efaf3aec" d="M 0 3 C 0.795609 3 1.55874 2.683901 2.12132 2.12132 C 2.683901 1.55874 3 0.795609 3 0 C 3 -0.795609 2.683901 -1.55874 2.12132 -2.12132 C 1.55874 -2.683901 0.795609 -3 0 -3 C -0.795609 -3 -1.55874 -2.683901 -2.12132 -2.12132 C -2.683901 -1.55874 -3 -0.795609 -3 0 C -3 0.795609 -2.683901 1.55874 -2.12132 2.12132 C -1.55874 2.683901 -0.795609 3 0 3 z " style="stroke: #1f77b4" />
|
| 4301 |
</defs>
|
| 4302 |
<g clip-path="url(#pb49fc4c8d2)">
|
| 4303 |
<use ns4:href="#md7efaf3aec" x="83.325193" y="420.186871" style="fill: #1f77b4; stroke: #1f77b4" />
|
| 4304 |
+
<use ns4:href="#md7efaf3aec" x="114.286231" y="413.282033" style="fill: #1f77b4; stroke: #1f77b4" />
|
| 4305 |
+
<use ns4:href="#md7efaf3aec" x="145.247268" y="414.725697" style="fill: #1f77b4; stroke: #1f77b4" />
|
| 4306 |
+
<use ns4:href="#md7efaf3aec" x="176.208306" y="414.527708" style="fill: #1f77b4; stroke: #1f77b4" />
|
| 4307 |
+
<use ns4:href="#md7efaf3aec" x="207.169343" y="414.733946" style="fill: #1f77b4; stroke: #1f77b4" />
|
| 4308 |
+
<use ns4:href="#md7efaf3aec" x="238.130381" y="414.742196" style="fill: #1f77b4; stroke: #1f77b4" />
|
| 4309 |
+
<use ns4:href="#md7efaf3aec" x="269.091418" y="415.294913" style="fill: #1f77b4; stroke: #1f77b4" />
|
| 4310 |
+
<use ns4:href="#md7efaf3aec" x="300.052455" y="415.129922" style="fill: #1f77b4; stroke: #1f77b4" />
|
| 4311 |
+
<use ns4:href="#md7efaf3aec" x="331.013493" y="416.168535" style="fill: #1f77b4; stroke: #1f77b4" />
|
| 4312 |
+
<use ns4:href="#md7efaf3aec" x="361.97453" y="415.319661" style="fill: #1f77b4; stroke: #1f77b4" />
|
| 4313 |
+
<use ns4:href="#md7efaf3aec" x="392.935568" y="415.451653" style="fill: #1f77b4; stroke: #1f77b4" />
|
| 4314 |
+
<use ns4:href="#md7efaf3aec" x="423.896605" y="416.399522" style="fill: #1f77b4; stroke: #1f77b4" />
|
| 4315 |
+
<use ns4:href="#md7efaf3aec" x="454.857643" y="415.476402" style="fill: #1f77b4; stroke: #1f77b4" />
|
| 4316 |
+
<use ns4:href="#md7efaf3aec" x="485.81868" y="414.915435" style="fill: #1f77b4; stroke: #1f77b4" />
|
| 4317 |
+
<use ns4:href="#md7efaf3aec" x="516.779718" y="415.558897" style="fill: #1f77b4; stroke: #1f77b4" />
|
| 4318 |
+
<use ns4:href="#md7efaf3aec" x="547.740755" y="415.080425" style="fill: #1f77b4; stroke: #1f77b4" />
|
| 4319 |
+
<use ns4:href="#md7efaf3aec" x="578.701793" y="415.905376" style="fill: #1f77b4; stroke: #1f77b4" />
|
| 4320 |
+
<use ns4:href="#md7efaf3aec" x="609.66283" y="414.428714" style="fill: #1f77b4; stroke: #1f77b4" />
|
| 4321 |
+
<use ns4:href="#md7efaf3aec" x="640.623868" y="414.980606" style="fill: #1f77b4; stroke: #1f77b4" />
|
| 4322 |
+
<use ns4:href="#md7efaf3aec" x="671.584905" y="414.964932" style="fill: #1f77b4; stroke: #1f77b4" />
|
| 4323 |
+
<use ns4:href="#md7efaf3aec" x="702.545943" y="414.634952" style="fill: #1f77b4; stroke: #1f77b4" />
|
| 4324 |
+
<use ns4:href="#md7efaf3aec" x="733.50698" y="414.321471" style="fill: #1f77b4; stroke: #1f77b4" />
|
| 4325 |
+
<use ns4:href="#md7efaf3aec" x="764.468018" y="414.766944" style="fill: #1f77b4; stroke: #1f77b4" />
|
| 4326 |
+
<use ns4:href="#md7efaf3aec" x="795.429055" y="414.676199" style="fill: #1f77b4; stroke: #1f77b4" />
|
| 4327 |
</g>
|
| 4328 |
</g>
|
| 4329 |
<g id="series--torch-eager" class="series">
|
| 4330 |
+
<path d="M 83.325193 397.16167 L 114.286231 384.655416 L 145.247268 385.439119 L 176.208306 386.603125 L 207.169343 386.93228 L 238.130381 386.948779 L 269.091418 386.320992 L 300.052455 386.924031 L 331.013493 386.998276 L 361.97453 386.93228 L 392.935568 335.710262 L 423.896605 321.165555 L 454.857643 387.52707 L 485.81868 387.60874 L 516.779718 388.243952 L 547.740755 387.27051 L 578.701793 386.635298 L 609.66283 387.072522 L 640.623868 387.048598 L 671.584905 386.643547 L 702.545943 378.377541 L 733.50698 373.106105 L 764.468018 54.3872 L 795.429055 45.608899 " clip-path="url(#pb49fc4c8d2)" style="fill: none; stroke: #ff7f0e; stroke-width: 1.5; stroke-linecap: square" />
|
| 4331 |
<defs>
|
| 4332 |
<path id="m9b8c54d372" d="M 0 3 C 0.795609 3 1.55874 2.683901 2.12132 2.12132 C 2.683901 1.55874 3 0.795609 3 0 C 3 -0.795609 2.683901 -1.55874 2.12132 -2.12132 C 1.55874 -2.683901 0.795609 -3 0 -3 C -0.795609 -3 -1.55874 -2.683901 -2.12132 -2.12132 C -2.683901 -1.55874 -3 -0.795609 -3 0 C -3 0.795609 -2.683901 1.55874 -2.12132 2.12132 C -1.55874 2.683901 -0.795609 3 0 3 z " style="stroke: #ff7f0e" />
|
| 4333 |
</defs>
|
| 4334 |
<g clip-path="url(#pb49fc4c8d2)">
|
| 4335 |
+
<use ns4:href="#m9b8c54d372" x="83.325193" y="397.16167" style="fill: #ff7f0e; stroke: #ff7f0e" />
|
| 4336 |
+
<use ns4:href="#m9b8c54d372" x="114.286231" y="384.655416" style="fill: #ff7f0e; stroke: #ff7f0e" />
|
| 4337 |
+
<use ns4:href="#m9b8c54d372" x="145.247268" y="385.439119" style="fill: #ff7f0e; stroke: #ff7f0e" />
|
| 4338 |
+
<use ns4:href="#m9b8c54d372" x="176.208306" y="386.603125" style="fill: #ff7f0e; stroke: #ff7f0e" />
|
| 4339 |
+
<use ns4:href="#m9b8c54d372" x="207.169343" y="386.93228" style="fill: #ff7f0e; stroke: #ff7f0e" />
|
| 4340 |
+
<use ns4:href="#m9b8c54d372" x="238.130381" y="386.948779" style="fill: #ff7f0e; stroke: #ff7f0e" />
|
| 4341 |
+
<use ns4:href="#m9b8c54d372" x="269.091418" y="386.320992" style="fill: #ff7f0e; stroke: #ff7f0e" />
|
| 4342 |
+
<use ns4:href="#m9b8c54d372" x="300.052455" y="386.924031" style="fill: #ff7f0e; stroke: #ff7f0e" />
|
| 4343 |
+
<use ns4:href="#m9b8c54d372" x="331.013493" y="386.998276" style="fill: #ff7f0e; stroke: #ff7f0e" />
|
| 4344 |
+
<use ns4:href="#m9b8c54d372" x="361.97453" y="386.93228" style="fill: #ff7f0e; stroke: #ff7f0e" />
|
| 4345 |
+
<use ns4:href="#m9b8c54d372" x="392.935568" y="335.710262" style="fill: #ff7f0e; stroke: #ff7f0e" />
|
| 4346 |
+
<use ns4:href="#m9b8c54d372" x="423.896605" y="321.165555" style="fill: #ff7f0e; stroke: #ff7f0e" />
|
| 4347 |
+
<use ns4:href="#m9b8c54d372" x="454.857643" y="387.52707" style="fill: #ff7f0e; stroke: #ff7f0e" />
|
| 4348 |
+
<use ns4:href="#m9b8c54d372" x="485.81868" y="387.60874" style="fill: #ff7f0e; stroke: #ff7f0e" />
|
| 4349 |
+
<use ns4:href="#m9b8c54d372" x="516.779718" y="388.243952" style="fill: #ff7f0e; stroke: #ff7f0e" />
|
| 4350 |
+
<use ns4:href="#m9b8c54d372" x="547.740755" y="387.27051" style="fill: #ff7f0e; stroke: #ff7f0e" />
|
| 4351 |
+
<use ns4:href="#m9b8c54d372" x="578.701793" y="386.635298" style="fill: #ff7f0e; stroke: #ff7f0e" />
|
| 4352 |
+
<use ns4:href="#m9b8c54d372" x="609.66283" y="387.072522" style="fill: #ff7f0e; stroke: #ff7f0e" />
|
| 4353 |
+
<use ns4:href="#m9b8c54d372" x="640.623868" y="387.048598" style="fill: #ff7f0e; stroke: #ff7f0e" />
|
| 4354 |
+
<use ns4:href="#m9b8c54d372" x="671.584905" y="386.643547" style="fill: #ff7f0e; stroke: #ff7f0e" />
|
| 4355 |
+
<use ns4:href="#m9b8c54d372" x="702.545943" y="378.377541" style="fill: #ff7f0e; stroke: #ff7f0e" />
|
| 4356 |
+
<use ns4:href="#m9b8c54d372" x="733.50698" y="373.106105" style="fill: #ff7f0e; stroke: #ff7f0e" />
|
| 4357 |
+
<use ns4:href="#m9b8c54d372" x="764.468018" y="54.3872" style="fill: #ff7f0e; stroke: #ff7f0e" />
|
| 4358 |
<use ns4:href="#m9b8c54d372" x="795.429055" y="45.608899" style="fill: #ff7f0e; stroke: #ff7f0e" />
|
| 4359 |
</g>
|
| 4360 |
</g>
|
|
|
|
| 4413 |
<span onclick="toggleOutput('combine')" style="cursor: pointer;">▼ output</span>
|
| 4414 |
<span id="uv-indicator-combine" onclick="toggleUvLogsFromHeader('combine')" style="cursor: pointer;">▶ uv-logs</span>
|
| 4415 |
</span> |
|
| 4416 |
+
Cell: combine | 4.35s
|
| 4417 |
| <button class="run-btn" onclick="runCell('combine')">▶ run</button>
|
| 4418 |
<button class="copy-btn" onclick="copyCell('combine')">Copy</button>
|
| 4419 |
<a href="cells/combine.py" target="_blank" class="raw-btn">Raw</a>
|
|
|
|
| 4506 |
hf_kernels_causal_conv1d cuda_B2_D2048_S2048_W4 0.05 True
|
| 4507 |
hf_kernels_causal_conv1d cuda_B2_D2048_S512_W2 0.05 True
|
| 4508 |
hf_kernels_causal_conv1d cuda_B2_D2048_S512_W4 0.05 True
|
| 4509 |
+
hf_kernels_causal_conv1d cuda_B2_D64_S128_W2 0.04 True
|
| 4510 |
+
hf_kernels_causal_conv1d cuda_B2_D64_S128_W4 0.05 True
|
| 4511 |
+
hf_kernels_causal_conv1d cuda_B2_D64_S2048_W2 0.05 True
|
| 4512 |
hf_kernels_causal_conv1d cuda_B2_D64_S2048_W4 0.05 True
|
| 4513 |
+
hf_kernels_causal_conv1d cuda_B2_D64_S512_W2 0.05 True
|
| 4514 |
+
hf_kernels_causal_conv1d cuda_B2_D64_S512_W4 0.05 True
|
| 4515 |
hf_kernels_causal_conv1d cuda_B4_D2048_S128_W2 0.05 True
|
| 4516 |
hf_kernels_causal_conv1d cuda_B4_D2048_S128_W4 0.05 True
|
| 4517 |
hf_kernels_causal_conv1d cuda_B4_D2048_S2048_W2 0.05 True
|
|
|
|
| 4522 |
hf_kernels_causal_conv1d cuda_B4_D64_S128_W4 0.05 True
|
| 4523 |
hf_kernels_causal_conv1d cuda_B4_D64_S2048_W2 0.05 True
|
| 4524 |
hf_kernels_causal_conv1d cuda_B4_D64_S2048_W4 0.05 True
|
| 4525 |
+
hf_kernels_causal_conv1d cuda_B4_D64_S512_W2 0.05 True
|
| 4526 |
hf_kernels_causal_conv1d cuda_B4_D64_S512_W4 0.05 True
|
| 4527 |
torch_eager cuda_B2_D2048_S128_W2 0.08 True
|
| 4528 |
torch_eager cuda_B2_D2048_S128_W4 0.08 True
|
|
|
|
| 4545 |
torch_eager cuda_B4_D64_S128_W2 0.08 True
|
| 4546 |
torch_eager cuda_B4_D64_S128_W4 0.08 True
|
| 4547 |
torch_eager cuda_B4_D64_S2048_W2 0.08 True
|
| 4548 |
+
torch_eager cuda_B4_D64_S2048_W4 0.08 True
|
| 4549 |
torch_eager cuda_B4_D64_S512_W2 0.08 True
|
| 4550 |
torch_eager cuda_B4_D64_S512_W4 0.08 True
|
| 4551 |
|
|
|
|
| 4567 |
<div class="uv-install-logs" id="uv-logs-combine">
|
| 4568 |
<div class="uv-logs-header" onclick="toggleUvLogs(this)">▶ UV Install Logs</div>
|
| 4569 |
<div class="uv-logs-content" style="display: none;">
|
| 4570 |
+
Installed 37 packages in 191ms
|
| 4571 |
</div>
|
| 4572 |
</div>
|
| 4573 |
<div class="cell-artifacts">
|
|
|
|
| 4580 |
<rdf:RDF>
|
| 4581 |
<ns2:Work>
|
| 4582 |
<dc:type rdf:resource="http://purl.org/dc/dcmitype/StillImage" />
|
| 4583 |
+
<dc:date>2025-10-29T15:50:56.264680</dc:date>
|
| 4584 |
<dc:format>image/svg+xml</dc:format>
|
| 4585 |
<dc:creator>
|
| 4586 |
<ns2:Agent>
|
|
|
|
| 4924 |
<g id="matplotlib.axis_2">
|
| 4925 |
<g id="ytick_1">
|
| 4926 |
<g id="grid-y--2" class="grid grid-y">
|
| 4927 |
+
<path d="M 47.72 373.1985 L 831.034248 373.1985 " clip-path="url(#pb49fc4c8d2)" style="fill: none; stroke: #b0b0b0; stroke-opacity: 0.3; stroke-width: 0.8; stroke-linecap: square" />
|
| 4928 |
</g>
|
| 4929 |
<g id="line2d_25">
|
| 4930 |
<defs>
|
| 4931 |
<path id="m0fca2865ba" d="M 0 0 L -3.5 0 " style="stroke: #000000; stroke-width: 0.8" />
|
| 4932 |
</defs>
|
| 4933 |
<g>
|
| 4934 |
+
<use ns4:href="#m0fca2865ba" x="47.72" y="373.1985" style="stroke: #000000; stroke-width: 0.8" />
|
| 4935 |
</g>
|
| 4936 |
</g>
|
| 4937 |
<g id="text_25">
|
| 4938 |
+
<text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: end" x="40.72" y="376.997718" transform="rotate(-0 40.72 376.997718)">0.1</text>
|
| 4939 |
</g>
|
| 4940 |
</g>
|
| 4941 |
<g id="ytick_2">
|
| 4942 |
<g id="grid-y--3" class="grid grid-y">
|
| 4943 |
+
<path d="M 47.72 290.703423 L 831.034248 290.703423 " clip-path="url(#pb49fc4c8d2)" style="fill: none; stroke: #b0b0b0; stroke-opacity: 0.3; stroke-width: 0.8; stroke-linecap: square" />
|
| 4944 |
</g>
|
| 4945 |
<g id="line2d_26">
|
| 4946 |
<g>
|
| 4947 |
+
<use ns4:href="#m0fca2865ba" x="47.72" y="290.703423" style="stroke: #000000; stroke-width: 0.8" />
|
| 4948 |
</g>
|
| 4949 |
</g>
|
| 4950 |
<g id="text_26">
|
| 4951 |
+
<text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: end" x="40.72" y="294.502641" transform="rotate(-0 40.72 294.502641)">0.2</text>
|
| 4952 |
</g>
|
| 4953 |
</g>
|
| 4954 |
<g id="ytick_3">
|
| 4955 |
<g id="grid-y--4" class="grid grid-y">
|
| 4956 |
+
<path d="M 47.72 208.208345 L 831.034248 208.208345 " clip-path="url(#pb49fc4c8d2)" style="fill: none; stroke: #b0b0b0; stroke-opacity: 0.3; stroke-width: 0.8; stroke-linecap: square" />
|
| 4957 |
</g>
|
| 4958 |
<g id="line2d_27">
|
| 4959 |
<g>
|
| 4960 |
+
<use ns4:href="#m0fca2865ba" x="47.72" y="208.208345" style="stroke: #000000; stroke-width: 0.8" />
|
| 4961 |
</g>
|
| 4962 |
</g>
|
| 4963 |
<g id="text_27">
|
| 4964 |
+
<text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: end" x="40.72" y="212.007564" transform="rotate(-0 40.72 212.007564)">0.3</text>
|
| 4965 |
</g>
|
| 4966 |
</g>
|
| 4967 |
<g id="ytick_4">
|
| 4968 |
<g id="grid-y--5" class="grid grid-y">
|
| 4969 |
+
<path d="M 47.72 125.713268 L 831.034248 125.713268 " clip-path="url(#pb49fc4c8d2)" style="fill: none; stroke: #b0b0b0; stroke-opacity: 0.3; stroke-width: 0.8; stroke-linecap: square" />
|
| 4970 |
</g>
|
| 4971 |
<g id="line2d_28">
|
| 4972 |
<g>
|
| 4973 |
+
<use ns4:href="#m0fca2865ba" x="47.72" y="125.713268" style="stroke: #000000; stroke-width: 0.8" />
|
| 4974 |
</g>
|
| 4975 |
</g>
|
| 4976 |
<g id="text_28">
|
| 4977 |
+
<text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: end" x="40.72" y="129.512487" transform="rotate(-0 40.72 129.512487)">0.4</text>
|
| 4978 |
</g>
|
| 4979 |
</g>
|
| 4980 |
<g id="ytick_5">
|
| 4981 |
<g id="grid-y--6" class="grid grid-y">
|
| 4982 |
+
<path d="M 47.72 43.218191 L 831.034248 43.218191 " clip-path="url(#pb49fc4c8d2)" style="fill: none; stroke: #b0b0b0; stroke-opacity: 0.3; stroke-width: 0.8; stroke-linecap: square" />
|
| 4983 |
</g>
|
| 4984 |
<g id="line2d_29">
|
| 4985 |
<g>
|
| 4986 |
+
<use ns4:href="#m0fca2865ba" x="47.72" y="43.218191" style="stroke: #000000; stroke-width: 0.8" />
|
| 4987 |
</g>
|
| 4988 |
</g>
|
| 4989 |
<g id="text_29">
|
| 4990 |
+
<text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: end" x="40.72" y="47.01741" transform="rotate(-0 40.72 47.01741)">0.5</text>
|
| 4991 |
</g>
|
| 4992 |
</g>
|
| 4993 |
<g id="label--y" class="ylabel">
|
|
|
|
| 4995 |
</g>
|
| 4996 |
</g>
|
| 4997 |
<g id="series--hf-kernels-causal-conv1d" class="series">
|
| 4998 |
+
<path d="M 83.325193 420.186871 L 114.286231 413.282033 L 145.247268 414.725697 L 176.208306 414.527708 L 207.169343 414.733946 L 238.130381 414.742196 L 269.091418 415.294913 L 300.052455 415.129922 L 331.013493 416.168535 L 361.97453 415.319661 L 392.935568 415.451653 L 423.896605 416.399522 L 454.857643 415.476402 L 485.81868 414.915435 L 516.779718 415.558897 L 547.740755 415.080425 L 578.701793 415.905376 L 609.66283 414.428714 L 640.623868 414.980606 L 671.584905 414.964932 L 702.545943 414.634952 L 733.50698 414.321471 L 764.468018 414.766944 L 795.429055 414.676199 " clip-path="url(#pb49fc4c8d2)" style="fill: none; stroke: #1f77b4; stroke-width: 1.5; stroke-linecap: square" />
|
| 4999 |
<defs>
|
| 5000 |
<path id="md7efaf3aec" d="M 0 3 C 0.795609 3 1.55874 2.683901 2.12132 2.12132 C 2.683901 1.55874 3 0.795609 3 0 C 3 -0.795609 2.683901 -1.55874 2.12132 -2.12132 C 1.55874 -2.683901 0.795609 -3 0 -3 C -0.795609 -3 -1.55874 -2.683901 -2.12132 -2.12132 C -2.683901 -1.55874 -3 -0.795609 -3 0 C -3 0.795609 -2.683901 1.55874 -2.12132 2.12132 C -1.55874 2.683901 -0.795609 3 0 3 z " style="stroke: #1f77b4" />
|
| 5001 |
</defs>
|
| 5002 |
<g clip-path="url(#pb49fc4c8d2)">
|
| 5003 |
<use ns4:href="#md7efaf3aec" x="83.325193" y="420.186871" style="fill: #1f77b4; stroke: #1f77b4" />
|
| 5004 |
+
<use ns4:href="#md7efaf3aec" x="114.286231" y="413.282033" style="fill: #1f77b4; stroke: #1f77b4" />
|
| 5005 |
+
<use ns4:href="#md7efaf3aec" x="145.247268" y="414.725697" style="fill: #1f77b4; stroke: #1f77b4" />
|
| 5006 |
+
<use ns4:href="#md7efaf3aec" x="176.208306" y="414.527708" style="fill: #1f77b4; stroke: #1f77b4" />
|
| 5007 |
+
<use ns4:href="#md7efaf3aec" x="207.169343" y="414.733946" style="fill: #1f77b4; stroke: #1f77b4" />
|
| 5008 |
+
<use ns4:href="#md7efaf3aec" x="238.130381" y="414.742196" style="fill: #1f77b4; stroke: #1f77b4" />
|
| 5009 |
+
<use ns4:href="#md7efaf3aec" x="269.091418" y="415.294913" style="fill: #1f77b4; stroke: #1f77b4" />
|
| 5010 |
+
<use ns4:href="#md7efaf3aec" x="300.052455" y="415.129922" style="fill: #1f77b4; stroke: #1f77b4" />
|
| 5011 |
+
<use ns4:href="#md7efaf3aec" x="331.013493" y="416.168535" style="fill: #1f77b4; stroke: #1f77b4" />
|
| 5012 |
+
<use ns4:href="#md7efaf3aec" x="361.97453" y="415.319661" style="fill: #1f77b4; stroke: #1f77b4" />
|
| 5013 |
+
<use ns4:href="#md7efaf3aec" x="392.935568" y="415.451653" style="fill: #1f77b4; stroke: #1f77b4" />
|
| 5014 |
+
<use ns4:href="#md7efaf3aec" x="423.896605" y="416.399522" style="fill: #1f77b4; stroke: #1f77b4" />
|
| 5015 |
+
<use ns4:href="#md7efaf3aec" x="454.857643" y="415.476402" style="fill: #1f77b4; stroke: #1f77b4" />
|
| 5016 |
+
<use ns4:href="#md7efaf3aec" x="485.81868" y="414.915435" style="fill: #1f77b4; stroke: #1f77b4" />
|
| 5017 |
+
<use ns4:href="#md7efaf3aec" x="516.779718" y="415.558897" style="fill: #1f77b4; stroke: #1f77b4" />
|
| 5018 |
+
<use ns4:href="#md7efaf3aec" x="547.740755" y="415.080425" style="fill: #1f77b4; stroke: #1f77b4" />
|
| 5019 |
+
<use ns4:href="#md7efaf3aec" x="578.701793" y="415.905376" style="fill: #1f77b4; stroke: #1f77b4" />
|
| 5020 |
+
<use ns4:href="#md7efaf3aec" x="609.66283" y="414.428714" style="fill: #1f77b4; stroke: #1f77b4" />
|
| 5021 |
+
<use ns4:href="#md7efaf3aec" x="640.623868" y="414.980606" style="fill: #1f77b4; stroke: #1f77b4" />
|
| 5022 |
+
<use ns4:href="#md7efaf3aec" x="671.584905" y="414.964932" style="fill: #1f77b4; stroke: #1f77b4" />
|
| 5023 |
+
<use ns4:href="#md7efaf3aec" x="702.545943" y="414.634952" style="fill: #1f77b4; stroke: #1f77b4" />
|
| 5024 |
+
<use ns4:href="#md7efaf3aec" x="733.50698" y="414.321471" style="fill: #1f77b4; stroke: #1f77b4" />
|
| 5025 |
+
<use ns4:href="#md7efaf3aec" x="764.468018" y="414.766944" style="fill: #1f77b4; stroke: #1f77b4" />
|
| 5026 |
+
<use ns4:href="#md7efaf3aec" x="795.429055" y="414.676199" style="fill: #1f77b4; stroke: #1f77b4" />
|
| 5027 |
</g>
|
| 5028 |
</g>
|
| 5029 |
<g id="series--torch-eager" class="series">
|
| 5030 |
+
<path d="M 83.325193 397.16167 L 114.286231 384.655416 L 145.247268 385.439119 L 176.208306 386.603125 L 207.169343 386.93228 L 238.130381 386.948779 L 269.091418 386.320992 L 300.052455 386.924031 L 331.013493 386.998276 L 361.97453 386.93228 L 392.935568 335.710262 L 423.896605 321.165555 L 454.857643 387.52707 L 485.81868 387.60874 L 516.779718 388.243952 L 547.740755 387.27051 L 578.701793 386.635298 L 609.66283 387.072522 L 640.623868 387.048598 L 671.584905 386.643547 L 702.545943 378.377541 L 733.50698 373.106105 L 764.468018 54.3872 L 795.429055 45.608899 " clip-path="url(#pb49fc4c8d2)" style="fill: none; stroke: #ff7f0e; stroke-width: 1.5; stroke-linecap: square" />
|
| 5031 |
<defs>
|
| 5032 |
<path id="m9b8c54d372" d="M 0 3 C 0.795609 3 1.55874 2.683901 2.12132 2.12132 C 2.683901 1.55874 3 0.795609 3 0 C 3 -0.795609 2.683901 -1.55874 2.12132 -2.12132 C 1.55874 -2.683901 0.795609 -3 0 -3 C -0.795609 -3 -1.55874 -2.683901 -2.12132 -2.12132 C -2.683901 -1.55874 -3 -0.795609 -3 0 C -3 0.795609 -2.683901 1.55874 -2.12132 2.12132 C -1.55874 2.683901 -0.795609 3 0 3 z " style="stroke: #ff7f0e" />
|
| 5033 |
</defs>
|
| 5034 |
<g clip-path="url(#pb49fc4c8d2)">
|
| 5035 |
+
<use ns4:href="#m9b8c54d372" x="83.325193" y="397.16167" style="fill: #ff7f0e; stroke: #ff7f0e" />
|
| 5036 |
+
<use ns4:href="#m9b8c54d372" x="114.286231" y="384.655416" style="fill: #ff7f0e; stroke: #ff7f0e" />
|
| 5037 |
+
<use ns4:href="#m9b8c54d372" x="145.247268" y="385.439119" style="fill: #ff7f0e; stroke: #ff7f0e" />
|
| 5038 |
+
<use ns4:href="#m9b8c54d372" x="176.208306" y="386.603125" style="fill: #ff7f0e; stroke: #ff7f0e" />
|
| 5039 |
+
<use ns4:href="#m9b8c54d372" x="207.169343" y="386.93228" style="fill: #ff7f0e; stroke: #ff7f0e" />
|
| 5040 |
+
<use ns4:href="#m9b8c54d372" x="238.130381" y="386.948779" style="fill: #ff7f0e; stroke: #ff7f0e" />
|
| 5041 |
+
<use ns4:href="#m9b8c54d372" x="269.091418" y="386.320992" style="fill: #ff7f0e; stroke: #ff7f0e" />
|
| 5042 |
+
<use ns4:href="#m9b8c54d372" x="300.052455" y="386.924031" style="fill: #ff7f0e; stroke: #ff7f0e" />
|
| 5043 |
+
<use ns4:href="#m9b8c54d372" x="331.013493" y="386.998276" style="fill: #ff7f0e; stroke: #ff7f0e" />
|
| 5044 |
+
<use ns4:href="#m9b8c54d372" x="361.97453" y="386.93228" style="fill: #ff7f0e; stroke: #ff7f0e" />
|
| 5045 |
+
<use ns4:href="#m9b8c54d372" x="392.935568" y="335.710262" style="fill: #ff7f0e; stroke: #ff7f0e" />
|
| 5046 |
+
<use ns4:href="#m9b8c54d372" x="423.896605" y="321.165555" style="fill: #ff7f0e; stroke: #ff7f0e" />
|
| 5047 |
+
<use ns4:href="#m9b8c54d372" x="454.857643" y="387.52707" style="fill: #ff7f0e; stroke: #ff7f0e" />
|
| 5048 |
+
<use ns4:href="#m9b8c54d372" x="485.81868" y="387.60874" style="fill: #ff7f0e; stroke: #ff7f0e" />
|
| 5049 |
+
<use ns4:href="#m9b8c54d372" x="516.779718" y="388.243952" style="fill: #ff7f0e; stroke: #ff7f0e" />
|
| 5050 |
+
<use ns4:href="#m9b8c54d372" x="547.740755" y="387.27051" style="fill: #ff7f0e; stroke: #ff7f0e" />
|
| 5051 |
+
<use ns4:href="#m9b8c54d372" x="578.701793" y="386.635298" style="fill: #ff7f0e; stroke: #ff7f0e" />
|
| 5052 |
+
<use ns4:href="#m9b8c54d372" x="609.66283" y="387.072522" style="fill: #ff7f0e; stroke: #ff7f0e" />
|
| 5053 |
+
<use ns4:href="#m9b8c54d372" x="640.623868" y="387.048598" style="fill: #ff7f0e; stroke: #ff7f0e" />
|
| 5054 |
+
<use ns4:href="#m9b8c54d372" x="671.584905" y="386.643547" style="fill: #ff7f0e; stroke: #ff7f0e" />
|
| 5055 |
+
<use ns4:href="#m9b8c54d372" x="702.545943" y="378.377541" style="fill: #ff7f0e; stroke: #ff7f0e" />
|
| 5056 |
+
<use ns4:href="#m9b8c54d372" x="733.50698" y="373.106105" style="fill: #ff7f0e; stroke: #ff7f0e" />
|
| 5057 |
+
<use ns4:href="#m9b8c54d372" x="764.468018" y="54.3872" style="fill: #ff7f0e; stroke: #ff7f0e" />
|
| 5058 |
<use ns4:href="#m9b8c54d372" x="795.429055" y="45.608899" style="fill: #ff7f0e; stroke: #ff7f0e" />
|
| 5059 |
</g>
|
| 5060 |
</g>
|
flash_attn/impls/artifacts/benchmark/attention.jsonl
CHANGED
|
@@ -1,6 +1,6 @@
|
|
| 1 |
-
{"ts": "2025-10-
|
| 2 |
-
{"ts": "2025-10-
|
| 3 |
-
{"ts": "2025-10-
|
| 4 |
-
{"ts": "2025-10-
|
| 5 |
-
{"ts": "2025-10-
|
| 6 |
-
{"ts": "2025-10-
|
|
|
|
| 1 |
+
{"ts": "2025-10-29T15:50:47Z", "run": "cdb0295471e5453fb4c555529da92783", "impl": "torch_mem_eff", "tags": {"family": "torch-sdpa", "backend": "EFFICIENT", "compile": "none"}, "wl": {"name": "cuda_attn_L128_bfloat16", "batch": 1, "seq_len": 4224, "heads": 24, "head_dim": 128, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 1.8233430000123008, "p50": 1.8343830000162598, "p90": 1.8450139999686144, "mean": 1.8363673999942876, "iqr": 0.021300000014434772, "raw_times": [1.8450139999686144, 1.8233430000123008, 1.8237139999541796, 1.8343830000162598, 1.8553830000200833], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 1.8232439999792405, "peak_bytes": 295567360, "ok": true, "absmax": 0.0625, "corr": {"ok": true, "rtol": 0.02, "atol": 0.02, "absmax": 0.0625, "mae": 0.0003604888916015625, "mse": 2.7865171432495117e-06, "ref": "sdpa_math_fp32"}, "err": null}
|
| 2 |
+
{"ts": "2025-10-29T15:50:47Z", "run": "cdb0295471e5453fb4c555529da92783", "impl": "torch_mem_eff", "tags": {"family": "torch-sdpa", "backend": "EFFICIENT", "compile": "none"}, "wl": {"name": "cuda_attn_L256_bfloat16", "batch": 1, "seq_len": 4352, "heads": 24, "head_dim": 128, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 1.8942840000022443, "p50": 1.9424449999974058, "p90": 1.9434060000094178, "mean": 1.9367254000030698, "iqr": 0.0023400000372930663, "raw_times": [1.8942840000022443, 1.9424449999974058, 1.9410659999721247, 1.9434060000094178, 1.9624260000341565], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 1.9008649999818772, "peak_bytes": 304742400, "ok": true, "absmax": 0.0625, "corr": {"ok": true, "rtol": 0.02, "atol": 0.02, "absmax": 0.0625, "mae": 0.00035858154296875, "mse": 2.7567148208618164e-06, "ref": "sdpa_math_fp32"}, "err": null}
|
| 3 |
+
{"ts": "2025-10-29T15:50:48Z", "run": "cdb0295471e5453fb4c555529da92783", "impl": "torch_mem_eff", "tags": {"family": "torch-sdpa", "backend": "EFFICIENT", "compile": "none"}, "wl": {"name": "cuda_attn_L320_bfloat16", "batch": 1, "seq_len": 4416, "heads": 24, "head_dim": 128, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 1.942595999992136, "p50": 1.9503360000499015, "p90": 2.019877999998698, "mean": 1.9758666000029734, "iqr": 0.0764520000302582, "raw_times": [1.9503360000499015, 1.94342599996844, 1.942595999992136, 2.019877999998698, 2.0230970000056914], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 1.9501660000287302, "peak_bytes": 307494912, "ok": true, "absmax": 0.0625, "corr": {"ok": true, "rtol": 0.02, "atol": 0.02, "absmax": 0.0625, "mae": 0.0003566741943359375, "mse": 2.771615982055664e-06, "ref": "sdpa_math_fp32"}, "err": null}
|
| 4 |
+
{"ts": "2025-10-29T15:50:48Z", "run": "cdb0295471e5453fb4c555529da92783", "impl": "torch_mem_eff", "tags": {"family": "torch-sdpa", "backend": "EFFICIENT", "compile": "none"}, "wl": {"name": "cuda_attn_L384_bfloat16", "batch": 1, "seq_len": 4480, "heads": 24, "head_dim": 128, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 1.9654459999856044, "p50": 2.0491880000008678, "p90": 2.050657999973282, "mean": 2.0347600000036437, "iqr": 0.0033989999224104395, "raw_times": [1.9654459999856044, 2.0491880000008678, 2.0472590000508717, 2.050657999973282, 2.0612490000075923], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 2.0352980000097887, "peak_bytes": 311296000, "ok": true, "absmax": 0.0625, "corr": {"ok": true, "rtol": 0.02, "atol": 0.02, "absmax": 0.0625, "mae": 0.00035858154296875, "mse": 2.771615982055664e-06, "ref": "sdpa_math_fp32"}, "err": null}
|
| 5 |
+
{"ts": "2025-10-29T15:50:48Z", "run": "cdb0295471e5453fb4c555529da92783", "impl": "torch_mem_eff", "tags": {"family": "torch-sdpa", "backend": "EFFICIENT", "compile": "none"}, "wl": {"name": "cuda_attn_L448_bfloat16", "batch": 1, "seq_len": 4544, "heads": 24, "head_dim": 128, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 2.0188670000038655, "p50": 2.067507999981899, "p90": 2.1027900000376576, "mean": 2.0633722000184207, "iqr": 0.07837300000801406, "raw_times": [2.0188670000038655, 2.0244170000296435, 2.067507999981899, 2.1027900000376576, 2.103279000039038], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 2.0235979999938536, "peak_bytes": 315621376, "ok": true, "absmax": 0.0625, "corr": {"ok": true, "rtol": 0.02, "atol": 0.02, "absmax": 0.0625, "mae": 0.00035858154296875, "mse": 2.771615982055664e-06, "ref": "sdpa_math_fp32"}, "err": null}
|
| 6 |
+
{"ts": "2025-10-29T15:50:48Z", "run": "cdb0295471e5453fb4c555529da92783", "impl": "torch_mem_eff", "tags": {"family": "torch-sdpa", "backend": "EFFICIENT", "compile": "none"}, "wl": {"name": "cuda_attn_L512_bfloat16", "batch": 1, "seq_len": 4608, "heads": 24, "head_dim": 128, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 2.1849919999681333, "p50": 2.1887119999632887, "p90": 2.2487329999876238, "mean": 2.212510399988332, "iqr": 0.06324099996390942, "raw_times": [2.1849919999681333, 2.1887119999632887, 2.1854920000237144, 2.2487329999876238, 2.254622999998901], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 2.1668410000188487, "peak_bytes": 319946752, "ok": true, "absmax": 0.0625, "corr": {"ok": true, "rtol": 0.02, "atol": 0.02, "absmax": 0.0625, "mae": 0.0003604888916015625, "mse": 2.771615982055664e-06, "ref": "sdpa_math_fp32"}, "err": null}
|
flash_attn/impls/cells/benchmark.py
CHANGED
|
@@ -4,7 +4,6 @@
|
|
| 4 |
# "numpy",
|
| 5 |
# "torch==2.8.0",
|
| 6 |
# "kernels-benchmark-tools",
|
| 7 |
-
# "xformers",
|
| 8 |
# ]
|
| 9 |
#
|
| 10 |
# [tool.uv.sources]
|
|
@@ -13,18 +12,20 @@
|
|
| 13 |
import torch
|
| 14 |
import sys
|
| 15 |
from kernels_benchmark_tools import KernelTypeEnum, run_benchmark
|
| 16 |
-
import xformers.ops as xops
|
| 17 |
|
| 18 |
|
| 19 |
-
def
|
| 20 |
-
|
| 21 |
-
|
| 22 |
-
|
|
|
|
|
|
|
|
|
|
| 23 |
|
| 24 |
|
| 25 |
run_benchmark(
|
| 26 |
kernel_type=KernelTypeEnum.ATTENTION,
|
| 27 |
-
impl_name="
|
| 28 |
-
impl_tags={"family": "
|
| 29 |
-
impl_func=
|
| 30 |
)
|
|
|
|
| 4 |
# "numpy",
|
| 5 |
# "torch==2.8.0",
|
| 6 |
# "kernels-benchmark-tools",
|
|
|
|
| 7 |
# ]
|
| 8 |
#
|
| 9 |
# [tool.uv.sources]
|
|
|
|
| 12 |
import torch
|
| 13 |
import sys
|
| 14 |
from kernels_benchmark_tools import KernelTypeEnum, run_benchmark
|
|
|
|
| 15 |
|
| 16 |
|
| 17 |
+
def torch_mem_eff(q, k, v):
|
| 18 |
+
qt, kt, vt = (x.transpose(1, 2).contiguous() for x in (q, k, v))
|
| 19 |
+
with torch.nn.attention.sdpa_kernel(
|
| 20 |
+
torch.nn.attention.SDPBackend.EFFICIENT_ATTENTION
|
| 21 |
+
):
|
| 22 |
+
o = torch.nn.functional.scaled_dot_product_attention(qt, kt, vt)
|
| 23 |
+
return o.transpose(1, 2).contiguous()
|
| 24 |
|
| 25 |
|
| 26 |
run_benchmark(
|
| 27 |
kernel_type=KernelTypeEnum.ATTENTION,
|
| 28 |
+
impl_name="torch_mem_eff",
|
| 29 |
+
impl_tags={"family": "torch-sdpa", "backend": "EFFICIENT", "compile": "none"},
|
| 30 |
+
impl_func=torch_mem_eff,
|
| 31 |
)
|
flash_attn/impls/flash_attention.html
CHANGED
|
@@ -809,6 +809,14 @@
|
|
| 809 |
.artifact-preview svg {
|
| 810 |
background: transparent;
|
| 811 |
}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 812 |
/* CSV table styling */
|
| 813 |
.artifact-csv {
|
| 814 |
margin-top: 1rem;
|
|
@@ -3871,7 +3879,7 @@ span.linenos.special { color: #000000; background-color: #ffffc0; padding-left:
|
|
| 3871 |
<span onclick="toggleOutput('nv')" style="cursor: pointer;">▼ output</span>
|
| 3872 |
<span id="uv-indicator-nv" style="cursor: default; opacity: 0.3;">▶ uv-logs</span>
|
| 3873 |
</span> |
|
| 3874 |
-
Cell: nv | 0.
|
| 3875 |
| <button class="run-btn" onclick="runCell('nv')">▶ run</button>
|
| 3876 |
<button class="copy-btn" onclick="copyCell('nv')">Copy</button>
|
| 3877 |
<a href="cells/nv.py" target="_blank" class="raw-btn">Raw</a>
|
|
@@ -3888,7 +3896,7 @@ Cell: nv | 0.28s
|
|
| 3888 |
</div>
|
| 3889 |
</div>
|
| 3890 |
<div id="output-nv" class="cell-output">
|
| 3891 |
-
<div class="cell-stdout"><pre class="stdout-text">Wed Oct 29
|
| 3892 |
+-----------------------------------------------------------------------------------------+
|
| 3893 |
| NVIDIA-SMI 570.195.03 Driver Version: 570.195.03 CUDA Version: 12.8 |
|
| 3894 |
|-----------------------------------------+------------------------+----------------------+
|
|
@@ -3897,7 +3905,7 @@ Cell: nv | 0.28s
|
|
| 3897 |
| | | MIG M. |
|
| 3898 |
|=========================================+========================+======================|
|
| 3899 |
| 0 NVIDIA L40S On | 00000000:4D:00.0 Off | 0 |
|
| 3900 |
-
| N/A
|
| 3901 |
| | | N/A |
|
| 3902 |
+-----------------------------------------+------------------------+----------------------+
|
| 3903 |
|
|
@@ -3919,9 +3927,9 @@ Cell: nv | 0.28s
|
|
| 3919 |
<span class="collapse-indicators">
|
| 3920 |
<span onclick="toggleCode('benchmark')" style="cursor: pointer;">▼ code</span>
|
| 3921 |
<span onclick="toggleOutput('benchmark')" style="cursor: pointer;">▼ output</span>
|
| 3922 |
-
<span id="uv-indicator-benchmark"
|
| 3923 |
</span> |
|
| 3924 |
-
Cell: benchmark |
|
| 3925 |
| <button class="run-btn" onclick="runCell('benchmark')">▶ run</button>
|
| 3926 |
<button class="copy-btn" onclick="copyCell('benchmark')">Copy</button>
|
| 3927 |
<a href="cells/benchmark.py" target="_blank" class="raw-btn">Raw</a>
|
|
@@ -3972,29 +3980,29 @@ PROFILE TRACE: torch_flash_ma | cuda_attn_L128_bfloat16
|
|
| 3972 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 3973 |
Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
|
| 3974 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 3975 |
-
torch_flash_ma 0.00% 0.000us 0.00% 0.000us 0.000us 3.
|
| 3976 |
-
torch_flash_ma 6.
|
| 3977 |
-
aten::scaled_dot_product_attention 0.
|
| 3978 |
-
aten::_scaled_dot_product_flash_attention 0.
|
| 3979 |
-
aten::_flash_attention_forward 0.
|
| 3980 |
-
void pytorch_flash::flash_fwd_kernel<Flash_fwd_kerne... 0.00% 0.000us 0.00% 0.000us 0.000us 2.
|
| 3981 |
-
aten::contiguous 0.27%
|
| 3982 |
-
aten::clone 0.
|
| 3983 |
-
aten::copy_ 1.
|
| 3984 |
-
void at::native::elementwise_kernel<128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us
|
| 3985 |
-
Activity Buffer Request
|
| 3986 |
-
aten::transpose 1.
|
| 3987 |
-
aten::as_strided 0.
|
| 3988 |
-
aten::empty_like 0.
|
| 3989 |
-
aten::empty 1.
|
| 3990 |
-
cudaLaunchKernel 2.
|
| 3991 |
-
aten::empty_strided 0.
|
| 3992 |
-
cudaDeviceGetAttribute 0.
|
| 3993 |
-
cudaFuncSetAttribute 0.
|
| 3994 |
-
cudaDeviceSynchronize
|
| 3995 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 3996 |
-
Self CPU time total: 5.
|
| 3997 |
-
Self CUDA time total: 3.
|
| 3998 |
|
| 3999 |
|
| 4000 |
|
|
@@ -4004,29 +4012,29 @@ PROFILE TRACE: torch_flash_ma | cuda_attn_L256_bfloat16
|
|
| 4004 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 4005 |
Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
|
| 4006 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 4007 |
-
torch_flash_ma 4.
|
| 4008 |
-
torch_flash_ma 0.00% 0.000us 0.00% 0.000us 0.000us 3.
|
| 4009 |
-
aten::scaled_dot_product_attention 0.
|
| 4010 |
-
aten::_scaled_dot_product_flash_attention 0.
|
| 4011 |
-
aten::_flash_attention_forward 0.
|
| 4012 |
-
void pytorch_flash::flash_fwd_kernel<Flash_fwd_kerne... 0.00% 0.000us 0.00% 0.000us 0.000us 2.
|
| 4013 |
-
aten::contiguous 0.
|
| 4014 |
-
aten::clone 0.
|
| 4015 |
-
aten::copy_
|
| 4016 |
-
void at::native::elementwise_kernel<128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us
|
| 4017 |
-
Activity Buffer Request
|
| 4018 |
-
aten::transpose
|
| 4019 |
-
aten::as_strided 0.
|
| 4020 |
-
aten::empty_like 0.
|
| 4021 |
-
aten::empty 1.
|
| 4022 |
-
cudaLaunchKernel
|
| 4023 |
-
aten::empty_strided 0.
|
| 4024 |
-
cudaDeviceGetAttribute 0.03% 1.
|
| 4025 |
-
cudaFuncSetAttribute 0.07% 3.
|
| 4026 |
-
cudaDeviceSynchronize
|
| 4027 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 4028 |
-
Self CPU time total: 5.
|
| 4029 |
-
Self CUDA time total: 3.
|
| 4030 |
|
| 4031 |
|
| 4032 |
|
|
@@ -4036,29 +4044,29 @@ PROFILE TRACE: torch_flash_ma | cuda_attn_L320_bfloat16
|
|
| 4036 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 4037 |
Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
|
| 4038 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 4039 |
-
torch_flash_ma 4.
|
| 4040 |
-
torch_flash_ma 0.00% 0.000us 0.00% 0.000us 0.000us 3.
|
| 4041 |
-
aten::scaled_dot_product_attention 0.
|
| 4042 |
-
aten::_scaled_dot_product_flash_attention 0.
|
| 4043 |
-
aten::_flash_attention_forward 0.
|
| 4044 |
-
void pytorch_flash::flash_fwd_kernel<Flash_fwd_kerne... 0.00% 0.000us 0.00% 0.000us 0.000us
|
| 4045 |
-
aten::contiguous 0.
|
| 4046 |
-
aten::clone 0.
|
| 4047 |
-
aten::copy_ 1.
|
| 4048 |
-
void at::native::elementwise_kernel<128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us
|
| 4049 |
-
Activity Buffer Request 27.
|
| 4050 |
-
aten::transpose 0.
|
| 4051 |
-
aten::as_strided 0.
|
| 4052 |
-
aten::empty_like 0.
|
| 4053 |
-
aten::empty 1.
|
| 4054 |
-
cudaLaunchKernel
|
| 4055 |
-
aten::empty_strided 0.
|
| 4056 |
-
cudaDeviceGetAttribute 0.03% 1.
|
| 4057 |
-
cudaFuncSetAttribute 0.07% 3.
|
| 4058 |
-
cudaDeviceSynchronize
|
| 4059 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 4060 |
-
Self CPU time total: 5.
|
| 4061 |
-
Self CUDA time total: 3.
|
| 4062 |
|
| 4063 |
|
| 4064 |
|
|
@@ -4068,29 +4076,29 @@ PROFILE TRACE: torch_flash_ma | cuda_attn_L384_bfloat16
|
|
| 4068 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 4069 |
Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
|
| 4070 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 4071 |
-
torch_flash_ma 4.
|
| 4072 |
-
torch_flash_ma 0.00% 0.000us 0.00% 0.000us 0.000us 3.
|
| 4073 |
-
aten::scaled_dot_product_attention 0.
|
| 4074 |
-
aten::_scaled_dot_product_flash_attention 0.
|
| 4075 |
-
aten::_flash_attention_forward 0.
|
| 4076 |
-
void pytorch_flash::flash_fwd_kernel<Flash_fwd_kerne... 0.00% 0.000us 0.00% 0.000us 0.000us 3.
|
| 4077 |
-
aten::contiguous 0.17% 9.
|
| 4078 |
-
aten::clone 0.
|
| 4079 |
-
aten::copy_ 1.
|
| 4080 |
-
void at::native::elementwise_kernel<128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us
|
| 4081 |
-
Activity Buffer Request 25.
|
| 4082 |
-
aten::transpose 0.
|
| 4083 |
-
aten::as_strided 0.
|
| 4084 |
-
aten::empty_like 0.
|
| 4085 |
-
aten::empty 1.
|
| 4086 |
-
cudaLaunchKernel 5.
|
| 4087 |
-
aten::empty_strided 0.
|
| 4088 |
-
cudaDeviceGetAttribute 0.03% 1.
|
| 4089 |
-
cudaFuncSetAttribute 0.
|
| 4090 |
-
cudaDeviceSynchronize
|
| 4091 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 4092 |
-
Self CPU time total: 5.
|
| 4093 |
-
Self CUDA time total: 3.
|
| 4094 |
|
| 4095 |
|
| 4096 |
|
|
@@ -4100,29 +4108,29 @@ PROFILE TRACE: torch_flash_ma | cuda_attn_L448_bfloat16
|
|
| 4100 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 4101 |
Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
|
| 4102 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 4103 |
-
torch_flash_ma
|
| 4104 |
-
torch_flash_ma 0.00% 0.000us 0.00% 0.000us 0.000us 4.
|
| 4105 |
-
aten::scaled_dot_product_attention 0.
|
| 4106 |
-
aten::_scaled_dot_product_flash_attention 0.
|
| 4107 |
-
aten::_flash_attention_forward 0.
|
| 4108 |
-
void pytorch_flash::flash_fwd_kernel<Flash_fwd_kerne... 0.00% 0.000us 0.00% 0.000us 0.000us 3.
|
| 4109 |
-
aten::contiguous 0.
|
| 4110 |
-
aten::clone 0.
|
| 4111 |
-
aten::copy_ 1.
|
| 4112 |
-
void at::native::elementwise_kernel<128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us
|
| 4113 |
-
Activity Buffer Request
|
| 4114 |
-
aten::transpose 0.
|
| 4115 |
-
aten::as_strided 0.
|
| 4116 |
-
aten::empty_like 0.
|
| 4117 |
-
aten::empty 1.
|
| 4118 |
-
cudaLaunchKernel
|
| 4119 |
-
aten::empty_strided 0.
|
| 4120 |
-
cudaDeviceGetAttribute 0.03%
|
| 4121 |
-
cudaFuncSetAttribute 0.06% 3.
|
| 4122 |
-
cudaDeviceSynchronize 59.
|
| 4123 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 4124 |
-
Self CPU time total: 6.
|
| 4125 |
-
Self CUDA time total: 4.
|
| 4126 |
|
| 4127 |
|
| 4128 |
|
|
@@ -4132,91 +4140,39 @@ PROFILE TRACE: torch_flash_ma | cuda_attn_L512_bfloat16
|
|
| 4132 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 4133 |
Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
|
| 4134 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 4135 |
-
torch_flash_ma 4.
|
| 4136 |
-
torch_flash_ma 0.00% 0.000us 0.00% 0.000us 0.000us 4.
|
| 4137 |
-
aten::scaled_dot_product_attention 0.
|
| 4138 |
-
aten::_scaled_dot_product_flash_attention 0.
|
| 4139 |
-
aten::_flash_attention_forward 0.
|
| 4140 |
-
void pytorch_flash::flash_fwd_kernel<Flash_fwd_kerne... 0.00% 0.000us 0.00% 0.000us 0.000us 3.
|
| 4141 |
-
aten::contiguous 0.16%
|
| 4142 |
-
aten::clone 0.50% 30.
|
| 4143 |
-
aten::copy_ 1.
|
| 4144 |
-
void at::native::elementwise_kernel<128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us
|
| 4145 |
-
Activity Buffer Request 23.
|
| 4146 |
-
aten::transpose 0.
|
| 4147 |
-
aten::as_strided 0.
|
| 4148 |
-
aten::empty_like 0.33% 20.
|
| 4149 |
-
aten::empty 1.
|
| 4150 |
-
cudaLaunchKernel 5.
|
| 4151 |
-
aten::empty_strided 0.25% 15.
|
| 4152 |
-
cudaDeviceGetAttribute 0.03% 1.
|
| 4153 |
-
cudaFuncSetAttribute 0.06% 3.
|
| 4154 |
-
cudaDeviceSynchronize
|
| 4155 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 4156 |
-
Self CPU time total: 6.
|
| 4157 |
-
Self CUDA time total: 4.
|
| 4158 |
|
| 4159 |
|
| 4160 |
impl wl p50(ms) ok
|
| 4161 |
-
torch_flash_ma cuda_attn_L128_bfloat16 1.
|
| 4162 |
-
torch_flash_ma cuda_attn_L256_bfloat16 1.
|
| 4163 |
-
torch_flash_ma cuda_attn_L320_bfloat16 1.
|
| 4164 |
-
torch_flash_ma cuda_attn_L384_bfloat16 1.
|
| 4165 |
-
torch_flash_ma cuda_attn_L448_bfloat16 1.
|
| 4166 |
-
torch_flash_ma cuda_attn_L512_bfloat16 1.
|
| 4167 |
</pre></div>
|
| 4168 |
-
<div class="uv-install-logs" id="uv-logs-benchmark">
|
| 4169 |
-
<div class="uv-logs-header" onclick="toggleUvLogs(this)">▶ UV Install Logs</div>
|
| 4170 |
-
<div class="uv-logs-content" style="display: none;">
|
| 4171 |
-
Building kernels-benchmark-tools @ file:///__w/kernels-benchmarks/kernels-benchmarks/tools
|
| 4172 |
-
Downloading nvidia-cuda-cupti-cu12 (9.8MiB)
|
| 4173 |
-
Downloading matplotlib (8.3MiB)
|
| 4174 |
-
Downloading nvidia-cufile-cu12 (1.1MiB)
|
| 4175 |
-
Downloading numpy (16.2MiB)
|
| 4176 |
-
Downloading nvidia-cudnn-cu12 (674.0MiB)
|
| 4177 |
-
Downloading nvidia-cusparse-cu12 (274.9MiB)
|
| 4178 |
-
Downloading nvidia-cublas-cu12 (566.8MiB)
|
| 4179 |
-
Downloading nvidia-cusparselt-cu12 (273.9MiB)
|
| 4180 |
-
Downloading nvidia-nvjitlink-cu12 (37.4MiB)
|
| 4181 |
-
Downloading kiwisolver (1.4MiB)
|
| 4182 |
-
Downloading networkx (1.9MiB)
|
| 4183 |
-
Downloading nvidia-curand-cu12 (60.7MiB)
|
| 4184 |
-
Downloading sympy (6.0MiB)
|
| 4185 |
-
Downloading nvidia-cuda-nvrtc-cu12 (84.0MiB)
|
| 4186 |
-
Downloading setuptools (1.1MiB)
|
| 4187 |
-
Downloading nvidia-cusolver-cu12 (255.1MiB)
|
| 4188 |
-
Downloading triton (148.3MiB)
|
| 4189 |
-
Downloading pillow (6.7MiB)
|
| 4190 |
-
Downloading nvidia-cufft-cu12 (184.2MiB)
|
| 4191 |
-
Downloading fonttools (4.7MiB)
|
| 4192 |
-
Downloading nvidia-nccl-cu12 (307.4MiB)
|
| 4193 |
-
Downloading torch (846.9MiB)
|
| 4194 |
-
Downloading nvidia-cufile-cu12
|
| 4195 |
-
Downloading kiwisolver
|
| 4196 |
-
Downloading setuptools
|
| 4197 |
-
Downloading networkx
|
| 4198 |
-
Downloading fonttools
|
| 4199 |
-
Downloading pillow
|
| 4200 |
-
Built kernels-benchmark-tools @ file:///__w/kernels-benchmarks/kernels-benchmarks/tools
|
| 4201 |
-
Downloading nvidia-cuda-cupti-cu12
|
| 4202 |
-
Downloading matplotlib
|
| 4203 |
-
Downloading numpy
|
| 4204 |
-
Downloading sympy
|
| 4205 |
-
Downloading nvidia-nvjitlink-cu12
|
| 4206 |
-
Downloading nvidia-curand-cu12
|
| 4207 |
-
Downloading nvidia-cuda-nvrtc-cu12
|
| 4208 |
-
Downloading triton
|
| 4209 |
-
Downloading nvidia-cufft-cu12
|
| 4210 |
-
Downloading nvidia-cusolver-cu12
|
| 4211 |
-
Downloading nvidia-cusparse-cu12
|
| 4212 |
-
Downloading nvidia-cusparselt-cu12
|
| 4213 |
-
Downloading nvidia-nccl-cu12
|
| 4214 |
-
Downloading nvidia-cublas-cu12
|
| 4215 |
-
Downloading nvidia-cudnn-cu12
|
| 4216 |
-
Downloading torch
|
| 4217 |
-
Installed 37 packages in 212ms
|
| 4218 |
-
</div>
|
| 4219 |
-
</div>
|
| 4220 |
<div class="cell-artifacts">
|
| 4221 |
<h4>Artifacts:</h4>
|
| 4222 |
<a href="artifacts/benchmark/attention.jsonl" class="artifact" target="_blank">attention.jsonl</a>
|
|
|
|
| 809 |
.artifact-preview svg {
|
| 810 |
background: transparent;
|
| 811 |
}
|
| 812 |
+
/* Invert SVG images in dark mode */
|
| 813 |
+
:root[data-theme="dark"] .artifact-preview img[src$=".svg"] {
|
| 814 |
+
filter: invert(0.9) hue-rotate(180deg);
|
| 815 |
+
}
|
| 816 |
+
/* Keep SVG images readable in monocolor mode */
|
| 817 |
+
:root[data-ui="monocolor"] .artifact-preview img[src$=".svg"] {
|
| 818 |
+
filter: none;
|
| 819 |
+
}
|
| 820 |
/* CSV table styling */
|
| 821 |
.artifact-csv {
|
| 822 |
margin-top: 1rem;
|
|
|
|
| 3879 |
<span onclick="toggleOutput('nv')" style="cursor: pointer;">▼ output</span>
|
| 3880 |
<span id="uv-indicator-nv" style="cursor: default; opacity: 0.3;">▶ uv-logs</span>
|
| 3881 |
</span> |
|
| 3882 |
+
Cell: nv | 0.26s
|
| 3883 |
| <button class="run-btn" onclick="runCell('nv')">▶ run</button>
|
| 3884 |
<button class="copy-btn" onclick="copyCell('nv')">Copy</button>
|
| 3885 |
<a href="cells/nv.py" target="_blank" class="raw-btn">Raw</a>
|
|
|
|
| 3896 |
</div>
|
| 3897 |
</div>
|
| 3898 |
<div id="output-nv" class="cell-output">
|
| 3899 |
+
<div class="cell-stdout"><pre class="stdout-text">Wed Oct 29 15:50:02 2025
|
| 3900 |
+-----------------------------------------------------------------------------------------+
|
| 3901 |
| NVIDIA-SMI 570.195.03 Driver Version: 570.195.03 CUDA Version: 12.8 |
|
| 3902 |
|-----------------------------------------+------------------------+----------------------+
|
|
|
|
| 3905 |
| | | MIG M. |
|
| 3906 |
|=========================================+========================+======================|
|
| 3907 |
| 0 NVIDIA L40S On | 00000000:4D:00.0 Off | 0 |
|
| 3908 |
+
| N/A 29C P0 165W / 350W | 0MiB / 46068MiB | 61% Default |
|
| 3909 |
| | | N/A |
|
| 3910 |
+-----------------------------------------+------------------------+----------------------+
|
| 3911 |
|
|
|
|
| 3927 |
<span class="collapse-indicators">
|
| 3928 |
<span onclick="toggleCode('benchmark')" style="cursor: pointer;">▼ code</span>
|
| 3929 |
<span onclick="toggleOutput('benchmark')" style="cursor: pointer;">▼ output</span>
|
| 3930 |
+
<span id="uv-indicator-benchmark" style="cursor: default; opacity: 0.3;">▶ uv-logs</span>
|
| 3931 |
</span> |
|
| 3932 |
+
Cell: benchmark | 3.82s
|
| 3933 |
| <button class="run-btn" onclick="runCell('benchmark')">▶ run</button>
|
| 3934 |
<button class="copy-btn" onclick="copyCell('benchmark')">Copy</button>
|
| 3935 |
<a href="cells/benchmark.py" target="_blank" class="raw-btn">Raw</a>
|
|
|
|
| 3980 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 3981 |
Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
|
| 3982 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 3983 |
+
torch_flash_ma 0.00% 0.000us 0.00% 0.000us 0.000us 3.562ms 101.45% 3.562ms 3.562ms 1
|
| 3984 |
+
torch_flash_ma 6.38% 328.580us 45.84% 2.360ms 2.360ms 0.000us 0.00% 3.551ms 3.551ms 1
|
| 3985 |
+
aten::scaled_dot_product_attention 0.79% 40.571us 4.12% 212.315us 70.772us 0.000us 0.00% 2.798ms 932.779us 3
|
| 3986 |
+
aten::_scaled_dot_product_flash_attention 0.52% 26.642us 3.34% 171.744us 57.248us 0.000us 0.00% 2.798ms 932.779us 3
|
| 3987 |
+
aten::_flash_attention_forward 0.74% 37.939us 2.40% 123.383us 41.128us 2.798ms 79.71% 2.798ms 932.779us 3
|
| 3988 |
+
void pytorch_flash::flash_fwd_kernel<Flash_fwd_kerne... 0.00% 0.000us 0.00% 0.000us 0.000us 2.798ms 79.71% 2.798ms 932.779us 3
|
| 3989 |
+
aten::contiguous 0.27% 13.720us 34.12% 1.757ms 146.409us 0.000us 0.00% 752.288us 62.691us 12
|
| 3990 |
+
aten::clone 0.73% 37.449us 33.85% 1.743ms 145.266us 0.000us 0.00% 752.288us 62.691us 12
|
| 3991 |
+
aten::copy_ 1.68% 86.484us 31.57% 1.625ms 135.456us 712.095us 20.29% 752.288us 62.691us 12
|
| 3992 |
+
void at::native::elementwise_kernel<128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 712.095us 20.29% 712.095us 59.341us 12
|
| 3993 |
+
Activity Buffer Request 28.00% 1.442ms 28.00% 1.442ms 1.442ms 40.193us 1.14% 40.193us 40.193us 1
|
| 3994 |
+
aten::transpose 1.22% 62.637us 1.64% 84.218us 3.509us 0.000us 0.00% 0.000us 0.000us 24
|
| 3995 |
+
aten::as_strided 0.42% 21.581us 0.42% 21.581us 0.899us 0.000us 0.00% 0.000us 0.000us 24
|
| 3996 |
+
aten::empty_like 0.48% 24.619us 1.97% 101.523us 6.768us 0.000us 0.00% 0.000us 0.000us 15
|
| 3997 |
+
aten::empty 1.76% 90.465us 1.76% 90.465us 3.769us 0.000us 0.00% 0.000us 0.000us 24
|
| 3998 |
+
cudaLaunchKernel 2.36% 121.521us 2.36% 121.521us 8.101us 0.000us 0.00% 0.000us 0.000us 15
|
| 3999 |
+
aten::empty_strided 0.31% 15.721us 0.31% 15.721us 5.240us 0.000us 0.00% 0.000us 0.000us 3
|
| 4000 |
+
cudaDeviceGetAttribute 0.04% 2.280us 0.04% 2.280us 0.380us 0.000us 0.00% 0.000us 0.000us 6
|
| 4001 |
+
cudaFuncSetAttribute 0.16% 8.181us 0.16% 8.181us 2.727us 0.000us 0.00% 0.000us 0.000us 3
|
| 4002 |
+
cudaDeviceSynchronize 54.16% 2.789ms 54.16% 2.789ms 2.789ms 0.000us 0.00% 0.000us 0.000us 1
|
| 4003 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 4004 |
+
Self CPU time total: 5.149ms
|
| 4005 |
+
Self CUDA time total: 3.510ms
|
| 4006 |
|
| 4007 |
|
| 4008 |
|
|
|
|
| 4012 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 4013 |
Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
|
| 4014 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 4015 |
+
torch_flash_ma 4.71% 257.538us 44.52% 2.436ms 2.436ms 0.000us 0.00% 3.763ms 3.763ms 1
|
| 4016 |
+
torch_flash_ma 0.00% 0.000us 0.00% 0.000us 0.000us 3.719ms 100.29% 3.719ms 3.719ms 1
|
| 4017 |
+
aten::scaled_dot_product_attention 0.45% 24.440us 3.30% 180.683us 60.228us 0.000us 0.00% 2.948ms 982.525us 3
|
| 4018 |
+
aten::_scaled_dot_product_flash_attention 0.35% 18.890us 2.86% 156.243us 52.081us 0.000us 0.00% 2.948ms 982.525us 3
|
| 4019 |
+
aten::_flash_attention_forward 0.68% 37.218us 2.07% 113.133us 37.711us 2.948ms 79.49% 2.948ms 982.525us 3
|
| 4020 |
+
void pytorch_flash::flash_fwd_kernel<Flash_fwd_kerne... 0.00% 0.000us 0.00% 0.000us 0.000us 2.948ms 79.49% 2.948ms 982.525us 3
|
| 4021 |
+
aten::contiguous 0.16% 8.651us 35.72% 1.955ms 162.890us 0.000us 0.00% 815.678us 67.973us 12
|
| 4022 |
+
aten::clone 0.48% 26.452us 35.56% 1.946ms 162.169us 0.000us 0.00% 815.678us 67.973us 12
|
| 4023 |
+
aten::copy_ 1.81% 99.279us 33.97% 1.859ms 154.885us 760.479us 20.51% 815.678us 67.973us 12
|
| 4024 |
+
void at::native::elementwise_kernel<128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 760.479us 20.51% 760.479us 63.373us 12
|
| 4025 |
+
Activity Buffer Request 30.60% 1.674ms 30.60% 1.674ms 1.674ms 55.199us 1.49% 55.199us 55.199us 1
|
| 4026 |
+
aten::transpose 0.92% 50.270us 1.23% 67.460us 2.811us 0.000us 0.00% 0.000us 0.000us 24
|
| 4027 |
+
aten::as_strided 0.31% 17.190us 0.31% 17.190us 0.716us 0.000us 0.00% 0.000us 0.000us 24
|
| 4028 |
+
aten::empty_like 0.34% 18.723us 1.45% 79.503us 5.300us 0.000us 0.00% 0.000us 0.000us 15
|
| 4029 |
+
aten::empty 1.39% 75.933us 1.39% 75.933us 3.164us 0.000us 0.00% 0.000us 0.000us 24
|
| 4030 |
+
cudaLaunchKernel 1.98% 108.143us 1.98% 108.143us 7.210us 0.000us 0.00% 0.000us 0.000us 15
|
| 4031 |
+
aten::empty_strided 0.25% 13.599us 0.25% 13.599us 4.533us 0.000us 0.00% 0.000us 0.000us 3
|
| 4032 |
+
cudaDeviceGetAttribute 0.03% 1.831us 0.03% 1.831us 0.305us 0.000us 0.00% 0.000us 0.000us 6
|
| 4033 |
+
cudaFuncSetAttribute 0.07% 3.690us 0.07% 3.690us 1.230us 0.000us 0.00% 0.000us 0.000us 3
|
| 4034 |
+
cudaDeviceSynchronize 55.48% 3.036ms 55.48% 3.036ms 3.036ms 0.000us 0.00% 0.000us 0.000us 1
|
| 4035 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 4036 |
+
Self CPU time total: 5.472ms
|
| 4037 |
+
Self CUDA time total: 3.708ms
|
| 4038 |
|
| 4039 |
|
| 4040 |
|
|
|
|
| 4044 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 4045 |
Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
|
| 4046 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 4047 |
+
torch_flash_ma 4.65% 248.558us 40.70% 2.176ms 2.176ms 0.000us 0.00% 3.868ms 3.868ms 1
|
| 4048 |
+
torch_flash_ma 0.00% 0.000us 0.00% 0.000us 0.000us 3.819ms 100.29% 3.819ms 3.819ms 1
|
| 4049 |
+
aten::scaled_dot_product_attention 0.45% 24.181us 3.36% 179.834us 59.945us 0.000us 0.00% 3.027ms 1.009ms 3
|
| 4050 |
+
aten::_scaled_dot_product_flash_attention 0.34% 18.100us 2.91% 155.653us 51.884us 0.000us 0.00% 3.027ms 1.009ms 3
|
| 4051 |
+
aten::_flash_attention_forward 0.73% 38.760us 2.16% 115.412us 38.471us 3.027ms 79.48% 3.027ms 1.009ms 3
|
| 4052 |
+
void pytorch_flash::flash_fwd_kernel<Flash_fwd_kerne... 0.00% 0.000us 0.00% 0.000us 0.000us 3.027ms 79.48% 3.027ms 1.009ms 3
|
| 4053 |
+
aten::contiguous 0.16% 8.609us 31.88% 1.704ms 142.018us 0.000us 0.00% 841.280us 70.107us 12
|
| 4054 |
+
aten::clone 0.50% 26.820us 31.72% 1.696ms 141.301us 0.000us 0.00% 841.280us 70.107us 12
|
| 4055 |
+
aten::copy_ 1.47% 78.703us 30.10% 1.609ms 134.076us 781.631us 20.52% 841.280us 70.107us 12
|
| 4056 |
+
void at::native::elementwise_kernel<128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 781.631us 20.52% 781.631us 65.136us 12
|
| 4057 |
+
Activity Buffer Request 27.11% 1.449ms 27.11% 1.449ms 1.449ms 59.649us 1.57% 59.649us 59.649us 1
|
| 4058 |
+
aten::transpose 0.90% 48.151us 1.22% 65.102us 2.713us 0.000us 0.00% 0.000us 0.000us 24
|
| 4059 |
+
aten::as_strided 0.32% 16.951us 0.32% 16.951us 0.706us 0.000us 0.00% 0.000us 0.000us 24
|
| 4060 |
+
aten::empty_like 0.35% 18.789us 1.49% 79.862us 5.324us 0.000us 0.00% 0.000us 0.000us 15
|
| 4061 |
+
aten::empty 1.38% 73.892us 1.38% 73.892us 3.079us 0.000us 0.00% 0.000us 0.000us 24
|
| 4062 |
+
cudaLaunchKernel 1.96% 104.680us 1.96% 104.680us 6.979us 0.000us 0.00% 0.000us 0.000us 15
|
| 4063 |
+
aten::empty_strided 0.28% 15.081us 0.28% 15.081us 5.027us 0.000us 0.00% 0.000us 0.000us 3
|
| 4064 |
+
cudaDeviceGetAttribute 0.03% 1.791us 0.03% 1.791us 0.299us 0.000us 0.00% 0.000us 0.000us 6
|
| 4065 |
+
cudaFuncSetAttribute 0.07% 3.500us 0.07% 3.500us 1.167us 0.000us 0.00% 0.000us 0.000us 3
|
| 4066 |
+
cudaDeviceSynchronize 59.30% 3.169ms 59.30% 3.169ms 3.169ms 0.000us 0.00% 0.000us 0.000us 1
|
| 4067 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 4068 |
+
Self CPU time total: 5.345ms
|
| 4069 |
+
Self CUDA time total: 3.808ms
|
| 4070 |
|
| 4071 |
|
| 4072 |
|
|
|
|
| 4076 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 4077 |
Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
|
| 4078 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 4079 |
+
torch_flash_ma 4.50% 255.237us 42.25% 2.398ms 2.398ms 0.000us 0.00% 3.984ms 3.984ms 1
|
| 4080 |
+
torch_flash_ma 0.00% 0.000us 0.00% 0.000us 0.000us 3.936ms 100.28% 3.936ms 3.936ms 1
|
| 4081 |
+
aten::scaled_dot_product_attention 0.42% 23.840us 3.17% 179.904us 59.968us 0.000us 0.00% 3.135ms 1.045ms 3
|
| 4082 |
+
aten::_scaled_dot_product_flash_attention 0.36% 20.442us 2.75% 156.064us 52.021us 0.000us 0.00% 3.135ms 1.045ms 3
|
| 4083 |
+
aten::_flash_attention_forward 0.68% 38.721us 1.99% 113.183us 37.728us 3.135ms 79.87% 3.135ms 1.045ms 3
|
| 4084 |
+
void pytorch_flash::flash_fwd_kernel<Flash_fwd_kerne... 0.00% 0.000us 0.00% 0.000us 0.000us 3.135ms 79.87% 3.135ms 1.045ms 3
|
| 4085 |
+
aten::contiguous 0.17% 9.382us 33.81% 1.919ms 159.915us 0.000us 0.00% 848.416us 70.701us 12
|
| 4086 |
+
aten::clone 0.52% 29.639us 33.64% 1.910ms 159.133us 0.000us 0.00% 848.416us 70.701us 12
|
| 4087 |
+
aten::copy_ 1.40% 79.644us 32.03% 1.818ms 151.492us 790.048us 20.13% 848.416us 70.701us 12
|
| 4088 |
+
void at::native::elementwise_kernel<128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 790.048us 20.13% 790.048us 65.837us 12
|
| 4089 |
+
Activity Buffer Request 25.14% 1.427ms 25.14% 1.427ms 1.427ms 58.368us 1.49% 58.368us 58.368us 1
|
| 4090 |
+
aten::transpose 0.87% 49.289us 1.17% 66.169us 2.757us 0.000us 0.00% 0.000us 0.000us 24
|
| 4091 |
+
aten::as_strided 0.30% 16.880us 0.30% 16.880us 0.703us 0.000us 0.00% 0.000us 0.000us 24
|
| 4092 |
+
aten::empty_like 0.35% 19.852us 1.42% 80.662us 5.377us 0.000us 0.00% 0.000us 0.000us 15
|
| 4093 |
+
aten::empty 1.32% 74.981us 1.32% 74.981us 3.124us 0.000us 0.00% 0.000us 0.000us 24
|
| 4094 |
+
cudaLaunchKernel 5.89% 334.125us 5.89% 334.125us 22.275us 0.000us 0.00% 0.000us 0.000us 15
|
| 4095 |
+
aten::empty_strided 0.24% 13.720us 0.24% 13.720us 4.573us 0.000us 0.00% 0.000us 0.000us 3
|
| 4096 |
+
cudaDeviceGetAttribute 0.03% 1.760us 0.03% 1.760us 0.293us 0.000us 0.00% 0.000us 0.000us 6
|
| 4097 |
+
cudaFuncSetAttribute 0.06% 3.570us 0.06% 3.570us 1.190us 0.000us 0.00% 0.000us 0.000us 3
|
| 4098 |
+
cudaDeviceSynchronize 57.75% 3.278ms 57.75% 3.278ms 3.278ms 0.000us 0.00% 0.000us 0.000us 1
|
| 4099 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 4100 |
+
Self CPU time total: 5.676ms
|
| 4101 |
+
Self CUDA time total: 3.925ms
|
| 4102 |
|
| 4103 |
|
| 4104 |
|
|
|
|
| 4108 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 4109 |
Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
|
| 4110 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 4111 |
+
torch_flash_ma 5.07% 311.056us 40.82% 2.505ms 2.505ms 0.000us 0.00% 4.409ms 4.409ms 1
|
| 4112 |
+
torch_flash_ma 0.00% 0.000us 0.00% 0.000us 0.000us 4.359ms 100.26% 4.359ms 4.359ms 1
|
| 4113 |
+
aten::scaled_dot_product_attention 0.41% 24.931us 3.07% 188.265us 62.755us 0.000us 0.00% 3.539ms 1.180ms 3
|
| 4114 |
+
aten::_scaled_dot_product_flash_attention 0.33% 20.199us 2.66% 163.334us 54.445us 0.000us 0.00% 3.539ms 1.180ms 3
|
| 4115 |
+
aten::_flash_attention_forward 0.67% 41.371us 1.94% 118.823us 39.608us 3.539ms 81.38% 3.539ms 1.180ms 3
|
| 4116 |
+
void pytorch_flash::flash_fwd_kernel<Flash_fwd_kerne... 0.00% 0.000us 0.00% 0.000us 0.000us 3.539ms 81.38% 3.539ms 1.180ms 3
|
| 4117 |
+
aten::contiguous 0.16% 9.771us 31.97% 1.962ms 163.526us 0.000us 0.00% 870.819us 72.568us 12
|
| 4118 |
+
aten::clone 0.47% 28.779us 31.82% 1.953ms 162.712us 0.000us 0.00% 870.819us 72.568us 12
|
| 4119 |
+
aten::copy_ 1.27% 77.896us 30.33% 1.862ms 155.132us 809.571us 18.62% 870.819us 72.568us 12
|
| 4120 |
+
void at::native::elementwise_kernel<128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 809.571us 18.62% 809.571us 67.464us 12
|
| 4121 |
+
Activity Buffer Request 24.14% 1.481ms 24.14% 1.481ms 1.481ms 61.248us 1.41% 61.248us 61.248us 1
|
| 4122 |
+
aten::transpose 0.82% 50.583us 1.11% 68.092us 2.837us 0.000us 0.00% 0.000us 0.000us 24
|
| 4123 |
+
aten::as_strided 0.29% 17.509us 0.29% 17.509us 0.730us 0.000us 0.00% 0.000us 0.000us 24
|
| 4124 |
+
aten::empty_like 0.32% 19.913us 1.33% 81.883us 5.459us 0.000us 0.00% 0.000us 0.000us 15
|
| 4125 |
+
aten::empty 1.23% 75.660us 1.23% 75.660us 3.153us 0.000us 0.00% 0.000us 0.000us 24
|
| 4126 |
+
cudaLaunchKernel 5.31% 325.825us 5.31% 325.825us 21.722us 0.000us 0.00% 0.000us 0.000us 15
|
| 4127 |
+
aten::empty_strided 0.24% 14.770us 0.24% 14.770us 4.923us 0.000us 0.00% 0.000us 0.000us 3
|
| 4128 |
+
cudaDeviceGetAttribute 0.03% 1.990us 0.03% 1.990us 0.332us 0.000us 0.00% 0.000us 0.000us 6
|
| 4129 |
+
cudaFuncSetAttribute 0.06% 3.670us 0.06% 3.670us 1.223us 0.000us 0.00% 0.000us 0.000us 3
|
| 4130 |
+
cudaDeviceSynchronize 59.18% 3.632ms 59.18% 3.632ms 3.632ms 0.000us 0.00% 0.000us 0.000us 1
|
| 4131 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 4132 |
+
Self CPU time total: 6.137ms
|
| 4133 |
+
Self CUDA time total: 4.348ms
|
| 4134 |
|
| 4135 |
|
| 4136 |
|
|
|
|
| 4140 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 4141 |
Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
|
| 4142 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 4143 |
+
torch_flash_ma 4.13% 252.675us 38.98% 2.384ms 2.384ms 0.000us 0.00% 4.451ms 4.451ms 1
|
| 4144 |
+
torch_flash_ma 0.00% 0.000us 0.00% 0.000us 0.000us 4.400ms 100.24% 4.400ms 4.400ms 1
|
| 4145 |
+
aten::scaled_dot_product_attention 0.50% 30.480us 3.11% 190.334us 63.445us 0.000us 0.00% 3.566ms 1.189ms 3
|
| 4146 |
+
aten::_scaled_dot_product_flash_attention 0.31% 19.082us 2.61% 159.854us 53.285us 0.000us 0.00% 3.566ms 1.189ms 3
|
| 4147 |
+
aten::_flash_attention_forward 0.62% 38.112us 1.93% 118.053us 39.351us 3.566ms 81.24% 3.566ms 1.189ms 3
|
| 4148 |
+
void pytorch_flash::flash_fwd_kernel<Flash_fwd_kerne... 0.00% 0.000us 0.00% 0.000us 0.000us 3.566ms 81.24% 3.566ms 1.189ms 3
|
| 4149 |
+
aten::contiguous 0.16% 9.891us 31.02% 1.897ms 158.059us 0.000us 0.00% 884.831us 73.736us 12
|
| 4150 |
+
aten::clone 0.50% 30.290us 30.85% 1.887ms 157.234us 0.000us 0.00% 884.831us 73.736us 12
|
| 4151 |
+
aten::copy_ 1.28% 78.520us 29.35% 1.795ms 149.550us 823.711us 18.76% 884.831us 73.736us 12
|
| 4152 |
+
void at::native::elementwise_kernel<128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 823.711us 18.76% 823.711us 68.643us 12
|
| 4153 |
+
Activity Buffer Request 23.29% 1.424ms 23.29% 1.424ms 1.424ms 61.120us 1.39% 61.120us 61.120us 1
|
| 4154 |
+
aten::transpose 0.81% 49.593us 1.09% 66.721us 2.780us 0.000us 0.00% 0.000us 0.000us 24
|
| 4155 |
+
aten::as_strided 0.28% 17.128us 0.28% 17.128us 0.714us 0.000us 0.00% 0.000us 0.000us 24
|
| 4156 |
+
aten::empty_like 0.33% 20.381us 1.35% 82.362us 5.491us 0.000us 0.00% 0.000us 0.000us 15
|
| 4157 |
+
aten::empty 1.23% 74.920us 1.23% 74.920us 3.122us 0.000us 0.00% 0.000us 0.000us 24
|
| 4158 |
+
cudaLaunchKernel 5.19% 317.558us 5.19% 317.558us 21.171us 0.000us 0.00% 0.000us 0.000us 15
|
| 4159 |
+
aten::empty_strided 0.25% 15.161us 0.25% 15.161us 5.054us 0.000us 0.00% 0.000us 0.000us 3
|
| 4160 |
+
cudaDeviceGetAttribute 0.03% 1.791us 0.03% 1.791us 0.299us 0.000us 0.00% 0.000us 0.000us 6
|
| 4161 |
+
cudaFuncSetAttribute 0.06% 3.670us 0.06% 3.670us 1.223us 0.000us 0.00% 0.000us 0.000us 3
|
| 4162 |
+
cudaDeviceSynchronize 61.02% 3.732ms 61.02% 3.732ms 3.732ms 0.000us 0.00% 0.000us 0.000us 1
|
| 4163 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 4164 |
+
Self CPU time total: 6.115ms
|
| 4165 |
+
Self CUDA time total: 4.390ms
|
| 4166 |
|
| 4167 |
|
| 4168 |
impl wl p50(ms) ok
|
| 4169 |
+
torch_flash_ma cuda_attn_L128_bfloat16 1.21 True
|
| 4170 |
+
torch_flash_ma cuda_attn_L256_bfloat16 1.27 True
|
| 4171 |
+
torch_flash_ma cuda_attn_L320_bfloat16 1.30 True
|
| 4172 |
+
torch_flash_ma cuda_attn_L384_bfloat16 1.32 True
|
| 4173 |
+
torch_flash_ma cuda_attn_L448_bfloat16 1.48 True
|
| 4174 |
+
torch_flash_ma cuda_attn_L512_bfloat16 1.49 True
|
| 4175 |
</pre></div>
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 4176 |
<div class="cell-artifacts">
|
| 4177 |
<h4>Artifacts:</h4>
|
| 4178 |
<a href="artifacts/benchmark/attention.jsonl" class="artifact" target="_blank">attention.jsonl</a>
|
flash_attn/impls/hf_kernels_flash_attn.html
CHANGED
|
@@ -809,6 +809,14 @@
|
|
| 809 |
.artifact-preview svg {
|
| 810 |
background: transparent;
|
| 811 |
}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 812 |
/* CSV table styling */
|
| 813 |
.artifact-csv {
|
| 814 |
margin-top: 1rem;
|
|
@@ -3871,7 +3879,7 @@ span.linenos.special { color: #000000; background-color: #ffffc0; padding-left:
|
|
| 3871 |
<span onclick="toggleOutput('benchmark')" style="cursor: pointer;">▼ output</span>
|
| 3872 |
<span id="uv-indicator-benchmark" style="cursor: default; opacity: 0.3;">▶ uv-logs</span>
|
| 3873 |
</span> |
|
| 3874 |
-
Cell: benchmark | 5.
|
| 3875 |
| <button class="run-btn" onclick="runCell('benchmark')">▶ run</button>
|
| 3876 |
<button class="copy-btn" onclick="copyCell('benchmark')">Copy</button>
|
| 3877 |
<a href="cells/benchmark.py" target="_blank" class="raw-btn">Raw</a>
|
|
@@ -3926,21 +3934,21 @@ PROFILE TRACE: hf_kernels_flash_attn | cuda_attn_L128_bfloat16
|
|
| 3926 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 3927 |
Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
|
| 3928 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 3929 |
-
hf_kernels_flash_attn 3.
|
| 3930 |
-
_flash_attn_9e27194::fwd 1.
|
| 3931 |
-
hf_kernels_flash_attn 0.00% 0.000us 0.00% 0.000us 0.000us 2.
|
| 3932 |
-
void flash::flash_fwd_kernel<Flash_fwd_kernel_traits... 0.00% 0.000us 0.00% 0.000us 0.000us 2.
|
| 3933 |
-
Activity Buffer Request 32.
|
| 3934 |
-
cudaDeviceGetAttribute 0.
|
| 3935 |
-
aten::empty_like 0.
|
| 3936 |
-
aten::empty_strided 0.
|
| 3937 |
-
aten::empty 0.57%
|
| 3938 |
-
cudaFuncSetAttribute 0.
|
| 3939 |
-
cudaLaunchKernel
|
| 3940 |
-
cudaDeviceSynchronize 58.
|
| 3941 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 3942 |
-
Self CPU time total: 4.
|
| 3943 |
-
Self CUDA time total: 2.
|
| 3944 |
|
| 3945 |
|
| 3946 |
|
|
@@ -3950,21 +3958,21 @@ PROFILE TRACE: hf_kernels_flash_attn | cuda_attn_L256_bfloat16
|
|
| 3950 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 3951 |
Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
|
| 3952 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 3953 |
-
hf_kernels_flash_attn
|
| 3954 |
-
_flash_attn_9e27194::fwd 1.
|
| 3955 |
-
hf_kernels_flash_attn 0.00% 0.000us 0.00% 0.000us 0.000us 2.
|
| 3956 |
-
void flash::flash_fwd_kernel<Flash_fwd_kernel_traits... 0.00% 0.000us 0.00% 0.000us 0.000us 2.
|
| 3957 |
-
Activity Buffer Request
|
| 3958 |
-
cudaDeviceGetAttribute 0.08% 3.
|
| 3959 |
-
aten::empty_like 0.
|
| 3960 |
-
aten::empty_strided 0.
|
| 3961 |
-
aten::empty 0.
|
| 3962 |
-
cudaFuncSetAttribute 0.08% 3.
|
| 3963 |
-
cudaLaunchKernel 0.
|
| 3964 |
-
cudaDeviceSynchronize
|
| 3965 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 3966 |
-
Self CPU time total: 4.
|
| 3967 |
-
Self CUDA time total: 2.
|
| 3968 |
|
| 3969 |
|
| 3970 |
|
|
@@ -3974,21 +3982,21 @@ PROFILE TRACE: hf_kernels_flash_attn | cuda_attn_L320_bfloat16
|
|
| 3974 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 3975 |
Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
|
| 3976 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 3977 |
-
hf_kernels_flash_attn 2.
|
| 3978 |
-
_flash_attn_9e27194::fwd 1.
|
| 3979 |
-
hf_kernels_flash_attn 0.00% 0.000us 0.00% 0.000us 0.000us 3.
|
| 3980 |
-
void flash::flash_fwd_kernel<Flash_fwd_kernel_traits... 0.00% 0.000us 0.00% 0.000us 0.000us 3.
|
| 3981 |
-
Activity Buffer Request 31.
|
| 3982 |
-
cudaDeviceGetAttribute 0.08% 3.
|
| 3983 |
-
aten::empty_like 0.15%
|
| 3984 |
-
aten::empty_strided 0.
|
| 3985 |
-
aten::empty 0.
|
| 3986 |
-
cudaFuncSetAttribute 0.08% 3.
|
| 3987 |
-
cudaLaunchKernel 0.
|
| 3988 |
-
cudaDeviceSynchronize 63.
|
| 3989 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 3990 |
-
Self CPU time total: 4.
|
| 3991 |
-
Self CUDA time total: 3.
|
| 3992 |
|
| 3993 |
|
| 3994 |
|
|
@@ -3998,21 +4006,21 @@ PROFILE TRACE: hf_kernels_flash_attn | cuda_attn_L384_bfloat16
|
|
| 3998 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 3999 |
Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
|
| 4000 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 4001 |
-
hf_kernels_flash_attn 2.
|
| 4002 |
-
_flash_attn_9e27194::fwd 1.
|
| 4003 |
-
hf_kernels_flash_attn 0.00% 0.000us 0.00% 0.000us 0.000us 3.
|
| 4004 |
-
void flash::flash_fwd_kernel<Flash_fwd_kernel_traits... 0.00% 0.000us 0.00% 0.000us 0.000us 3.
|
| 4005 |
-
Activity Buffer Request
|
| 4006 |
-
cudaDeviceGetAttribute 0.08% 3.
|
| 4007 |
-
aten::empty_like 0.
|
| 4008 |
-
aten::empty_strided 0.
|
| 4009 |
-
aten::empty 0.
|
| 4010 |
-
cudaFuncSetAttribute 0.08% 3.
|
| 4011 |
-
cudaLaunchKernel
|
| 4012 |
-
cudaDeviceSynchronize 61.
|
| 4013 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 4014 |
-
Self CPU time total: 4.
|
| 4015 |
-
Self CUDA time total: 3.
|
| 4016 |
|
| 4017 |
|
| 4018 |
|
|
@@ -4022,21 +4030,21 @@ PROFILE TRACE: hf_kernels_flash_attn | cuda_attn_L448_bfloat16
|
|
| 4022 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 4023 |
Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
|
| 4024 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 4025 |
-
hf_kernels_flash_attn 2.
|
| 4026 |
-
_flash_attn_9e27194::fwd 0.
|
| 4027 |
-
hf_kernels_flash_attn 0.00% 0.000us 0.00% 0.000us 0.000us 3.
|
| 4028 |
-
void flash::flash_fwd_kernel<Flash_fwd_kernel_traits... 0.00% 0.000us 0.00% 0.000us 0.000us 3.
|
| 4029 |
-
Activity Buffer Request 27.53% 1.
|
| 4030 |
-
cudaDeviceGetAttribute 0.08% 4.
|
| 4031 |
-
aten::empty_like 0.
|
| 4032 |
-
aten::empty_strided 0.31% 16.
|
| 4033 |
-
aten::empty 0.
|
| 4034 |
-
cudaFuncSetAttribute 0.07% 3.
|
| 4035 |
-
cudaLaunchKernel 3.
|
| 4036 |
-
cudaDeviceSynchronize 65.
|
| 4037 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 4038 |
-
Self CPU time total: 5.
|
| 4039 |
-
Self CUDA time total: 3.
|
| 4040 |
|
| 4041 |
|
| 4042 |
|
|
@@ -4046,35 +4054,36 @@ PROFILE TRACE: hf_kernels_flash_attn | cuda_attn_L512_bfloat16
|
|
| 4046 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 4047 |
Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
|
| 4048 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 4049 |
-
hf_kernels_flash_attn
|
| 4050 |
-
_flash_attn_9e27194::fwd
|
| 4051 |
-
hf_kernels_flash_attn 0.00% 0.000us 0.00% 0.000us 0.000us 3.
|
| 4052 |
-
void flash::flash_fwd_kernel<Flash_fwd_kernel_traits... 0.00% 0.000us 0.00% 0.000us 0.000us 3.
|
| 4053 |
-
Activity Buffer Request 26.
|
| 4054 |
-
cudaDeviceGetAttribute 0.
|
| 4055 |
-
aten::empty_like 0.
|
| 4056 |
-
aten::empty_strided 0.34% 18.
|
| 4057 |
-
aten::empty 0.
|
| 4058 |
-
cudaFuncSetAttribute 0.
|
| 4059 |
-
cudaLaunchKernel 3.
|
| 4060 |
-
cudaDeviceSynchronize 65.
|
| 4061 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 4062 |
-
Self CPU time total: 5.
|
| 4063 |
-
Self CUDA time total: 3.
|
| 4064 |
|
| 4065 |
|
| 4066 |
impl wl p50(ms) ok
|
| 4067 |
-
hf_kernels_flash_attn cuda_attn_L128_bfloat16 0.
|
| 4068 |
-
hf_kernels_flash_attn cuda_attn_L256_bfloat16 1.
|
| 4069 |
-
hf_kernels_flash_attn cuda_attn_L320_bfloat16 1.
|
| 4070 |
-
hf_kernels_flash_attn cuda_attn_L384_bfloat16 1.
|
| 4071 |
-
hf_kernels_flash_attn cuda_attn_L448_bfloat16 1.
|
| 4072 |
-
hf_kernels_flash_attn cuda_attn_L512_bfloat16 1.
|
| 4073 |
</pre></div>
|
| 4074 |
<div class="cell-stderr">
|
| 4075 |
Fetching 20 files: 0%| | 0/20 [00:00<?, ?it/s]
|
| 4076 |
-
Fetching 20 files:
|
| 4077 |
-
Fetching 20 files:
|
|
|
|
| 4078 |
</div>
|
| 4079 |
<div class="cell-artifacts">
|
| 4080 |
<h4>Artifacts:</h4>
|
|
|
|
| 809 |
.artifact-preview svg {
|
| 810 |
background: transparent;
|
| 811 |
}
|
| 812 |
+
/* Invert SVG images in dark mode */
|
| 813 |
+
:root[data-theme="dark"] .artifact-preview img[src$=".svg"] {
|
| 814 |
+
filter: invert(0.9) hue-rotate(180deg);
|
| 815 |
+
}
|
| 816 |
+
/* Keep SVG images readable in monocolor mode */
|
| 817 |
+
:root[data-ui="monocolor"] .artifact-preview img[src$=".svg"] {
|
| 818 |
+
filter: none;
|
| 819 |
+
}
|
| 820 |
/* CSV table styling */
|
| 821 |
.artifact-csv {
|
| 822 |
margin-top: 1rem;
|
|
|
|
| 3879 |
<span onclick="toggleOutput('benchmark')" style="cursor: pointer;">▼ output</span>
|
| 3880 |
<span id="uv-indicator-benchmark" style="cursor: default; opacity: 0.3;">▶ uv-logs</span>
|
| 3881 |
</span> |
|
| 3882 |
+
Cell: benchmark | 5.46s
|
| 3883 |
| <button class="run-btn" onclick="runCell('benchmark')">▶ run</button>
|
| 3884 |
<button class="copy-btn" onclick="copyCell('benchmark')">Copy</button>
|
| 3885 |
<a href="cells/benchmark.py" target="_blank" class="raw-btn">Raw</a>
|
|
|
|
| 3934 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 3935 |
Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
|
| 3936 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 3937 |
+
hf_kernels_flash_attn 3.61% 157.413us 41.18% 1.795ms 1.795ms 0.000us 0.00% 3.726ms 3.726ms 1
|
| 3938 |
+
_flash_attn_9e27194::fwd 1.61% 70.165us 37.57% 1.638ms 545.853us 2.781ms 100.00% 3.726ms 1.242ms 3
|
| 3939 |
+
hf_kernels_flash_attn 0.00% 0.000us 0.00% 0.000us 0.000us 2.783ms 100.05% 2.783ms 2.783ms 1
|
| 3940 |
+
void flash::flash_fwd_kernel<Flash_fwd_kernel_traits... 0.00% 0.000us 0.00% 0.000us 0.000us 2.781ms 100.00% 2.781ms 927.059us 3
|
| 3941 |
+
Activity Buffer Request 32.93% 1.435ms 32.93% 1.435ms 1.435ms 944.349us 33.96% 944.349us 944.349us 1
|
| 3942 |
+
cudaDeviceGetAttribute 0.11% 4.789us 0.11% 4.789us 0.319us 0.000us 0.00% 0.000us 0.000us 15
|
| 3943 |
+
aten::empty_like 0.38% 16.590us 1.18% 51.251us 17.084us 0.000us 0.00% 0.000us 0.000us 3
|
| 3944 |
+
aten::empty_strided 0.80% 34.661us 0.80% 34.661us 11.554us 0.000us 0.00% 0.000us 0.000us 3
|
| 3945 |
+
aten::empty 0.57% 24.950us 0.57% 24.950us 2.772us 0.000us 0.00% 0.000us 0.000us 9
|
| 3946 |
+
cudaFuncSetAttribute 0.27% 11.579us 0.27% 11.579us 3.860us 0.000us 0.00% 0.000us 0.000us 3
|
| 3947 |
+
cudaLaunchKernel 0.90% 39.431us 0.90% 39.431us 13.144us 0.000us 0.00% 0.000us 0.000us 3
|
| 3948 |
+
cudaDeviceSynchronize 58.82% 2.564ms 58.82% 2.564ms 2.564ms 0.000us 0.00% 0.000us 0.000us 1
|
| 3949 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 3950 |
+
Self CPU time total: 4.359ms
|
| 3951 |
+
Self CUDA time total: 2.781ms
|
| 3952 |
|
| 3953 |
|
| 3954 |
|
|
|
|
| 3958 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 3959 |
Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
|
| 3960 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 3961 |
+
hf_kernels_flash_attn 1.92% 86.861us 37.15% 1.685ms 1.685ms 0.000us 0.00% 3.967ms 3.967ms 1
|
| 3962 |
+
_flash_attn_9e27194::fwd 1.05% 47.633us 35.24% 1.598ms 532.729us 2.988ms 100.00% 3.967ms 1.322ms 3
|
| 3963 |
+
hf_kernels_flash_attn 0.00% 0.000us 0.00% 0.000us 0.000us 2.989ms 100.05% 2.989ms 2.989ms 1
|
| 3964 |
+
void flash::flash_fwd_kernel<Flash_fwd_kernel_traits... 0.00% 0.000us 0.00% 0.000us 0.000us 2.988ms 100.00% 2.988ms 995.953us 3
|
| 3965 |
+
Activity Buffer Request 32.54% 1.476ms 32.54% 1.476ms 1.476ms 979.196us 32.77% 979.196us 979.196us 1
|
| 3966 |
+
cudaDeviceGetAttribute 0.08% 3.549us 0.08% 3.549us 0.237us 0.000us 0.00% 0.000us 0.000us 15
|
| 3967 |
+
aten::empty_like 0.15% 6.770us 0.48% 21.750us 7.250us 0.000us 0.00% 0.000us 0.000us 3
|
| 3968 |
+
aten::empty_strided 0.33% 14.980us 0.33% 14.980us 4.993us 0.000us 0.00% 0.000us 0.000us 3
|
| 3969 |
+
aten::empty 0.45% 20.562us 0.45% 20.562us 2.285us 0.000us 0.00% 0.000us 0.000us 9
|
| 3970 |
+
cudaFuncSetAttribute 0.08% 3.410us 0.08% 3.410us 1.137us 0.000us 0.00% 0.000us 0.000us 3
|
| 3971 |
+
cudaLaunchKernel 0.56% 25.521us 0.56% 25.521us 8.507us 0.000us 0.00% 0.000us 0.000us 3
|
| 3972 |
+
cudaDeviceSynchronize 62.85% 2.850ms 62.85% 2.850ms 2.850ms 0.000us 0.00% 0.000us 0.000us 1
|
| 3973 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 3974 |
+
Self CPU time total: 4.535ms
|
| 3975 |
+
Self CUDA time total: 2.988ms
|
| 3976 |
|
| 3977 |
|
| 3978 |
|
|
|
|
| 3982 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 3983 |
Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
|
| 3984 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 3985 |
+
hf_kernels_flash_attn 2.25% 102.643us 36.16% 1.652ms 1.652ms 0.000us 0.00% 4.081ms 4.081ms 1
|
| 3986 |
+
_flash_attn_9e27194::fwd 1.10% 50.081us 33.92% 1.550ms 516.605us 3.056ms 100.00% 4.081ms 1.360ms 3
|
| 3987 |
+
hf_kernels_flash_attn 0.00% 0.000us 0.00% 0.000us 0.000us 3.058ms 100.05% 3.058ms 3.058ms 1
|
| 3988 |
+
void flash::flash_fwd_kernel<Flash_fwd_kernel_traits... 0.00% 0.000us 0.00% 0.000us 0.000us 3.056ms 100.00% 3.056ms 1.019ms 3
|
| 3989 |
+
Activity Buffer Request 31.13% 1.423ms 31.13% 1.423ms 1.423ms 1.024ms 33.52% 1.024ms 1.024ms 1
|
| 3990 |
+
cudaDeviceGetAttribute 0.08% 3.832us 0.08% 3.832us 0.255us 0.000us 0.00% 0.000us 0.000us 15
|
| 3991 |
+
aten::empty_like 0.15% 6.971us 0.48% 22.109us 7.370us 0.000us 0.00% 0.000us 0.000us 3
|
| 3992 |
+
aten::empty_strided 0.33% 15.138us 0.33% 15.138us 5.046us 0.000us 0.00% 0.000us 0.000us 3
|
| 3993 |
+
aten::empty 0.46% 20.860us 0.46% 20.860us 2.318us 0.000us 0.00% 0.000us 0.000us 9
|
| 3994 |
+
cudaFuncSetAttribute 0.08% 3.430us 0.08% 3.430us 1.143us 0.000us 0.00% 0.000us 0.000us 3
|
| 3995 |
+
cudaLaunchKernel 0.59% 26.891us 0.59% 26.891us 8.964us 0.000us 0.00% 0.000us 0.000us 3
|
| 3996 |
+
cudaDeviceSynchronize 63.84% 2.917ms 63.84% 2.917ms 2.917ms 0.000us 0.00% 0.000us 0.000us 1
|
| 3997 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 3998 |
+
Self CPU time total: 4.569ms
|
| 3999 |
+
Self CUDA time total: 3.056ms
|
| 4000 |
|
| 4001 |
|
| 4002 |
|
|
|
|
| 4006 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 4007 |
Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
|
| 4008 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 4009 |
+
hf_kernels_flash_attn 2.25% 106.084us 38.22% 1.803ms 1.803ms 0.000us 0.00% 4.091ms 4.091ms 1
|
| 4010 |
+
_flash_attn_9e27194::fwd 1.01% 47.791us 35.97% 1.697ms 565.799us 3.060ms 100.00% 4.091ms 1.364ms 3
|
| 4011 |
+
hf_kernels_flash_attn 0.00% 0.000us 0.00% 0.000us 0.000us 3.062ms 100.06% 3.062ms 3.062ms 1
|
| 4012 |
+
void flash::flash_fwd_kernel<Flash_fwd_kernel_traits... 0.00% 0.000us 0.00% 0.000us 0.000us 3.060ms 100.00% 3.060ms 1.020ms 3
|
| 4013 |
+
Activity Buffer Request 30.05% 1.418ms 30.05% 1.418ms 1.418ms 1.031ms 33.68% 1.031ms 1.031ms 1
|
| 4014 |
+
cudaDeviceGetAttribute 0.08% 3.720us 0.08% 3.720us 0.248us 0.000us 0.00% 0.000us 0.000us 15
|
| 4015 |
+
aten::empty_like 0.16% 7.600us 0.52% 24.620us 8.207us 0.000us 0.00% 0.000us 0.000us 3
|
| 4016 |
+
aten::empty_strided 0.36% 17.020us 0.36% 17.020us 5.673us 0.000us 0.00% 0.000us 0.000us 3
|
| 4017 |
+
aten::empty 0.44% 20.780us 0.44% 20.780us 2.309us 0.000us 0.00% 0.000us 0.000us 9
|
| 4018 |
+
cudaFuncSetAttribute 0.08% 3.620us 0.08% 3.620us 1.207us 0.000us 0.00% 0.000us 0.000us 3
|
| 4019 |
+
cudaLaunchKernel 3.79% 178.824us 3.79% 178.824us 59.608us 0.000us 0.00% 0.000us 0.000us 3
|
| 4020 |
+
cudaDeviceSynchronize 61.78% 2.916ms 61.78% 2.916ms 2.916ms 0.000us 0.00% 0.000us 0.000us 1
|
| 4021 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 4022 |
+
Self CPU time total: 4.719ms
|
| 4023 |
+
Self CUDA time total: 3.060ms
|
| 4024 |
|
| 4025 |
|
| 4026 |
|
|
|
|
| 4030 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 4031 |
Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
|
| 4032 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 4033 |
+
hf_kernels_flash_attn 2.06% 106.072us 34.88% 1.800ms 1.800ms 0.000us 0.00% 4.679ms 4.679ms 1
|
| 4034 |
+
_flash_attn_9e27194::fwd 0.97% 50.192us 32.82% 1.694ms 564.573us 3.505ms 100.00% 4.679ms 1.560ms 3
|
| 4035 |
+
hf_kernels_flash_attn 0.00% 0.000us 0.00% 0.000us 0.000us 3.507ms 100.04% 3.507ms 3.507ms 1
|
| 4036 |
+
void flash::flash_fwd_kernel<Flash_fwd_kernel_traits... 0.00% 0.000us 0.00% 0.000us 0.000us 3.505ms 100.00% 3.505ms 1.168ms 3
|
| 4037 |
+
Activity Buffer Request 27.53% 1.421ms 27.53% 1.421ms 1.421ms 1.174ms 33.50% 1.174ms 1.174ms 1
|
| 4038 |
+
cudaDeviceGetAttribute 0.08% 4.219us 0.08% 4.219us 0.281us 0.000us 0.00% 0.000us 0.000us 15
|
| 4039 |
+
aten::empty_like 0.15% 7.700us 0.46% 23.940us 7.980us 0.000us 0.00% 0.000us 0.000us 3
|
| 4040 |
+
aten::empty_strided 0.31% 16.240us 0.31% 16.240us 5.413us 0.000us 0.00% 0.000us 0.000us 3
|
| 4041 |
+
aten::empty 0.41% 21.049us 0.41% 21.049us 2.339us 0.000us 0.00% 0.000us 0.000us 9
|
| 4042 |
+
cudaFuncSetAttribute 0.07% 3.601us 0.07% 3.601us 1.200us 0.000us 0.00% 0.000us 0.000us 3
|
| 4043 |
+
cudaLaunchKernel 3.29% 169.975us 3.29% 169.975us 56.658us 0.000us 0.00% 0.000us 0.000us 3
|
| 4044 |
+
cudaDeviceSynchronize 65.12% 3.360ms 65.12% 3.360ms 3.360ms 0.000us 0.00% 0.000us 0.000us 1
|
| 4045 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 4046 |
+
Self CPU time total: 5.160ms
|
| 4047 |
+
Self CUDA time total: 3.505ms
|
| 4048 |
|
| 4049 |
|
| 4050 |
|
|
|
|
| 4054 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 4055 |
Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
|
| 4056 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 4057 |
+
hf_kernels_flash_attn 2.05% 108.192us 34.34% 1.815ms 1.815ms 0.000us 0.00% 4.838ms 4.838ms 1
|
| 4058 |
+
_flash_attn_9e27194::fwd 0.96% 50.903us 32.30% 1.707ms 568.907us 3.618ms 100.00% 4.838ms 1.613ms 3
|
| 4059 |
+
hf_kernels_flash_attn 0.00% 0.000us 0.00% 0.000us 0.000us 3.620ms 100.04% 3.620ms 3.620ms 1
|
| 4060 |
+
void flash::flash_fwd_kernel<Flash_fwd_kernel_traits... 0.00% 0.000us 0.00% 0.000us 0.000us 3.618ms 100.00% 3.618ms 1.206ms 3
|
| 4061 |
+
Activity Buffer Request 26.73% 1.413ms 26.73% 1.413ms 1.413ms 1.220ms 33.72% 1.220ms 1.220ms 1
|
| 4062 |
+
cudaDeviceGetAttribute 0.07% 3.869us 0.07% 3.869us 0.258us 0.000us 0.00% 0.000us 0.000us 15
|
| 4063 |
+
aten::empty_like 0.14% 7.319us 0.48% 25.360us 8.453us 0.000us 0.00% 0.000us 0.000us 3
|
| 4064 |
+
aten::empty_strided 0.34% 18.041us 0.34% 18.041us 6.014us 0.000us 0.00% 0.000us 0.000us 3
|
| 4065 |
+
aten::empty 0.41% 21.680us 0.41% 21.680us 2.409us 0.000us 0.00% 0.000us 0.000us 9
|
| 4066 |
+
cudaFuncSetAttribute 0.07% 3.810us 0.07% 3.810us 1.270us 0.000us 0.00% 0.000us 0.000us 3
|
| 4067 |
+
cudaLaunchKernel 3.57% 188.496us 3.57% 188.496us 62.832us 0.000us 0.00% 0.000us 0.000us 3
|
| 4068 |
+
cudaDeviceSynchronize 65.66% 3.470ms 65.66% 3.470ms 3.470ms 0.000us 0.00% 0.000us 0.000us 1
|
| 4069 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 4070 |
+
Self CPU time total: 5.285ms
|
| 4071 |
+
Self CUDA time total: 3.618ms
|
| 4072 |
|
| 4073 |
|
| 4074 |
impl wl p50(ms) ok
|
| 4075 |
+
hf_kernels_flash_attn cuda_attn_L128_bfloat16 0.95 True
|
| 4076 |
+
hf_kernels_flash_attn cuda_attn_L256_bfloat16 1.00 True
|
| 4077 |
+
hf_kernels_flash_attn cuda_attn_L320_bfloat16 1.05 True
|
| 4078 |
+
hf_kernels_flash_attn cuda_attn_L384_bfloat16 1.07 True
|
| 4079 |
+
hf_kernels_flash_attn cuda_attn_L448_bfloat16 1.21 True
|
| 4080 |
+
hf_kernels_flash_attn cuda_attn_L512_bfloat16 1.19 True
|
| 4081 |
</pre></div>
|
| 4082 |
<div class="cell-stderr">
|
| 4083 |
Fetching 20 files: 0%| | 0/20 [00:00<?, ?it/s]
|
| 4084 |
+
Fetching 20 files: 5%|▌ | 1/20 [00:00<00:03, 6.07it/s]
|
| 4085 |
+
Fetching 20 files: 10%|█ | 2/20 [00:01<00:12, 1.40it/s]
|
| 4086 |
+
Fetching 20 files: 100%|██████████| 20/20 [00:01<00:00, 15.82it/s]
|
| 4087 |
</div>
|
| 4088 |
<div class="cell-artifacts">
|
| 4089 |
<h4>Artifacts:</h4>
|
flash_attn/impls/hf_kernels_flash_attn3.html
CHANGED
|
@@ -809,6 +809,14 @@
|
|
| 809 |
.artifact-preview svg {
|
| 810 |
background: transparent;
|
| 811 |
}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 812 |
/* CSV table styling */
|
| 813 |
.artifact-csv {
|
| 814 |
margin-top: 1rem;
|
|
@@ -3869,9 +3877,9 @@ span.linenos.special { color: #000000; background-color: #ffffc0; padding-left:
|
|
| 3869 |
<span class="collapse-indicators">
|
| 3870 |
<span onclick="toggleCode('benchmark')" style="cursor: pointer;">▼ code</span>
|
| 3871 |
<span onclick="toggleOutput('benchmark')" style="cursor: pointer;">▼ output</span>
|
| 3872 |
-
<span id="uv-indicator-benchmark" style="cursor:
|
| 3873 |
</span> |
|
| 3874 |
-
Cell: benchmark | 5.
|
| 3875 |
| <button class="run-btn" onclick="runCell('benchmark')">▶ run</button>
|
| 3876 |
<button class="copy-btn" onclick="copyCell('benchmark')">Copy</button>
|
| 3877 |
<a href="cells/benchmark.py" target="_blank" class="raw-btn">Raw</a>
|
|
@@ -3925,19 +3933,19 @@ PROFILE TRACE: hf_kernels_flash_attn3 | cuda_attn_L128_bfloat16
|
|
| 3925 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 3926 |
Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
|
| 3927 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 3928 |
-
hf_kernels_flash_attn3 3.
|
| 3929 |
-
FlashAttnFunc 2.
|
| 3930 |
-
_flash_attn3_48fe103_dirty::fwd 1.
|
| 3931 |
-
hf_kernels_flash_attn3 0.00% 0.000us 0.00% 0.000us 0.000us 2.
|
| 3932 |
-
void cutlass::device_kernel<flash::enable_sm80_to_sm... 0.00% 0.000us 0.00% 0.000us 0.000us 2.
|
| 3933 |
-
Activity Buffer Request 33.
|
| 3934 |
-
aten::empty 1.
|
| 3935 |
-
cudaFuncSetAttribute 0.
|
| 3936 |
-
cudaLaunchKernel 1.
|
| 3937 |
-
cudaDeviceSynchronize
|
| 3938 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 3939 |
-
Self CPU time total: 4.
|
| 3940 |
-
Self CUDA time total: 2.
|
| 3941 |
|
| 3942 |
|
| 3943 |
|
|
@@ -3947,19 +3955,19 @@ PROFILE TRACE: hf_kernels_flash_attn3 | cuda_attn_L256_bfloat16
|
|
| 3947 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 3948 |
Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
|
| 3949 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 3950 |
-
hf_kernels_flash_attn3 2.
|
| 3951 |
-
FlashAttnFunc 2.
|
| 3952 |
-
_flash_attn3_48fe103_dirty::fwd 1.
|
| 3953 |
-
hf_kernels_flash_attn3 0.00% 0.000us 0.00% 0.000us 0.000us 2.
|
| 3954 |
-
void cutlass::device_kernel<flash::enable_sm80_to_sm... 0.00% 0.000us 0.00% 0.000us 0.000us 2.
|
| 3955 |
-
Activity Buffer Request 33.
|
| 3956 |
-
aten::empty 0.
|
| 3957 |
-
cudaFuncSetAttribute 0.
|
| 3958 |
-
cudaLaunchKernel 0.
|
| 3959 |
-
cudaDeviceSynchronize
|
| 3960 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 3961 |
-
Self CPU time total: 4.
|
| 3962 |
-
Self CUDA time total: 2.
|
| 3963 |
|
| 3964 |
|
| 3965 |
|
|
@@ -3969,19 +3977,19 @@ PROFILE TRACE: hf_kernels_flash_attn3 | cuda_attn_L320_bfloat16
|
|
| 3969 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 3970 |
Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
|
| 3971 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 3972 |
-
hf_kernels_flash_attn3 2.
|
| 3973 |
-
FlashAttnFunc
|
| 3974 |
-
_flash_attn3_48fe103_dirty::fwd 1.
|
| 3975 |
-
hf_kernels_flash_attn3 0.00% 0.000us 0.00% 0.000us 0.000us 2.
|
| 3976 |
-
void cutlass::device_kernel<flash::enable_sm80_to_sm... 0.00% 0.000us 0.00% 0.000us 0.000us 2.
|
| 3977 |
-
Activity Buffer Request
|
| 3978 |
-
aten::empty 0.
|
| 3979 |
-
cudaFuncSetAttribute 0.
|
| 3980 |
-
cudaLaunchKernel 0.69%
|
| 3981 |
-
cudaDeviceSynchronize 60.
|
| 3982 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 3983 |
-
Self CPU time total: 4.
|
| 3984 |
-
Self CUDA time total: 2.
|
| 3985 |
|
| 3986 |
|
| 3987 |
|
|
@@ -3991,19 +3999,19 @@ PROFILE TRACE: hf_kernels_flash_attn3 | cuda_attn_L384_bfloat16
|
|
| 3991 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 3992 |
Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
|
| 3993 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 3994 |
-
hf_kernels_flash_attn3 2.
|
| 3995 |
-
FlashAttnFunc 1.
|
| 3996 |
-
_flash_attn3_48fe103_dirty::fwd
|
| 3997 |
-
hf_kernels_flash_attn3 0.00% 0.000us 0.00% 0.000us 0.000us
|
| 3998 |
-
void cutlass::device_kernel<flash::enable_sm80_to_sm... 0.00% 0.000us 0.00% 0.000us 0.000us
|
| 3999 |
-
Activity Buffer Request 30.
|
| 4000 |
-
aten::empty 0.
|
| 4001 |
-
cudaFuncSetAttribute 0.11% 5.
|
| 4002 |
-
cudaLaunchKernel
|
| 4003 |
-
cudaDeviceSynchronize 58.
|
| 4004 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 4005 |
-
Self CPU time total: 4.
|
| 4006 |
-
Self CUDA time total:
|
| 4007 |
|
| 4008 |
|
| 4009 |
|
|
@@ -4013,19 +4021,19 @@ PROFILE TRACE: hf_kernels_flash_attn3 | cuda_attn_L448_bfloat16
|
|
| 4013 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 4014 |
Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
|
| 4015 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 4016 |
-
hf_kernels_flash_attn3 2.
|
| 4017 |
-
FlashAttnFunc 1.
|
| 4018 |
-
_flash_attn3_48fe103_dirty::fwd 0.
|
| 4019 |
-
hf_kernels_flash_attn3 0.00% 0.000us 0.00% 0.000us 0.000us 3.
|
| 4020 |
-
void cutlass::device_kernel<flash::enable_sm80_to_sm... 0.00% 0.000us 0.00% 0.000us 0.000us 3.
|
| 4021 |
-
Activity Buffer Request 28.
|
| 4022 |
-
aten::empty 0.
|
| 4023 |
-
cudaFuncSetAttribute 0.
|
| 4024 |
-
cudaLaunchKernel 3.
|
| 4025 |
-
cudaDeviceSynchronize 62.
|
| 4026 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 4027 |
-
Self CPU time total: 5.
|
| 4028 |
-
Self CUDA time total: 3.
|
| 4029 |
|
| 4030 |
|
| 4031 |
|
|
@@ -4035,34 +4043,38 @@ PROFILE TRACE: hf_kernels_flash_attn3 | cuda_attn_L512_bfloat16
|
|
| 4035 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 4036 |
Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
|
| 4037 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 4038 |
-
hf_kernels_flash_attn3 2.
|
| 4039 |
-
FlashAttnFunc
|
| 4040 |
-
_flash_attn3_48fe103_dirty::fwd
|
| 4041 |
-
hf_kernels_flash_attn3 0.00% 0.000us 0.00% 0.000us 0.000us 3.
|
| 4042 |
-
void cutlass::device_kernel<flash::enable_sm80_to_sm... 0.00% 0.000us 0.00% 0.000us 0.000us 3.
|
| 4043 |
-
Activity Buffer Request
|
| 4044 |
-
aten::empty 0.54% 27.
|
| 4045 |
-
cudaFuncSetAttribute 0.10% 5.
|
| 4046 |
-
cudaLaunchKernel 3.
|
| 4047 |
-
cudaDeviceSynchronize
|
| 4048 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 4049 |
-
Self CPU time total: 5.
|
| 4050 |
-
Self CUDA time total: 3.
|
| 4051 |
|
| 4052 |
|
| 4053 |
impl wl p50(ms) ok
|
| 4054 |
-
hf_kernels_flash_attn3 cuda_attn_L128_bfloat16
|
| 4055 |
-
hf_kernels_flash_attn3 cuda_attn_L256_bfloat16 0.
|
| 4056 |
-
hf_kernels_flash_attn3 cuda_attn_L320_bfloat16 1.
|
| 4057 |
-
hf_kernels_flash_attn3 cuda_attn_L384_bfloat16 1.
|
| 4058 |
-
hf_kernels_flash_attn3 cuda_attn_L448_bfloat16 1.
|
| 4059 |
-
hf_kernels_flash_attn3 cuda_attn_L512_bfloat16 1.
|
| 4060 |
</pre></div>
|
| 4061 |
-
<div class="
|
| 4062 |
-
|
| 4063 |
-
|
| 4064 |
-
|
|
|
|
| 4065 |
</div>
|
|
|
|
|
|
|
|
|
|
| 4066 |
<div class="cell-artifacts">
|
| 4067 |
<h4>Artifacts:</h4>
|
| 4068 |
<a href="artifacts/benchmark/attention.jsonl" class="artifact" target="_blank">attention.jsonl</a>
|
|
|
|
| 809 |
.artifact-preview svg {
|
| 810 |
background: transparent;
|
| 811 |
}
|
| 812 |
+
/* Invert SVG images in dark mode */
|
| 813 |
+
:root[data-theme="dark"] .artifact-preview img[src$=".svg"] {
|
| 814 |
+
filter: invert(0.9) hue-rotate(180deg);
|
| 815 |
+
}
|
| 816 |
+
/* Keep SVG images readable in monocolor mode */
|
| 817 |
+
:root[data-ui="monocolor"] .artifact-preview img[src$=".svg"] {
|
| 818 |
+
filter: none;
|
| 819 |
+
}
|
| 820 |
/* CSV table styling */
|
| 821 |
.artifact-csv {
|
| 822 |
margin-top: 1rem;
|
|
|
|
| 3877 |
<span class="collapse-indicators">
|
| 3878 |
<span onclick="toggleCode('benchmark')" style="cursor: pointer;">▼ code</span>
|
| 3879 |
<span onclick="toggleOutput('benchmark')" style="cursor: pointer;">▼ output</span>
|
| 3880 |
+
<span id="uv-indicator-benchmark" onclick="toggleUvLogsFromHeader('benchmark')" style="cursor: pointer;">▶ uv-logs</span>
|
| 3881 |
</span> |
|
| 3882 |
+
Cell: benchmark | 5.78s
|
| 3883 |
| <button class="run-btn" onclick="runCell('benchmark')">▶ run</button>
|
| 3884 |
<button class="copy-btn" onclick="copyCell('benchmark')">Copy</button>
|
| 3885 |
<a href="cells/benchmark.py" target="_blank" class="raw-btn">Raw</a>
|
|
|
|
| 3933 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 3934 |
Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
|
| 3935 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 3936 |
+
hf_kernels_flash_attn3 3.71% 164.893us 43.76% 1.944ms 1.944ms 0.000us 0.00% 3.688ms 3.688ms 1
|
| 3937 |
+
FlashAttnFunc 2.67% 118.403us 40.05% 1.779ms 593.141us 0.000us 0.00% 3.688ms 1.229ms 3
|
| 3938 |
+
_flash_attn3_48fe103_dirty::fwd 1.75% 77.922us 37.39% 1.661ms 553.673us 2.790ms 100.00% 3.688ms 1.229ms 3
|
| 3939 |
+
hf_kernels_flash_attn3 0.00% 0.000us 0.00% 0.000us 0.000us 2.791ms 100.05% 2.791ms 2.791ms 1
|
| 3940 |
+
void cutlass::device_kernel<flash::enable_sm80_to_sm... 0.00% 0.000us 0.00% 0.000us 0.000us 2.790ms 100.00% 2.790ms 929.856us 3
|
| 3941 |
+
Activity Buffer Request 33.30% 1.480ms 33.30% 1.480ms 1.480ms 898.016us 32.19% 898.016us 898.016us 1
|
| 3942 |
+
aten::empty 1.01% 44.942us 1.01% 44.942us 7.490us 0.000us 0.00% 0.000us 0.000us 6
|
| 3943 |
+
cudaFuncSetAttribute 0.31% 13.870us 0.31% 13.870us 4.623us 0.000us 0.00% 0.000us 0.000us 3
|
| 3944 |
+
cudaLaunchKernel 1.01% 44.741us 1.01% 44.741us 14.914us 0.000us 0.00% 0.000us 0.000us 3
|
| 3945 |
+
cudaDeviceSynchronize 56.24% 2.499ms 56.24% 2.499ms 2.499ms 0.000us 0.00% 0.000us 0.000us 1
|
| 3946 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 3947 |
+
Self CPU time total: 4.443ms
|
| 3948 |
+
Self CUDA time total: 2.790ms
|
| 3949 |
|
| 3950 |
|
| 3951 |
|
|
|
|
| 3955 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 3956 |
Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
|
| 3957 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 3958 |
+
hf_kernels_flash_attn3 2.31% 100.671us 40.75% 1.773ms 1.773ms 0.000us 0.00% 3.735ms 3.735ms 1
|
| 3959 |
+
FlashAttnFunc 2.09% 91.144us 38.44% 1.673ms 557.547us 0.000us 0.00% 3.735ms 1.245ms 3
|
| 3960 |
+
_flash_attn3_48fe103_dirty::fwd 1.16% 50.371us 36.34% 1.581ms 527.165us 2.796ms 100.00% 3.735ms 1.245ms 3
|
| 3961 |
+
hf_kernels_flash_attn3 0.00% 0.000us 0.00% 0.000us 0.000us 2.798ms 100.06% 2.798ms 2.798ms 1
|
| 3962 |
+
void cutlass::device_kernel<flash::enable_sm80_to_sm... 0.00% 0.000us 0.00% 0.000us 0.000us 2.796ms 100.00% 2.796ms 932.000us 3
|
| 3963 |
+
Activity Buffer Request 33.75% 1.469ms 33.75% 1.469ms 1.469ms 939.487us 33.60% 939.487us 939.487us 1
|
| 3964 |
+
aten::empty 0.64% 27.720us 0.64% 27.720us 4.620us 0.000us 0.00% 0.000us 0.000us 6
|
| 3965 |
+
cudaFuncSetAttribute 0.11% 4.991us 0.11% 4.991us 1.664us 0.000us 0.00% 0.000us 0.000us 3
|
| 3966 |
+
cudaLaunchKernel 0.68% 29.510us 0.68% 29.510us 9.837us 0.000us 0.00% 0.000us 0.000us 3
|
| 3967 |
+
cudaDeviceSynchronize 59.25% 2.578ms 59.25% 2.578ms 2.578ms 0.000us 0.00% 0.000us 0.000us 1
|
| 3968 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 3969 |
+
Self CPU time total: 4.352ms
|
| 3970 |
+
Self CUDA time total: 2.796ms
|
| 3971 |
|
| 3972 |
|
| 3973 |
|
|
|
|
| 3977 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 3978 |
Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
|
| 3979 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 3980 |
+
hf_kernels_flash_attn3 2.10% 95.451us 39.98% 1.817ms 1.817ms 0.000us 0.00% 3.967ms 3.967ms 1
|
| 3981 |
+
FlashAttnFunc 2.52% 114.605us 37.88% 1.721ms 573.824us 0.000us 0.00% 3.967ms 1.322ms 3
|
| 3982 |
+
_flash_attn3_48fe103_dirty::fwd 1.12% 50.981us 35.36% 1.607ms 535.622us 2.964ms 100.00% 3.967ms 1.322ms 3
|
| 3983 |
+
hf_kernels_flash_attn3 0.00% 0.000us 0.00% 0.000us 0.000us 2.966ms 100.05% 2.966ms 2.966ms 1
|
| 3984 |
+
void cutlass::device_kernel<flash::enable_sm80_to_sm... 0.00% 0.000us 0.00% 0.000us 0.000us 2.964ms 100.00% 2.964ms 988.118us 3
|
| 3985 |
+
Activity Buffer Request 32.83% 1.492ms 32.83% 1.492ms 1.492ms 1.002ms 33.81% 1.002ms 1.002ms 1
|
| 3986 |
+
aten::empty 0.60% 27.089us 0.60% 27.089us 4.515us 0.000us 0.00% 0.000us 0.000us 6
|
| 3987 |
+
cudaFuncSetAttribute 0.12% 5.480us 0.12% 5.480us 1.827us 0.000us 0.00% 0.000us 0.000us 3
|
| 3988 |
+
cudaLaunchKernel 0.69% 31.551us 0.69% 31.551us 10.517us 0.000us 0.00% 0.000us 0.000us 3
|
| 3989 |
+
cudaDeviceSynchronize 60.02% 2.727ms 60.02% 2.727ms 2.727ms 0.000us 0.00% 0.000us 0.000us 1
|
| 3990 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 3991 |
+
Self CPU time total: 4.544ms
|
| 3992 |
+
Self CUDA time total: 2.964ms
|
| 3993 |
|
| 3994 |
|
| 3995 |
|
|
|
|
| 3999 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 4000 |
Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
|
| 4001 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 4002 |
+
hf_kernels_flash_attn3 2.35% 113.792us 41.57% 2.016ms 2.016ms 0.000us 0.00% 4.078ms 4.078ms 1
|
| 4003 |
+
FlashAttnFunc 1.91% 92.684us 39.22% 1.902ms 634.112us 0.000us 0.00% 4.078ms 1.359ms 3
|
| 4004 |
+
_flash_attn3_48fe103_dirty::fwd 0.98% 47.600us 37.31% 1.810ms 603.217us 3.050ms 100.00% 4.078ms 1.359ms 3
|
| 4005 |
+
hf_kernels_flash_attn3 0.00% 0.000us 0.00% 0.000us 0.000us 3.052ms 100.05% 3.052ms 3.052ms 1
|
| 4006 |
+
void cutlass::device_kernel<flash::enable_sm80_to_sm... 0.00% 0.000us 0.00% 0.000us 0.000us 3.050ms 100.00% 3.050ms 1.017ms 3
|
| 4007 |
+
Activity Buffer Request 30.19% 1.464ms 30.19% 1.464ms 1.464ms 1.028ms 33.70% 1.028ms 1.028ms 1
|
| 4008 |
+
aten::empty 0.58% 28.221us 0.58% 28.221us 4.703us 0.000us 0.00% 0.000us 0.000us 6
|
| 4009 |
+
cudaFuncSetAttribute 0.11% 5.430us 0.11% 5.430us 1.810us 0.000us 0.00% 0.000us 0.000us 3
|
| 4010 |
+
cudaLaunchKernel 5.44% 264.046us 5.44% 264.046us 88.015us 0.000us 0.00% 0.000us 0.000us 3
|
| 4011 |
+
cudaDeviceSynchronize 58.43% 2.834ms 58.43% 2.834ms 2.834ms 0.000us 0.00% 0.000us 0.000us 1
|
| 4012 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 4013 |
+
Self CPU time total: 4.851ms
|
| 4014 |
+
Self CUDA time total: 3.050ms
|
| 4015 |
|
| 4016 |
|
| 4017 |
|
|
|
|
| 4021 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 4022 |
Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
|
| 4023 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 4024 |
+
hf_kernels_flash_attn3 2.29% 116.152us 37.60% 1.908ms 1.908ms 0.000us 0.00% 4.514ms 4.514ms 1
|
| 4025 |
+
FlashAttnFunc 1.78% 90.384us 35.31% 1.792ms 597.414us 0.000us 0.00% 4.514ms 1.505ms 3
|
| 4026 |
+
_flash_attn3_48fe103_dirty::fwd 0.91% 46.231us 33.53% 1.702ms 567.286us 3.379ms 100.00% 4.514ms 1.505ms 3
|
| 4027 |
+
hf_kernels_flash_attn3 0.00% 0.000us 0.00% 0.000us 0.000us 3.380ms 100.05% 3.380ms 3.380ms 1
|
| 4028 |
+
void cutlass::device_kernel<flash::enable_sm80_to_sm... 0.00% 0.000us 0.00% 0.000us 0.000us 3.379ms 100.00% 3.379ms 1.126ms 3
|
| 4029 |
+
Activity Buffer Request 28.41% 1.442ms 28.41% 1.442ms 1.442ms 1.136ms 33.61% 1.136ms 1.136ms 1
|
| 4030 |
+
aten::empty 0.54% 27.250us 0.54% 27.250us 4.542us 0.000us 0.00% 0.000us 0.000us 6
|
| 4031 |
+
cudaFuncSetAttribute 0.10% 5.250us 0.10% 5.250us 1.750us 0.000us 0.00% 0.000us 0.000us 3
|
| 4032 |
+
cudaLaunchKernel 3.57% 181.204us 3.57% 181.204us 60.401us 0.000us 0.00% 0.000us 0.000us 3
|
| 4033 |
+
cudaDeviceSynchronize 62.40% 3.167ms 62.40% 3.167ms 3.167ms 0.000us 0.00% 0.000us 0.000us 1
|
| 4034 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 4035 |
+
Self CPU time total: 5.075ms
|
| 4036 |
+
Self CUDA time total: 3.379ms
|
| 4037 |
|
| 4038 |
|
| 4039 |
|
|
|
|
| 4043 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 4044 |
Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
|
| 4045 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 4046 |
+
hf_kernels_flash_attn3 2.24% 115.243us 39.36% 2.021ms 2.021ms 0.000us 0.00% 4.438ms 4.438ms 1
|
| 4047 |
+
FlashAttnFunc 1.78% 91.262us 37.12% 1.906ms 635.278us 0.000us 0.00% 4.438ms 1.479ms 3
|
| 4048 |
+
_flash_attn3_48fe103_dirty::fwd 0.90% 46.212us 35.34% 1.815ms 604.857us 3.325ms 100.00% 4.438ms 1.479ms 3
|
| 4049 |
+
hf_kernels_flash_attn3 0.00% 0.000us 0.00% 0.000us 0.000us 3.327ms 100.04% 3.327ms 3.327ms 1
|
| 4050 |
+
void cutlass::device_kernel<flash::enable_sm80_to_sm... 0.00% 0.000us 0.00% 0.000us 0.000us 3.325ms 100.00% 3.325ms 1.108ms 3
|
| 4051 |
+
Activity Buffer Request 30.40% 1.561ms 30.40% 1.561ms 1.561ms 1.113ms 33.46% 1.113ms 1.113ms 1
|
| 4052 |
+
aten::empty 0.54% 27.780us 0.54% 27.780us 4.630us 0.000us 0.00% 0.000us 0.000us 6
|
| 4053 |
+
cudaFuncSetAttribute 0.10% 5.330us 0.10% 5.330us 1.777us 0.000us 0.00% 0.000us 0.000us 3
|
| 4054 |
+
cudaLaunchKernel 3.40% 174.454us 3.40% 174.454us 58.151us 0.000us 0.00% 0.000us 0.000us 3
|
| 4055 |
+
cudaDeviceSynchronize 60.64% 3.113ms 60.64% 3.113ms 3.113ms 0.000us 0.00% 0.000us 0.000us 1
|
| 4056 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 4057 |
+
Self CPU time total: 5.134ms
|
| 4058 |
+
Self CUDA time total: 3.325ms
|
| 4059 |
|
| 4060 |
|
| 4061 |
impl wl p50(ms) ok
|
| 4062 |
+
hf_kernels_flash_attn3 cuda_attn_L128_bfloat16 1.00 True
|
| 4063 |
+
hf_kernels_flash_attn3 cuda_attn_L256_bfloat16 0.99 True
|
| 4064 |
+
hf_kernels_flash_attn3 cuda_attn_L320_bfloat16 1.03 True
|
| 4065 |
+
hf_kernels_flash_attn3 cuda_attn_L384_bfloat16 1.02 True
|
| 4066 |
+
hf_kernels_flash_attn3 cuda_attn_L448_bfloat16 1.19 True
|
| 4067 |
+
hf_kernels_flash_attn3 cuda_attn_L512_bfloat16 1.15 True
|
| 4068 |
</pre></div>
|
| 4069 |
+
<div class="uv-install-logs" id="uv-logs-benchmark">
|
| 4070 |
+
<div class="uv-logs-header" onclick="toggleUvLogs(this)">▶ UV Install Logs</div>
|
| 4071 |
+
<div class="uv-logs-content" style="display: none;">
|
| 4072 |
+
Installed 15 packages in 13ms
|
| 4073 |
+
</div>
|
| 4074 |
</div>
|
| 4075 |
+
<div class="cell-stderr">Fetching 4 files: 0%| | 0/4 [00:00<?, ?it/s]
|
| 4076 |
+
Fetching 4 files: 50%|█████ | 2/4 [00:01<00:01, 1.44it/s]
|
| 4077 |
+
Fetching 4 files: 100%|██████████| 4/4 [00:01<00:00, 2.88it/s]</div>
|
| 4078 |
<div class="cell-artifacts">
|
| 4079 |
<h4>Artifacts:</h4>
|
| 4080 |
<a href="artifacts/benchmark/attention.jsonl" class="artifact" target="_blank">attention.jsonl</a>
|
flash_attn/impls/mem_efficient_attention.html
CHANGED
|
@@ -809,6 +809,14 @@
|
|
| 809 |
.artifact-preview svg {
|
| 810 |
background: transparent;
|
| 811 |
}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 812 |
/* CSV table styling */
|
| 813 |
.artifact-csv {
|
| 814 |
margin-top: 1rem;
|
|
@@ -3871,7 +3879,7 @@ span.linenos.special { color: #000000; background-color: #ffffc0; padding-left:
|
|
| 3871 |
<span onclick="toggleOutput('benchmark')" style="cursor: pointer;">▼ output</span>
|
| 3872 |
<span id="uv-indicator-benchmark" style="cursor: default; opacity: 0.3;">▶ uv-logs</span>
|
| 3873 |
</span> |
|
| 3874 |
-
Cell: benchmark | 3.
|
| 3875 |
| <button class="run-btn" onclick="runCell('benchmark')">▶ run</button>
|
| 3876 |
<button class="copy-btn" onclick="copyCell('benchmark')">Copy</button>
|
| 3877 |
<a href="cells/benchmark.py" target="_blank" class="raw-btn">Raw</a>
|
|
@@ -3924,28 +3932,28 @@ PROFILE TRACE: torch_mem_eff | cuda_attn_L128_bfloat16
|
|
| 3924 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 3925 |
Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
|
| 3926 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 3927 |
-
torch_mem_eff
|
| 3928 |
-
torch_mem_eff 0.00% 0.000us 0.00% 0.000us 0.000us 5.
|
| 3929 |
-
aten::scaled_dot_product_attention 0.
|
| 3930 |
-
aten::_scaled_dot_product_efficient_attention 0.
|
| 3931 |
-
aten::_efficient_attention_forward 0.
|
| 3932 |
-
fmha_cutlassF_bf16_aligned_64x128_rf_sm80(PyTorchMem... 0.00% 0.000us 0.00% 0.000us 0.000us 4.
|
| 3933 |
-
aten::contiguous 0.
|
| 3934 |
-
aten::clone 0.
|
| 3935 |
-
aten::copy_ 1.
|
| 3936 |
-
void at::native::elementwise_kernel<128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us
|
| 3937 |
-
Activity Buffer Request 20.
|
| 3938 |
-
aten::transpose 0.
|
| 3939 |
-
aten::as_strided 0.
|
| 3940 |
-
aten::empty_like 0.
|
| 3941 |
-
aten::empty 1.
|
| 3942 |
-
cudaLaunchKernel 1.
|
| 3943 |
-
cudaStreamIsCapturing 0.
|
| 3944 |
-
cudaFuncSetAttribute 0.
|
| 3945 |
-
cudaDeviceSynchronize
|
| 3946 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 3947 |
-
Self CPU time total:
|
| 3948 |
-
Self CUDA time total: 5.
|
| 3949 |
|
| 3950 |
|
| 3951 |
|
|
@@ -3955,28 +3963,28 @@ PROFILE TRACE: torch_mem_eff | cuda_attn_L256_bfloat16
|
|
| 3955 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 3956 |
Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
|
| 3957 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 3958 |
-
torch_mem_eff 3.
|
| 3959 |
-
torch_mem_eff 0.00% 0.000us 0.00% 0.000us 0.000us 5.
|
| 3960 |
-
aten::scaled_dot_product_attention 0.
|
| 3961 |
-
aten::_scaled_dot_product_efficient_attention 0.
|
| 3962 |
-
aten::_efficient_attention_forward 0.
|
| 3963 |
-
fmha_cutlassF_bf16_aligned_64x128_rf_sm80(PyTorchMem... 0.00% 0.000us 0.00% 0.000us 0.000us
|
| 3964 |
-
aten::contiguous 0.10% 7.
|
| 3965 |
-
aten::clone 0.
|
| 3966 |
-
aten::copy_ 0.
|
| 3967 |
-
void at::native::elementwise_kernel<128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us
|
| 3968 |
-
Activity Buffer Request
|
| 3969 |
-
aten::transpose 0.
|
| 3970 |
-
aten::as_strided 0.
|
| 3971 |
-
aten::empty_like 0.
|
| 3972 |
-
aten::empty 0.
|
| 3973 |
-
cudaLaunchKernel 1.
|
| 3974 |
-
cudaStreamIsCapturing 0.03% 2.
|
| 3975 |
-
cudaFuncSetAttribute 0.
|
| 3976 |
-
cudaDeviceSynchronize
|
| 3977 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 3978 |
-
Self CPU time total: 7.
|
| 3979 |
-
Self CUDA time total: 5.
|
| 3980 |
|
| 3981 |
|
| 3982 |
|
|
@@ -3986,28 +3994,28 @@ PROFILE TRACE: torch_mem_eff | cuda_attn_L320_bfloat16
|
|
| 3986 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 3987 |
Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
|
| 3988 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 3989 |
-
torch_mem_eff 3.
|
| 3990 |
-
torch_mem_eff 0.00% 0.000us 0.00% 0.000us 0.000us 5.
|
| 3991 |
-
aten::scaled_dot_product_attention 0.
|
| 3992 |
-
aten::_scaled_dot_product_efficient_attention 0.
|
| 3993 |
-
aten::_efficient_attention_forward 0.
|
| 3994 |
-
fmha_cutlassF_bf16_aligned_64x128_rf_sm80(PyTorchMem... 0.00% 0.000us 0.00% 0.000us 0.000us 5.
|
| 3995 |
-
aten::contiguous 0.
|
| 3996 |
-
aten::clone 0.
|
| 3997 |
-
aten::copy_
|
| 3998 |
-
void at::native::elementwise_kernel<128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us
|
| 3999 |
-
Activity Buffer Request
|
| 4000 |
-
aten::transpose 0.
|
| 4001 |
-
aten::as_strided 0.
|
| 4002 |
-
aten::empty_like 0.
|
| 4003 |
-
aten::empty 0.
|
| 4004 |
-
cudaLaunchKernel 1.
|
| 4005 |
-
cudaStreamIsCapturing 0.03% 2.
|
| 4006 |
-
cudaFuncSetAttribute 0.
|
| 4007 |
-
cudaDeviceSynchronize
|
| 4008 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 4009 |
-
Self CPU time total: 7.
|
| 4010 |
-
Self CUDA time total: 5.
|
| 4011 |
|
| 4012 |
|
| 4013 |
|
|
@@ -4017,28 +4025,28 @@ PROFILE TRACE: torch_mem_eff | cuda_attn_L384_bfloat16
|
|
| 4017 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 4018 |
Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
|
| 4019 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 4020 |
-
torch_mem_eff 3.
|
| 4021 |
-
torch_mem_eff 0.00% 0.000us 0.00% 0.000us 0.000us 6.
|
| 4022 |
-
aten::scaled_dot_product_attention 0.24% 18.
|
| 4023 |
-
aten::_scaled_dot_product_efficient_attention 0.25% 19.
|
| 4024 |
-
aten::_efficient_attention_forward 0.
|
| 4025 |
-
fmha_cutlassF_bf16_aligned_64x128_rf_sm80(PyTorchMem... 0.00% 0.000us 0.00% 0.000us 0.000us 5.
|
| 4026 |
-
aten::contiguous 0.10% 7.
|
| 4027 |
-
aten::clone 0.28% 21.
|
| 4028 |
-
aten::copy_ 0.
|
| 4029 |
-
void at::native::elementwise_kernel<128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us
|
| 4030 |
-
Activity Buffer Request 18.
|
| 4031 |
-
aten::transpose 0.
|
| 4032 |
-
aten::as_strided 0.
|
| 4033 |
-
aten::empty_like 0.15%
|
| 4034 |
-
aten::empty 0.
|
| 4035 |
-
cudaLaunchKernel 3.
|
| 4036 |
-
cudaStreamIsCapturing 0.03% 2.
|
| 4037 |
-
cudaFuncSetAttribute 0.04% 3.
|
| 4038 |
-
cudaDeviceSynchronize 70.
|
| 4039 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 4040 |
-
Self CPU time total: 7.
|
| 4041 |
-
Self CUDA time total: 6.
|
| 4042 |
|
| 4043 |
|
| 4044 |
|
|
@@ -4048,28 +4056,28 @@ PROFILE TRACE: torch_mem_eff | cuda_attn_L448_bfloat16
|
|
| 4048 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 4049 |
Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
|
| 4050 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 4051 |
-
torch_mem_eff 3.
|
| 4052 |
-
torch_mem_eff 0.00% 0.000us 0.00% 0.000us 0.000us 6.
|
| 4053 |
-
aten::scaled_dot_product_attention 0.
|
| 4054 |
-
aten::_scaled_dot_product_efficient_attention 0.24% 19.
|
| 4055 |
-
aten::_efficient_attention_forward 0.36% 28.
|
| 4056 |
-
fmha_cutlassF_bf16_aligned_64x128_rf_sm80(PyTorchMem... 0.00% 0.000us 0.00% 0.000us 0.000us 5.
|
| 4057 |
-
aten::contiguous 0.09%
|
| 4058 |
-
aten::clone 0.
|
| 4059 |
-
aten::copy_ 0.
|
| 4060 |
-
void at::native::elementwise_kernel<128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us
|
| 4061 |
-
Activity Buffer Request 18.
|
| 4062 |
-
aten::transpose 0.
|
| 4063 |
-
aten::as_strided 0.
|
| 4064 |
-
aten::empty_like 0.15% 12.
|
| 4065 |
-
aten::empty 0.
|
| 4066 |
-
cudaLaunchKernel 3.
|
| 4067 |
-
cudaStreamIsCapturing 0.03% 2.
|
| 4068 |
-
cudaFuncSetAttribute 0.04% 3.
|
| 4069 |
-
cudaDeviceSynchronize 71.
|
| 4070 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 4071 |
-
Self CPU time total:
|
| 4072 |
-
Self CUDA time total: 6.
|
| 4073 |
|
| 4074 |
|
| 4075 |
|
|
@@ -4079,36 +4087,36 @@ PROFILE TRACE: torch_mem_eff | cuda_attn_L512_bfloat16
|
|
| 4079 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 4080 |
Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
|
| 4081 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 4082 |
-
torch_mem_eff
|
| 4083 |
-
torch_mem_eff 0.00% 0.000us 0.00% 0.000us 0.000us 6.
|
| 4084 |
-
aten::scaled_dot_product_attention 0.
|
| 4085 |
-
aten::_scaled_dot_product_efficient_attention 0.
|
| 4086 |
-
aten::_efficient_attention_forward 0.
|
| 4087 |
-
fmha_cutlassF_bf16_aligned_64x128_rf_sm80(PyTorchMem... 0.00% 0.000us 0.00% 0.000us 0.000us 5.
|
| 4088 |
-
aten::contiguous 0.
|
| 4089 |
-
aten::clone 0.
|
| 4090 |
-
aten::copy_ 0.
|
| 4091 |
-
void at::native::elementwise_kernel<128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us
|
| 4092 |
-
Activity Buffer Request 17.
|
| 4093 |
-
aten::transpose 0.
|
| 4094 |
-
aten::as_strided 0.
|
| 4095 |
-
aten::empty_like 0.14% 11.
|
| 4096 |
-
aten::empty 0.
|
| 4097 |
-
cudaLaunchKernel
|
| 4098 |
-
cudaStreamIsCapturing 0.03% 2.
|
| 4099 |
-
cudaFuncSetAttribute 0.04%
|
| 4100 |
-
cudaDeviceSynchronize 73.
|
| 4101 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 4102 |
-
Self CPU time total: 8.
|
| 4103 |
-
Self CUDA time total: 6.
|
| 4104 |
|
| 4105 |
|
| 4106 |
impl wl p50(ms) ok
|
| 4107 |
torch_mem_eff cuda_attn_L128_bfloat16 1.83 True
|
| 4108 |
-
torch_mem_eff cuda_attn_L256_bfloat16 1.
|
| 4109 |
-
torch_mem_eff cuda_attn_L320_bfloat16
|
| 4110 |
-
torch_mem_eff cuda_attn_L384_bfloat16
|
| 4111 |
-
torch_mem_eff cuda_attn_L448_bfloat16 2.
|
| 4112 |
torch_mem_eff cuda_attn_L512_bfloat16 2.19 True
|
| 4113 |
</pre></div>
|
| 4114 |
<div class="cell-artifacts">
|
|
|
|
| 809 |
.artifact-preview svg {
|
| 810 |
background: transparent;
|
| 811 |
}
|
| 812 |
+
/* Invert SVG images in dark mode */
|
| 813 |
+
:root[data-theme="dark"] .artifact-preview img[src$=".svg"] {
|
| 814 |
+
filter: invert(0.9) hue-rotate(180deg);
|
| 815 |
+
}
|
| 816 |
+
/* Keep SVG images readable in monocolor mode */
|
| 817 |
+
:root[data-ui="monocolor"] .artifact-preview img[src$=".svg"] {
|
| 818 |
+
filter: none;
|
| 819 |
+
}
|
| 820 |
/* CSV table styling */
|
| 821 |
.artifact-csv {
|
| 822 |
margin-top: 1rem;
|
|
|
|
| 3879 |
<span onclick="toggleOutput('benchmark')" style="cursor: pointer;">▼ output</span>
|
| 3880 |
<span id="uv-indicator-benchmark" style="cursor: default; opacity: 0.3;">▶ uv-logs</span>
|
| 3881 |
</span> |
|
| 3882 |
+
Cell: benchmark | 3.89s
|
| 3883 |
| <button class="run-btn" onclick="runCell('benchmark')">▶ run</button>
|
| 3884 |
<button class="copy-btn" onclick="copyCell('benchmark')">Copy</button>
|
| 3885 |
<a href="cells/benchmark.py" target="_blank" class="raw-btn">Raw</a>
|
|
|
|
| 3932 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 3933 |
Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
|
| 3934 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 3935 |
+
torch_mem_eff 5.04% 355.427us 33.26% 2.347ms 2.347ms 0.000us 0.00% 5.443ms 5.443ms 1
|
| 3936 |
+
torch_mem_eff 0.00% 0.000us 0.00% 0.000us 0.000us 5.441ms 100.90% 5.441ms 5.441ms 1
|
| 3937 |
+
aten::scaled_dot_product_attention 0.45% 31.972us 2.63% 185.885us 61.962us 0.000us 0.00% 4.772ms 1.591ms 3
|
| 3938 |
+
aten::_scaled_dot_product_efficient_attention 0.35% 24.621us 2.18% 153.913us 51.304us 0.000us 0.00% 4.772ms 1.591ms 3
|
| 3939 |
+
aten::_efficient_attention_forward 0.53% 37.509us 1.49% 105.321us 35.107us 4.772ms 88.48% 4.772ms 1.591ms 3
|
| 3940 |
+
fmha_cutlassF_bf16_aligned_64x128_rf_sm80(PyTorchMem... 0.00% 0.000us 0.00% 0.000us 0.000us 4.772ms 88.48% 4.772ms 1.591ms 3
|
| 3941 |
+
aten::contiguous 0.16% 11.612us 24.73% 1.745ms 193.873us 0.000us 0.00% 671.455us 74.606us 9
|
| 3942 |
+
aten::clone 0.45% 31.980us 24.56% 1.733ms 192.583us 0.000us 0.00% 671.455us 74.606us 9
|
| 3943 |
+
aten::copy_ 1.09% 76.971us 23.11% 1.631ms 181.191us 621.119us 11.52% 671.455us 74.606us 9
|
| 3944 |
+
void at::native::elementwise_kernel<128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 621.119us 11.52% 621.119us 69.013us 9
|
| 3945 |
+
Activity Buffer Request 20.82% 1.469ms 20.82% 1.469ms 1.469ms 50.336us 0.93% 50.336us 50.336us 1
|
| 3946 |
+
aten::transpose 0.89% 62.923us 1.20% 84.503us 3.521us 0.000us 0.00% 0.000us 0.000us 24
|
| 3947 |
+
aten::as_strided 0.31% 21.580us 0.31% 21.580us 0.899us 0.000us 0.00% 0.000us 0.000us 24
|
| 3948 |
+
aten::empty_like 0.23% 16.040us 1.00% 70.551us 7.839us 0.000us 0.00% 0.000us 0.000us 9
|
| 3949 |
+
aten::empty 1.20% 84.702us 1.20% 84.702us 4.033us 0.000us 0.00% 0.000us 0.000us 21
|
| 3950 |
+
cudaLaunchKernel 1.56% 109.883us 1.56% 109.883us 9.157us 0.000us 0.00% 0.000us 0.000us 12
|
| 3951 |
+
cudaStreamIsCapturing 0.04% 3.130us 0.04% 3.130us 1.043us 0.000us 0.00% 0.000us 0.000us 3
|
| 3952 |
+
cudaFuncSetAttribute 0.13% 9.350us 0.13% 9.350us 3.117us 0.000us 0.00% 0.000us 0.000us 3
|
| 3953 |
+
cudaDeviceSynchronize 66.74% 4.709ms 66.74% 4.709ms 4.709ms 0.000us 0.00% 0.000us 0.000us 1
|
| 3954 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 3955 |
+
Self CPU time total: 7.056ms
|
| 3956 |
+
Self CUDA time total: 5.393ms
|
| 3957 |
|
| 3958 |
|
| 3959 |
|
|
|
|
| 3963 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 3964 |
Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
|
| 3965 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 3966 |
+
torch_mem_eff 3.16% 230.972us 28.28% 2.069ms 2.069ms 0.000us 0.00% 5.837ms 5.837ms 1
|
| 3967 |
+
torch_mem_eff 0.00% 0.000us 0.00% 0.000us 0.000us 5.791ms 100.14% 5.791ms 5.791ms 1
|
| 3968 |
+
aten::scaled_dot_product_attention 0.28% 20.721us 1.89% 138.014us 46.005us 0.000us 0.00% 5.147ms 1.716ms 3
|
| 3969 |
+
aten::_scaled_dot_product_efficient_attention 0.25% 18.299us 1.60% 117.293us 39.098us 0.000us 0.00% 5.147ms 1.716ms 3
|
| 3970 |
+
aten::_efficient_attention_forward 0.37% 27.244us 1.07% 78.053us 26.018us 5.147ms 89.00% 5.147ms 1.716ms 3
|
| 3971 |
+
fmha_cutlassF_bf16_aligned_64x128_rf_sm80(PyTorchMem... 0.00% 0.000us 0.00% 0.000us 0.000us 5.147ms 89.00% 5.147ms 1.716ms 3
|
| 3972 |
+
aten::contiguous 0.10% 7.473us 22.69% 1.660ms 184.464us 0.000us 0.00% 690.528us 76.725us 9
|
| 3973 |
+
aten::clone 0.31% 22.407us 22.59% 1.653ms 183.634us 0.000us 0.00% 690.528us 76.725us 9
|
| 3974 |
+
aten::copy_ 0.90% 65.683us 21.62% 1.582ms 175.735us 636.032us 11.00% 690.528us 76.725us 9
|
| 3975 |
+
void at::native::elementwise_kernel<128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 636.032us 11.00% 636.032us 70.670us 9
|
| 3976 |
+
Activity Buffer Request 19.82% 1.450ms 19.82% 1.450ms 1.450ms 54.496us 0.94% 54.496us 54.496us 1
|
| 3977 |
+
aten::transpose 0.62% 45.174us 0.83% 60.723us 2.530us 0.000us 0.00% 0.000us 0.000us 24
|
| 3978 |
+
aten::as_strided 0.21% 15.549us 0.21% 15.549us 0.648us 0.000us 0.00% 0.000us 0.000us 24
|
| 3979 |
+
aten::empty_like 0.16% 11.973us 0.67% 48.683us 5.409us 0.000us 0.00% 0.000us 0.000us 9
|
| 3980 |
+
aten::empty 0.84% 61.270us 0.84% 61.270us 2.918us 0.000us 0.00% 0.000us 0.000us 21
|
| 3981 |
+
cudaLaunchKernel 1.18% 86.180us 1.18% 86.180us 7.182us 0.000us 0.00% 0.000us 0.000us 12
|
| 3982 |
+
cudaStreamIsCapturing 0.03% 2.460us 0.03% 2.460us 0.820us 0.000us 0.00% 0.000us 0.000us 3
|
| 3983 |
+
cudaFuncSetAttribute 0.04% 3.159us 0.04% 3.159us 1.053us 0.000us 0.00% 0.000us 0.000us 3
|
| 3984 |
+
cudaDeviceSynchronize 71.72% 5.248ms 71.72% 5.248ms 5.248ms 0.000us 0.00% 0.000us 0.000us 1
|
| 3985 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 3986 |
+
Self CPU time total: 7.317ms
|
| 3987 |
+
Self CUDA time total: 5.783ms
|
| 3988 |
|
| 3989 |
|
| 3990 |
|
|
|
|
| 3994 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 3995 |
Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
|
| 3996 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 3997 |
+
torch_mem_eff 3.27% 244.917us 27.45% 2.054ms 2.054ms 0.000us 0.00% 6.034ms 6.034ms 1
|
| 3998 |
+
torch_mem_eff 0.00% 0.000us 0.00% 0.000us 0.000us 5.984ms 100.14% 5.984ms 5.984ms 1
|
| 3999 |
+
aten::scaled_dot_product_attention 0.26% 19.270us 1.91% 142.603us 47.534us 0.000us 0.00% 5.315ms 1.772ms 3
|
| 4000 |
+
aten::_scaled_dot_product_efficient_attention 0.25% 18.622us 1.65% 123.333us 41.111us 0.000us 0.00% 5.315ms 1.772ms 3
|
| 4001 |
+
aten::_efficient_attention_forward 0.37% 27.710us 1.08% 80.560us 26.853us 5.315ms 88.95% 5.315ms 1.772ms 3
|
| 4002 |
+
fmha_cutlassF_bf16_aligned_64x128_rf_sm80(PyTorchMem... 0.00% 0.000us 0.00% 0.000us 0.000us 5.315ms 88.95% 5.315ms 1.772ms 3
|
| 4003 |
+
aten::contiguous 0.10% 7.220us 21.76% 1.628ms 180.911us 0.000us 0.00% 718.878us 79.875us 9
|
| 4004 |
+
aten::clone 0.29% 21.638us 21.66% 1.621ms 180.109us 0.000us 0.00% 718.878us 79.875us 9
|
| 4005 |
+
aten::copy_ 0.91% 68.381us 20.73% 1.551ms 172.378us 660.254us 11.05% 718.878us 79.875us 9
|
| 4006 |
+
void at::native::elementwise_kernel<128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 660.254us 11.05% 660.254us 73.362us 9
|
| 4007 |
+
Activity Buffer Request 18.95% 1.418ms 18.95% 1.418ms 1.418ms 58.624us 0.98% 58.624us 58.624us 1
|
| 4008 |
+
aten::transpose 0.63% 46.916us 0.84% 62.771us 2.615us 0.000us 0.00% 0.000us 0.000us 24
|
| 4009 |
+
aten::as_strided 0.21% 15.855us 0.21% 15.855us 0.661us 0.000us 0.00% 0.000us 0.000us 24
|
| 4010 |
+
aten::empty_like 0.15% 11.482us 0.64% 47.942us 5.327us 0.000us 0.00% 0.000us 0.000us 9
|
| 4011 |
+
aten::empty 0.82% 61.110us 0.82% 61.110us 2.910us 0.000us 0.00% 0.000us 0.000us 21
|
| 4012 |
+
cudaLaunchKernel 1.17% 87.854us 1.17% 87.854us 7.321us 0.000us 0.00% 0.000us 0.000us 12
|
| 4013 |
+
cudaStreamIsCapturing 0.03% 2.410us 0.03% 2.410us 0.803us 0.000us 0.00% 0.000us 0.000us 3
|
| 4014 |
+
cudaFuncSetAttribute 0.04% 2.950us 0.04% 2.950us 0.983us 0.000us 0.00% 0.000us 0.000us 3
|
| 4015 |
+
cudaDeviceSynchronize 72.55% 5.429ms 72.55% 5.429ms 5.429ms 0.000us 0.00% 0.000us 0.000us 1
|
| 4016 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 4017 |
+
Self CPU time total: 7.483ms
|
| 4018 |
+
Self CUDA time total: 5.976ms
|
| 4019 |
|
| 4020 |
|
| 4021 |
|
|
|
|
| 4025 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 4026 |
Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
|
| 4027 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 4028 |
+
torch_mem_eff 3.13% 245.154us 29.09% 2.280ms 2.280ms 0.000us 0.00% 6.166ms 6.166ms 1
|
| 4029 |
+
torch_mem_eff 0.00% 0.000us 0.00% 0.000us 0.000us 6.117ms 100.15% 6.117ms 6.117ms 1
|
| 4030 |
+
aten::scaled_dot_product_attention 0.24% 18.991us 1.80% 140.753us 46.918us 0.000us 0.00% 5.454ms 1.818ms 3
|
| 4031 |
+
aten::_scaled_dot_product_efficient_attention 0.25% 19.741us 1.55% 121.762us 40.587us 0.000us 0.00% 5.454ms 1.818ms 3
|
| 4032 |
+
aten::_efficient_attention_forward 0.36% 27.980us 1.01% 79.030us 26.343us 5.454ms 89.29% 5.454ms 1.818ms 3
|
| 4033 |
+
fmha_cutlassF_bf16_aligned_64x128_rf_sm80(PyTorchMem... 0.00% 0.000us 0.00% 0.000us 0.000us 5.454ms 89.29% 5.454ms 1.818ms 3
|
| 4034 |
+
aten::contiguous 0.10% 7.853us 23.65% 1.854ms 206.016us 0.000us 0.00% 711.999us 79.111us 9
|
| 4035 |
+
aten::clone 0.28% 21.760us 23.55% 1.846ms 205.144us 0.000us 0.00% 711.999us 79.111us 9
|
| 4036 |
+
aten::copy_ 0.86% 67.621us 22.63% 1.774ms 197.124us 654.399us 10.71% 711.999us 79.111us 9
|
| 4037 |
+
void at::native::elementwise_kernel<128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 654.399us 10.71% 654.399us 72.711us 9
|
| 4038 |
+
Activity Buffer Request 18.63% 1.461ms 18.63% 1.461ms 1.461ms 57.600us 0.94% 57.600us 57.600us 1
|
| 4039 |
+
aten::transpose 0.60% 47.388us 0.81% 63.381us 2.641us 0.000us 0.00% 0.000us 0.000us 24
|
| 4040 |
+
aten::as_strided 0.20% 15.993us 0.20% 15.993us 0.666us 0.000us 0.00% 0.000us 0.000us 24
|
| 4041 |
+
aten::empty_like 0.15% 12.039us 0.64% 50.420us 5.602us 0.000us 0.00% 0.000us 0.000us 9
|
| 4042 |
+
aten::empty 0.81% 63.411us 0.81% 63.411us 3.020us 0.000us 0.00% 0.000us 0.000us 21
|
| 4043 |
+
cudaLaunchKernel 3.40% 266.437us 3.40% 266.437us 22.203us 0.000us 0.00% 0.000us 0.000us 12
|
| 4044 |
+
cudaStreamIsCapturing 0.03% 2.470us 0.03% 2.470us 0.823us 0.000us 0.00% 0.000us 0.000us 3
|
| 4045 |
+
cudaFuncSetAttribute 0.04% 3.000us 0.04% 3.000us 1.000us 0.000us 0.00% 0.000us 0.000us 3
|
| 4046 |
+
cudaDeviceSynchronize 70.91% 5.560ms 70.91% 5.560ms 5.560ms 0.000us 0.00% 0.000us 0.000us 1
|
| 4047 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 4048 |
+
Self CPU time total: 7.840ms
|
| 4049 |
+
Self CUDA time total: 6.108ms
|
| 4050 |
|
| 4051 |
|
| 4052 |
|
|
|
|
| 4056 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 4057 |
Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
|
| 4058 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 4059 |
+
torch_mem_eff 3.12% 251.727us 28.35% 2.287ms 2.287ms 0.000us 0.00% 6.402ms 6.402ms 1
|
| 4060 |
+
torch_mem_eff 0.00% 0.000us 0.00% 0.000us 0.000us 6.350ms 100.14% 6.350ms 6.350ms 1
|
| 4061 |
+
aten::scaled_dot_product_attention 0.24% 19.272us 1.78% 143.434us 47.811us 0.000us 0.00% 5.676ms 1.892ms 3
|
| 4062 |
+
aten::_scaled_dot_product_efficient_attention 0.24% 19.071us 1.54% 124.162us 41.387us 0.000us 0.00% 5.676ms 1.892ms 3
|
| 4063 |
+
aten::_efficient_attention_forward 0.36% 28.918us 1.02% 82.141us 27.380us 5.676ms 89.51% 5.676ms 1.892ms 3
|
| 4064 |
+
fmha_cutlassF_bf16_aligned_64x128_rf_sm80(PyTorchMem... 0.00% 0.000us 0.00% 0.000us 0.000us 5.676ms 89.51% 5.676ms 1.892ms 3
|
| 4065 |
+
aten::contiguous 0.09% 7.578us 22.96% 1.852ms 205.774us 0.000us 0.00% 725.410us 80.601us 9
|
| 4066 |
+
aten::clone 0.27% 22.113us 22.87% 1.844ms 204.932us 0.000us 0.00% 725.410us 80.601us 9
|
| 4067 |
+
aten::copy_ 0.85% 68.201us 21.96% 1.771ms 196.780us 665.282us 10.49% 725.410us 80.601us 9
|
| 4068 |
+
void at::native::elementwise_kernel<128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 665.282us 10.49% 665.282us 73.920us 9
|
| 4069 |
+
Activity Buffer Request 18.11% 1.461ms 18.11% 1.461ms 1.461ms 60.128us 0.95% 60.128us 60.128us 1
|
| 4070 |
+
aten::transpose 0.57% 46.288us 0.78% 62.529us 2.605us 0.000us 0.00% 0.000us 0.000us 24
|
| 4071 |
+
aten::as_strided 0.20% 16.241us 0.20% 16.241us 0.677us 0.000us 0.00% 0.000us 0.000us 24
|
| 4072 |
+
aten::empty_like 0.15% 12.469us 0.64% 51.250us 5.694us 0.000us 0.00% 0.000us 0.000us 9
|
| 4073 |
+
aten::empty 0.80% 64.494us 0.80% 64.494us 3.071us 0.000us 0.00% 0.000us 0.000us 21
|
| 4074 |
+
cudaLaunchKernel 3.27% 263.876us 3.27% 263.876us 21.990us 0.000us 0.00% 0.000us 0.000us 12
|
| 4075 |
+
cudaStreamIsCapturing 0.03% 2.440us 0.03% 2.440us 0.813us 0.000us 0.00% 0.000us 0.000us 3
|
| 4076 |
+
cudaFuncSetAttribute 0.04% 3.380us 0.04% 3.380us 1.127us 0.000us 0.00% 0.000us 0.000us 3
|
| 4077 |
+
cudaDeviceSynchronize 71.65% 5.779ms 71.65% 5.779ms 5.779ms 0.000us 0.00% 0.000us 0.000us 1
|
| 4078 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 4079 |
+
Self CPU time total: 8.066ms
|
| 4080 |
+
Self CUDA time total: 6.342ms
|
| 4081 |
|
| 4082 |
|
| 4083 |
|
|
|
|
| 4087 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 4088 |
Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
|
| 4089 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 4090 |
+
torch_mem_eff 2.86% 239.115us 26.99% 2.259ms 2.259ms 0.000us 0.00% 6.718ms 6.718ms 1
|
| 4091 |
+
torch_mem_eff 0.00% 0.000us 0.00% 0.000us 0.000us 6.665ms 100.13% 6.665ms 6.665ms 1
|
| 4092 |
+
aten::scaled_dot_product_attention 0.23% 19.210us 1.67% 139.873us 46.624us 0.000us 0.00% 5.983ms 1.994ms 3
|
| 4093 |
+
aten::_scaled_dot_product_efficient_attention 0.22% 18.712us 1.44% 120.663us 40.221us 0.000us 0.00% 5.983ms 1.994ms 3
|
| 4094 |
+
aten::_efficient_attention_forward 0.33% 27.381us 0.94% 78.541us 26.180us 5.983ms 89.89% 5.983ms 1.994ms 3
|
| 4095 |
+
fmha_cutlassF_bf16_aligned_64x128_rf_sm80(PyTorchMem... 0.00% 0.000us 0.00% 0.000us 0.000us 5.983ms 89.89% 5.983ms 1.994ms 3
|
| 4096 |
+
aten::contiguous 0.09% 7.469us 21.99% 1.841ms 204.601us 0.000us 0.00% 734.336us 81.593us 9
|
| 4097 |
+
aten::clone 0.27% 22.450us 21.90% 1.834ms 203.772us 0.000us 0.00% 734.336us 81.593us 9
|
| 4098 |
+
aten::copy_ 0.80% 67.050us 21.01% 1.759ms 195.442us 673.088us 10.11% 734.336us 81.593us 9
|
| 4099 |
+
void at::native::elementwise_kernel<128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 673.088us 10.11% 673.088us 74.788us 9
|
| 4100 |
+
Activity Buffer Request 17.30% 1.449ms 17.30% 1.449ms 1.449ms 61.248us 0.92% 61.248us 61.248us 1
|
| 4101 |
+
aten::transpose 0.55% 46.102us 0.74% 62.332us 2.597us 0.000us 0.00% 0.000us 0.000us 24
|
| 4102 |
+
aten::as_strided 0.19% 16.230us 0.19% 16.230us 0.676us 0.000us 0.00% 0.000us 0.000us 24
|
| 4103 |
+
aten::empty_like 0.14% 11.891us 0.63% 52.512us 5.835us 0.000us 0.00% 0.000us 0.000us 9
|
| 4104 |
+
aten::empty 0.78% 65.061us 0.78% 65.061us 3.098us 0.000us 0.00% 0.000us 0.000us 21
|
| 4105 |
+
cudaLaunchKernel 3.16% 264.678us 3.16% 264.678us 22.056us 0.000us 0.00% 0.000us 0.000us 12
|
| 4106 |
+
cudaStreamIsCapturing 0.03% 2.400us 0.03% 2.400us 0.800us 0.000us 0.00% 0.000us 0.000us 3
|
| 4107 |
+
cudaFuncSetAttribute 0.04% 2.990us 0.04% 2.990us 0.997us 0.000us 0.00% 0.000us 0.000us 3
|
| 4108 |
+
cudaDeviceSynchronize 73.01% 6.113ms 73.01% 6.113ms 6.113ms 0.000us 0.00% 0.000us 0.000us 1
|
| 4109 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 4110 |
+
Self CPU time total: 8.372ms
|
| 4111 |
+
Self CUDA time total: 6.656ms
|
| 4112 |
|
| 4113 |
|
| 4114 |
impl wl p50(ms) ok
|
| 4115 |
torch_mem_eff cuda_attn_L128_bfloat16 1.83 True
|
| 4116 |
+
torch_mem_eff cuda_attn_L256_bfloat16 1.94 True
|
| 4117 |
+
torch_mem_eff cuda_attn_L320_bfloat16 1.95 True
|
| 4118 |
+
torch_mem_eff cuda_attn_L384_bfloat16 2.05 True
|
| 4119 |
+
torch_mem_eff cuda_attn_L448_bfloat16 2.07 True
|
| 4120 |
torch_mem_eff cuda_attn_L512_bfloat16 2.19 True
|
| 4121 |
</pre></div>
|
| 4122 |
<div class="cell-artifacts">
|
flash_attn/impls/sage_attention.html
CHANGED
|
@@ -809,6 +809,14 @@
|
|
| 809 |
.artifact-preview svg {
|
| 810 |
background: transparent;
|
| 811 |
}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 812 |
/* CSV table styling */
|
| 813 |
.artifact-csv {
|
| 814 |
margin-top: 1rem;
|
|
@@ -3869,9 +3877,9 @@ span.linenos.special { color: #000000; background-color: #ffffc0; padding-left:
|
|
| 3869 |
<span class="collapse-indicators">
|
| 3870 |
<span onclick="toggleCode('benchmark')" style="cursor: pointer;">▼ code</span>
|
| 3871 |
<span onclick="toggleOutput('benchmark')" style="cursor: pointer;">▼ output</span>
|
| 3872 |
-
<span id="uv-indicator-benchmark"
|
| 3873 |
</span> |
|
| 3874 |
-
Cell: benchmark | 4.
|
| 3875 |
| <button class="run-btn" onclick="runCell('benchmark')">▶ run</button>
|
| 3876 |
<button class="copy-btn" onclick="copyCell('benchmark')">Copy</button>
|
| 3877 |
<a href="cells/benchmark.py" target="_blank" class="raw-btn">Raw</a>
|
|
@@ -3920,28 +3928,23 @@ Cell: benchmark | 4.53s
|
|
| 3920 |
<div class="cell-stdout"><pre class="stdout-text">Running attention benchmark on cuda with 6 workloads.
|
| 3921 |
impl wl p50(ms) ok
|
| 3922 |
sage_int8_fp16 cuda_attn_L128_bfloat16 FAIL False
|
| 3923 |
-
Error: module '
|
| 3924 |
sage_int8_fp16 cuda_attn_L256_bfloat16 FAIL False
|
| 3925 |
-
Error: module '
|
| 3926 |
sage_int8_fp16 cuda_attn_L320_bfloat16 FAIL False
|
| 3927 |
-
Error: module '
|
| 3928 |
sage_int8_fp16 cuda_attn_L384_bfloat16 FAIL False
|
| 3929 |
-
Error: module '
|
| 3930 |
sage_int8_fp16 cuda_attn_L448_bfloat16 FAIL False
|
| 3931 |
-
Error: module '
|
| 3932 |
sage_int8_fp16 cuda_attn_L512_bfloat16 FAIL False
|
| 3933 |
-
Error: module '
|
| 3934 |
</pre></div>
|
| 3935 |
-
<div class="
|
| 3936 |
-
|
| 3937 |
-
|
| 3938 |
-
|
| 3939 |
-
</div>
|
| 3940 |
</div>
|
| 3941 |
-
<div class="cell-stderr">Fetching 11 files: 0%| | 0/11 [00:00<?, ?it/s]
|
| 3942 |
-
Fetching 11 files: 18%|█▊ | 2/11 [00:00<00:00, 15.79it/s]
|
| 3943 |
-
Fetching 11 files: 73%|███████▎ | 8/11 [00:00<00:00, 13.55it/s]
|
| 3944 |
-
Fetching 11 files: 100%|██████████| 11/11 [00:00<00:00, 18.83it/s]</div>
|
| 3945 |
<div class="cell-artifacts">
|
| 3946 |
<h4>Artifacts:</h4>
|
| 3947 |
<a href="artifacts/benchmark/attention.jsonl" class="artifact" target="_blank">attention.jsonl</a>
|
|
|
|
| 809 |
.artifact-preview svg {
|
| 810 |
background: transparent;
|
| 811 |
}
|
| 812 |
+
/* Invert SVG images in dark mode */
|
| 813 |
+
:root[data-theme="dark"] .artifact-preview img[src$=".svg"] {
|
| 814 |
+
filter: invert(0.9) hue-rotate(180deg);
|
| 815 |
+
}
|
| 816 |
+
/* Keep SVG images readable in monocolor mode */
|
| 817 |
+
:root[data-ui="monocolor"] .artifact-preview img[src$=".svg"] {
|
| 818 |
+
filter: none;
|
| 819 |
+
}
|
| 820 |
/* CSV table styling */
|
| 821 |
.artifact-csv {
|
| 822 |
margin-top: 1rem;
|
|
|
|
| 3877 |
<span class="collapse-indicators">
|
| 3878 |
<span onclick="toggleCode('benchmark')" style="cursor: pointer;">▼ code</span>
|
| 3879 |
<span onclick="toggleOutput('benchmark')" style="cursor: pointer;">▼ output</span>
|
| 3880 |
+
<span id="uv-indicator-benchmark" style="cursor: default; opacity: 0.3;">▶ uv-logs</span>
|
| 3881 |
</span> |
|
| 3882 |
+
Cell: benchmark | 4.19s
|
| 3883 |
| <button class="run-btn" onclick="runCell('benchmark')">▶ run</button>
|
| 3884 |
<button class="copy-btn" onclick="copyCell('benchmark')">Copy</button>
|
| 3885 |
<a href="cells/benchmark.py" target="_blank" class="raw-btn">Raw</a>
|
|
|
|
| 3928 |
<div class="cell-stdout"><pre class="stdout-text">Running attention benchmark on cuda with 6 workloads.
|
| 3929 |
impl wl p50(ms) ok
|
| 3930 |
sage_int8_fp16 cuda_attn_L128_bfloat16 FAIL False
|
| 3931 |
+
Error: module 'sage_attention_d4f4a6803f593c0b' has no attribute 'fwd'
|
| 3932 |
sage_int8_fp16 cuda_attn_L256_bfloat16 FAIL False
|
| 3933 |
+
Error: module 'sage_attention_d4f4a6803f593c0b' has no attribute 'fwd'
|
| 3934 |
sage_int8_fp16 cuda_attn_L320_bfloat16 FAIL False
|
| 3935 |
+
Error: module 'sage_attention_d4f4a6803f593c0b' has no attribute 'fwd'
|
| 3936 |
sage_int8_fp16 cuda_attn_L384_bfloat16 FAIL False
|
| 3937 |
+
Error: module 'sage_attention_d4f4a6803f593c0b' has no attribute 'fwd'
|
| 3938 |
sage_int8_fp16 cuda_attn_L448_bfloat16 FAIL False
|
| 3939 |
+
Error: module 'sage_attention_d4f4a6803f593c0b' has no attribute 'fwd'
|
| 3940 |
sage_int8_fp16 cuda_attn_L512_bfloat16 FAIL False
|
| 3941 |
+
Error: module 'sage_attention_d4f4a6803f593c0b' has no attribute 'fwd'
|
| 3942 |
</pre></div>
|
| 3943 |
+
<div class="cell-stderr">
|
| 3944 |
+
Fetching 11 files: 0%| | 0/11 [00:00<?, ?it/s]
|
| 3945 |
+
Fetching 11 files: 73%|███████▎ | 8/11 [00:00<00:00, 13.96it/s]
|
| 3946 |
+
Fetching 11 files: 100%|██████████| 11/11 [00:00<00:00, 19.18it/s]
|
|
|
|
| 3947 |
</div>
|
|
|
|
|
|
|
|
|
|
|
|
|
| 3948 |
<div class="cell-artifacts">
|
| 3949 |
<h4>Artifacts:</h4>
|
| 3950 |
<a href="artifacts/benchmark/attention.jsonl" class="artifact" target="_blank">attention.jsonl</a>
|
flash_attn/impls/xformers.html
CHANGED
|
@@ -809,6 +809,14 @@
|
|
| 809 |
.artifact-preview svg {
|
| 810 |
background: transparent;
|
| 811 |
}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 812 |
/* CSV table styling */
|
| 813 |
.artifact-csv {
|
| 814 |
margin-top: 1rem;
|
|
@@ -3871,7 +3879,7 @@ span.linenos.special { color: #000000; background-color: #ffffc0; padding-left:
|
|
| 3871 |
<span onclick="toggleOutput('benchmark')" style="cursor: pointer;">▼ output</span>
|
| 3872 |
<span id="uv-indicator-benchmark" onclick="toggleUvLogsFromHeader('benchmark')" style="cursor: pointer;">▶ uv-logs</span>
|
| 3873 |
</span> |
|
| 3874 |
-
Cell: benchmark |
|
| 3875 |
| <button class="run-btn" onclick="runCell('benchmark')">▶ run</button>
|
| 3876 |
<button class="copy-btn" onclick="copyCell('benchmark')">Copy</button>
|
| 3877 |
<a href="cells/benchmark.py" target="_blank" class="raw-btn">Raw</a>
|
|
@@ -3923,21 +3931,21 @@ PROFILE TRACE: xformers_meff | cuda_attn_L128_bfloat16
|
|
| 3923 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 3924 |
Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
|
| 3925 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 3926 |
-
xformers_meff
|
| 3927 |
-
xformers_flash3::flash_fwd
|
| 3928 |
-
flash_attn_3::fwd 1.
|
| 3929 |
-
xformers_meff 0.00% 0.000us 0.00% 0.000us 0.000us 2.
|
| 3930 |
-
void cutlass::device_kernel<flash::enable_sm80_to_sm... 0.00% 0.000us 0.00% 0.000us 0.000us 2.
|
| 3931 |
-
Activity Buffer Request
|
| 3932 |
-
aten::empty 0.
|
| 3933 |
-
cudaFuncSetAttribute 0.
|
| 3934 |
-
cudaLaunchKernel
|
| 3935 |
-
aten::reshape 0.
|
| 3936 |
-
aten::view 0.
|
| 3937 |
-
cudaDeviceSynchronize
|
| 3938 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 3939 |
-
Self CPU time total: 4.
|
| 3940 |
-
Self CUDA time total: 2.
|
| 3941 |
|
| 3942 |
|
| 3943 |
|
|
@@ -3947,21 +3955,21 @@ PROFILE TRACE: xformers_meff | cuda_attn_L256_bfloat16
|
|
| 3947 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 3948 |
Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
|
| 3949 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 3950 |
-
xformers_meff
|
| 3951 |
-
xformers_flash3::flash_fwd 3.
|
| 3952 |
-
flash_attn_3::fwd 1.
|
| 3953 |
-
xformers_meff 0.00% 0.000us 0.00% 0.000us 0.000us 2.
|
| 3954 |
-
void cutlass::device_kernel<flash::enable_sm80_to_sm... 0.00% 0.000us 0.00% 0.000us 0.000us 2.
|
| 3955 |
-
Activity Buffer Request 32.
|
| 3956 |
-
aten::empty 0.
|
| 3957 |
-
cudaFuncSetAttribute 0.12% 5.
|
| 3958 |
-
cudaLaunchKernel 0.
|
| 3959 |
-
aten::reshape 0.
|
| 3960 |
-
aten::view 0.
|
| 3961 |
-
cudaDeviceSynchronize 54.
|
| 3962 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 3963 |
-
Self CPU time total: 4.
|
| 3964 |
-
Self CUDA time total: 2.
|
| 3965 |
|
| 3966 |
|
| 3967 |
|
|
@@ -3971,21 +3979,21 @@ PROFILE TRACE: xformers_meff | cuda_attn_L320_bfloat16
|
|
| 3971 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 3972 |
Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
|
| 3973 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 3974 |
-
xformers_meff 6.
|
| 3975 |
-
xformers_flash3::flash_fwd 3.
|
| 3976 |
-
flash_attn_3::fwd 1.15% 52.
|
| 3977 |
-
xformers_meff 0.00% 0.000us 0.00% 0.000us 0.000us 2.
|
| 3978 |
-
void cutlass::device_kernel<flash::enable_sm80_to_sm... 0.00% 0.000us 0.00% 0.000us 0.000us 2.
|
| 3979 |
-
Activity Buffer Request 31.
|
| 3980 |
-
aten::empty 0.
|
| 3981 |
-
cudaFuncSetAttribute 0.12% 5.
|
| 3982 |
-
cudaLaunchKernel 0.
|
| 3983 |
-
aten::reshape 0.
|
| 3984 |
-
aten::view 0.
|
| 3985 |
-
cudaDeviceSynchronize 55.
|
| 3986 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 3987 |
-
Self CPU time total: 4.
|
| 3988 |
-
Self CUDA time total: 2.
|
| 3989 |
|
| 3990 |
|
| 3991 |
|
|
@@ -3995,21 +4003,21 @@ PROFILE TRACE: xformers_meff | cuda_attn_L384_bfloat16
|
|
| 3995 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 3996 |
Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
|
| 3997 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 3998 |
-
xformers_meff 6.
|
| 3999 |
-
xformers_flash3::flash_fwd 3.
|
| 4000 |
-
flash_attn_3::fwd 1.
|
| 4001 |
-
xformers_meff 0.00% 0.000us 0.00% 0.000us 0.000us 2.
|
| 4002 |
-
void cutlass::device_kernel<flash::enable_sm80_to_sm... 0.00% 0.000us 0.00% 0.000us 0.000us 2.
|
| 4003 |
-
Activity Buffer Request 30.
|
| 4004 |
-
aten::empty 0.
|
| 4005 |
-
cudaFuncSetAttribute 0.
|
| 4006 |
-
cudaLaunchKernel
|
| 4007 |
-
aten::reshape 0.
|
| 4008 |
-
aten::view 0.29% 13.
|
| 4009 |
-
cudaDeviceSynchronize 52.
|
| 4010 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 4011 |
-
Self CPU time total: 4.
|
| 4012 |
-
Self CUDA time total: 2.
|
| 4013 |
|
| 4014 |
|
| 4015 |
|
|
@@ -4019,21 +4027,21 @@ PROFILE TRACE: xformers_meff | cuda_attn_L448_bfloat16
|
|
| 4019 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 4020 |
Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
|
| 4021 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 4022 |
-
xformers_meff 5.
|
| 4023 |
-
xformers_flash3::flash_fwd 2.
|
| 4024 |
-
flash_attn_3::fwd 1.
|
| 4025 |
-
xformers_meff 0.00% 0.000us 0.00% 0.000us 0.000us 3.
|
| 4026 |
-
void cutlass::device_kernel<flash::enable_sm80_to_sm... 0.00% 0.000us 0.00% 0.000us 0.000us 3.
|
| 4027 |
-
Activity Buffer Request 27.
|
| 4028 |
-
aten::empty 0.
|
| 4029 |
-
cudaFuncSetAttribute 0.
|
| 4030 |
-
cudaLaunchKernel
|
| 4031 |
-
aten::reshape 0.18% 9.
|
| 4032 |
-
aten::view 0.
|
| 4033 |
-
cudaDeviceSynchronize
|
| 4034 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 4035 |
-
Self CPU time total: 5.
|
| 4036 |
-
Self CUDA time total: 3.
|
| 4037 |
|
| 4038 |
|
| 4039 |
|
|
@@ -4043,37 +4051,83 @@ PROFILE TRACE: xformers_meff | cuda_attn_L512_bfloat16
|
|
| 4043 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 4044 |
Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
|
| 4045 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 4046 |
-
xformers_meff 5.
|
| 4047 |
-
xformers_flash3::flash_fwd 2.
|
| 4048 |
-
flash_attn_3::fwd 1.
|
| 4049 |
-
xformers_meff 0.00% 0.000us 0.00% 0.000us 0.000us 3.
|
| 4050 |
-
void cutlass::device_kernel<flash::enable_sm80_to_sm... 0.00% 0.000us 0.00% 0.000us 0.000us 3.
|
| 4051 |
-
Activity Buffer Request
|
| 4052 |
-
aten::empty 0.
|
| 4053 |
-
cudaFuncSetAttribute 0.11% 5.
|
| 4054 |
-
cudaLaunchKernel
|
| 4055 |
-
aten::reshape 0.17% 8.
|
| 4056 |
-
aten::view 0.
|
| 4057 |
-
cudaDeviceSynchronize
|
| 4058 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 4059 |
-
Self CPU time total: 5.
|
| 4060 |
-
Self CUDA time total: 3.
|
| 4061 |
|
| 4062 |
|
| 4063 |
impl wl p50(ms) ok
|
| 4064 |
-
xformers_meff cuda_attn_L128_bfloat16
|
| 4065 |
-
xformers_meff cuda_attn_L256_bfloat16 1.
|
| 4066 |
-
xformers_meff cuda_attn_L320_bfloat16 1.
|
| 4067 |
-
xformers_meff cuda_attn_L384_bfloat16 1.
|
| 4068 |
-
xformers_meff cuda_attn_L448_bfloat16 1.
|
| 4069 |
-
xformers_meff cuda_attn_L512_bfloat16 1.
|
| 4070 |
</pre></div>
|
| 4071 |
<div class="uv-install-logs" id="uv-logs-benchmark">
|
| 4072 |
<div class="uv-logs-header" onclick="toggleUvLogs(this)">▶ UV Install Logs</div>
|
| 4073 |
<div class="uv-logs-content" style="display: none;">
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 4074 |
Downloading xformers (111.8MiB)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 4075 |
Downloading xformers
|
| 4076 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 4077 |
</div>
|
| 4078 |
</div>
|
| 4079 |
<div class="cell-artifacts">
|
|
|
|
| 809 |
.artifact-preview svg {
|
| 810 |
background: transparent;
|
| 811 |
}
|
| 812 |
+
/* Invert SVG images in dark mode */
|
| 813 |
+
:root[data-theme="dark"] .artifact-preview img[src$=".svg"] {
|
| 814 |
+
filter: invert(0.9) hue-rotate(180deg);
|
| 815 |
+
}
|
| 816 |
+
/* Keep SVG images readable in monocolor mode */
|
| 817 |
+
:root[data-ui="monocolor"] .artifact-preview img[src$=".svg"] {
|
| 818 |
+
filter: none;
|
| 819 |
+
}
|
| 820 |
/* CSV table styling */
|
| 821 |
.artifact-csv {
|
| 822 |
margin-top: 1rem;
|
|
|
|
| 3879 |
<span onclick="toggleOutput('benchmark')" style="cursor: pointer;">▼ output</span>
|
| 3880 |
<span id="uv-indicator-benchmark" onclick="toggleUvLogsFromHeader('benchmark')" style="cursor: pointer;">▶ uv-logs</span>
|
| 3881 |
</span> |
|
| 3882 |
+
Cell: benchmark | 33.44s
|
| 3883 |
| <button class="run-btn" onclick="runCell('benchmark')">▶ run</button>
|
| 3884 |
<button class="copy-btn" onclick="copyCell('benchmark')">Copy</button>
|
| 3885 |
<a href="cells/benchmark.py" target="_blank" class="raw-btn">Raw</a>
|
|
|
|
| 3931 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 3932 |
Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
|
| 3933 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 3934 |
+
xformers_meff 9.89% 457.200us 48.78% 2.255ms 2.255ms 0.000us 0.00% 3.820ms 3.820ms 1
|
| 3935 |
+
xformers_flash3::flash_fwd 3.84% 177.424us 38.10% 1.761ms 587.077us 0.000us 0.00% 3.820ms 1.273ms 3
|
| 3936 |
+
flash_attn_3::fwd 1.55% 71.862us 34.26% 1.584ms 527.935us 2.885ms 100.00% 3.820ms 1.273ms 3
|
| 3937 |
+
xformers_meff 0.00% 0.000us 0.00% 0.000us 0.000us 2.886ms 100.04% 2.886ms 2.886ms 1
|
| 3938 |
+
void cutlass::device_kernel<flash::enable_sm80_to_sm... 0.00% 0.000us 0.00% 0.000us 0.000us 2.885ms 100.00% 2.885ms 961.658us 3
|
| 3939 |
+
Activity Buffer Request 30.73% 1.420ms 30.73% 1.420ms 1.420ms 934.553us 32.39% 934.553us 934.553us 1
|
| 3940 |
+
aten::empty 0.74% 34.201us 0.74% 34.201us 5.700us 0.000us 0.00% 0.000us 0.000us 6
|
| 3941 |
+
cudaFuncSetAttribute 0.22% 10.110us 0.22% 10.110us 3.370us 0.000us 0.00% 0.000us 0.000us 3
|
| 3942 |
+
cudaLaunchKernel 1.02% 47.230us 1.02% 47.230us 15.743us 0.000us 0.00% 0.000us 0.000us 3
|
| 3943 |
+
aten::reshape 0.34% 15.510us 0.79% 36.581us 6.097us 0.000us 0.00% 0.000us 0.000us 6
|
| 3944 |
+
aten::view 0.46% 21.071us 0.46% 21.071us 3.512us 0.000us 0.00% 0.000us 0.000us 6
|
| 3945 |
+
cudaDeviceSynchronize 51.22% 2.368ms 51.22% 2.368ms 2.368ms 0.000us 0.00% 0.000us 0.000us 1
|
| 3946 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 3947 |
+
Self CPU time total: 4.623ms
|
| 3948 |
+
Self CUDA time total: 2.885ms
|
| 3949 |
|
| 3950 |
|
| 3951 |
|
|
|
|
| 3955 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 3956 |
Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
|
| 3957 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 3958 |
+
xformers_meff 6.56% 301.335us 45.12% 2.073ms 2.073ms 0.000us 0.00% 3.862ms 3.862ms 1
|
| 3959 |
+
xformers_flash3::flash_fwd 3.02% 138.865us 38.04% 1.748ms 582.607us 0.000us 0.00% 3.862ms 1.287ms 3
|
| 3960 |
+
flash_attn_3::fwd 1.15% 53.013us 35.02% 1.609ms 536.319us 2.932ms 100.00% 3.862ms 1.287ms 3
|
| 3961 |
+
xformers_meff 0.00% 0.000us 0.00% 0.000us 0.000us 2.933ms 100.04% 2.933ms 2.933ms 1
|
| 3962 |
+
void cutlass::device_kernel<flash::enable_sm80_to_sm... 0.00% 0.000us 0.00% 0.000us 0.000us 2.932ms 100.00% 2.932ms 977.308us 3
|
| 3963 |
+
Activity Buffer Request 32.36% 1.487ms 32.36% 1.487ms 1.487ms 930.332us 31.73% 930.332us 930.332us 1
|
| 3964 |
+
aten::empty 0.65% 29.679us 0.65% 29.679us 4.946us 0.000us 0.00% 0.000us 0.000us 6
|
| 3965 |
+
cudaFuncSetAttribute 0.12% 5.591us 0.12% 5.591us 1.864us 0.000us 0.00% 0.000us 0.000us 3
|
| 3966 |
+
cudaLaunchKernel 0.74% 34.170us 0.74% 34.170us 11.390us 0.000us 0.00% 0.000us 0.000us 3
|
| 3967 |
+
aten::reshape 0.22% 9.881us 0.51% 23.631us 3.938us 0.000us 0.00% 0.000us 0.000us 6
|
| 3968 |
+
aten::view 0.30% 13.750us 0.30% 13.750us 2.292us 0.000us 0.00% 0.000us 0.000us 6
|
| 3969 |
+
cudaDeviceSynchronize 54.88% 2.521ms 54.88% 2.521ms 2.521ms 0.000us 0.00% 0.000us 0.000us 1
|
| 3970 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 3971 |
+
Self CPU time total: 4.594ms
|
| 3972 |
+
Self CUDA time total: 2.932ms
|
| 3973 |
|
| 3974 |
|
| 3975 |
|
|
|
|
| 3979 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 3980 |
Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
|
| 3981 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 3982 |
+
xformers_meff 6.47% 295.057us 44.36% 2.024ms 2.024ms 0.000us 0.00% 3.906ms 3.906ms 1
|
| 3983 |
+
xformers_flash3::flash_fwd 3.08% 140.693us 37.39% 1.706ms 568.676us 0.000us 0.00% 3.906ms 1.302ms 3
|
| 3984 |
+
flash_attn_3::fwd 1.15% 52.641us 34.31% 1.565ms 521.779us 2.948ms 100.00% 3.906ms 1.302ms 3
|
| 3985 |
+
xformers_meff 0.00% 0.000us 0.00% 0.000us 0.000us 2.949ms 100.05% 2.949ms 2.949ms 1
|
| 3986 |
+
void cutlass::device_kernel<flash::enable_sm80_to_sm... 0.00% 0.000us 0.00% 0.000us 0.000us 2.948ms 100.00% 2.948ms 982.658us 3
|
| 3987 |
+
Activity Buffer Request 31.65% 1.444ms 31.65% 1.444ms 1.444ms 958.263us 32.51% 958.263us 958.263us 1
|
| 3988 |
+
aten::empty 0.65% 29.440us 0.65% 29.440us 4.907us 0.000us 0.00% 0.000us 0.000us 6
|
| 3989 |
+
cudaFuncSetAttribute 0.12% 5.511us 0.12% 5.511us 1.837us 0.000us 0.00% 0.000us 0.000us 3
|
| 3990 |
+
cudaLaunchKernel 0.74% 33.911us 0.74% 33.911us 11.304us 0.000us 0.00% 0.000us 0.000us 3
|
| 3991 |
+
aten::reshape 0.18% 8.109us 0.50% 22.850us 3.808us 0.000us 0.00% 0.000us 0.000us 6
|
| 3992 |
+
aten::view 0.32% 14.741us 0.32% 14.741us 2.457us 0.000us 0.00% 0.000us 0.000us 6
|
| 3993 |
+
cudaDeviceSynchronize 55.64% 2.539ms 55.64% 2.539ms 2.539ms 0.000us 0.00% 0.000us 0.000us 1
|
| 3994 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 3995 |
+
Self CPU time total: 4.562ms
|
| 3996 |
+
Self CUDA time total: 2.948ms
|
| 3997 |
|
| 3998 |
|
| 3999 |
|
|
|
|
| 4003 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 4004 |
Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
|
| 4005 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 4006 |
+
xformers_meff 6.44% 300.857us 47.49% 2.217ms 2.217ms 0.000us 0.00% 3.827ms 3.827ms 1
|
| 4007 |
+
xformers_flash3::flash_fwd 3.16% 147.703us 40.53% 1.892ms 630.694us 0.000us 0.00% 3.827ms 1.276ms 3
|
| 4008 |
+
flash_attn_3::fwd 1.13% 52.820us 37.36% 1.744ms 581.460us 2.874ms 100.00% 3.827ms 1.276ms 3
|
| 4009 |
+
xformers_meff 0.00% 0.000us 0.00% 0.000us 0.000us 2.876ms 100.05% 2.876ms 2.876ms 1
|
| 4010 |
+
void cutlass::device_kernel<flash::enable_sm80_to_sm... 0.00% 0.000us 0.00% 0.000us 0.000us 2.874ms 100.00% 2.874ms 958.161us 3
|
| 4011 |
+
Activity Buffer Request 30.85% 1.440ms 30.85% 1.440ms 1.440ms 952.124us 33.12% 952.124us 952.124us 1
|
| 4012 |
+
aten::empty 0.63% 29.391us 0.63% 29.391us 4.899us 0.000us 0.00% 0.000us 0.000us 6
|
| 4013 |
+
cudaFuncSetAttribute 0.13% 5.930us 0.13% 5.930us 1.977us 0.000us 0.00% 0.000us 0.000us 3
|
| 4014 |
+
cudaLaunchKernel 4.63% 215.955us 4.63% 215.955us 71.985us 0.000us 0.00% 0.000us 0.000us 3
|
| 4015 |
+
aten::reshape 0.22% 10.380us 0.51% 23.940us 3.990us 0.000us 0.00% 0.000us 0.000us 6
|
| 4016 |
+
aten::view 0.29% 13.560us 0.29% 13.560us 2.260us 0.000us 0.00% 0.000us 0.000us 6
|
| 4017 |
+
cudaDeviceSynchronize 52.51% 2.452ms 52.51% 2.452ms 2.452ms 0.000us 0.00% 0.000us 0.000us 1
|
| 4018 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 4019 |
+
Self CPU time total: 4.669ms
|
| 4020 |
+
Self CUDA time total: 2.874ms
|
| 4021 |
|
| 4022 |
|
| 4023 |
|
|
|
|
| 4027 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 4028 |
Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
|
| 4029 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 4030 |
+
xformers_meff 5.75% 298.955us 42.23% 2.194ms 2.194ms 0.000us 0.00% 4.560ms 4.560ms 1
|
| 4031 |
+
xformers_flash3::flash_fwd 2.73% 142.094us 36.04% 1.872ms 624.074us 0.000us 0.00% 4.560ms 1.520ms 3
|
| 4032 |
+
flash_attn_3::fwd 1.06% 54.881us 33.30% 1.730ms 576.710us 3.413ms 100.00% 4.560ms 1.520ms 3
|
| 4033 |
+
xformers_meff 0.00% 0.000us 0.00% 0.000us 0.000us 3.415ms 100.04% 3.415ms 3.415ms 1
|
| 4034 |
+
void cutlass::device_kernel<flash::enable_sm80_to_sm... 0.00% 0.000us 0.00% 0.000us 0.000us 3.413ms 100.00% 3.413ms 1.138ms 3
|
| 4035 |
+
Activity Buffer Request 27.56% 1.432ms 27.56% 1.432ms 1.432ms 1.147ms 33.59% 1.147ms 1.147ms 1
|
| 4036 |
+
aten::empty 0.56% 28.860us 0.56% 28.860us 4.810us 0.000us 0.00% 0.000us 0.000us 6
|
| 4037 |
+
cudaFuncSetAttribute 0.10% 5.420us 0.10% 5.420us 1.807us 0.000us 0.00% 0.000us 0.000us 3
|
| 4038 |
+
cudaLaunchKernel 4.02% 208.865us 4.02% 208.865us 69.622us 0.000us 0.00% 0.000us 0.000us 3
|
| 4039 |
+
aten::reshape 0.18% 9.222us 0.44% 22.901us 3.817us 0.000us 0.00% 0.000us 0.000us 6
|
| 4040 |
+
aten::view 0.26% 13.679us 0.26% 13.679us 2.280us 0.000us 0.00% 0.000us 0.000us 6
|
| 4041 |
+
cudaDeviceSynchronize 57.77% 3.001ms 57.77% 3.001ms 3.001ms 0.000us 0.00% 0.000us 0.000us 1
|
| 4042 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 4043 |
+
Self CPU time total: 5.196ms
|
| 4044 |
+
Self CUDA time total: 3.413ms
|
| 4045 |
|
| 4046 |
|
| 4047 |
|
|
|
|
| 4051 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 4052 |
Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
|
| 4053 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 4054 |
+
xformers_meff 5.27% 272.556us 42.19% 2.184ms 2.184ms 0.000us 0.00% 4.536ms 4.536ms 1
|
| 4055 |
+
xformers_flash3::flash_fwd 2.70% 139.942us 36.49% 1.889ms 629.618us 0.000us 0.00% 4.536ms 1.512ms 3
|
| 4056 |
+
flash_attn_3::fwd 1.02% 52.981us 33.79% 1.749ms 582.970us 3.398ms 100.00% 4.536ms 1.512ms 3
|
| 4057 |
+
xformers_meff 0.00% 0.000us 0.00% 0.000us 0.000us 3.400ms 100.05% 3.400ms 3.400ms 1
|
| 4058 |
+
void cutlass::device_kernel<flash::enable_sm80_to_sm... 0.00% 0.000us 0.00% 0.000us 0.000us 3.398ms 100.00% 3.398ms 1.133ms 3
|
| 4059 |
+
Activity Buffer Request 28.10% 1.454ms 28.10% 1.454ms 1.454ms 1.138ms 33.49% 1.138ms 1.138ms 1
|
| 4060 |
+
aten::empty 0.56% 28.991us 0.56% 28.991us 4.832us 0.000us 0.00% 0.000us 0.000us 6
|
| 4061 |
+
cudaFuncSetAttribute 0.11% 5.511us 0.11% 5.511us 1.837us 0.000us 0.00% 0.000us 0.000us 3
|
| 4062 |
+
cudaLaunchKernel 4.00% 207.225us 4.00% 207.225us 69.075us 0.000us 0.00% 0.000us 0.000us 3
|
| 4063 |
+
aten::reshape 0.17% 8.891us 0.44% 22.532us 3.755us 0.000us 0.00% 0.000us 0.000us 6
|
| 4064 |
+
aten::view 0.26% 13.641us 0.26% 13.641us 2.274us 0.000us 0.00% 0.000us 0.000us 6
|
| 4065 |
+
cudaDeviceSynchronize 57.81% 2.992ms 57.81% 2.992ms 2.992ms 0.000us 0.00% 0.000us 0.000us 1
|
| 4066 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 4067 |
+
Self CPU time total: 5.176ms
|
| 4068 |
+
Self CUDA time total: 3.398ms
|
| 4069 |
|
| 4070 |
|
| 4071 |
impl wl p50(ms) ok
|
| 4072 |
+
xformers_meff cuda_attn_L128_bfloat16 0.99 True
|
| 4073 |
+
xformers_meff cuda_attn_L256_bfloat16 1.05 True
|
| 4074 |
+
xformers_meff cuda_attn_L320_bfloat16 1.06 True
|
| 4075 |
+
xformers_meff cuda_attn_L384_bfloat16 1.06 True
|
| 4076 |
+
xformers_meff cuda_attn_L448_bfloat16 1.23 True
|
| 4077 |
+
xformers_meff cuda_attn_L512_bfloat16 1.23 True
|
| 4078 |
</pre></div>
|
| 4079 |
<div class="uv-install-logs" id="uv-logs-benchmark">
|
| 4080 |
<div class="uv-logs-header" onclick="toggleUvLogs(this)">▶ UV Install Logs</div>
|
| 4081 |
<div class="uv-logs-content" style="display: none;">
|
| 4082 |
+
Building kernels-benchmark-tools @ file:///__w/kernels-benchmarks/kernels-benchmarks/tools
|
| 4083 |
+
Downloading setuptools (1.1MiB)
|
| 4084 |
+
Downloading numpy (16.2MiB)
|
| 4085 |
+
Downloading nvidia-cublas-cu12 (566.8MiB)
|
| 4086 |
+
Downloading nvidia-cuda-cupti-cu12 (9.8MiB)
|
| 4087 |
+
Downloading nvidia-cusparse-cu12 (274.9MiB)
|
| 4088 |
+
Downloading nvidia-nccl-cu12 (307.4MiB)
|
| 4089 |
+
Downloading kiwisolver (1.4MiB)
|
| 4090 |
+
Downloading torch (846.9MiB)
|
| 4091 |
+
Downloading matplotlib (8.3MiB)
|
| 4092 |
+
Downloading sympy (6.0MiB)
|
| 4093 |
+
Downloading nvidia-cusparselt-cu12 (273.9MiB)
|
| 4094 |
+
Downloading fonttools (4.7MiB)
|
| 4095 |
+
Downloading nvidia-cufft-cu12 (184.2MiB)
|
| 4096 |
+
Downloading pillow (6.7MiB)
|
| 4097 |
+
Downloading networkx (1.9MiB)
|
| 4098 |
+
Downloading nvidia-cusolver-cu12 (255.1MiB)
|
| 4099 |
+
Downloading triton (148.3MiB)
|
| 4100 |
+
Downloading nvidia-cuda-nvrtc-cu12 (84.0MiB)
|
| 4101 |
Downloading xformers (111.8MiB)
|
| 4102 |
+
Downloading nvidia-curand-cu12 (60.7MiB)
|
| 4103 |
+
Downloading nvidia-cufile-cu12 (1.1MiB)
|
| 4104 |
+
Downloading nvidia-cudnn-cu12 (674.0MiB)
|
| 4105 |
+
Downloading nvidia-nvjitlink-cu12 (37.4MiB)
|
| 4106 |
+
Downloading nvidia-cufile-cu12
|
| 4107 |
+
Downloading kiwisolver
|
| 4108 |
+
Downloading setuptools
|
| 4109 |
+
Downloading networkx
|
| 4110 |
+
Downloading fonttools
|
| 4111 |
+
Downloading pillow
|
| 4112 |
+
Built kernels-benchmark-tools @ file:///__w/kernels-benchmarks/kernels-benchmarks/tools
|
| 4113 |
+
Downloading nvidia-cuda-cupti-cu12
|
| 4114 |
+
Downloading matplotlib
|
| 4115 |
+
Downloading numpy
|
| 4116 |
+
Downloading sympy
|
| 4117 |
+
Downloading nvidia-nvjitlink-cu12
|
| 4118 |
+
Downloading nvidia-curand-cu12
|
| 4119 |
+
Downloading nvidia-cuda-nvrtc-cu12
|
| 4120 |
Downloading xformers
|
| 4121 |
+
Downloading triton
|
| 4122 |
+
Downloading nvidia-cufft-cu12
|
| 4123 |
+
Downloading nvidia-cusolver-cu12
|
| 4124 |
+
Downloading nvidia-cusparse-cu12
|
| 4125 |
+
Downloading nvidia-cusparselt-cu12
|
| 4126 |
+
Downloading nvidia-nccl-cu12
|
| 4127 |
+
Downloading nvidia-cudnn-cu12
|
| 4128 |
+
Downloading nvidia-cublas-cu12
|
| 4129 |
+
Downloading torch
|
| 4130 |
+
Installed 38 packages in 204ms
|
| 4131 |
</div>
|
| 4132 |
</div>
|
| 4133 |
<div class="cell-artifacts">
|
flash_attn/results/artifacts/combine/latency.svg
CHANGED
|
|
Git LFS Details
|
|
|
Git LFS Details
|
flash_attn/results/combined_results.html
CHANGED
|
@@ -809,6 +809,14 @@
|
|
| 809 |
.artifact-preview svg {
|
| 810 |
background: transparent;
|
| 811 |
}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 812 |
/* CSV table styling */
|
| 813 |
.artifact-csv {
|
| 814 |
margin-top: 1rem;
|
|
@@ -3872,7 +3880,7 @@ span.linenos.special { color: #000000; background-color: #ffffc0; padding-left:
|
|
| 3872 |
<rdf:RDF>
|
| 3873 |
<ns2:Work>
|
| 3874 |
<dc:type rdf:resource="http://purl.org/dc/dcmitype/StillImage" />
|
| 3875 |
-
<dc:date>2025-10-
|
| 3876 |
<dc:format>image/svg+xml</dc:format>
|
| 3877 |
<dc:creator>
|
| 3878 |
<ns2:Agent>
|
|
@@ -3982,96 +3990,96 @@ span.linenos.special { color: #000000; background-color: #ffffc0; padding-left:
|
|
| 3982 |
<g id="matplotlib.axis_2">
|
| 3983 |
<g id="ytick_1">
|
| 3984 |
<g id="grid-y--2" class="grid grid-y">
|
| 3985 |
-
<path d="M 47.81
|
| 3986 |
</g>
|
| 3987 |
<g id="line2d_7">
|
| 3988 |
<defs>
|
| 3989 |
<path id="m0fca2865ba" d="M 0 0 L -3.5 0 " style="stroke: #000000; stroke-width: 0.8" />
|
| 3990 |
</defs>
|
| 3991 |
<g>
|
| 3992 |
-
<use ns4:href="#m0fca2865ba" x="47.81" y="
|
| 3993 |
</g>
|
| 3994 |
</g>
|
| 3995 |
<g id="text_7">
|
| 3996 |
-
<text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: end" x="40.81" y="
|
| 3997 |
</g>
|
| 3998 |
</g>
|
| 3999 |
<g id="ytick_2">
|
| 4000 |
<g id="grid-y--3" class="grid grid-y">
|
| 4001 |
-
<path d="M 47.81
|
| 4002 |
</g>
|
| 4003 |
<g id="line2d_8">
|
| 4004 |
<g>
|
| 4005 |
-
<use ns4:href="#m0fca2865ba" x="47.81" y="
|
| 4006 |
</g>
|
| 4007 |
</g>
|
| 4008 |
<g id="text_8">
|
| 4009 |
-
<text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: end" x="40.81" y="
|
| 4010 |
</g>
|
| 4011 |
</g>
|
| 4012 |
<g id="ytick_3">
|
| 4013 |
<g id="grid-y--4" class="grid grid-y">
|
| 4014 |
-
<path d="M 47.81
|
| 4015 |
</g>
|
| 4016 |
<g id="line2d_9">
|
| 4017 |
<g>
|
| 4018 |
-
<use ns4:href="#m0fca2865ba" x="47.81" y="
|
| 4019 |
</g>
|
| 4020 |
</g>
|
| 4021 |
<g id="text_9">
|
| 4022 |
-
<text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: end" x="40.81" y="
|
| 4023 |
</g>
|
| 4024 |
</g>
|
| 4025 |
<g id="ytick_4">
|
| 4026 |
<g id="grid-y--5" class="grid grid-y">
|
| 4027 |
-
<path d="M 47.81
|
| 4028 |
</g>
|
| 4029 |
<g id="line2d_10">
|
| 4030 |
<g>
|
| 4031 |
-
<use ns4:href="#m0fca2865ba" x="47.81" y="
|
| 4032 |
</g>
|
| 4033 |
</g>
|
| 4034 |
<g id="text_10">
|
| 4035 |
-
<text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: end" x="40.81" y="
|
| 4036 |
</g>
|
| 4037 |
</g>
|
| 4038 |
<g id="ytick_5">
|
| 4039 |
<g id="grid-y--6" class="grid grid-y">
|
| 4040 |
-
<path d="M 47.81
|
| 4041 |
</g>
|
| 4042 |
<g id="line2d_11">
|
| 4043 |
<g>
|
| 4044 |
-
<use ns4:href="#m0fca2865ba" x="47.81" y="
|
| 4045 |
</g>
|
| 4046 |
</g>
|
| 4047 |
<g id="text_11">
|
| 4048 |
-
<text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: end" x="40.81" y="
|
| 4049 |
</g>
|
| 4050 |
</g>
|
| 4051 |
<g id="ytick_6">
|
| 4052 |
<g id="grid-y--7" class="grid grid-y">
|
| 4053 |
-
<path d="M 47.81
|
| 4054 |
</g>
|
| 4055 |
<g id="line2d_12">
|
| 4056 |
<g>
|
| 4057 |
-
<use ns4:href="#m0fca2865ba" x="47.81" y="
|
| 4058 |
</g>
|
| 4059 |
</g>
|
| 4060 |
<g id="text_12">
|
| 4061 |
-
<text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: end" x="40.81" y="
|
| 4062 |
</g>
|
| 4063 |
</g>
|
| 4064 |
<g id="ytick_7">
|
| 4065 |
<g id="grid-y--8" class="grid grid-y">
|
| 4066 |
-
<path d="M 47.81 42.
|
| 4067 |
</g>
|
| 4068 |
<g id="line2d_13">
|
| 4069 |
<g>
|
| 4070 |
-
<use ns4:href="#m0fca2865ba" x="47.81" y="42.
|
| 4071 |
</g>
|
| 4072 |
</g>
|
| 4073 |
<g id="text_13">
|
| 4074 |
-
<text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: end" x="40.81" y="46.
|
| 4075 |
</g>
|
| 4076 |
</g>
|
| 4077 |
<g id="label--y" class="ylabel">
|
|
@@ -4079,73 +4087,73 @@ span.linenos.special { color: #000000; background-color: #ffffc0; padding-left:
|
|
| 4079 |
</g>
|
| 4080 |
</g>
|
| 4081 |
<g id="series--torch-flash-ma" class="series">
|
| 4082 |
-
<path d="M 83.607806
|
| 4083 |
<defs>
|
| 4084 |
<path id="md7efaf3aec" d="M 0 3 C 0.795609 3 1.55874 2.683901 2.12132 2.12132 C 2.683901 1.55874 3 0.795609 3 0 C 3 -0.795609 2.683901 -1.55874 2.12132 -2.12132 C 1.55874 -2.683901 0.795609 -3 0 -3 C -0.795609 -3 -1.55874 -2.683901 -2.12132 -2.12132 C -2.683901 -1.55874 -3 -0.795609 -3 0 C -3 0.795609 -2.683901 1.55874 -2.12132 2.12132 C -1.55874 2.683901 -0.795609 3 0 3 z " style="stroke: #1f77b4" />
|
| 4085 |
</defs>
|
| 4086 |
<g clip-path="url(#p09feef2583)">
|
| 4087 |
-
<use ns4:href="#md7efaf3aec" x="83.607806" y="
|
| 4088 |
-
<use ns4:href="#md7efaf3aec" x="226.799032" y="
|
| 4089 |
-
<use ns4:href="#md7efaf3aec" x="369.990258" y="
|
| 4090 |
-
<use ns4:href="#md7efaf3aec" x="513.181484" y="
|
| 4091 |
-
<use ns4:href="#md7efaf3aec" x="656.37271" y="
|
| 4092 |
-
<use ns4:href="#md7efaf3aec" x="799.563935" y="
|
| 4093 |
</g>
|
| 4094 |
</g>
|
| 4095 |
<g id="series--torch-mem-eff" class="series">
|
| 4096 |
-
<path d="M 83.607806
|
| 4097 |
<defs>
|
| 4098 |
<path id="m9b8c54d372" d="M 0 3 C 0.795609 3 1.55874 2.683901 2.12132 2.12132 C 2.683901 1.55874 3 0.795609 3 0 C 3 -0.795609 2.683901 -1.55874 2.12132 -2.12132 C 1.55874 -2.683901 0.795609 -3 0 -3 C -0.795609 -3 -1.55874 -2.683901 -2.12132 -2.12132 C -2.683901 -1.55874 -3 -0.795609 -3 0 C -3 0.795609 -2.683901 1.55874 -2.12132 2.12132 C -1.55874 2.683901 -0.795609 3 0 3 z " style="stroke: #ff7f0e" />
|
| 4099 |
</defs>
|
| 4100 |
<g clip-path="url(#p09feef2583)">
|
| 4101 |
-
<use ns4:href="#m9b8c54d372" x="83.607806" y="
|
| 4102 |
-
<use ns4:href="#m9b8c54d372" x="226.799032" y="
|
| 4103 |
-
<use ns4:href="#m9b8c54d372" x="369.990258" y="
|
| 4104 |
-
<use ns4:href="#m9b8c54d372" x="513.181484" y="
|
| 4105 |
-
<use ns4:href="#m9b8c54d372" x="656.37271" y="
|
| 4106 |
<use ns4:href="#m9b8c54d372" x="799.563935" y="45.999414" style="fill: #ff7f0e; stroke: #ff7f0e" />
|
| 4107 |
</g>
|
| 4108 |
</g>
|
| 4109 |
<g id="series--xformers-meff" class="series">
|
| 4110 |
-
<path d="M 83.607806
|
| 4111 |
<defs>
|
| 4112 |
<path id="mc655281e0b" d="M 0 3 C 0.795609 3 1.55874 2.683901 2.12132 2.12132 C 2.683901 1.55874 3 0.795609 3 0 C 3 -0.795609 2.683901 -1.55874 2.12132 -2.12132 C 1.55874 -2.683901 0.795609 -3 0 -3 C -0.795609 -3 -1.55874 -2.683901 -2.12132 -2.12132 C -2.683901 -1.55874 -3 -0.795609 -3 0 C -3 0.795609 -2.683901 1.55874 -2.12132 2.12132 C -1.55874 2.683901 -0.795609 3 0 3 z " style="stroke: #2ca02c" />
|
| 4113 |
</defs>
|
| 4114 |
<g clip-path="url(#p09feef2583)">
|
| 4115 |
-
<use ns4:href="#mc655281e0b" x="83.607806" y="
|
| 4116 |
-
<use ns4:href="#mc655281e0b" x="226.799032" y="
|
| 4117 |
-
<use ns4:href="#mc655281e0b" x="369.990258" y="
|
| 4118 |
-
<use ns4:href="#mc655281e0b" x="513.181484" y="
|
| 4119 |
-
<use ns4:href="#mc655281e0b" x="656.37271" y="
|
| 4120 |
-
<use ns4:href="#mc655281e0b" x="799.563935" y="
|
| 4121 |
</g>
|
| 4122 |
</g>
|
| 4123 |
<g id="series--hf-kernels-flash-attn" class="series">
|
| 4124 |
-
<path d="M 83.607806
|
| 4125 |
<defs>
|
| 4126 |
<path id="m61c8040d7e" d="M 0 3 C 0.795609 3 1.55874 2.683901 2.12132 2.12132 C 2.683901 1.55874 3 0.795609 3 0 C 3 -0.795609 2.683901 -1.55874 2.12132 -2.12132 C 1.55874 -2.683901 0.795609 -3 0 -3 C -0.795609 -3 -1.55874 -2.683901 -2.12132 -2.12132 C -2.683901 -1.55874 -3 -0.795609 -3 0 C -3 0.795609 -2.683901 1.55874 -2.12132 2.12132 C -1.55874 2.683901 -0.795609 3 0 3 z " style="stroke: #d62728" />
|
| 4127 |
</defs>
|
| 4128 |
<g clip-path="url(#p09feef2583)">
|
| 4129 |
-
<use ns4:href="#m61c8040d7e" x="83.607806" y="
|
| 4130 |
-
<use ns4:href="#m61c8040d7e" x="226.799032" y="
|
| 4131 |
-
<use ns4:href="#m61c8040d7e" x="369.990258" y="
|
| 4132 |
-
<use ns4:href="#m61c8040d7e" x="513.181484" y="
|
| 4133 |
-
<use ns4:href="#m61c8040d7e" x="656.37271" y="
|
| 4134 |
-
<use ns4:href="#m61c8040d7e" x="799.563935" y="
|
| 4135 |
</g>
|
| 4136 |
</g>
|
| 4137 |
<g id="series--hf-kernels-flash-attn3" class="series">
|
| 4138 |
-
<path d="M 83.607806
|
| 4139 |
<defs>
|
| 4140 |
<path id="m7cd35be9cc" d="M 0 3 C 0.795609 3 1.55874 2.683901 2.12132 2.12132 C 2.683901 1.55874 3 0.795609 3 0 C 3 -0.795609 2.683901 -1.55874 2.12132 -2.12132 C 1.55874 -2.683901 0.795609 -3 0 -3 C -0.795609 -3 -1.55874 -2.683901 -2.12132 -2.12132 C -2.683901 -1.55874 -3 -0.795609 -3 0 C -3 0.795609 -2.683901 1.55874 -2.12132 2.12132 C -1.55874 2.683901 -0.795609 3 0 3 z " style="stroke: #9467bd" />
|
| 4141 |
</defs>
|
| 4142 |
<g clip-path="url(#p09feef2583)">
|
| 4143 |
-
<use ns4:href="#m7cd35be9cc" x="83.607806" y="
|
| 4144 |
-
<use ns4:href="#m7cd35be9cc" x="226.799032" y="
|
| 4145 |
-
<use ns4:href="#m7cd35be9cc" x="369.990258" y="
|
| 4146 |
-
<use ns4:href="#m7cd35be9cc" x="513.181484" y="
|
| 4147 |
-
<use ns4:href="#m7cd35be9cc" x="656.37271" y="
|
| 4148 |
-
<use ns4:href="#m7cd35be9cc" x="799.563935" y="
|
| 4149 |
</g>
|
| 4150 |
</g>
|
| 4151 |
<g id="patch_3">
|
|
@@ -4230,7 +4238,7 @@ span.linenos.special { color: #000000; background-color: #ffffc0; padding-left:
|
|
| 4230 |
<span onclick="toggleOutput('combine')" style="cursor: pointer;">▼ output</span>
|
| 4231 |
<span id="uv-indicator-combine" onclick="toggleUvLogsFromHeader('combine')" style="cursor: pointer;">▶ uv-logs</span>
|
| 4232 |
</span> |
|
| 4233 |
-
Cell: combine | 4.
|
| 4234 |
| <button class="run-btn" onclick="runCell('combine')">▶ run</button>
|
| 4235 |
<button class="copy-btn" onclick="copyCell('combine')">Copy</button>
|
| 4236 |
<a href="cells/combine.py" target="_blank" class="raw-btn">Raw</a>
|
|
@@ -4337,48 +4345,48 @@ Summary: 6 found, 0 skipped, 0 missing
|
|
| 4337 |
COMBINED BENCHMARK SUMMARY
|
| 4338 |
|
| 4339 |
impl wl p50(ms) ok
|
| 4340 |
-
hf_kernels_flash_attn cuda_attn_L128_bfloat16 0.
|
| 4341 |
-
hf_kernels_flash_attn cuda_attn_L256_bfloat16 1.
|
| 4342 |
-
hf_kernels_flash_attn cuda_attn_L320_bfloat16 1.
|
| 4343 |
-
hf_kernels_flash_attn cuda_attn_L384_bfloat16 1.
|
| 4344 |
-
hf_kernels_flash_attn cuda_attn_L448_bfloat16 1.
|
| 4345 |
-
hf_kernels_flash_attn cuda_attn_L512_bfloat16 1.
|
| 4346 |
-
hf_kernels_flash_attn3 cuda_attn_L128_bfloat16
|
| 4347 |
-
hf_kernels_flash_attn3 cuda_attn_L256_bfloat16 0.
|
| 4348 |
-
hf_kernels_flash_attn3 cuda_attn_L320_bfloat16 1.
|
| 4349 |
-
hf_kernels_flash_attn3 cuda_attn_L384_bfloat16 1.
|
| 4350 |
-
hf_kernels_flash_attn3 cuda_attn_L448_bfloat16 1.
|
| 4351 |
-
hf_kernels_flash_attn3 cuda_attn_L512_bfloat16 1.
|
| 4352 |
sage_int8_fp16 cuda_attn_L128_bfloat16 FAIL False
|
| 4353 |
-
Error: module '
|
| 4354 |
sage_int8_fp16 cuda_attn_L256_bfloat16 FAIL False
|
| 4355 |
-
Error: module '
|
| 4356 |
sage_int8_fp16 cuda_attn_L320_bfloat16 FAIL False
|
| 4357 |
-
Error: module '
|
| 4358 |
sage_int8_fp16 cuda_attn_L384_bfloat16 FAIL False
|
| 4359 |
-
Error: module '
|
| 4360 |
sage_int8_fp16 cuda_attn_L448_bfloat16 FAIL False
|
| 4361 |
-
Error: module '
|
| 4362 |
sage_int8_fp16 cuda_attn_L512_bfloat16 FAIL False
|
| 4363 |
-
Error: module '
|
| 4364 |
-
torch_flash_ma cuda_attn_L128_bfloat16 1.
|
| 4365 |
-
torch_flash_ma cuda_attn_L256_bfloat16 1.
|
| 4366 |
-
torch_flash_ma cuda_attn_L320_bfloat16 1.
|
| 4367 |
-
torch_flash_ma cuda_attn_L384_bfloat16 1.
|
| 4368 |
-
torch_flash_ma cuda_attn_L448_bfloat16 1.
|
| 4369 |
-
torch_flash_ma cuda_attn_L512_bfloat16 1.
|
| 4370 |
torch_mem_eff cuda_attn_L128_bfloat16 1.83 True
|
| 4371 |
-
torch_mem_eff cuda_attn_L256_bfloat16 1.
|
| 4372 |
-
torch_mem_eff cuda_attn_L320_bfloat16
|
| 4373 |
-
torch_mem_eff cuda_attn_L384_bfloat16
|
| 4374 |
-
torch_mem_eff cuda_attn_L448_bfloat16 2.
|
| 4375 |
torch_mem_eff cuda_attn_L512_bfloat16 2.19 True
|
| 4376 |
-
xformers_meff cuda_attn_L128_bfloat16
|
| 4377 |
-
xformers_meff cuda_attn_L256_bfloat16 1.
|
| 4378 |
-
xformers_meff cuda_attn_L320_bfloat16 1.
|
| 4379 |
-
xformers_meff cuda_attn_L384_bfloat16 1.
|
| 4380 |
-
xformers_meff cuda_attn_L448_bfloat16 1.
|
| 4381 |
-
xformers_meff cuda_attn_L512_bfloat16 1.
|
| 4382 |
|
| 4383 |
GENERATING COMBINED VISUALIZATION
|
| 4384 |
|
|
@@ -4402,7 +4410,7 @@ Implementations included:
|
|
| 4402 |
<div class="uv-install-logs" id="uv-logs-combine">
|
| 4403 |
<div class="uv-logs-header" onclick="toggleUvLogs(this)">▶ UV Install Logs</div>
|
| 4404 |
<div class="uv-logs-content" style="display: none;">
|
| 4405 |
-
Installed 37 packages in
|
| 4406 |
</div>
|
| 4407 |
</div>
|
| 4408 |
<div class="cell-artifacts">
|
|
@@ -4415,7 +4423,7 @@ Installed 37 packages in 208ms
|
|
| 4415 |
<rdf:RDF>
|
| 4416 |
<ns2:Work>
|
| 4417 |
<dc:type rdf:resource="http://purl.org/dc/dcmitype/StillImage" />
|
| 4418 |
-
<dc:date>2025-10-
|
| 4419 |
<dc:format>image/svg+xml</dc:format>
|
| 4420 |
<dc:creator>
|
| 4421 |
<ns2:Agent>
|
|
@@ -4525,96 +4533,96 @@ Installed 37 packages in 208ms
|
|
| 4525 |
<g id="matplotlib.axis_2">
|
| 4526 |
<g id="ytick_1">
|
| 4527 |
<g id="grid-y--2" class="grid grid-y">
|
| 4528 |
-
<path d="M 47.81
|
| 4529 |
</g>
|
| 4530 |
<g id="line2d_7">
|
| 4531 |
<defs>
|
| 4532 |
<path id="m0fca2865ba" d="M 0 0 L -3.5 0 " style="stroke: #000000; stroke-width: 0.8" />
|
| 4533 |
</defs>
|
| 4534 |
<g>
|
| 4535 |
-
<use ns4:href="#m0fca2865ba" x="47.81" y="
|
| 4536 |
</g>
|
| 4537 |
</g>
|
| 4538 |
<g id="text_7">
|
| 4539 |
-
<text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: end" x="40.81" y="
|
| 4540 |
</g>
|
| 4541 |
</g>
|
| 4542 |
<g id="ytick_2">
|
| 4543 |
<g id="grid-y--3" class="grid grid-y">
|
| 4544 |
-
<path d="M 47.81
|
| 4545 |
</g>
|
| 4546 |
<g id="line2d_8">
|
| 4547 |
<g>
|
| 4548 |
-
<use ns4:href="#m0fca2865ba" x="47.81" y="
|
| 4549 |
</g>
|
| 4550 |
</g>
|
| 4551 |
<g id="text_8">
|
| 4552 |
-
<text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: end" x="40.81" y="
|
| 4553 |
</g>
|
| 4554 |
</g>
|
| 4555 |
<g id="ytick_3">
|
| 4556 |
<g id="grid-y--4" class="grid grid-y">
|
| 4557 |
-
<path d="M 47.81
|
| 4558 |
</g>
|
| 4559 |
<g id="line2d_9">
|
| 4560 |
<g>
|
| 4561 |
-
<use ns4:href="#m0fca2865ba" x="47.81" y="
|
| 4562 |
</g>
|
| 4563 |
</g>
|
| 4564 |
<g id="text_9">
|
| 4565 |
-
<text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: end" x="40.81" y="
|
| 4566 |
</g>
|
| 4567 |
</g>
|
| 4568 |
<g id="ytick_4">
|
| 4569 |
<g id="grid-y--5" class="grid grid-y">
|
| 4570 |
-
<path d="M 47.81
|
| 4571 |
</g>
|
| 4572 |
<g id="line2d_10">
|
| 4573 |
<g>
|
| 4574 |
-
<use ns4:href="#m0fca2865ba" x="47.81" y="
|
| 4575 |
</g>
|
| 4576 |
</g>
|
| 4577 |
<g id="text_10">
|
| 4578 |
-
<text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: end" x="40.81" y="
|
| 4579 |
</g>
|
| 4580 |
</g>
|
| 4581 |
<g id="ytick_5">
|
| 4582 |
<g id="grid-y--6" class="grid grid-y">
|
| 4583 |
-
<path d="M 47.81
|
| 4584 |
</g>
|
| 4585 |
<g id="line2d_11">
|
| 4586 |
<g>
|
| 4587 |
-
<use ns4:href="#m0fca2865ba" x="47.81" y="
|
| 4588 |
</g>
|
| 4589 |
</g>
|
| 4590 |
<g id="text_11">
|
| 4591 |
-
<text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: end" x="40.81" y="
|
| 4592 |
</g>
|
| 4593 |
</g>
|
| 4594 |
<g id="ytick_6">
|
| 4595 |
<g id="grid-y--7" class="grid grid-y">
|
| 4596 |
-
<path d="M 47.81
|
| 4597 |
</g>
|
| 4598 |
<g id="line2d_12">
|
| 4599 |
<g>
|
| 4600 |
-
<use ns4:href="#m0fca2865ba" x="47.81" y="
|
| 4601 |
</g>
|
| 4602 |
</g>
|
| 4603 |
<g id="text_12">
|
| 4604 |
-
<text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: end" x="40.81" y="
|
| 4605 |
</g>
|
| 4606 |
</g>
|
| 4607 |
<g id="ytick_7">
|
| 4608 |
<g id="grid-y--8" class="grid grid-y">
|
| 4609 |
-
<path d="M 47.81 42.
|
| 4610 |
</g>
|
| 4611 |
<g id="line2d_13">
|
| 4612 |
<g>
|
| 4613 |
-
<use ns4:href="#m0fca2865ba" x="47.81" y="42.
|
| 4614 |
</g>
|
| 4615 |
</g>
|
| 4616 |
<g id="text_13">
|
| 4617 |
-
<text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: end" x="40.81" y="46.
|
| 4618 |
</g>
|
| 4619 |
</g>
|
| 4620 |
<g id="label--y" class="ylabel">
|
|
@@ -4622,73 +4630,73 @@ Installed 37 packages in 208ms
|
|
| 4622 |
</g>
|
| 4623 |
</g>
|
| 4624 |
<g id="series--torch-flash-ma" class="series">
|
| 4625 |
-
<path d="M 83.607806
|
| 4626 |
<defs>
|
| 4627 |
<path id="md7efaf3aec" d="M 0 3 C 0.795609 3 1.55874 2.683901 2.12132 2.12132 C 2.683901 1.55874 3 0.795609 3 0 C 3 -0.795609 2.683901 -1.55874 2.12132 -2.12132 C 1.55874 -2.683901 0.795609 -3 0 -3 C -0.795609 -3 -1.55874 -2.683901 -2.12132 -2.12132 C -2.683901 -1.55874 -3 -0.795609 -3 0 C -3 0.795609 -2.683901 1.55874 -2.12132 2.12132 C -1.55874 2.683901 -0.795609 3 0 3 z " style="stroke: #1f77b4" />
|
| 4628 |
</defs>
|
| 4629 |
<g clip-path="url(#p09feef2583)">
|
| 4630 |
-
<use ns4:href="#md7efaf3aec" x="83.607806" y="
|
| 4631 |
-
<use ns4:href="#md7efaf3aec" x="226.799032" y="
|
| 4632 |
-
<use ns4:href="#md7efaf3aec" x="369.990258" y="
|
| 4633 |
-
<use ns4:href="#md7efaf3aec" x="513.181484" y="
|
| 4634 |
-
<use ns4:href="#md7efaf3aec" x="656.37271" y="
|
| 4635 |
-
<use ns4:href="#md7efaf3aec" x="799.563935" y="
|
| 4636 |
</g>
|
| 4637 |
</g>
|
| 4638 |
<g id="series--torch-mem-eff" class="series">
|
| 4639 |
-
<path d="M 83.607806
|
| 4640 |
<defs>
|
| 4641 |
<path id="m9b8c54d372" d="M 0 3 C 0.795609 3 1.55874 2.683901 2.12132 2.12132 C 2.683901 1.55874 3 0.795609 3 0 C 3 -0.795609 2.683901 -1.55874 2.12132 -2.12132 C 1.55874 -2.683901 0.795609 -3 0 -3 C -0.795609 -3 -1.55874 -2.683901 -2.12132 -2.12132 C -2.683901 -1.55874 -3 -0.795609 -3 0 C -3 0.795609 -2.683901 1.55874 -2.12132 2.12132 C -1.55874 2.683901 -0.795609 3 0 3 z " style="stroke: #ff7f0e" />
|
| 4642 |
</defs>
|
| 4643 |
<g clip-path="url(#p09feef2583)">
|
| 4644 |
-
<use ns4:href="#m9b8c54d372" x="83.607806" y="
|
| 4645 |
-
<use ns4:href="#m9b8c54d372" x="226.799032" y="
|
| 4646 |
-
<use ns4:href="#m9b8c54d372" x="369.990258" y="
|
| 4647 |
-
<use ns4:href="#m9b8c54d372" x="513.181484" y="
|
| 4648 |
-
<use ns4:href="#m9b8c54d372" x="656.37271" y="
|
| 4649 |
<use ns4:href="#m9b8c54d372" x="799.563935" y="45.999414" style="fill: #ff7f0e; stroke: #ff7f0e" />
|
| 4650 |
</g>
|
| 4651 |
</g>
|
| 4652 |
<g id="series--xformers-meff" class="series">
|
| 4653 |
-
<path d="M 83.607806
|
| 4654 |
<defs>
|
| 4655 |
<path id="mc655281e0b" d="M 0 3 C 0.795609 3 1.55874 2.683901 2.12132 2.12132 C 2.683901 1.55874 3 0.795609 3 0 C 3 -0.795609 2.683901 -1.55874 2.12132 -2.12132 C 1.55874 -2.683901 0.795609 -3 0 -3 C -0.795609 -3 -1.55874 -2.683901 -2.12132 -2.12132 C -2.683901 -1.55874 -3 -0.795609 -3 0 C -3 0.795609 -2.683901 1.55874 -2.12132 2.12132 C -1.55874 2.683901 -0.795609 3 0 3 z " style="stroke: #2ca02c" />
|
| 4656 |
</defs>
|
| 4657 |
<g clip-path="url(#p09feef2583)">
|
| 4658 |
-
<use ns4:href="#mc655281e0b" x="83.607806" y="
|
| 4659 |
-
<use ns4:href="#mc655281e0b" x="226.799032" y="
|
| 4660 |
-
<use ns4:href="#mc655281e0b" x="369.990258" y="
|
| 4661 |
-
<use ns4:href="#mc655281e0b" x="513.181484" y="
|
| 4662 |
-
<use ns4:href="#mc655281e0b" x="656.37271" y="
|
| 4663 |
-
<use ns4:href="#mc655281e0b" x="799.563935" y="
|
| 4664 |
</g>
|
| 4665 |
</g>
|
| 4666 |
<g id="series--hf-kernels-flash-attn" class="series">
|
| 4667 |
-
<path d="M 83.607806
|
| 4668 |
<defs>
|
| 4669 |
<path id="m61c8040d7e" d="M 0 3 C 0.795609 3 1.55874 2.683901 2.12132 2.12132 C 2.683901 1.55874 3 0.795609 3 0 C 3 -0.795609 2.683901 -1.55874 2.12132 -2.12132 C 1.55874 -2.683901 0.795609 -3 0 -3 C -0.795609 -3 -1.55874 -2.683901 -2.12132 -2.12132 C -2.683901 -1.55874 -3 -0.795609 -3 0 C -3 0.795609 -2.683901 1.55874 -2.12132 2.12132 C -1.55874 2.683901 -0.795609 3 0 3 z " style="stroke: #d62728" />
|
| 4670 |
</defs>
|
| 4671 |
<g clip-path="url(#p09feef2583)">
|
| 4672 |
-
<use ns4:href="#m61c8040d7e" x="83.607806" y="
|
| 4673 |
-
<use ns4:href="#m61c8040d7e" x="226.799032" y="
|
| 4674 |
-
<use ns4:href="#m61c8040d7e" x="369.990258" y="
|
| 4675 |
-
<use ns4:href="#m61c8040d7e" x="513.181484" y="
|
| 4676 |
-
<use ns4:href="#m61c8040d7e" x="656.37271" y="
|
| 4677 |
-
<use ns4:href="#m61c8040d7e" x="799.563935" y="
|
| 4678 |
</g>
|
| 4679 |
</g>
|
| 4680 |
<g id="series--hf-kernels-flash-attn3" class="series">
|
| 4681 |
-
<path d="M 83.607806
|
| 4682 |
<defs>
|
| 4683 |
<path id="m7cd35be9cc" d="M 0 3 C 0.795609 3 1.55874 2.683901 2.12132 2.12132 C 2.683901 1.55874 3 0.795609 3 0 C 3 -0.795609 2.683901 -1.55874 2.12132 -2.12132 C 1.55874 -2.683901 0.795609 -3 0 -3 C -0.795609 -3 -1.55874 -2.683901 -2.12132 -2.12132 C -2.683901 -1.55874 -3 -0.795609 -3 0 C -3 0.795609 -2.683901 1.55874 -2.12132 2.12132 C -1.55874 2.683901 -0.795609 3 0 3 z " style="stroke: #9467bd" />
|
| 4684 |
</defs>
|
| 4685 |
<g clip-path="url(#p09feef2583)">
|
| 4686 |
-
<use ns4:href="#m7cd35be9cc" x="83.607806" y="
|
| 4687 |
-
<use ns4:href="#m7cd35be9cc" x="226.799032" y="
|
| 4688 |
-
<use ns4:href="#m7cd35be9cc" x="369.990258" y="
|
| 4689 |
-
<use ns4:href="#m7cd35be9cc" x="513.181484" y="
|
| 4690 |
-
<use ns4:href="#m7cd35be9cc" x="656.37271" y="
|
| 4691 |
-
<use ns4:href="#m7cd35be9cc" x="799.563935" y="
|
| 4692 |
</g>
|
| 4693 |
</g>
|
| 4694 |
<g id="patch_3">
|
|
|
|
| 809 |
.artifact-preview svg {
|
| 810 |
background: transparent;
|
| 811 |
}
|
| 812 |
+
/* Invert SVG images in dark mode */
|
| 813 |
+
:root[data-theme="dark"] .artifact-preview img[src$=".svg"] {
|
| 814 |
+
filter: invert(0.9) hue-rotate(180deg);
|
| 815 |
+
}
|
| 816 |
+
/* Keep SVG images readable in monocolor mode */
|
| 817 |
+
:root[data-ui="monocolor"] .artifact-preview img[src$=".svg"] {
|
| 818 |
+
filter: none;
|
| 819 |
+
}
|
| 820 |
/* CSV table styling */
|
| 821 |
.artifact-csv {
|
| 822 |
margin-top: 1rem;
|
|
|
|
| 3880 |
<rdf:RDF>
|
| 3881 |
<ns2:Work>
|
| 3882 |
<dc:type rdf:resource="http://purl.org/dc/dcmitype/StillImage" />
|
| 3883 |
+
<dc:date>2025-10-29T15:51:09.340715</dc:date>
|
| 3884 |
<dc:format>image/svg+xml</dc:format>
|
| 3885 |
<dc:creator>
|
| 3886 |
<ns2:Agent>
|
|
|
|
| 3990 |
<g id="matplotlib.axis_2">
|
| 3991 |
<g id="ytick_1">
|
| 3992 |
<g id="grid-y--2" class="grid grid-y">
|
| 3993 |
+
<path d="M 47.81 413.024194 L 835.361742 413.024194 " clip-path="url(#p09feef2583)" style="fill: none; stroke: #b0b0b0; stroke-opacity: 0.3; stroke-width: 0.8; stroke-linecap: square" />
|
| 3994 |
</g>
|
| 3995 |
<g id="line2d_7">
|
| 3996 |
<defs>
|
| 3997 |
<path id="m0fca2865ba" d="M 0 0 L -3.5 0 " style="stroke: #000000; stroke-width: 0.8" />
|
| 3998 |
</defs>
|
| 3999 |
<g>
|
| 4000 |
+
<use ns4:href="#m0fca2865ba" x="47.81" y="413.024194" style="stroke: #000000; stroke-width: 0.8" />
|
| 4001 |
</g>
|
| 4002 |
</g>
|
| 4003 |
<g id="text_7">
|
| 4004 |
+
<text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: end" x="40.81" y="416.823413" transform="rotate(-0 40.81 416.823413)">1.0</text>
|
| 4005 |
</g>
|
| 4006 |
</g>
|
| 4007 |
<g id="ytick_2">
|
| 4008 |
<g id="grid-y--3" class="grid grid-y">
|
| 4009 |
+
<path d="M 47.81 351.27252 L 835.361742 351.27252 " clip-path="url(#p09feef2583)" style="fill: none; stroke: #b0b0b0; stroke-opacity: 0.3; stroke-width: 0.8; stroke-linecap: square" />
|
| 4010 |
</g>
|
| 4011 |
<g id="line2d_8">
|
| 4012 |
<g>
|
| 4013 |
+
<use ns4:href="#m0fca2865ba" x="47.81" y="351.27252" style="stroke: #000000; stroke-width: 0.8" />
|
| 4014 |
</g>
|
| 4015 |
</g>
|
| 4016 |
<g id="text_8">
|
| 4017 |
+
<text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: end" x="40.81" y="355.071739" transform="rotate(-0 40.81 355.071739)">1.2</text>
|
| 4018 |
</g>
|
| 4019 |
</g>
|
| 4020 |
<g id="ytick_3">
|
| 4021 |
<g id="grid-y--4" class="grid grid-y">
|
| 4022 |
+
<path d="M 47.81 289.520846 L 835.361742 289.520846 " clip-path="url(#p09feef2583)" style="fill: none; stroke: #b0b0b0; stroke-opacity: 0.3; stroke-width: 0.8; stroke-linecap: square" />
|
| 4023 |
</g>
|
| 4024 |
<g id="line2d_9">
|
| 4025 |
<g>
|
| 4026 |
+
<use ns4:href="#m0fca2865ba" x="47.81" y="289.520846" style="stroke: #000000; stroke-width: 0.8" />
|
| 4027 |
</g>
|
| 4028 |
</g>
|
| 4029 |
<g id="text_9">
|
| 4030 |
+
<text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: end" x="40.81" y="293.320065" transform="rotate(-0 40.81 293.320065)">1.4</text>
|
| 4031 |
</g>
|
| 4032 |
</g>
|
| 4033 |
<g id="ytick_4">
|
| 4034 |
<g id="grid-y--5" class="grid grid-y">
|
| 4035 |
+
<path d="M 47.81 227.769172 L 835.361742 227.769172 " clip-path="url(#p09feef2583)" style="fill: none; stroke: #b0b0b0; stroke-opacity: 0.3; stroke-width: 0.8; stroke-linecap: square" />
|
| 4036 |
</g>
|
| 4037 |
<g id="line2d_10">
|
| 4038 |
<g>
|
| 4039 |
+
<use ns4:href="#m0fca2865ba" x="47.81" y="227.769172" style="stroke: #000000; stroke-width: 0.8" />
|
| 4040 |
</g>
|
| 4041 |
</g>
|
| 4042 |
<g id="text_10">
|
| 4043 |
+
<text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: end" x="40.81" y="231.568391" transform="rotate(-0 40.81 231.568391)">1.6</text>
|
| 4044 |
</g>
|
| 4045 |
</g>
|
| 4046 |
<g id="ytick_5">
|
| 4047 |
<g id="grid-y--6" class="grid grid-y">
|
| 4048 |
+
<path d="M 47.81 166.017498 L 835.361742 166.017498 " clip-path="url(#p09feef2583)" style="fill: none; stroke: #b0b0b0; stroke-opacity: 0.3; stroke-width: 0.8; stroke-linecap: square" />
|
| 4049 |
</g>
|
| 4050 |
<g id="line2d_11">
|
| 4051 |
<g>
|
| 4052 |
+
<use ns4:href="#m0fca2865ba" x="47.81" y="166.017498" style="stroke: #000000; stroke-width: 0.8" />
|
| 4053 |
</g>
|
| 4054 |
</g>
|
| 4055 |
<g id="text_11">
|
| 4056 |
+
<text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: end" x="40.81" y="169.816717" transform="rotate(-0 40.81 169.816717)">1.8</text>
|
| 4057 |
</g>
|
| 4058 |
</g>
|
| 4059 |
<g id="ytick_6">
|
| 4060 |
<g id="grid-y--7" class="grid grid-y">
|
| 4061 |
+
<path d="M 47.81 104.265824 L 835.361742 104.265824 " clip-path="url(#p09feef2583)" style="fill: none; stroke: #b0b0b0; stroke-opacity: 0.3; stroke-width: 0.8; stroke-linecap: square" />
|
| 4062 |
</g>
|
| 4063 |
<g id="line2d_12">
|
| 4064 |
<g>
|
| 4065 |
+
<use ns4:href="#m0fca2865ba" x="47.81" y="104.265824" style="stroke: #000000; stroke-width: 0.8" />
|
| 4066 |
</g>
|
| 4067 |
</g>
|
| 4068 |
<g id="text_12">
|
| 4069 |
+
<text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: end" x="40.81" y="108.065043" transform="rotate(-0 40.81 108.065043)">2.0</text>
|
| 4070 |
</g>
|
| 4071 |
</g>
|
| 4072 |
<g id="ytick_7">
|
| 4073 |
<g id="grid-y--8" class="grid grid-y">
|
| 4074 |
+
<path d="M 47.81 42.51415 L 835.361742 42.51415 " clip-path="url(#p09feef2583)" style="fill: none; stroke: #b0b0b0; stroke-opacity: 0.3; stroke-width: 0.8; stroke-linecap: square" />
|
| 4075 |
</g>
|
| 4076 |
<g id="line2d_13">
|
| 4077 |
<g>
|
| 4078 |
+
<use ns4:href="#m0fca2865ba" x="47.81" y="42.51415" style="stroke: #000000; stroke-width: 0.8" />
|
| 4079 |
</g>
|
| 4080 |
</g>
|
| 4081 |
<g id="text_13">
|
| 4082 |
+
<text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: end" x="40.81" y="46.313369" transform="rotate(-0 40.81 46.313369)">2.2</text>
|
| 4083 |
</g>
|
| 4084 |
</g>
|
| 4085 |
<g id="label--y" class="ylabel">
|
|
|
|
| 4087 |
</g>
|
| 4088 |
</g>
|
| 4089 |
<g id="series--torch-flash-ma" class="series">
|
| 4090 |
+
<path d="M 83.607806 346.756003 L 226.799032 329.780159 L 369.990258 321.569965 L 513.181484 313.597515 L 656.37271 266.140736 L 799.563935 260.34812 " clip-path="url(#p09feef2583)" style="fill: none; stroke: #1f77b4; stroke-width: 1.5; stroke-linecap: square" />
|
| 4091 |
<defs>
|
| 4092 |
<path id="md7efaf3aec" d="M 0 3 C 0.795609 3 1.55874 2.683901 2.12132 2.12132 C 2.683901 1.55874 3 0.795609 3 0 C 3 -0.795609 2.683901 -1.55874 2.12132 -2.12132 C 1.55874 -2.683901 0.795609 -3 0 -3 C -0.795609 -3 -1.55874 -2.683901 -2.12132 -2.12132 C -2.683901 -1.55874 -3 -0.795609 -3 0 C -3 0.795609 -2.683901 1.55874 -2.12132 2.12132 C -1.55874 2.683901 -0.795609 3 0 3 z " style="stroke: #1f77b4" />
|
| 4093 |
</defs>
|
| 4094 |
<g clip-path="url(#p09feef2583)">
|
| 4095 |
+
<use ns4:href="#md7efaf3aec" x="83.607806" y="346.756003" style="fill: #1f77b4; stroke: #1f77b4" />
|
| 4096 |
+
<use ns4:href="#md7efaf3aec" x="226.799032" y="329.780159" style="fill: #1f77b4; stroke: #1f77b4" />
|
| 4097 |
+
<use ns4:href="#md7efaf3aec" x="369.990258" y="321.569965" style="fill: #1f77b4; stroke: #1f77b4" />
|
| 4098 |
+
<use ns4:href="#md7efaf3aec" x="513.181484" y="313.597515" style="fill: #1f77b4; stroke: #1f77b4" />
|
| 4099 |
+
<use ns4:href="#md7efaf3aec" x="656.37271" y="266.140736" style="fill: #1f77b4; stroke: #1f77b4" />
|
| 4100 |
+
<use ns4:href="#md7efaf3aec" x="799.563935" y="260.34812" style="fill: #1f77b4; stroke: #1f77b4" />
|
| 4101 |
</g>
|
| 4102 |
</g>
|
| 4103 |
<g id="series--torch-mem-eff" class="series">
|
| 4104 |
+
<path d="M 83.607806 155.401459 L 226.799032 122.036412 L 369.990258 119.6 L 513.181484 89.078617 L 656.37271 83.422164 L 799.563935 45.999414 " clip-path="url(#p09feef2583)" style="fill: none; stroke: #ff7f0e; stroke-width: 1.5; stroke-linecap: square" />
|
| 4105 |
<defs>
|
| 4106 |
<path id="m9b8c54d372" d="M 0 3 C 0.795609 3 1.55874 2.683901 2.12132 2.12132 C 2.683901 1.55874 3 0.795609 3 0 C 3 -0.795609 2.683901 -1.55874 2.12132 -2.12132 C 1.55874 -2.683901 0.795609 -3 0 -3 C -0.795609 -3 -1.55874 -2.683901 -2.12132 -2.12132 C -2.683901 -1.55874 -3 -0.795609 -3 0 C -3 0.795609 -2.683901 1.55874 -2.12132 2.12132 C -1.55874 2.683901 -0.795609 3 0 3 z " style="stroke: #ff7f0e" />
|
| 4107 |
</defs>
|
| 4108 |
<g clip-path="url(#p09feef2583)">
|
| 4109 |
+
<use ns4:href="#m9b8c54d372" x="83.607806" y="155.401459" style="fill: #ff7f0e; stroke: #ff7f0e" />
|
| 4110 |
+
<use ns4:href="#m9b8c54d372" x="226.799032" y="122.036412" style="fill: #ff7f0e; stroke: #ff7f0e" />
|
| 4111 |
+
<use ns4:href="#m9b8c54d372" x="369.990258" y="119.6" style="fill: #ff7f0e; stroke: #ff7f0e" />
|
| 4112 |
+
<use ns4:href="#m9b8c54d372" x="513.181484" y="89.078617" style="fill: #ff7f0e; stroke: #ff7f0e" />
|
| 4113 |
+
<use ns4:href="#m9b8c54d372" x="656.37271" y="83.422164" style="fill: #ff7f0e; stroke: #ff7f0e" />
|
| 4114 |
<use ns4:href="#m9b8c54d372" x="799.563935" y="45.999414" style="fill: #ff7f0e; stroke: #ff7f0e" />
|
| 4115 |
</g>
|
| 4116 |
</g>
|
| 4117 |
<g id="series--xformers-meff" class="series">
|
| 4118 |
+
<path d="M 83.607806 415.619926 L 226.799032 397.353472 L 369.990258 394.772252 L 513.181484 393.111132 L 656.37271 341.729417 L 799.563935 342.902698 " clip-path="url(#p09feef2583)" style="fill: none; stroke: #2ca02c; stroke-width: 1.5; stroke-linecap: square" />
|
| 4119 |
<defs>
|
| 4120 |
<path id="mc655281e0b" d="M 0 3 C 0.795609 3 1.55874 2.683901 2.12132 2.12132 C 2.683901 1.55874 3 0.795609 3 0 C 3 -0.795609 2.683901 -1.55874 2.12132 -2.12132 C 1.55874 -2.683901 0.795609 -3 0 -3 C -0.795609 -3 -1.55874 -2.683901 -2.12132 -2.12132 C -2.683901 -1.55874 -3 -0.795609 -3 0 C -3 0.795609 -2.683901 1.55874 -2.12132 2.12132 C -1.55874 2.683901 -0.795609 3 0 3 z " style="stroke: #2ca02c" />
|
| 4121 |
</defs>
|
| 4122 |
<g clip-path="url(#p09feef2583)">
|
| 4123 |
+
<use ns4:href="#mc655281e0b" x="83.607806" y="415.619926" style="fill: #2ca02c; stroke: #2ca02c" />
|
| 4124 |
+
<use ns4:href="#mc655281e0b" x="226.799032" y="397.353472" style="fill: #2ca02c; stroke: #2ca02c" />
|
| 4125 |
+
<use ns4:href="#mc655281e0b" x="369.990258" y="394.772252" style="fill: #2ca02c; stroke: #2ca02c" />
|
| 4126 |
+
<use ns4:href="#mc655281e0b" x="513.181484" y="393.111132" style="fill: #2ca02c; stroke: #2ca02c" />
|
| 4127 |
+
<use ns4:href="#mc655281e0b" x="656.37271" y="341.729417" style="fill: #2ca02c; stroke: #2ca02c" />
|
| 4128 |
+
<use ns4:href="#mc655281e0b" x="799.563935" y="342.902698" style="fill: #2ca02c; stroke: #2ca02c" />
|
| 4129 |
</g>
|
| 4130 |
</g>
|
| 4131 |
<g id="series--hf-kernels-flash-attn" class="series">
|
| 4132 |
+
<path d="M 83.607806 428.387702 L 226.799032 413.415083 L 369.990258 398.063616 L 513.181484 390.915551 L 656.37271 347.629789 L 799.563935 352.847806 " clip-path="url(#p09feef2583)" style="fill: none; stroke: #d62728; stroke-width: 1.5; stroke-linecap: square" />
|
| 4133 |
<defs>
|
| 4134 |
<path id="m61c8040d7e" d="M 0 3 C 0.795609 3 1.55874 2.683901 2.12132 2.12132 C 2.683901 1.55874 3 0.795609 3 0 C 3 -0.795609 2.683901 -1.55874 2.12132 -2.12132 C 1.55874 -2.683901 0.795609 -3 0 -3 C -0.795609 -3 -1.55874 -2.683901 -2.12132 -2.12132 C -2.683901 -1.55874 -3 -0.795609 -3 0 C -3 0.795609 -2.683901 1.55874 -2.12132 2.12132 C -1.55874 2.683901 -0.795609 3 0 3 z " style="stroke: #d62728" />
|
| 4135 |
</defs>
|
| 4136 |
<g clip-path="url(#p09feef2583)">
|
| 4137 |
+
<use ns4:href="#m61c8040d7e" x="83.607806" y="428.387702" style="fill: #d62728; stroke: #d62728" />
|
| 4138 |
+
<use ns4:href="#m61c8040d7e" x="226.799032" y="413.415083" style="fill: #d62728; stroke: #d62728" />
|
| 4139 |
+
<use ns4:href="#m61c8040d7e" x="369.990258" y="398.063616" style="fill: #d62728; stroke: #d62728" />
|
| 4140 |
+
<use ns4:href="#m61c8040d7e" x="513.181484" y="390.915551" style="fill: #d62728; stroke: #d62728" />
|
| 4141 |
+
<use ns4:href="#m61c8040d7e" x="656.37271" y="347.629789" style="fill: #d62728; stroke: #d62728" />
|
| 4142 |
+
<use ns4:href="#m61c8040d7e" x="799.563935" y="352.847806" style="fill: #d62728; stroke: #d62728" />
|
| 4143 |
</g>
|
| 4144 |
</g>
|
| 4145 |
<g id="series--hf-kernels-flash-attn3" class="series">
|
| 4146 |
+
<path d="M 83.607806 411.846899 L 226.799032 414.604111 L 369.990258 404.183516 L 513.181484 406.09473 L 656.37271 355.213203 L 799.563935 367.844508 " clip-path="url(#p09feef2583)" style="fill: none; stroke: #9467bd; stroke-width: 1.5; stroke-linecap: square" />
|
| 4147 |
<defs>
|
| 4148 |
<path id="m7cd35be9cc" d="M 0 3 C 0.795609 3 1.55874 2.683901 2.12132 2.12132 C 2.683901 1.55874 3 0.795609 3 0 C 3 -0.795609 2.683901 -1.55874 2.12132 -2.12132 C 1.55874 -2.683901 0.795609 -3 0 -3 C -0.795609 -3 -1.55874 -2.683901 -2.12132 -2.12132 C -2.683901 -1.55874 -3 -0.795609 -3 0 C -3 0.795609 -2.683901 1.55874 -2.12132 2.12132 C -1.55874 2.683901 -0.795609 3 0 3 z " style="stroke: #9467bd" />
|
| 4149 |
</defs>
|
| 4150 |
<g clip-path="url(#p09feef2583)">
|
| 4151 |
+
<use ns4:href="#m7cd35be9cc" x="83.607806" y="411.846899" style="fill: #9467bd; stroke: #9467bd" />
|
| 4152 |
+
<use ns4:href="#m7cd35be9cc" x="226.799032" y="414.604111" style="fill: #9467bd; stroke: #9467bd" />
|
| 4153 |
+
<use ns4:href="#m7cd35be9cc" x="369.990258" y="404.183516" style="fill: #9467bd; stroke: #9467bd" />
|
| 4154 |
+
<use ns4:href="#m7cd35be9cc" x="513.181484" y="406.09473" style="fill: #9467bd; stroke: #9467bd" />
|
| 4155 |
+
<use ns4:href="#m7cd35be9cc" x="656.37271" y="355.213203" style="fill: #9467bd; stroke: #9467bd" />
|
| 4156 |
+
<use ns4:href="#m7cd35be9cc" x="799.563935" y="367.844508" style="fill: #9467bd; stroke: #9467bd" />
|
| 4157 |
</g>
|
| 4158 |
</g>
|
| 4159 |
<g id="patch_3">
|
|
|
|
| 4238 |
<span onclick="toggleOutput('combine')" style="cursor: pointer;">▼ output</span>
|
| 4239 |
<span id="uv-indicator-combine" onclick="toggleUvLogsFromHeader('combine')" style="cursor: pointer;">▶ uv-logs</span>
|
| 4240 |
</span> |
|
| 4241 |
+
Cell: combine | 4.24s
|
| 4242 |
| <button class="run-btn" onclick="runCell('combine')">▶ run</button>
|
| 4243 |
<button class="copy-btn" onclick="copyCell('combine')">Copy</button>
|
| 4244 |
<a href="cells/combine.py" target="_blank" class="raw-btn">Raw</a>
|
|
|
|
| 4345 |
COMBINED BENCHMARK SUMMARY
|
| 4346 |
|
| 4347 |
impl wl p50(ms) ok
|
| 4348 |
+
hf_kernels_flash_attn cuda_attn_L128_bfloat16 0.95 True
|
| 4349 |
+
hf_kernels_flash_attn cuda_attn_L256_bfloat16 1.00 True
|
| 4350 |
+
hf_kernels_flash_attn cuda_attn_L320_bfloat16 1.05 True
|
| 4351 |
+
hf_kernels_flash_attn cuda_attn_L384_bfloat16 1.07 True
|
| 4352 |
+
hf_kernels_flash_attn cuda_attn_L448_bfloat16 1.21 True
|
| 4353 |
+
hf_kernels_flash_attn cuda_attn_L512_bfloat16 1.19 True
|
| 4354 |
+
hf_kernels_flash_attn3 cuda_attn_L128_bfloat16 1.00 True
|
| 4355 |
+
hf_kernels_flash_attn3 cuda_attn_L256_bfloat16 0.99 True
|
| 4356 |
+
hf_kernels_flash_attn3 cuda_attn_L320_bfloat16 1.03 True
|
| 4357 |
+
hf_kernels_flash_attn3 cuda_attn_L384_bfloat16 1.02 True
|
| 4358 |
+
hf_kernels_flash_attn3 cuda_attn_L448_bfloat16 1.19 True
|
| 4359 |
+
hf_kernels_flash_attn3 cuda_attn_L512_bfloat16 1.15 True
|
| 4360 |
sage_int8_fp16 cuda_attn_L128_bfloat16 FAIL False
|
| 4361 |
+
Error: module 'sage_attention_d4f4a6803f593c0b' has no attribute 'fwd'
|
| 4362 |
sage_int8_fp16 cuda_attn_L256_bfloat16 FAIL False
|
| 4363 |
+
Error: module 'sage_attention_d4f4a6803f593c0b' has no attribute 'fwd'
|
| 4364 |
sage_int8_fp16 cuda_attn_L320_bfloat16 FAIL False
|
| 4365 |
+
Error: module 'sage_attention_d4f4a6803f593c0b' has no attribute 'fwd'
|
| 4366 |
sage_int8_fp16 cuda_attn_L384_bfloat16 FAIL False
|
| 4367 |
+
Error: module 'sage_attention_d4f4a6803f593c0b' has no attribute 'fwd'
|
| 4368 |
sage_int8_fp16 cuda_attn_L448_bfloat16 FAIL False
|
| 4369 |
+
Error: module 'sage_attention_d4f4a6803f593c0b' has no attribute 'fwd'
|
| 4370 |
sage_int8_fp16 cuda_attn_L512_bfloat16 FAIL False
|
| 4371 |
+
Error: module 'sage_attention_d4f4a6803f593c0b' has no attribute 'fwd'
|
| 4372 |
+
torch_flash_ma cuda_attn_L128_bfloat16 1.21 True
|
| 4373 |
+
torch_flash_ma cuda_attn_L256_bfloat16 1.27 True
|
| 4374 |
+
torch_flash_ma cuda_attn_L320_bfloat16 1.30 True
|
| 4375 |
+
torch_flash_ma cuda_attn_L384_bfloat16 1.32 True
|
| 4376 |
+
torch_flash_ma cuda_attn_L448_bfloat16 1.48 True
|
| 4377 |
+
torch_flash_ma cuda_attn_L512_bfloat16 1.49 True
|
| 4378 |
torch_mem_eff cuda_attn_L128_bfloat16 1.83 True
|
| 4379 |
+
torch_mem_eff cuda_attn_L256_bfloat16 1.94 True
|
| 4380 |
+
torch_mem_eff cuda_attn_L320_bfloat16 1.95 True
|
| 4381 |
+
torch_mem_eff cuda_attn_L384_bfloat16 2.05 True
|
| 4382 |
+
torch_mem_eff cuda_attn_L448_bfloat16 2.07 True
|
| 4383 |
torch_mem_eff cuda_attn_L512_bfloat16 2.19 True
|
| 4384 |
+
xformers_meff cuda_attn_L128_bfloat16 0.99 True
|
| 4385 |
+
xformers_meff cuda_attn_L256_bfloat16 1.05 True
|
| 4386 |
+
xformers_meff cuda_attn_L320_bfloat16 1.06 True
|
| 4387 |
+
xformers_meff cuda_attn_L384_bfloat16 1.06 True
|
| 4388 |
+
xformers_meff cuda_attn_L448_bfloat16 1.23 True
|
| 4389 |
+
xformers_meff cuda_attn_L512_bfloat16 1.23 True
|
| 4390 |
|
| 4391 |
GENERATING COMBINED VISUALIZATION
|
| 4392 |
|
|
|
|
| 4410 |
<div class="uv-install-logs" id="uv-logs-combine">
|
| 4411 |
<div class="uv-logs-header" onclick="toggleUvLogs(this)">▶ UV Install Logs</div>
|
| 4412 |
<div class="uv-logs-content" style="display: none;">
|
| 4413 |
+
Installed 37 packages in 204ms
|
| 4414 |
</div>
|
| 4415 |
</div>
|
| 4416 |
<div class="cell-artifacts">
|
|
|
|
| 4423 |
<rdf:RDF>
|
| 4424 |
<ns2:Work>
|
| 4425 |
<dc:type rdf:resource="http://purl.org/dc/dcmitype/StillImage" />
|
| 4426 |
+
<dc:date>2025-10-29T15:51:09.340715</dc:date>
|
| 4427 |
<dc:format>image/svg+xml</dc:format>
|
| 4428 |
<dc:creator>
|
| 4429 |
<ns2:Agent>
|
|
|
|
| 4533 |
<g id="matplotlib.axis_2">
|
| 4534 |
<g id="ytick_1">
|
| 4535 |
<g id="grid-y--2" class="grid grid-y">
|
| 4536 |
+
<path d="M 47.81 413.024194 L 835.361742 413.024194 " clip-path="url(#p09feef2583)" style="fill: none; stroke: #b0b0b0; stroke-opacity: 0.3; stroke-width: 0.8; stroke-linecap: square" />
|
| 4537 |
</g>
|
| 4538 |
<g id="line2d_7">
|
| 4539 |
<defs>
|
| 4540 |
<path id="m0fca2865ba" d="M 0 0 L -3.5 0 " style="stroke: #000000; stroke-width: 0.8" />
|
| 4541 |
</defs>
|
| 4542 |
<g>
|
| 4543 |
+
<use ns4:href="#m0fca2865ba" x="47.81" y="413.024194" style="stroke: #000000; stroke-width: 0.8" />
|
| 4544 |
</g>
|
| 4545 |
</g>
|
| 4546 |
<g id="text_7">
|
| 4547 |
+
<text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: end" x="40.81" y="416.823413" transform="rotate(-0 40.81 416.823413)">1.0</text>
|
| 4548 |
</g>
|
| 4549 |
</g>
|
| 4550 |
<g id="ytick_2">
|
| 4551 |
<g id="grid-y--3" class="grid grid-y">
|
| 4552 |
+
<path d="M 47.81 351.27252 L 835.361742 351.27252 " clip-path="url(#p09feef2583)" style="fill: none; stroke: #b0b0b0; stroke-opacity: 0.3; stroke-width: 0.8; stroke-linecap: square" />
|
| 4553 |
</g>
|
| 4554 |
<g id="line2d_8">
|
| 4555 |
<g>
|
| 4556 |
+
<use ns4:href="#m0fca2865ba" x="47.81" y="351.27252" style="stroke: #000000; stroke-width: 0.8" />
|
| 4557 |
</g>
|
| 4558 |
</g>
|
| 4559 |
<g id="text_8">
|
| 4560 |
+
<text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: end" x="40.81" y="355.071739" transform="rotate(-0 40.81 355.071739)">1.2</text>
|
| 4561 |
</g>
|
| 4562 |
</g>
|
| 4563 |
<g id="ytick_3">
|
| 4564 |
<g id="grid-y--4" class="grid grid-y">
|
| 4565 |
+
<path d="M 47.81 289.520846 L 835.361742 289.520846 " clip-path="url(#p09feef2583)" style="fill: none; stroke: #b0b0b0; stroke-opacity: 0.3; stroke-width: 0.8; stroke-linecap: square" />
|
| 4566 |
</g>
|
| 4567 |
<g id="line2d_9">
|
| 4568 |
<g>
|
| 4569 |
+
<use ns4:href="#m0fca2865ba" x="47.81" y="289.520846" style="stroke: #000000; stroke-width: 0.8" />
|
| 4570 |
</g>
|
| 4571 |
</g>
|
| 4572 |
<g id="text_9">
|
| 4573 |
+
<text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: end" x="40.81" y="293.320065" transform="rotate(-0 40.81 293.320065)">1.4</text>
|
| 4574 |
</g>
|
| 4575 |
</g>
|
| 4576 |
<g id="ytick_4">
|
| 4577 |
<g id="grid-y--5" class="grid grid-y">
|
| 4578 |
+
<path d="M 47.81 227.769172 L 835.361742 227.769172 " clip-path="url(#p09feef2583)" style="fill: none; stroke: #b0b0b0; stroke-opacity: 0.3; stroke-width: 0.8; stroke-linecap: square" />
|
| 4579 |
</g>
|
| 4580 |
<g id="line2d_10">
|
| 4581 |
<g>
|
| 4582 |
+
<use ns4:href="#m0fca2865ba" x="47.81" y="227.769172" style="stroke: #000000; stroke-width: 0.8" />
|
| 4583 |
</g>
|
| 4584 |
</g>
|
| 4585 |
<g id="text_10">
|
| 4586 |
+
<text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: end" x="40.81" y="231.568391" transform="rotate(-0 40.81 231.568391)">1.6</text>
|
| 4587 |
</g>
|
| 4588 |
</g>
|
| 4589 |
<g id="ytick_5">
|
| 4590 |
<g id="grid-y--6" class="grid grid-y">
|
| 4591 |
+
<path d="M 47.81 166.017498 L 835.361742 166.017498 " clip-path="url(#p09feef2583)" style="fill: none; stroke: #b0b0b0; stroke-opacity: 0.3; stroke-width: 0.8; stroke-linecap: square" />
|
| 4592 |
</g>
|
| 4593 |
<g id="line2d_11">
|
| 4594 |
<g>
|
| 4595 |
+
<use ns4:href="#m0fca2865ba" x="47.81" y="166.017498" style="stroke: #000000; stroke-width: 0.8" />
|
| 4596 |
</g>
|
| 4597 |
</g>
|
| 4598 |
<g id="text_11">
|
| 4599 |
+
<text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: end" x="40.81" y="169.816717" transform="rotate(-0 40.81 169.816717)">1.8</text>
|
| 4600 |
</g>
|
| 4601 |
</g>
|
| 4602 |
<g id="ytick_6">
|
| 4603 |
<g id="grid-y--7" class="grid grid-y">
|
| 4604 |
+
<path d="M 47.81 104.265824 L 835.361742 104.265824 " clip-path="url(#p09feef2583)" style="fill: none; stroke: #b0b0b0; stroke-opacity: 0.3; stroke-width: 0.8; stroke-linecap: square" />
|
| 4605 |
</g>
|
| 4606 |
<g id="line2d_12">
|
| 4607 |
<g>
|
| 4608 |
+
<use ns4:href="#m0fca2865ba" x="47.81" y="104.265824" style="stroke: #000000; stroke-width: 0.8" />
|
| 4609 |
</g>
|
| 4610 |
</g>
|
| 4611 |
<g id="text_12">
|
| 4612 |
+
<text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: end" x="40.81" y="108.065043" transform="rotate(-0 40.81 108.065043)">2.0</text>
|
| 4613 |
</g>
|
| 4614 |
</g>
|
| 4615 |
<g id="ytick_7">
|
| 4616 |
<g id="grid-y--8" class="grid grid-y">
|
| 4617 |
+
<path d="M 47.81 42.51415 L 835.361742 42.51415 " clip-path="url(#p09feef2583)" style="fill: none; stroke: #b0b0b0; stroke-opacity: 0.3; stroke-width: 0.8; stroke-linecap: square" />
|
| 4618 |
</g>
|
| 4619 |
<g id="line2d_13">
|
| 4620 |
<g>
|
| 4621 |
+
<use ns4:href="#m0fca2865ba" x="47.81" y="42.51415" style="stroke: #000000; stroke-width: 0.8" />
|
| 4622 |
</g>
|
| 4623 |
</g>
|
| 4624 |
<g id="text_13">
|
| 4625 |
+
<text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: end" x="40.81" y="46.313369" transform="rotate(-0 40.81 46.313369)">2.2</text>
|
| 4626 |
</g>
|
| 4627 |
</g>
|
| 4628 |
<g id="label--y" class="ylabel">
|
|
|
|
| 4630 |
</g>
|
| 4631 |
</g>
|
| 4632 |
<g id="series--torch-flash-ma" class="series">
|
| 4633 |
+
<path d="M 83.607806 346.756003 L 226.799032 329.780159 L 369.990258 321.569965 L 513.181484 313.597515 L 656.37271 266.140736 L 799.563935 260.34812 " clip-path="url(#p09feef2583)" style="fill: none; stroke: #1f77b4; stroke-width: 1.5; stroke-linecap: square" />
|
| 4634 |
<defs>
|
| 4635 |
<path id="md7efaf3aec" d="M 0 3 C 0.795609 3 1.55874 2.683901 2.12132 2.12132 C 2.683901 1.55874 3 0.795609 3 0 C 3 -0.795609 2.683901 -1.55874 2.12132 -2.12132 C 1.55874 -2.683901 0.795609 -3 0 -3 C -0.795609 -3 -1.55874 -2.683901 -2.12132 -2.12132 C -2.683901 -1.55874 -3 -0.795609 -3 0 C -3 0.795609 -2.683901 1.55874 -2.12132 2.12132 C -1.55874 2.683901 -0.795609 3 0 3 z " style="stroke: #1f77b4" />
|
| 4636 |
</defs>
|
| 4637 |
<g clip-path="url(#p09feef2583)">
|
| 4638 |
+
<use ns4:href="#md7efaf3aec" x="83.607806" y="346.756003" style="fill: #1f77b4; stroke: #1f77b4" />
|
| 4639 |
+
<use ns4:href="#md7efaf3aec" x="226.799032" y="329.780159" style="fill: #1f77b4; stroke: #1f77b4" />
|
| 4640 |
+
<use ns4:href="#md7efaf3aec" x="369.990258" y="321.569965" style="fill: #1f77b4; stroke: #1f77b4" />
|
| 4641 |
+
<use ns4:href="#md7efaf3aec" x="513.181484" y="313.597515" style="fill: #1f77b4; stroke: #1f77b4" />
|
| 4642 |
+
<use ns4:href="#md7efaf3aec" x="656.37271" y="266.140736" style="fill: #1f77b4; stroke: #1f77b4" />
|
| 4643 |
+
<use ns4:href="#md7efaf3aec" x="799.563935" y="260.34812" style="fill: #1f77b4; stroke: #1f77b4" />
|
| 4644 |
</g>
|
| 4645 |
</g>
|
| 4646 |
<g id="series--torch-mem-eff" class="series">
|
| 4647 |
+
<path d="M 83.607806 155.401459 L 226.799032 122.036412 L 369.990258 119.6 L 513.181484 89.078617 L 656.37271 83.422164 L 799.563935 45.999414 " clip-path="url(#p09feef2583)" style="fill: none; stroke: #ff7f0e; stroke-width: 1.5; stroke-linecap: square" />
|
| 4648 |
<defs>
|
| 4649 |
<path id="m9b8c54d372" d="M 0 3 C 0.795609 3 1.55874 2.683901 2.12132 2.12132 C 2.683901 1.55874 3 0.795609 3 0 C 3 -0.795609 2.683901 -1.55874 2.12132 -2.12132 C 1.55874 -2.683901 0.795609 -3 0 -3 C -0.795609 -3 -1.55874 -2.683901 -2.12132 -2.12132 C -2.683901 -1.55874 -3 -0.795609 -3 0 C -3 0.795609 -2.683901 1.55874 -2.12132 2.12132 C -1.55874 2.683901 -0.795609 3 0 3 z " style="stroke: #ff7f0e" />
|
| 4650 |
</defs>
|
| 4651 |
<g clip-path="url(#p09feef2583)">
|
| 4652 |
+
<use ns4:href="#m9b8c54d372" x="83.607806" y="155.401459" style="fill: #ff7f0e; stroke: #ff7f0e" />
|
| 4653 |
+
<use ns4:href="#m9b8c54d372" x="226.799032" y="122.036412" style="fill: #ff7f0e; stroke: #ff7f0e" />
|
| 4654 |
+
<use ns4:href="#m9b8c54d372" x="369.990258" y="119.6" style="fill: #ff7f0e; stroke: #ff7f0e" />
|
| 4655 |
+
<use ns4:href="#m9b8c54d372" x="513.181484" y="89.078617" style="fill: #ff7f0e; stroke: #ff7f0e" />
|
| 4656 |
+
<use ns4:href="#m9b8c54d372" x="656.37271" y="83.422164" style="fill: #ff7f0e; stroke: #ff7f0e" />
|
| 4657 |
<use ns4:href="#m9b8c54d372" x="799.563935" y="45.999414" style="fill: #ff7f0e; stroke: #ff7f0e" />
|
| 4658 |
</g>
|
| 4659 |
</g>
|
| 4660 |
<g id="series--xformers-meff" class="series">
|
| 4661 |
+
<path d="M 83.607806 415.619926 L 226.799032 397.353472 L 369.990258 394.772252 L 513.181484 393.111132 L 656.37271 341.729417 L 799.563935 342.902698 " clip-path="url(#p09feef2583)" style="fill: none; stroke: #2ca02c; stroke-width: 1.5; stroke-linecap: square" />
|
| 4662 |
<defs>
|
| 4663 |
<path id="mc655281e0b" d="M 0 3 C 0.795609 3 1.55874 2.683901 2.12132 2.12132 C 2.683901 1.55874 3 0.795609 3 0 C 3 -0.795609 2.683901 -1.55874 2.12132 -2.12132 C 1.55874 -2.683901 0.795609 -3 0 -3 C -0.795609 -3 -1.55874 -2.683901 -2.12132 -2.12132 C -2.683901 -1.55874 -3 -0.795609 -3 0 C -3 0.795609 -2.683901 1.55874 -2.12132 2.12132 C -1.55874 2.683901 -0.795609 3 0 3 z " style="stroke: #2ca02c" />
|
| 4664 |
</defs>
|
| 4665 |
<g clip-path="url(#p09feef2583)">
|
| 4666 |
+
<use ns4:href="#mc655281e0b" x="83.607806" y="415.619926" style="fill: #2ca02c; stroke: #2ca02c" />
|
| 4667 |
+
<use ns4:href="#mc655281e0b" x="226.799032" y="397.353472" style="fill: #2ca02c; stroke: #2ca02c" />
|
| 4668 |
+
<use ns4:href="#mc655281e0b" x="369.990258" y="394.772252" style="fill: #2ca02c; stroke: #2ca02c" />
|
| 4669 |
+
<use ns4:href="#mc655281e0b" x="513.181484" y="393.111132" style="fill: #2ca02c; stroke: #2ca02c" />
|
| 4670 |
+
<use ns4:href="#mc655281e0b" x="656.37271" y="341.729417" style="fill: #2ca02c; stroke: #2ca02c" />
|
| 4671 |
+
<use ns4:href="#mc655281e0b" x="799.563935" y="342.902698" style="fill: #2ca02c; stroke: #2ca02c" />
|
| 4672 |
</g>
|
| 4673 |
</g>
|
| 4674 |
<g id="series--hf-kernels-flash-attn" class="series">
|
| 4675 |
+
<path d="M 83.607806 428.387702 L 226.799032 413.415083 L 369.990258 398.063616 L 513.181484 390.915551 L 656.37271 347.629789 L 799.563935 352.847806 " clip-path="url(#p09feef2583)" style="fill: none; stroke: #d62728; stroke-width: 1.5; stroke-linecap: square" />
|
| 4676 |
<defs>
|
| 4677 |
<path id="m61c8040d7e" d="M 0 3 C 0.795609 3 1.55874 2.683901 2.12132 2.12132 C 2.683901 1.55874 3 0.795609 3 0 C 3 -0.795609 2.683901 -1.55874 2.12132 -2.12132 C 1.55874 -2.683901 0.795609 -3 0 -3 C -0.795609 -3 -1.55874 -2.683901 -2.12132 -2.12132 C -2.683901 -1.55874 -3 -0.795609 -3 0 C -3 0.795609 -2.683901 1.55874 -2.12132 2.12132 C -1.55874 2.683901 -0.795609 3 0 3 z " style="stroke: #d62728" />
|
| 4678 |
</defs>
|
| 4679 |
<g clip-path="url(#p09feef2583)">
|
| 4680 |
+
<use ns4:href="#m61c8040d7e" x="83.607806" y="428.387702" style="fill: #d62728; stroke: #d62728" />
|
| 4681 |
+
<use ns4:href="#m61c8040d7e" x="226.799032" y="413.415083" style="fill: #d62728; stroke: #d62728" />
|
| 4682 |
+
<use ns4:href="#m61c8040d7e" x="369.990258" y="398.063616" style="fill: #d62728; stroke: #d62728" />
|
| 4683 |
+
<use ns4:href="#m61c8040d7e" x="513.181484" y="390.915551" style="fill: #d62728; stroke: #d62728" />
|
| 4684 |
+
<use ns4:href="#m61c8040d7e" x="656.37271" y="347.629789" style="fill: #d62728; stroke: #d62728" />
|
| 4685 |
+
<use ns4:href="#m61c8040d7e" x="799.563935" y="352.847806" style="fill: #d62728; stroke: #d62728" />
|
| 4686 |
</g>
|
| 4687 |
</g>
|
| 4688 |
<g id="series--hf-kernels-flash-attn3" class="series">
|
| 4689 |
+
<path d="M 83.607806 411.846899 L 226.799032 414.604111 L 369.990258 404.183516 L 513.181484 406.09473 L 656.37271 355.213203 L 799.563935 367.844508 " clip-path="url(#p09feef2583)" style="fill: none; stroke: #9467bd; stroke-width: 1.5; stroke-linecap: square" />
|
| 4690 |
<defs>
|
| 4691 |
<path id="m7cd35be9cc" d="M 0 3 C 0.795609 3 1.55874 2.683901 2.12132 2.12132 C 2.683901 1.55874 3 0.795609 3 0 C 3 -0.795609 2.683901 -1.55874 2.12132 -2.12132 C 1.55874 -2.683901 0.795609 -3 0 -3 C -0.795609 -3 -1.55874 -2.683901 -2.12132 -2.12132 C -2.683901 -1.55874 -3 -0.795609 -3 0 C -3 0.795609 -2.683901 1.55874 -2.12132 2.12132 C -1.55874 2.683901 -0.795609 3 0 3 z " style="stroke: #9467bd" />
|
| 4692 |
</defs>
|
| 4693 |
<g clip-path="url(#p09feef2583)">
|
| 4694 |
+
<use ns4:href="#m7cd35be9cc" x="83.607806" y="411.846899" style="fill: #9467bd; stroke: #9467bd" />
|
| 4695 |
+
<use ns4:href="#m7cd35be9cc" x="226.799032" y="414.604111" style="fill: #9467bd; stroke: #9467bd" />
|
| 4696 |
+
<use ns4:href="#m7cd35be9cc" x="369.990258" y="404.183516" style="fill: #9467bd; stroke: #9467bd" />
|
| 4697 |
+
<use ns4:href="#m7cd35be9cc" x="513.181484" y="406.09473" style="fill: #9467bd; stroke: #9467bd" />
|
| 4698 |
+
<use ns4:href="#m7cd35be9cc" x="656.37271" y="355.213203" style="fill: #9467bd; stroke: #9467bd" />
|
| 4699 |
+
<use ns4:href="#m7cd35be9cc" x="799.563935" y="367.844508" style="fill: #9467bd; stroke: #9467bd" />
|
| 4700 |
</g>
|
| 4701 |
</g>
|
| 4702 |
<g id="patch_3">
|
index.html
CHANGED
|
@@ -809,6 +809,14 @@
|
|
| 809 |
.artifact-preview svg {
|
| 810 |
background: transparent;
|
| 811 |
}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 812 |
/* CSV table styling */
|
| 813 |
.artifact-csv {
|
| 814 |
margin-top: 1rem;
|
|
@@ -3865,8 +3873,7 @@ span.linenos.special { color: #000000; background-color: #ffffc0; padding-left:
|
|
| 3865 |
<h1>All Benchmarks Aggregated Report</h1>
|
| 3866 |
<h2><a href="layer_norm/">Layer Norm</a></h2>
|
| 3867 |
<div class="artifact-preview">
|
| 3868 |
-
<
|
| 3869 |
-
</object>
|
| 3870 |
</div>
|
| 3871 |
|
| 3872 |
<table>
|
|
@@ -3889,8 +3896,7 @@ span.linenos.special { color: #000000; background-color: #ffffc0; padding-left:
|
|
| 3889 |
</table>
|
| 3890 |
<h2><a href="rotary/">Rotary Position Embeddings</a></h2>
|
| 3891 |
<div class="artifact-preview">
|
| 3892 |
-
<
|
| 3893 |
-
</object>
|
| 3894 |
</div>
|
| 3895 |
|
| 3896 |
<table>
|
|
@@ -3913,8 +3919,7 @@ span.linenos.special { color: #000000; background-color: #ffffc0; padding-left:
|
|
| 3913 |
</table>
|
| 3914 |
<h2><a href="flash_attn/">Flash Attention</a></h2>
|
| 3915 |
<div class="artifact-preview">
|
| 3916 |
-
<
|
| 3917 |
-
</object>
|
| 3918 |
</div>
|
| 3919 |
|
| 3920 |
<table>
|
|
@@ -3953,8 +3958,7 @@ span.linenos.special { color: #000000; background-color: #ffffc0; padding-left:
|
|
| 3953 |
</table>
|
| 3954 |
<h2><a href="causal_conv1d/">Causal Conv1D</a></h2>
|
| 3955 |
<div class="artifact-preview">
|
| 3956 |
-
<
|
| 3957 |
-
</object>
|
| 3958 |
</div>
|
| 3959 |
|
| 3960 |
<table>
|
|
@@ -3977,8 +3981,7 @@ span.linenos.special { color: #000000; background-color: #ffffc0; padding-left:
|
|
| 3977 |
</table>
|
| 3978 |
<h2><a href="activation/">Activation</a></h2>
|
| 3979 |
<div class="artifact-preview">
|
| 3980 |
-
<
|
| 3981 |
-
</object>
|
| 3982 |
</div>
|
| 3983 |
|
| 3984 |
<table>
|
|
@@ -4001,8 +4004,7 @@ span.linenos.special { color: #000000; background-color: #ffffc0; padding-left:
|
|
| 4001 |
</table>
|
| 4002 |
<h2><a href="relu/">ReLU</a></h2>
|
| 4003 |
<div class="artifact-preview">
|
| 4004 |
-
<
|
| 4005 |
-
</object>
|
| 4006 |
</div>
|
| 4007 |
|
| 4008 |
<table>
|
|
|
|
| 809 |
.artifact-preview svg {
|
| 810 |
background: transparent;
|
| 811 |
}
|
| 812 |
+
/* Invert SVG images in dark mode */
|
| 813 |
+
:root[data-theme="dark"] .artifact-preview img[src$=".svg"] {
|
| 814 |
+
filter: invert(0.9) hue-rotate(180deg);
|
| 815 |
+
}
|
| 816 |
+
/* Keep SVG images readable in monocolor mode */
|
| 817 |
+
:root[data-ui="monocolor"] .artifact-preview img[src$=".svg"] {
|
| 818 |
+
filter: none;
|
| 819 |
+
}
|
| 820 |
/* CSV table styling */
|
| 821 |
.artifact-csv {
|
| 822 |
margin-top: 1rem;
|
|
|
|
| 3873 |
<h1>All Benchmarks Aggregated Report</h1>
|
| 3874 |
<h2><a href="layer_norm/">Layer Norm</a></h2>
|
| 3875 |
<div class="artifact-preview">
|
| 3876 |
+
<img src="layer_norm/results/artifacts/combine/latency.svg" alt="Layer Norm Latency" width="800">
|
|
|
|
| 3877 |
</div>
|
| 3878 |
|
| 3879 |
<table>
|
|
|
|
| 3896 |
</table>
|
| 3897 |
<h2><a href="rotary/">Rotary Position Embeddings</a></h2>
|
| 3898 |
<div class="artifact-preview">
|
| 3899 |
+
<img src="rotary/results/artifacts/combine/latency.svg" alt="Rotary Position Embeddings Latency" width="800">
|
|
|
|
| 3900 |
</div>
|
| 3901 |
|
| 3902 |
<table>
|
|
|
|
| 3919 |
</table>
|
| 3920 |
<h2><a href="flash_attn/">Flash Attention</a></h2>
|
| 3921 |
<div class="artifact-preview">
|
| 3922 |
+
<img src="flash_attn/results/artifacts/combine/latency.svg" alt="Flash Attention Latency" width="800">
|
|
|
|
| 3923 |
</div>
|
| 3924 |
|
| 3925 |
<table>
|
|
|
|
| 3958 |
</table>
|
| 3959 |
<h2><a href="causal_conv1d/">Causal Conv1D</a></h2>
|
| 3960 |
<div class="artifact-preview">
|
| 3961 |
+
<img src="causal_conv1d/results/artifacts/combine/latency.svg" alt="Causal Conv1D Latency" width="800">
|
|
|
|
| 3962 |
</div>
|
| 3963 |
|
| 3964 |
<table>
|
|
|
|
| 3981 |
</table>
|
| 3982 |
<h2><a href="activation/">Activation</a></h2>
|
| 3983 |
<div class="artifact-preview">
|
| 3984 |
+
<img src="activation/results/artifacts/combine/latency.svg" alt="Activation Latency" width="800">
|
|
|
|
| 3985 |
</div>
|
| 3986 |
|
| 3987 |
<table>
|
|
|
|
| 4004 |
</table>
|
| 4005 |
<h2><a href="relu/">ReLU</a></h2>
|
| 4006 |
<div class="artifact-preview">
|
| 4007 |
+
<img src="relu/results/artifacts/combine/latency.svg" alt="ReLU Latency" width="800">
|
|
|
|
| 4008 |
</div>
|
| 4009 |
|
| 4010 |
<table>
|
layer_norm/impls/artifacts/benchmark/layer_norm.jsonl
CHANGED
|
@@ -1,4 +1,4 @@
|
|
| 1 |
-
{"ts": "2025-10-
|
| 2 |
-
{"ts": "2025-10-
|
| 3 |
-
{"ts": "2025-10-
|
| 4 |
-
{"ts": "2025-10-
|
|
|
|
| 1 |
+
{"ts": "2025-10-29T15:50:43Z", "run": "146df378aaa14293b5d1526656fc6cb4", "impl": "torch_layer_norm", "tags": {"family": "torch", "op": "layer_norm"}, "wl": {"name": "LN_B16_S2048_D4096", "batch": 16, "seq_len": 2048, "hidden_dim": 4096, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.8137589999819284, "p50": 0.8219090000238793, "p90": 0.8223789999988185, "mean": 0.8196492000138278, "iqr": 0.007259999961206631, "raw_times": [0.825080000026901, 0.8219090000238793, 0.8137589999819284, 0.8223789999988185, 0.8151190000376118], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.8213489999775447, "peak_bytes": 2415935488, "ok": true, "absmax": 0.03125, "corr": {"ok": true, "rtol": 0.001, "atol": 0.03125, "absmax": 0.03125, "mae": 0.0015411376953125, "mse": 1.1205673217773438e-05, "ref": "layer_norm_ref"}, "err": null}
|
| 2 |
+
{"ts": "2025-10-29T15:50:43Z", "run": "146df378aaa14293b5d1526656fc6cb4", "impl": "torch_layer_norm", "tags": {"family": "torch", "op": "layer_norm"}, "wl": {"name": "LN_B16_S2048_D8192", "batch": 16, "seq_len": 2048, "hidden_dim": 8192, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 1.680888999999297, "p50": 1.6820789999769659, "p90": 1.6842590000010205, "mean": 1.683131400000093, "iqr": 0.0026189999857706425, "raw_times": [1.6820789999769659, 1.680888999999297, 1.6816400000152498, 1.6842590000010205, 1.6867900000079317], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 1.687689999982922, "peak_bytes": 4831870976, "ok": true, "absmax": 0.03125, "corr": {"ok": true, "rtol": 0.001, "atol": 0.03125, "absmax": 0.03125, "mae": 0.0015106201171875, "mse": 1.1086463928222656e-05, "ref": "layer_norm_ref"}, "err": null}
|
| 3 |
+
{"ts": "2025-10-29T15:50:43Z", "run": "146df378aaa14293b5d1526656fc6cb4", "impl": "torch_layer_norm", "tags": {"family": "torch", "op": "layer_norm"}, "wl": {"name": "LN_B16_S4096_D4096", "batch": 16, "seq_len": 4096, "hidden_dim": 4096, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 1.603787000021839, "p50": 1.6093779999550861, "p90": 1.6102179999961663, "mean": 1.6086159999986194, "iqr": 0.002069999993636884, "raw_times": [1.6093779999550861, 1.6081480000025294, 1.603787000021839, 1.611549000017476, 1.6102179999961663], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 1.6238279999924998, "peak_bytes": 4831854592, "ok": true, "absmax": 0.03125, "corr": {"ok": true, "rtol": 0.001, "atol": 0.03125, "absmax": 0.03125, "mae": 0.0015411376953125, "mse": 1.1205673217773438e-05, "ref": "layer_norm_ref"}, "err": null}
|
| 4 |
+
{"ts": "2025-10-29T15:50:44Z", "run": "146df378aaa14293b5d1526656fc6cb4", "impl": "torch_layer_norm", "tags": {"family": "torch", "op": "layer_norm"}, "wl": {"name": "LN_B16_S4096_D8192", "batch": 16, "seq_len": 4096, "hidden_dim": 8192, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 3.30805800001599, "p50": 3.3301390000133324, "p90": 3.331328999991001, "mean": 3.3278527999868857, "iqr": 0.001610000026630587, "raw_times": [3.331328999991001, 3.3400189999497343, 3.3297189999643706, 3.30805800001599, 3.3301390000133324], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 3.3235790000389898, "peak_bytes": 9663709184, "ok": true, "absmax": 0.03125, "corr": {"ok": true, "rtol": 0.001, "atol": 0.03125, "absmax": 0.03125, "mae": 0.0015106201171875, "mse": 1.1026859283447266e-05, "ref": "layer_norm_ref"}, "err": null}
|
layer_norm/impls/cells/benchmark.py
CHANGED
|
@@ -3,7 +3,6 @@
|
|
| 3 |
# dependencies = [
|
| 4 |
# "numpy",
|
| 5 |
# "torch==2.8.0",
|
| 6 |
-
# "kernels",
|
| 7 |
# "kernels-benchmark-tools",
|
| 8 |
# ]
|
| 9 |
#
|
|
@@ -13,37 +12,15 @@
|
|
| 13 |
import torch
|
| 14 |
import sys
|
| 15 |
from kernels_benchmark_tools import KernelTypeEnum, run_benchmark
|
| 16 |
-
from kernels import get_kernel
|
| 17 |
|
| 18 |
-
# Load the layer norm kernel
|
| 19 |
-
layer_norm_kernel = get_kernel("kernels-community/layer-norm")
|
| 20 |
|
| 21 |
-
|
| 22 |
-
|
| 23 |
-
B, S, D = x.shape
|
| 24 |
-
# The kernel expects [N, D] input; support beta (bias) if provided.
|
| 25 |
-
out = layer_norm_kernel.dropout_add_ln_fwd(
|
| 26 |
-
input=x.view(-1, D),
|
| 27 |
-
gamma=weight,
|
| 28 |
-
beta=bias,
|
| 29 |
-
rowscale=None,
|
| 30 |
-
colscale=None,
|
| 31 |
-
x0_subset=None,
|
| 32 |
-
z_subset=None,
|
| 33 |
-
dropout_p=0.0,
|
| 34 |
-
epsilon=eps,
|
| 35 |
-
rowscale_const=1.0,
|
| 36 |
-
z_numrows=S,
|
| 37 |
-
gen=None,
|
| 38 |
-
residual_in_fp32=False,
|
| 39 |
-
is_rms_norm=False,
|
| 40 |
-
)[0].view(B, S, D)
|
| 41 |
-
return out
|
| 42 |
|
| 43 |
|
| 44 |
run_benchmark(
|
| 45 |
kernel_type=KernelTypeEnum.LAYER_NORM,
|
| 46 |
-
impl_name="
|
| 47 |
-
impl_tags={"family": "
|
| 48 |
-
impl_func=
|
| 49 |
)
|
|
|
|
| 3 |
# dependencies = [
|
| 4 |
# "numpy",
|
| 5 |
# "torch==2.8.0",
|
|
|
|
| 6 |
# "kernels-benchmark-tools",
|
| 7 |
# ]
|
| 8 |
#
|
|
|
|
| 12 |
import torch
|
| 13 |
import sys
|
| 14 |
from kernels_benchmark_tools import KernelTypeEnum, run_benchmark
|
|
|
|
| 15 |
|
|
|
|
|
|
|
| 16 |
|
| 17 |
+
def torch_layer_norm(x, weight, bias, eps: float = 1e-5):
|
| 18 |
+
return torch.nn.functional.layer_norm(x, (x.shape[-1],), weight, bias, eps)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 19 |
|
| 20 |
|
| 21 |
run_benchmark(
|
| 22 |
kernel_type=KernelTypeEnum.LAYER_NORM,
|
| 23 |
+
impl_name="torch_layer_norm",
|
| 24 |
+
impl_tags={"family": "torch", "op": "layer_norm"},
|
| 25 |
+
impl_func=torch_layer_norm,
|
| 26 |
)
|
layer_norm/impls/hf_kernels_layer_norm.html
CHANGED
|
@@ -809,6 +809,14 @@
|
|
| 809 |
.artifact-preview svg {
|
| 810 |
background: transparent;
|
| 811 |
}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 812 |
/* CSV table styling */
|
| 813 |
.artifact-csv {
|
| 814 |
margin-top: 1rem;
|
|
@@ -3872,7 +3880,7 @@ span.linenos.special { color: #000000; background-color: #ffffc0; padding-left:
|
|
| 3872 |
<span onclick="toggleOutput('benchmark')" style="cursor: pointer;">▼ output</span>
|
| 3873 |
<span id="uv-indicator-benchmark" onclick="toggleUvLogsFromHeader('benchmark')" style="cursor: pointer;">▶ uv-logs</span>
|
| 3874 |
</span> |
|
| 3875 |
-
Cell: benchmark |
|
| 3876 |
| <button class="run-btn" onclick="runCell('benchmark')">▶ run</button>
|
| 3877 |
<button class="copy-btn" onclick="copyCell('benchmark')">Copy</button>
|
| 3878 |
<a href="cells/benchmark.py" target="_blank" class="raw-btn">Raw</a>
|
|
@@ -3943,19 +3951,19 @@ PROFILE TRACE: hf_kernels_layer_norm | LN_B16_S2048_D4096
|
|
| 3943 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 3944 |
Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
|
| 3945 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 3946 |
-
hf_kernels_layer_norm
|
| 3947 |
-
_layer_norm_f8ec252::dropout_add_ln_fwd 1.
|
| 3948 |
-
hf_kernels_layer_norm 0.00% 0.000us 0.00% 0.000us 0.000us 2.
|
| 3949 |
-
void layer_norm::ln_fwd_kernel<layer_norm::Kernel_tr... 0.00% 0.000us 0.00% 0.000us 0.000us 2.
|
| 3950 |
-
Activity Buffer Request 36.
|
| 3951 |
-
aten::view 0.
|
| 3952 |
-
aten::empty 1.
|
| 3953 |
-
cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFla... 0.
|
| 3954 |
-
cudaLaunchKernel 1.
|
| 3955 |
-
cudaDeviceSynchronize 53.
|
| 3956 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 3957 |
-
Self CPU time total:
|
| 3958 |
-
Self CUDA time total: 2.
|
| 3959 |
|
| 3960 |
|
| 3961 |
|
|
@@ -3965,19 +3973,19 @@ PROFILE TRACE: hf_kernels_layer_norm | LN_B16_S2048_D8192
|
|
| 3965 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 3966 |
Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
|
| 3967 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 3968 |
-
hf_kernels_layer_norm
|
| 3969 |
-
_layer_norm_f8ec252::dropout_add_ln_fwd 0.
|
| 3970 |
-
hf_kernels_layer_norm 0.00% 0.000us 0.00% 0.000us 0.000us 4.
|
| 3971 |
-
void layer_norm::ln_fwd_kernel<layer_norm::Kernel_tr... 0.00% 0.000us 0.00% 0.000us 0.000us 4.
|
| 3972 |
-
Activity Buffer Request
|
| 3973 |
-
aten::view 0.
|
| 3974 |
-
aten::empty 0.46% 29.
|
| 3975 |
-
cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFla... 0.08%
|
| 3976 |
-
cudaLaunchKernel 0.
|
| 3977 |
-
cudaDeviceSynchronize
|
| 3978 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 3979 |
-
Self CPU time total: 6.
|
| 3980 |
-
Self CUDA time total: 4.
|
| 3981 |
|
| 3982 |
|
| 3983 |
|
|
@@ -3987,19 +3995,19 @@ PROFILE TRACE: hf_kernels_layer_norm | LN_B16_S4096_D4096
|
|
| 3987 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 3988 |
Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
|
| 3989 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 3990 |
-
hf_kernels_layer_norm 1.
|
| 3991 |
-
_layer_norm_f8ec252::dropout_add_ln_fwd 0.
|
| 3992 |
-
hf_kernels_layer_norm 0.00% 0.000us 0.00% 0.000us 0.000us 4.
|
| 3993 |
-
void layer_norm::ln_fwd_kernel<layer_norm::Kernel_tr... 0.00% 0.000us 0.00% 0.000us 0.000us 4.
|
| 3994 |
-
Activity Buffer Request 23.
|
| 3995 |
-
aten::view 0.
|
| 3996 |
-
aten::empty 0.
|
| 3997 |
-
cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFla... 0.08%
|
| 3998 |
-
cudaLaunchKernel 0.
|
| 3999 |
-
cudaDeviceSynchronize
|
| 4000 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 4001 |
-
Self CPU time total: 6.
|
| 4002 |
-
Self CUDA time total: 4.
|
| 4003 |
|
| 4004 |
|
| 4005 |
|
|
@@ -4009,24 +4017,24 @@ PROFILE TRACE: hf_kernels_layer_norm | LN_B16_S4096_D8192
|
|
| 4009 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 4010 |
Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
|
| 4011 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 4012 |
-
hf_kernels_layer_norm 1.
|
| 4013 |
-
_layer_norm_f8ec252::dropout_add_ln_fwd 0.
|
| 4014 |
-
hf_kernels_layer_norm 0.00% 0.000us 0.00% 0.000us 0.000us 9.
|
| 4015 |
-
void layer_norm::ln_fwd_kernel<layer_norm::Kernel_tr... 0.00% 0.000us 0.00% 0.000us 0.000us 9.
|
| 4016 |
-
Activity Buffer Request
|
| 4017 |
-
aten::view 0.
|
| 4018 |
-
aten::empty 0.
|
| 4019 |
-
cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFla... 0.05% 5.
|
| 4020 |
-
cudaLaunchKernel 2.
|
| 4021 |
-
cudaDeviceSynchronize
|
| 4022 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 4023 |
-
Self CPU time total:
|
| 4024 |
-
Self CUDA time total: 9.
|
| 4025 |
|
| 4026 |
|
| 4027 |
impl wl p50(ms) ok
|
| 4028 |
-
hf_kernels_layer_norm LN_B16_S2048_D4096 0.
|
| 4029 |
-
hf_kernels_layer_norm LN_B16_S2048_D8192 1.
|
| 4030 |
hf_kernels_layer_norm LN_B16_S4096_D4096 1.65 True
|
| 4031 |
hf_kernels_layer_norm LN_B16_S4096_D8192 3.26 True
|
| 4032 |
</pre></div>
|
|
@@ -4035,12 +4043,12 @@ hf_kernels_layer_norm LN_B16_S4096_D8192 3.26 True
|
|
| 4035 |
<div class="uv-logs-content" style="display: none;">
|
| 4036 |
Downloading hf-xet (3.2MiB)
|
| 4037 |
Downloading hf-xet
|
| 4038 |
-
Installed
|
| 4039 |
</div>
|
| 4040 |
</div>
|
| 4041 |
<div class="cell-stderr">Fetching 4 files: 0%| | 0/4 [00:00<?, ?it/s]
|
| 4042 |
-
Fetching 4 files: 50%|█████ | 2/4 [00:01<00:01, 1.
|
| 4043 |
-
Fetching 4 files: 100%|██████████| 4/4 [00:01<00:00, 2.
|
| 4044 |
<div class="cell-artifacts">
|
| 4045 |
<h4>Artifacts:</h4>
|
| 4046 |
<a href="artifacts/benchmark/layer_norm.jsonl" class="artifact" target="_blank">layer_norm.jsonl</a>
|
|
|
|
| 809 |
.artifact-preview svg {
|
| 810 |
background: transparent;
|
| 811 |
}
|
| 812 |
+
/* Invert SVG images in dark mode */
|
| 813 |
+
:root[data-theme="dark"] .artifact-preview img[src$=".svg"] {
|
| 814 |
+
filter: invert(0.9) hue-rotate(180deg);
|
| 815 |
+
}
|
| 816 |
+
/* Keep SVG images readable in monocolor mode */
|
| 817 |
+
:root[data-ui="monocolor"] .artifact-preview img[src$=".svg"] {
|
| 818 |
+
filter: none;
|
| 819 |
+
}
|
| 820 |
/* CSV table styling */
|
| 821 |
.artifact-csv {
|
| 822 |
margin-top: 1rem;
|
|
|
|
| 3880 |
<span onclick="toggleOutput('benchmark')" style="cursor: pointer;">▼ output</span>
|
| 3881 |
<span id="uv-indicator-benchmark" onclick="toggleUvLogsFromHeader('benchmark')" style="cursor: pointer;">▶ uv-logs</span>
|
| 3882 |
</span> |
|
| 3883 |
+
Cell: benchmark | 9.83s
|
| 3884 |
| <button class="run-btn" onclick="runCell('benchmark')">▶ run</button>
|
| 3885 |
<button class="copy-btn" onclick="copyCell('benchmark')">Copy</button>
|
| 3886 |
<a href="cells/benchmark.py" target="_blank" class="raw-btn">Raw</a>
|
|
|
|
| 3951 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 3952 |
Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
|
| 3953 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 3954 |
+
hf_kernels_layer_norm 4.95% 198.743us 46.81% 1.878ms 1.878ms 0.000us 0.00% 3.111ms 3.111ms 1
|
| 3955 |
+
_layer_norm_f8ec252::dropout_add_ln_fwd 1.73% 69.535us 41.21% 1.653ms 550.933us 2.375ms 100.00% 3.111ms 1.037ms 3
|
| 3956 |
+
hf_kernels_layer_norm 0.00% 0.000us 0.00% 0.000us 0.000us 2.376ms 100.07% 2.376ms 2.376ms 1
|
| 3957 |
+
void layer_norm::ln_fwd_kernel<layer_norm::Kernel_tr... 0.00% 0.000us 0.00% 0.000us 0.000us 2.375ms 100.00% 2.375ms 791.590us 3
|
| 3958 |
+
Activity Buffer Request 36.98% 1.483ms 36.98% 1.483ms 1.483ms 736.636us 31.02% 736.636us 736.636us 1
|
| 3959 |
+
aten::view 0.65% 26.132us 0.65% 26.132us 4.355us 0.000us 0.00% 0.000us 0.000us 6
|
| 3960 |
+
aten::empty 1.22% 49.009us 1.22% 49.009us 5.445us 0.000us 0.00% 0.000us 0.000us 9
|
| 3961 |
+
cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFla... 0.22% 8.769us 0.22% 8.769us 2.923us 0.000us 0.00% 0.000us 0.000us 3
|
| 3962 |
+
cudaLaunchKernel 1.05% 42.291us 1.05% 42.291us 14.097us 0.000us 0.00% 0.000us 0.000us 3
|
| 3963 |
+
cudaDeviceSynchronize 53.19% 2.133ms 53.19% 2.133ms 2.133ms 0.000us 0.00% 0.000us 0.000us 1
|
| 3964 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 3965 |
+
Self CPU time total: 4.011ms
|
| 3966 |
+
Self CUDA time total: 2.375ms
|
| 3967 |
|
| 3968 |
|
| 3969 |
|
|
|
|
| 3973 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 3974 |
Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
|
| 3975 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 3976 |
+
hf_kernels_layer_norm 1.97% 125.105us 26.88% 1.705ms 1.705ms 0.000us 0.00% 6.375ms 6.375ms 1
|
| 3977 |
+
_layer_norm_f8ec252::dropout_add_ln_fwd 0.73% 46.170us 24.73% 1.568ms 522.755us 4.809ms 100.00% 6.375ms 2.125ms 3
|
| 3978 |
+
hf_kernels_layer_norm 0.00% 0.000us 0.00% 0.000us 0.000us 4.811ms 100.03% 4.811ms 4.811ms 1
|
| 3979 |
+
void layer_norm::ln_fwd_kernel<layer_norm::Kernel_tr... 0.00% 0.000us 0.00% 0.000us 0.000us 4.809ms 100.00% 4.809ms 1.603ms 3
|
| 3980 |
+
Activity Buffer Request 22.98% 1.457ms 22.98% 1.457ms 1.457ms 1.565ms 32.55% 1.565ms 1.565ms 1
|
| 3981 |
+
aten::view 0.18% 11.529us 0.18% 11.529us 1.922us 0.000us 0.00% 0.000us 0.000us 6
|
| 3982 |
+
aten::empty 0.46% 29.430us 0.46% 29.430us 3.270us 0.000us 0.00% 0.000us 0.000us 9
|
| 3983 |
+
cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFla... 0.08% 4.900us 0.08% 4.900us 1.633us 0.000us 0.00% 0.000us 0.000us 3
|
| 3984 |
+
cudaLaunchKernel 0.48% 30.441us 0.48% 30.441us 10.147us 0.000us 0.00% 0.000us 0.000us 3
|
| 3985 |
+
cudaDeviceSynchronize 73.12% 4.638ms 73.12% 4.638ms 4.638ms 0.000us 0.00% 0.000us 0.000us 1
|
| 3986 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 3987 |
+
Self CPU time total: 6.343ms
|
| 3988 |
+
Self CUDA time total: 4.809ms
|
| 3989 |
|
| 3990 |
|
| 3991 |
|
|
|
|
| 3995 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 3996 |
Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
|
| 3997 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 3998 |
+
hf_kernels_layer_norm 1.75% 110.793us 26.94% 1.702ms 1.702ms 0.000us 0.00% 6.331ms 6.331ms 1
|
| 3999 |
+
_layer_norm_f8ec252::dropout_add_ln_fwd 0.70% 44.248us 25.01% 1.580ms 526.532us 4.779ms 100.00% 6.331ms 2.110ms 3
|
| 4000 |
+
hf_kernels_layer_norm 0.00% 0.000us 0.00% 0.000us 0.000us 4.781ms 100.03% 4.781ms 4.781ms 1
|
| 4001 |
+
void layer_norm::ln_fwd_kernel<layer_norm::Kernel_tr... 0.00% 0.000us 0.00% 0.000us 0.000us 4.779ms 100.00% 4.779ms 1.593ms 3
|
| 4002 |
+
Activity Buffer Request 23.30% 1.472ms 23.30% 1.472ms 1.472ms 1.552ms 32.48% 1.552ms 1.552ms 1
|
| 4003 |
+
aten::view 0.18% 11.190us 0.18% 11.190us 1.865us 0.000us 0.00% 0.000us 0.000us 6
|
| 4004 |
+
aten::empty 0.49% 30.823us 0.49% 30.823us 3.425us 0.000us 0.00% 0.000us 0.000us 9
|
| 4005 |
+
cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFla... 0.08% 4.981us 0.08% 4.981us 1.660us 0.000us 0.00% 0.000us 0.000us 3
|
| 4006 |
+
cudaLaunchKernel 0.44% 28.031us 0.44% 28.031us 9.344us 0.000us 0.00% 0.000us 0.000us 3
|
| 4007 |
+
cudaDeviceSynchronize 73.06% 4.615ms 73.06% 4.615ms 4.615ms 0.000us 0.00% 0.000us 0.000us 1
|
| 4008 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 4009 |
+
Self CPU time total: 6.317ms
|
| 4010 |
+
Self CUDA time total: 4.779ms
|
| 4011 |
|
| 4012 |
|
| 4013 |
|
|
|
|
| 4017 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 4018 |
Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
|
| 4019 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 4020 |
+
hf_kernels_layer_norm 1.11% 111.882us 6.14% 619.354us 619.354us 0.000us 0.00% 12.808ms 12.808ms 1
|
| 4021 |
+
_layer_norm_f8ec252::dropout_add_ln_fwd 0.46% 46.119us 4.92% 496.462us 165.487us 9.625ms 100.00% 12.808ms 4.269ms 3
|
| 4022 |
+
hf_kernels_layer_norm 0.00% 0.000us 0.00% 0.000us 0.000us 9.626ms 100.01% 9.626ms 9.626ms 1
|
| 4023 |
+
void layer_norm::ln_fwd_kernel<layer_norm::Kernel_tr... 0.00% 0.000us 0.00% 0.000us 0.000us 9.625ms 100.00% 9.625ms 3.208ms 3
|
| 4024 |
+
Activity Buffer Request 1.38% 138.943us 1.38% 138.943us 138.943us 3.183ms 33.07% 3.183ms 3.183ms 1
|
| 4025 |
+
aten::view 0.11% 11.010us 0.11% 11.010us 1.835us 0.000us 0.00% 0.000us 0.000us 6
|
| 4026 |
+
aten::empty 0.31% 31.174us 0.31% 31.174us 3.464us 0.000us 0.00% 0.000us 0.000us 9
|
| 4027 |
+
cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFla... 0.05% 5.190us 0.05% 5.190us 1.730us 0.000us 0.00% 0.000us 0.000us 3
|
| 4028 |
+
cudaLaunchKernel 2.73% 275.036us 2.73% 275.036us 91.679us 0.000us 0.00% 0.000us 0.000us 3
|
| 4029 |
+
cudaDeviceSynchronize 93.86% 9.465ms 93.86% 9.465ms 9.465ms 0.000us 0.00% 0.000us 0.000us 1
|
| 4030 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 4031 |
+
Self CPU time total: 10.085ms
|
| 4032 |
+
Self CUDA time total: 9.625ms
|
| 4033 |
|
| 4034 |
|
| 4035 |
impl wl p50(ms) ok
|
| 4036 |
+
hf_kernels_layer_norm LN_B16_S2048_D4096 0.83 True
|
| 4037 |
+
hf_kernels_layer_norm LN_B16_S2048_D8192 1.66 True
|
| 4038 |
hf_kernels_layer_norm LN_B16_S4096_D4096 1.65 True
|
| 4039 |
hf_kernels_layer_norm LN_B16_S4096_D8192 3.26 True
|
| 4040 |
</pre></div>
|
|
|
|
| 4043 |
<div class="uv-logs-content" style="display: none;">
|
| 4044 |
Downloading hf-xet (3.2MiB)
|
| 4045 |
Downloading hf-xet
|
| 4046 |
+
Installed 52 packages in 191ms
|
| 4047 |
</div>
|
| 4048 |
</div>
|
| 4049 |
<div class="cell-stderr">Fetching 4 files: 0%| | 0/4 [00:00<?, ?it/s]
|
| 4050 |
+
Fetching 4 files: 50%|█████ | 2/4 [00:01<00:01, 1.30it/s]
|
| 4051 |
+
Fetching 4 files: 100%|██████████| 4/4 [00:01<00:00, 2.60it/s]</div>
|
| 4052 |
<div class="cell-artifacts">
|
| 4053 |
<h4>Artifacts:</h4>
|
| 4054 |
<a href="artifacts/benchmark/layer_norm.jsonl" class="artifact" target="_blank">layer_norm.jsonl</a>
|
layer_norm/impls/torch_layer_norm.html
CHANGED
|
@@ -809,6 +809,14 @@
|
|
| 809 |
.artifact-preview svg {
|
| 810 |
background: transparent;
|
| 811 |
}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 812 |
/* CSV table styling */
|
| 813 |
.artifact-csv {
|
| 814 |
margin-top: 1rem;
|
|
@@ -3871,7 +3879,7 @@ span.linenos.special { color: #000000; background-color: #ffffc0; padding-left:
|
|
| 3871 |
<span onclick="toggleOutput('nv')" style="cursor: pointer;">▼ output</span>
|
| 3872 |
<span id="uv-indicator-nv" style="cursor: default; opacity: 0.3;">▶ uv-logs</span>
|
| 3873 |
</span> |
|
| 3874 |
-
Cell: nv | 0.
|
| 3875 |
| <button class="run-btn" onclick="runCell('nv')">▶ run</button>
|
| 3876 |
<button class="copy-btn" onclick="copyCell('nv')">Copy</button>
|
| 3877 |
<a href="cells/nv.py" target="_blank" class="raw-btn">Raw</a>
|
|
@@ -3887,7 +3895,7 @@ Cell: nv | 0.26s
|
|
| 3887 |
</div>
|
| 3888 |
</div>
|
| 3889 |
<div id="output-nv" class="cell-output">
|
| 3890 |
-
<div class="cell-stdout"><pre class="stdout-text">Wed Oct 29
|
| 3891 |
+-----------------------------------------------------------------------------------------+
|
| 3892 |
| NVIDIA-SMI 570.195.03 Driver Version: 570.195.03 CUDA Version: 12.8 |
|
| 3893 |
|-----------------------------------------+------------------------+----------------------+
|
|
@@ -3896,7 +3904,7 @@ Cell: nv | 0.26s
|
|
| 3896 |
| | | MIG M. |
|
| 3897 |
|=========================================+========================+======================|
|
| 3898 |
| 0 NVIDIA L40S On | 00000000:4D:00.0 Off | 0 |
|
| 3899 |
-
| N/A
|
| 3900 |
| | | N/A |
|
| 3901 |
+-----------------------------------------+------------------------+----------------------+
|
| 3902 |
|
|
@@ -3918,9 +3926,9 @@ Cell: nv | 0.26s
|
|
| 3918 |
<span class="collapse-indicators">
|
| 3919 |
<span onclick="toggleCode('benchmark')" style="cursor: pointer;">▼ code</span>
|
| 3920 |
<span onclick="toggleOutput('benchmark')" style="cursor: pointer;">▼ output</span>
|
| 3921 |
-
<span id="uv-indicator-benchmark"
|
| 3922 |
</span> |
|
| 3923 |
-
Cell: benchmark |
|
| 3924 |
| <button class="run-btn" onclick="runCell('benchmark')">▶ run</button>
|
| 3925 |
<button class="copy-btn" onclick="copyCell('benchmark')">Copy</button>
|
| 3926 |
<a href="cells/benchmark.py" target="_blank" class="raw-btn">Raw</a>
|
|
@@ -3968,19 +3976,19 @@ PROFILE TRACE: torch_layer_norm | LN_B16_S2048_D4096
|
|
| 3968 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 3969 |
Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
|
| 3970 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 3971 |
-
torch_layer_norm 3.
|
| 3972 |
-
aten::layer_norm 0.43% 16.
|
| 3973 |
-
aten::native_layer_norm 2.
|
| 3974 |
-
torch_layer_norm 0.00% 0.000us 0.00% 0.000us 0.000us 2.
|
| 3975 |
-
void at::native::(anonymous namespace)::vectorized_l... 0.00% 0.000us 0.00% 0.000us 0.000us 2.
|
| 3976 |
-
Activity Buffer Request
|
| 3977 |
-
aten::empty 1.
|
| 3978 |
-
cudaLaunchKernel 1.
|
| 3979 |
-
aten::view 0.17% 6.
|
| 3980 |
-
cudaDeviceSynchronize
|
| 3981 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 3982 |
-
Self CPU time total: 3.
|
| 3983 |
-
Self CUDA time total: 2.
|
| 3984 |
|
| 3985 |
|
| 3986 |
|
|
@@ -3990,19 +3998,19 @@ PROFILE TRACE: torch_layer_norm | LN_B16_S2048_D8192
|
|
| 3990 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 3991 |
Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
|
| 3992 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 3993 |
-
torch_layer_norm 1.
|
| 3994 |
-
aten::layer_norm 0.14% 9.
|
| 3995 |
-
aten::native_layer_norm 0.81% 51.
|
| 3996 |
-
torch_layer_norm 0.00% 0.000us 0.00% 0.000us 0.000us 4.
|
| 3997 |
-
void at::native::(anonymous namespace)::vectorized_l... 0.00% 0.000us 0.00% 0.000us 0.000us 4.
|
| 3998 |
-
Activity Buffer Request 22.
|
| 3999 |
-
aten::empty 0.
|
| 4000 |
-
cudaLaunchKernel 0.
|
| 4001 |
-
aten::view 0.
|
| 4002 |
-
cudaDeviceSynchronize 74.
|
| 4003 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 4004 |
-
Self CPU time total: 6.
|
| 4005 |
-
Self CUDA time total: 4.
|
| 4006 |
|
| 4007 |
|
| 4008 |
|
|
@@ -4012,19 +4020,19 @@ PROFILE TRACE: torch_layer_norm | LN_B16_S4096_D4096
|
|
| 4012 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 4013 |
Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
|
| 4014 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 4015 |
-
torch_layer_norm 1.
|
| 4016 |
-
aten::layer_norm 0.15% 9.
|
| 4017 |
-
aten::native_layer_norm 0.
|
| 4018 |
-
torch_layer_norm 0.00% 0.000us 0.00% 0.000us 0.000us 4.
|
| 4019 |
-
void at::native::(anonymous namespace)::vectorized_l... 0.00% 0.000us 0.00% 0.000us 0.000us 4.
|
| 4020 |
-
Activity Buffer Request 23.
|
| 4021 |
-
aten::empty 0.46% 28.
|
| 4022 |
-
cudaLaunchKernel 0.
|
| 4023 |
-
aten::view 0.
|
| 4024 |
-
cudaDeviceSynchronize 73.
|
| 4025 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 4026 |
-
Self CPU time total: 6.
|
| 4027 |
-
Self CUDA time total: 4.
|
| 4028 |
|
| 4029 |
|
| 4030 |
|
|
@@ -4034,19 +4042,19 @@ PROFILE TRACE: torch_layer_norm | LN_B16_S4096_D8192
|
|
| 4034 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 4035 |
Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
|
| 4036 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 4037 |
-
torch_layer_norm 0.
|
| 4038 |
-
aten::layer_norm 0.
|
| 4039 |
-
aten::native_layer_norm 0.
|
| 4040 |
-
torch_layer_norm 0.00% 0.000us 0.00% 0.000us 0.000us 9.
|
| 4041 |
-
void at::native::(anonymous namespace)::vectorized_l... 0.00% 0.000us 0.00% 0.000us 0.000us 9.
|
| 4042 |
-
Activity Buffer Request
|
| 4043 |
-
aten::empty 0.
|
| 4044 |
-
cudaLaunchKernel
|
| 4045 |
-
aten::view 0.
|
| 4046 |
-
cudaDeviceSynchronize
|
| 4047 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 4048 |
-
Self CPU time total: 11.
|
| 4049 |
-
Self CUDA time total: 9.
|
| 4050 |
|
| 4051 |
|
| 4052 |
impl wl p50(ms) ok
|
|
@@ -4055,12 +4063,6 @@ torch_layer_norm LN_B16_S2048_D8192 1.68 True
|
|
| 4055 |
torch_layer_norm LN_B16_S4096_D4096 1.61 True
|
| 4056 |
torch_layer_norm LN_B16_S4096_D8192 3.33 True
|
| 4057 |
</pre></div>
|
| 4058 |
-
<div class="uv-install-logs" id="uv-logs-benchmark">
|
| 4059 |
-
<div class="uv-logs-header" onclick="toggleUvLogs(this)">▶ UV Install Logs</div>
|
| 4060 |
-
<div class="uv-logs-content" style="display: none;">
|
| 4061 |
-
Installed 37 packages in 222ms
|
| 4062 |
-
</div>
|
| 4063 |
-
</div>
|
| 4064 |
<div class="cell-artifacts">
|
| 4065 |
<h4>Artifacts:</h4>
|
| 4066 |
<a href="artifacts/benchmark/layer_norm.jsonl" class="artifact" target="_blank">layer_norm.jsonl</a>
|
|
|
|
| 809 |
.artifact-preview svg {
|
| 810 |
background: transparent;
|
| 811 |
}
|
| 812 |
+
/* Invert SVG images in dark mode */
|
| 813 |
+
:root[data-theme="dark"] .artifact-preview img[src$=".svg"] {
|
| 814 |
+
filter: invert(0.9) hue-rotate(180deg);
|
| 815 |
+
}
|
| 816 |
+
/* Keep SVG images readable in monocolor mode */
|
| 817 |
+
:root[data-ui="monocolor"] .artifact-preview img[src$=".svg"] {
|
| 818 |
+
filter: none;
|
| 819 |
+
}
|
| 820 |
/* CSV table styling */
|
| 821 |
.artifact-csv {
|
| 822 |
margin-top: 1rem;
|
|
|
|
| 3879 |
<span onclick="toggleOutput('nv')" style="cursor: pointer;">▼ output</span>
|
| 3880 |
<span id="uv-indicator-nv" style="cursor: default; opacity: 0.3;">▶ uv-logs</span>
|
| 3881 |
</span> |
|
| 3882 |
+
Cell: nv | 0.23s
|
| 3883 |
| <button class="run-btn" onclick="runCell('nv')">▶ run</button>
|
| 3884 |
<button class="copy-btn" onclick="copyCell('nv')">Copy</button>
|
| 3885 |
<a href="cells/nv.py" target="_blank" class="raw-btn">Raw</a>
|
|
|
|
| 3895 |
</div>
|
| 3896 |
</div>
|
| 3897 |
<div id="output-nv" class="cell-output">
|
| 3898 |
+
<div class="cell-stdout"><pre class="stdout-text">Wed Oct 29 15:50:44 2025
|
| 3899 |
+-----------------------------------------------------------------------------------------+
|
| 3900 |
| NVIDIA-SMI 570.195.03 Driver Version: 570.195.03 CUDA Version: 12.8 |
|
| 3901 |
|-----------------------------------------+------------------------+----------------------+
|
|
|
|
| 3904 |
| | | MIG M. |
|
| 3905 |
|=========================================+========================+======================|
|
| 3906 |
| 0 NVIDIA L40S On | 00000000:4D:00.0 Off | 0 |
|
| 3907 |
+
| N/A 29C P0 138W / 350W | 0MiB / 46068MiB | 49% Default |
|
| 3908 |
| | | N/A |
|
| 3909 |
+-----------------------------------------+------------------------+----------------------+
|
| 3910 |
|
|
|
|
| 3926 |
<span class="collapse-indicators">
|
| 3927 |
<span onclick="toggleCode('benchmark')" style="cursor: pointer;">▼ code</span>
|
| 3928 |
<span onclick="toggleOutput('benchmark')" style="cursor: pointer;">▼ output</span>
|
| 3929 |
+
<span id="uv-indicator-benchmark" style="cursor: default; opacity: 0.3;">▶ uv-logs</span>
|
| 3930 |
</span> |
|
| 3931 |
+
Cell: benchmark | 3.85s
|
| 3932 |
| <button class="run-btn" onclick="runCell('benchmark')">▶ run</button>
|
| 3933 |
<button class="copy-btn" onclick="copyCell('benchmark')">Copy</button>
|
| 3934 |
<a href="cells/benchmark.py" target="_blank" class="raw-btn">Raw</a>
|
|
|
|
| 3976 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 3977 |
Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
|
| 3978 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 3979 |
+
torch_layer_norm 3.59% 140.394us 45.88% 1.793ms 1.793ms 0.000us 0.00% 3.034ms 3.034ms 1
|
| 3980 |
+
aten::layer_norm 0.43% 16.891us 42.29% 1.653ms 551.033us 0.000us 0.00% 3.034ms 1.011ms 3
|
| 3981 |
+
aten::native_layer_norm 2.49% 97.515us 41.85% 1.636ms 545.403us 2.324ms 100.00% 3.034ms 1.011ms 3
|
| 3982 |
+
torch_layer_norm 0.00% 0.000us 0.00% 0.000us 0.000us 2.325ms 100.07% 2.325ms 2.325ms 1
|
| 3983 |
+
void at::native::(anonymous namespace)::vectorized_l... 0.00% 0.000us 0.00% 0.000us 0.000us 2.324ms 100.00% 2.324ms 774.631us 3
|
| 3984 |
+
Activity Buffer Request 36.92% 1.443ms 36.92% 1.443ms 1.443ms 709.916us 30.55% 709.916us 709.916us 1
|
| 3985 |
+
aten::empty 1.11% 43.309us 1.11% 43.309us 4.812us 0.000us 0.00% 0.000us 0.000us 9
|
| 3986 |
+
cudaLaunchKernel 1.17% 45.620us 1.17% 45.620us 15.207us 0.000us 0.00% 0.000us 0.000us 3
|
| 3987 |
+
aten::view 0.17% 6.600us 0.17% 6.600us 1.100us 0.000us 0.00% 0.000us 0.000us 6
|
| 3988 |
+
cudaDeviceSynchronize 54.12% 2.116ms 54.12% 2.116ms 2.116ms 0.000us 0.00% 0.000us 0.000us 1
|
| 3989 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 3990 |
+
Self CPU time total: 3.909ms
|
| 3991 |
+
Self CUDA time total: 2.324ms
|
| 3992 |
|
| 3993 |
|
| 3994 |
|
|
|
|
| 3998 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 3999 |
Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
|
| 4000 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 4001 |
+
torch_layer_norm 1.51% 96.533us 25.68% 1.646ms 1.646ms 0.000us 0.00% 6.506ms 6.506ms 1
|
| 4002 |
+
aten::layer_norm 0.14% 9.019us 24.18% 1.550ms 516.535us 0.000us 0.00% 6.506ms 2.169ms 3
|
| 4003 |
+
aten::native_layer_norm 0.81% 51.783us 24.04% 1.541ms 513.529us 4.903ms 100.00% 6.506ms 2.169ms 3
|
| 4004 |
+
torch_layer_norm 0.00% 0.000us 0.00% 0.000us 0.000us 4.905ms 100.03% 4.905ms 4.905ms 1
|
| 4005 |
+
void at::native::(anonymous namespace)::vectorized_l... 0.00% 0.000us 0.00% 0.000us 0.000us 4.903ms 100.00% 4.903ms 1.634ms 3
|
| 4006 |
+
Activity Buffer Request 22.28% 1.428ms 22.28% 1.428ms 1.428ms 1.602ms 32.68% 1.602ms 1.602ms 1
|
| 4007 |
+
aten::empty 0.45% 29.001us 0.45% 29.001us 3.222us 0.000us 0.00% 0.000us 0.000us 9
|
| 4008 |
+
cudaLaunchKernel 0.43% 27.850us 0.43% 27.850us 9.283us 0.000us 0.00% 0.000us 0.000us 3
|
| 4009 |
+
aten::view 0.07% 4.220us 0.07% 4.220us 0.703us 0.000us 0.00% 0.000us 0.000us 6
|
| 4010 |
+
cudaDeviceSynchronize 74.32% 4.763ms 74.32% 4.763ms 4.763ms 0.000us 0.00% 0.000us 0.000us 1
|
| 4011 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 4012 |
+
Self CPU time total: 6.409ms
|
| 4013 |
+
Self CUDA time total: 4.903ms
|
| 4014 |
|
| 4015 |
|
| 4016 |
|
|
|
|
| 4020 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 4021 |
Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
|
| 4022 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 4023 |
+
torch_layer_norm 1.49% 93.320us 26.51% 1.656ms 1.656ms 0.000us 0.00% 6.235ms 6.235ms 1
|
| 4024 |
+
aten::layer_norm 0.15% 9.262us 25.02% 1.563ms 520.876us 0.000us 0.00% 6.235ms 2.078ms 3
|
| 4025 |
+
aten::native_layer_norm 0.82% 51.181us 24.87% 1.553ms 517.789us 4.722ms 100.00% 6.235ms 2.078ms 3
|
| 4026 |
+
torch_layer_norm 0.00% 0.000us 0.00% 0.000us 0.000us 4.723ms 100.03% 4.723ms 4.723ms 1
|
| 4027 |
+
void at::native::(anonymous namespace)::vectorized_l... 0.00% 0.000us 0.00% 0.000us 0.000us 4.722ms 100.00% 4.722ms 1.574ms 3
|
| 4028 |
+
Activity Buffer Request 23.09% 1.443ms 23.09% 1.443ms 1.443ms 1.513ms 32.04% 1.513ms 1.513ms 1
|
| 4029 |
+
aten::empty 0.46% 28.530us 0.46% 28.530us 3.170us 0.000us 0.00% 0.000us 0.000us 9
|
| 4030 |
+
cudaLaunchKernel 0.44% 27.431us 0.44% 27.431us 9.144us 0.000us 0.00% 0.000us 0.000us 3
|
| 4031 |
+
aten::view 0.06% 3.670us 0.06% 3.670us 0.612us 0.000us 0.00% 0.000us 0.000us 6
|
| 4032 |
+
cudaDeviceSynchronize 73.49% 4.591ms 73.49% 4.591ms 4.591ms 0.000us 0.00% 0.000us 0.000us 1
|
| 4033 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 4034 |
+
Self CPU time total: 6.247ms
|
| 4035 |
+
Self CUDA time total: 4.722ms
|
| 4036 |
|
| 4037 |
|
| 4038 |
|
|
|
|
| 4042 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 4043 |
Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
|
| 4044 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 4045 |
+
torch_layer_norm 0.75% 86.532us 16.17% 1.873ms 1.873ms 0.000us 0.00% 13.086ms 13.086ms 1
|
| 4046 |
+
aten::layer_norm 0.08% 9.721us 15.43% 1.787ms 595.631us 0.000us 0.00% 13.086ms 4.362ms 3
|
| 4047 |
+
aten::native_layer_norm 0.46% 53.132us 15.34% 1.777ms 592.390us 9.848ms 100.00% 13.086ms 4.362ms 3
|
| 4048 |
+
torch_layer_norm 0.00% 0.000us 0.00% 0.000us 0.000us 9.850ms 100.01% 9.850ms 9.850ms 1
|
| 4049 |
+
void at::native::(anonymous namespace)::vectorized_l... 0.00% 0.000us 0.00% 0.000us 0.000us 9.848ms 100.00% 9.848ms 3.283ms 3
|
| 4050 |
+
Activity Buffer Request 12.61% 1.460ms 12.61% 1.460ms 1.460ms 3.238ms 32.88% 3.238ms 3.238ms 1
|
| 4051 |
+
aten::empty 0.27% 30.840us 0.27% 30.840us 3.427us 0.000us 0.00% 0.000us 0.000us 9
|
| 4052 |
+
cudaLaunchKernel 1.98% 229.105us 1.98% 229.105us 76.368us 0.000us 0.00% 0.000us 0.000us 3
|
| 4053 |
+
aten::view 0.03% 3.969us 0.03% 3.969us 0.661us 0.000us 0.00% 0.000us 0.000us 6
|
| 4054 |
+
cudaDeviceSynchronize 83.83% 9.710ms 83.83% 9.710ms 9.710ms 0.000us 0.00% 0.000us 0.000us 1
|
| 4055 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 4056 |
+
Self CPU time total: 11.583ms
|
| 4057 |
+
Self CUDA time total: 9.848ms
|
| 4058 |
|
| 4059 |
|
| 4060 |
impl wl p50(ms) ok
|
|
|
|
| 4063 |
torch_layer_norm LN_B16_S4096_D4096 1.61 True
|
| 4064 |
torch_layer_norm LN_B16_S4096_D8192 3.33 True
|
| 4065 |
</pre></div>
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 4066 |
<div class="cell-artifacts">
|
| 4067 |
<h4>Artifacts:</h4>
|
| 4068 |
<a href="artifacts/benchmark/layer_norm.jsonl" class="artifact" target="_blank">layer_norm.jsonl</a>
|
layer_norm/results/artifacts/combine/latency.svg
CHANGED
|
|
Git LFS Details
|
|
|
Git LFS Details
|
layer_norm/results/combined_results.html
CHANGED
|
@@ -809,6 +809,14 @@
|
|
| 809 |
.artifact-preview svg {
|
| 810 |
background: transparent;
|
| 811 |
}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 812 |
/* CSV table styling */
|
| 813 |
.artifact-csv {
|
| 814 |
margin-top: 1rem;
|
|
@@ -3872,7 +3880,7 @@ span.linenos.special { color: #000000; background-color: #ffffc0; padding-left:
|
|
| 3872 |
<rdf:RDF>
|
| 3873 |
<ns2:Work>
|
| 3874 |
<dc:type rdf:resource="http://purl.org/dc/dcmitype/StillImage" />
|
| 3875 |
-
<dc:date>2025-10-
|
| 3876 |
<dc:format>image/svg+xml</dc:format>
|
| 3877 |
<dc:creator>
|
| 3878 |
<ns2:Agent>
|
|
@@ -3956,70 +3964,70 @@ span.linenos.special { color: #000000; background-color: #ffffc0; padding-left:
|
|
| 3956 |
<g id="matplotlib.axis_2">
|
| 3957 |
<g id="ytick_1">
|
| 3958 |
<g id="grid-y--2" class="grid grid-y">
|
| 3959 |
-
<path d="M 47.72
|
| 3960 |
</g>
|
| 3961 |
<g id="line2d_5">
|
| 3962 |
<defs>
|
| 3963 |
<path id="m0fca2865ba" d="M 0 0 L -3.5 0 " style="stroke: #000000; stroke-width: 0.8" />
|
| 3964 |
</defs>
|
| 3965 |
<g>
|
| 3966 |
-
<use ns4:href="#m0fca2865ba" x="47.72" y="
|
| 3967 |
</g>
|
| 3968 |
</g>
|
| 3969 |
<g id="text_5">
|
| 3970 |
-
<text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: end" x="40.72" y="
|
| 3971 |
</g>
|
| 3972 |
</g>
|
| 3973 |
<g id="ytick_2">
|
| 3974 |
<g id="grid-y--3" class="grid grid-y">
|
| 3975 |
-
<path d="M 47.72 331.
|
| 3976 |
</g>
|
| 3977 |
<g id="line2d_6">
|
| 3978 |
<g>
|
| 3979 |
-
<use ns4:href="#m0fca2865ba" x="47.72" y="331.
|
| 3980 |
</g>
|
| 3981 |
</g>
|
| 3982 |
<g id="text_6">
|
| 3983 |
-
<text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: end" x="40.72" y="
|
| 3984 |
</g>
|
| 3985 |
</g>
|
| 3986 |
<g id="ytick_3">
|
| 3987 |
<g id="grid-y--4" class="grid grid-y">
|
| 3988 |
-
<path d="M 47.72 253.
|
| 3989 |
</g>
|
| 3990 |
<g id="line2d_7">
|
| 3991 |
<g>
|
| 3992 |
-
<use ns4:href="#m0fca2865ba" x="47.72" y="253.
|
| 3993 |
</g>
|
| 3994 |
</g>
|
| 3995 |
<g id="text_7">
|
| 3996 |
-
<text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: end" x="40.72" y="
|
| 3997 |
</g>
|
| 3998 |
</g>
|
| 3999 |
<g id="ytick_4">
|
| 4000 |
<g id="grid-y--5" class="grid grid-y">
|
| 4001 |
-
<path d="M 47.72 175.
|
| 4002 |
</g>
|
| 4003 |
<g id="line2d_8">
|
| 4004 |
<g>
|
| 4005 |
-
<use ns4:href="#m0fca2865ba" x="47.72" y="175.
|
| 4006 |
</g>
|
| 4007 |
</g>
|
| 4008 |
<g id="text_8">
|
| 4009 |
-
<text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: end" x="40.72" y="179.
|
| 4010 |
</g>
|
| 4011 |
</g>
|
| 4012 |
<g id="ytick_5">
|
| 4013 |
<g id="grid-y--6" class="grid grid-y">
|
| 4014 |
-
<path d="M 47.72 97.
|
| 4015 |
</g>
|
| 4016 |
<g id="line2d_9">
|
| 4017 |
<g>
|
| 4018 |
-
<use ns4:href="#m0fca2865ba" x="47.72" y="97.
|
| 4019 |
</g>
|
| 4020 |
</g>
|
| 4021 |
<g id="text_9">
|
| 4022 |
-
<text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: end" x="40.72" y="101.
|
| 4023 |
</g>
|
| 4024 |
</g>
|
| 4025 |
<g id="label--y" class="ylabel">
|
|
@@ -4027,27 +4035,27 @@ span.linenos.special { color: #000000; background-color: #ffffc0; padding-left:
|
|
| 4027 |
</g>
|
| 4028 |
</g>
|
| 4029 |
<g id="series--torch-layer-norm" class="series">
|
| 4030 |
-
<path d="M 83.741924 437.689571 L 323.888085
|
| 4031 |
<defs>
|
| 4032 |
<path id="md7efaf3aec" d="M 0 3 C 0.795609 3 1.55874 2.683901 2.12132 2.12132 C 2.683901 1.55874 3 0.795609 3 0 C 3 -0.795609 2.683901 -1.55874 2.12132 -2.12132 C 1.55874 -2.683901 0.795609 -3 0 -3 C -0.795609 -3 -1.55874 -2.683901 -2.12132 -2.12132 C -2.683901 -1.55874 -3 -0.795609 -3 0 C -3 0.795609 -2.683901 1.55874 -2.12132 2.12132 C -1.55874 2.683901 -0.795609 3 0 3 z " style="stroke: #1f77b4" />
|
| 4033 |
</defs>
|
| 4034 |
<g clip-path="url(#p2214f54723)">
|
| 4035 |
<use ns4:href="#md7efaf3aec" x="83.741924" y="437.689571" style="fill: #1f77b4; stroke: #1f77b4" />
|
| 4036 |
-
<use ns4:href="#md7efaf3aec" x="323.888085" y="
|
| 4037 |
-
<use ns4:href="#md7efaf3aec" x="564.034245" y="
|
| 4038 |
<use ns4:href="#md7efaf3aec" x="804.180406" y="46.442361" style="fill: #1f77b4; stroke: #1f77b4" />
|
| 4039 |
</g>
|
| 4040 |
</g>
|
| 4041 |
<g id="series--hf-kernels-layer-norm" class="series">
|
| 4042 |
-
<path d="M 83.741924
|
| 4043 |
<defs>
|
| 4044 |
<path id="m9b8c54d372" d="M 0 3 C 0.795609 3 1.55874 2.683901 2.12132 2.12132 C 2.683901 1.55874 3 0.795609 3 0 C 3 -0.795609 2.683901 -1.55874 2.12132 -2.12132 C 1.55874 -2.683901 0.795609 -3 0 -3 C -0.795609 -3 -1.55874 -2.683901 -2.12132 -2.12132 C -2.683901 -1.55874 -3 -0.795609 -3 0 C -3 0.795609 -2.683901 1.55874 -2.12132 2.12132 C -1.55874 2.683901 -0.795609 3 0 3 z " style="stroke: #ff7f0e" />
|
| 4045 |
</defs>
|
| 4046 |
<g clip-path="url(#p2214f54723)">
|
| 4047 |
-
<use ns4:href="#m9b8c54d372" x="83.741924" y="
|
| 4048 |
-
<use ns4:href="#m9b8c54d372" x="323.888085" y="307.
|
| 4049 |
-
<use ns4:href="#m9b8c54d372" x="564.034245" y="307.
|
| 4050 |
-
<use ns4:href="#m9b8c54d372" x="804.180406" y="57.
|
| 4051 |
</g>
|
| 4052 |
</g>
|
| 4053 |
<g id="patch_3">
|
|
@@ -4105,7 +4113,7 @@ span.linenos.special { color: #000000; background-color: #ffffc0; padding-left:
|
|
| 4105 |
<span onclick="toggleOutput('combine')" style="cursor: pointer;">▼ output</span>
|
| 4106 |
<span id="uv-indicator-combine" onclick="toggleUvLogsFromHeader('combine')" style="cursor: pointer;">▶ uv-logs</span>
|
| 4107 |
</span> |
|
| 4108 |
-
Cell: combine | 4.
|
| 4109 |
| <button class="run-btn" onclick="runCell('combine')">▶ run</button>
|
| 4110 |
<button class="copy-btn" onclick="copyCell('combine')">Copy</button>
|
| 4111 |
<a href="cells/combine.py" target="_blank" class="raw-btn">Raw</a>
|
|
@@ -4192,8 +4200,8 @@ Summary: 2 found, 0 skipped, 0 missing
|
|
| 4192 |
COMBINED BENCHMARK SUMMARY
|
| 4193 |
|
| 4194 |
impl wl p50(ms) ok
|
| 4195 |
-
hf_kernels_layer_norm LN_B16_S2048_D4096 0.
|
| 4196 |
-
hf_kernels_layer_norm LN_B16_S2048_D8192 1.
|
| 4197 |
hf_kernels_layer_norm LN_B16_S4096_D4096 1.65 True
|
| 4198 |
hf_kernels_layer_norm LN_B16_S4096_D8192 3.26 True
|
| 4199 |
torch_layer_norm LN_B16_S2048_D4096 0.82 True
|
|
@@ -4219,7 +4227,7 @@ Implementations included:
|
|
| 4219 |
<div class="uv-install-logs" id="uv-logs-combine">
|
| 4220 |
<div class="uv-logs-header" onclick="toggleUvLogs(this)">▶ UV Install Logs</div>
|
| 4221 |
<div class="uv-logs-content" style="display: none;">
|
| 4222 |
-
Installed 37 packages in
|
| 4223 |
</div>
|
| 4224 |
</div>
|
| 4225 |
<div class="cell-artifacts">
|
|
@@ -4232,7 +4240,7 @@ Installed 37 packages in 210ms
|
|
| 4232 |
<rdf:RDF>
|
| 4233 |
<ns2:Work>
|
| 4234 |
<dc:type rdf:resource="http://purl.org/dc/dcmitype/StillImage" />
|
| 4235 |
-
<dc:date>2025-10-
|
| 4236 |
<dc:format>image/svg+xml</dc:format>
|
| 4237 |
<dc:creator>
|
| 4238 |
<ns2:Agent>
|
|
@@ -4316,70 +4324,70 @@ Installed 37 packages in 210ms
|
|
| 4316 |
<g id="matplotlib.axis_2">
|
| 4317 |
<g id="ytick_1">
|
| 4318 |
<g id="grid-y--2" class="grid grid-y">
|
| 4319 |
-
<path d="M 47.72
|
| 4320 |
</g>
|
| 4321 |
<g id="line2d_5">
|
| 4322 |
<defs>
|
| 4323 |
<path id="m0fca2865ba" d="M 0 0 L -3.5 0 " style="stroke: #000000; stroke-width: 0.8" />
|
| 4324 |
</defs>
|
| 4325 |
<g>
|
| 4326 |
-
<use ns4:href="#m0fca2865ba" x="47.72" y="
|
| 4327 |
</g>
|
| 4328 |
</g>
|
| 4329 |
<g id="text_5">
|
| 4330 |
-
<text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: end" x="40.72" y="
|
| 4331 |
</g>
|
| 4332 |
</g>
|
| 4333 |
<g id="ytick_2">
|
| 4334 |
<g id="grid-y--3" class="grid grid-y">
|
| 4335 |
-
<path d="M 47.72 331.
|
| 4336 |
</g>
|
| 4337 |
<g id="line2d_6">
|
| 4338 |
<g>
|
| 4339 |
-
<use ns4:href="#m0fca2865ba" x="47.72" y="331.
|
| 4340 |
</g>
|
| 4341 |
</g>
|
| 4342 |
<g id="text_6">
|
| 4343 |
-
<text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: end" x="40.72" y="
|
| 4344 |
</g>
|
| 4345 |
</g>
|
| 4346 |
<g id="ytick_3">
|
| 4347 |
<g id="grid-y--4" class="grid grid-y">
|
| 4348 |
-
<path d="M 47.72 253.
|
| 4349 |
</g>
|
| 4350 |
<g id="line2d_7">
|
| 4351 |
<g>
|
| 4352 |
-
<use ns4:href="#m0fca2865ba" x="47.72" y="253.
|
| 4353 |
</g>
|
| 4354 |
</g>
|
| 4355 |
<g id="text_7">
|
| 4356 |
-
<text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: end" x="40.72" y="
|
| 4357 |
</g>
|
| 4358 |
</g>
|
| 4359 |
<g id="ytick_4">
|
| 4360 |
<g id="grid-y--5" class="grid grid-y">
|
| 4361 |
-
<path d="M 47.72 175.
|
| 4362 |
</g>
|
| 4363 |
<g id="line2d_8">
|
| 4364 |
<g>
|
| 4365 |
-
<use ns4:href="#m0fca2865ba" x="47.72" y="175.
|
| 4366 |
</g>
|
| 4367 |
</g>
|
| 4368 |
<g id="text_8">
|
| 4369 |
-
<text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: end" x="40.72" y="179.
|
| 4370 |
</g>
|
| 4371 |
</g>
|
| 4372 |
<g id="ytick_5">
|
| 4373 |
<g id="grid-y--6" class="grid grid-y">
|
| 4374 |
-
<path d="M 47.72 97.
|
| 4375 |
</g>
|
| 4376 |
<g id="line2d_9">
|
| 4377 |
<g>
|
| 4378 |
-
<use ns4:href="#m0fca2865ba" x="47.72" y="97.
|
| 4379 |
</g>
|
| 4380 |
</g>
|
| 4381 |
<g id="text_9">
|
| 4382 |
-
<text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: end" x="40.72" y="101.
|
| 4383 |
</g>
|
| 4384 |
</g>
|
| 4385 |
<g id="label--y" class="ylabel">
|
|
@@ -4387,27 +4395,27 @@ Installed 37 packages in 210ms
|
|
| 4387 |
</g>
|
| 4388 |
</g>
|
| 4389 |
<g id="series--torch-layer-norm" class="series">
|
| 4390 |
-
<path d="M 83.741924 437.689571 L 323.888085
|
| 4391 |
<defs>
|
| 4392 |
<path id="md7efaf3aec" d="M 0 3 C 0.795609 3 1.55874 2.683901 2.12132 2.12132 C 2.683901 1.55874 3 0.795609 3 0 C 3 -0.795609 2.683901 -1.55874 2.12132 -2.12132 C 1.55874 -2.683901 0.795609 -3 0 -3 C -0.795609 -3 -1.55874 -2.683901 -2.12132 -2.12132 C -2.683901 -1.55874 -3 -0.795609 -3 0 C -3 0.795609 -2.683901 1.55874 -2.12132 2.12132 C -1.55874 2.683901 -0.795609 3 0 3 z " style="stroke: #1f77b4" />
|
| 4393 |
</defs>
|
| 4394 |
<g clip-path="url(#p2214f54723)">
|
| 4395 |
<use ns4:href="#md7efaf3aec" x="83.741924" y="437.689571" style="fill: #1f77b4; stroke: #1f77b4" />
|
| 4396 |
-
<use ns4:href="#md7efaf3aec" x="323.888085" y="
|
| 4397 |
-
<use ns4:href="#md7efaf3aec" x="564.034245" y="
|
| 4398 |
<use ns4:href="#md7efaf3aec" x="804.180406" y="46.442361" style="fill: #1f77b4; stroke: #1f77b4" />
|
| 4399 |
</g>
|
| 4400 |
</g>
|
| 4401 |
<g id="series--hf-kernels-layer-norm" class="series">
|
| 4402 |
-
<path d="M 83.741924
|
| 4403 |
<defs>
|
| 4404 |
<path id="m9b8c54d372" d="M 0 3 C 0.795609 3 1.55874 2.683901 2.12132 2.12132 C 2.683901 1.55874 3 0.795609 3 0 C 3 -0.795609 2.683901 -1.55874 2.12132 -2.12132 C 1.55874 -2.683901 0.795609 -3 0 -3 C -0.795609 -3 -1.55874 -2.683901 -2.12132 -2.12132 C -2.683901 -1.55874 -3 -0.795609 -3 0 C -3 0.795609 -2.683901 1.55874 -2.12132 2.12132 C -1.55874 2.683901 -0.795609 3 0 3 z " style="stroke: #ff7f0e" />
|
| 4405 |
</defs>
|
| 4406 |
<g clip-path="url(#p2214f54723)">
|
| 4407 |
-
<use ns4:href="#m9b8c54d372" x="83.741924" y="
|
| 4408 |
-
<use ns4:href="#m9b8c54d372" x="323.888085" y="307.
|
| 4409 |
-
<use ns4:href="#m9b8c54d372" x="564.034245" y="307.
|
| 4410 |
-
<use ns4:href="#m9b8c54d372" x="804.180406" y="57.
|
| 4411 |
</g>
|
| 4412 |
</g>
|
| 4413 |
<g id="patch_3">
|
|
|
|
| 809 |
.artifact-preview svg {
|
| 810 |
background: transparent;
|
| 811 |
}
|
| 812 |
+
/* Invert SVG images in dark mode */
|
| 813 |
+
:root[data-theme="dark"] .artifact-preview img[src$=".svg"] {
|
| 814 |
+
filter: invert(0.9) hue-rotate(180deg);
|
| 815 |
+
}
|
| 816 |
+
/* Keep SVG images readable in monocolor mode */
|
| 817 |
+
:root[data-ui="monocolor"] .artifact-preview img[src$=".svg"] {
|
| 818 |
+
filter: none;
|
| 819 |
+
}
|
| 820 |
/* CSV table styling */
|
| 821 |
.artifact-csv {
|
| 822 |
margin-top: 1rem;
|
|
|
|
| 3880 |
<rdf:RDF>
|
| 3881 |
<ns2:Work>
|
| 3882 |
<dc:type rdf:resource="http://purl.org/dc/dcmitype/StillImage" />
|
| 3883 |
+
<dc:date>2025-10-29T15:51:05.081730</dc:date>
|
| 3884 |
<dc:format>image/svg+xml</dc:format>
|
| 3885 |
<dc:creator>
|
| 3886 |
<ns2:Agent>
|
|
|
|
| 3964 |
<g id="matplotlib.axis_2">
|
| 3965 |
<g id="ytick_1">
|
| 3966 |
<g id="grid-y--2" class="grid grid-y">
|
| 3967 |
+
<path d="M 47.72 409.909979 L 840.20233 409.909979 " clip-path="url(#p2214f54723)" style="fill: none; stroke: #b0b0b0; stroke-opacity: 0.3; stroke-width: 0.8; stroke-linecap: square" />
|
| 3968 |
</g>
|
| 3969 |
<g id="line2d_5">
|
| 3970 |
<defs>
|
| 3971 |
<path id="m0fca2865ba" d="M 0 0 L -3.5 0 " style="stroke: #000000; stroke-width: 0.8" />
|
| 3972 |
</defs>
|
| 3973 |
<g>
|
| 3974 |
+
<use ns4:href="#m0fca2865ba" x="47.72" y="409.909979" style="stroke: #000000; stroke-width: 0.8" />
|
| 3975 |
</g>
|
| 3976 |
</g>
|
| 3977 |
<g id="text_5">
|
| 3978 |
+
<text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: end" x="40.72" y="413.709198" transform="rotate(-0 40.72 413.709198)">1.0</text>
|
| 3979 |
</g>
|
| 3980 |
</g>
|
| 3981 |
<g id="ytick_2">
|
| 3982 |
<g id="grid-y--3" class="grid grid-y">
|
| 3983 |
+
<path d="M 47.72 331.917289 L 840.20233 331.917289 " clip-path="url(#p2214f54723)" style="fill: none; stroke: #b0b0b0; stroke-opacity: 0.3; stroke-width: 0.8; stroke-linecap: square" />
|
| 3984 |
</g>
|
| 3985 |
<g id="line2d_6">
|
| 3986 |
<g>
|
| 3987 |
+
<use ns4:href="#m0fca2865ba" x="47.72" y="331.917289" style="stroke: #000000; stroke-width: 0.8" />
|
| 3988 |
</g>
|
| 3989 |
</g>
|
| 3990 |
<g id="text_6">
|
| 3991 |
+
<text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: end" x="40.72" y="335.716508" transform="rotate(-0 40.72 335.716508)">1.5</text>
|
| 3992 |
</g>
|
| 3993 |
</g>
|
| 3994 |
<g id="ytick_3">
|
| 3995 |
<g id="grid-y--4" class="grid grid-y">
|
| 3996 |
+
<path d="M 47.72 253.924599 L 840.20233 253.924599 " clip-path="url(#p2214f54723)" style="fill: none; stroke: #b0b0b0; stroke-opacity: 0.3; stroke-width: 0.8; stroke-linecap: square" />
|
| 3997 |
</g>
|
| 3998 |
<g id="line2d_7">
|
| 3999 |
<g>
|
| 4000 |
+
<use ns4:href="#m0fca2865ba" x="47.72" y="253.924599" style="stroke: #000000; stroke-width: 0.8" />
|
| 4001 |
</g>
|
| 4002 |
</g>
|
| 4003 |
<g id="text_7">
|
| 4004 |
+
<text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: end" x="40.72" y="257.723817" transform="rotate(-0 40.72 257.723817)">2.0</text>
|
| 4005 |
</g>
|
| 4006 |
</g>
|
| 4007 |
<g id="ytick_4">
|
| 4008 |
<g id="grid-y--5" class="grid grid-y">
|
| 4009 |
+
<path d="M 47.72 175.931908 L 840.20233 175.931908 " clip-path="url(#p2214f54723)" style="fill: none; stroke: #b0b0b0; stroke-opacity: 0.3; stroke-width: 0.8; stroke-linecap: square" />
|
| 4010 |
</g>
|
| 4011 |
<g id="line2d_8">
|
| 4012 |
<g>
|
| 4013 |
+
<use ns4:href="#m0fca2865ba" x="47.72" y="175.931908" style="stroke: #000000; stroke-width: 0.8" />
|
| 4014 |
</g>
|
| 4015 |
</g>
|
| 4016 |
<g id="text_8">
|
| 4017 |
+
<text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: end" x="40.72" y="179.731127" transform="rotate(-0 40.72 179.731127)">2.5</text>
|
| 4018 |
</g>
|
| 4019 |
</g>
|
| 4020 |
<g id="ytick_5">
|
| 4021 |
<g id="grid-y--6" class="grid grid-y">
|
| 4022 |
+
<path d="M 47.72 97.939218 L 840.20233 97.939218 " clip-path="url(#p2214f54723)" style="fill: none; stroke: #b0b0b0; stroke-opacity: 0.3; stroke-width: 0.8; stroke-linecap: square" />
|
| 4023 |
</g>
|
| 4024 |
<g id="line2d_9">
|
| 4025 |
<g>
|
| 4026 |
+
<use ns4:href="#m0fca2865ba" x="47.72" y="97.939218" style="stroke: #000000; stroke-width: 0.8" />
|
| 4027 |
</g>
|
| 4028 |
</g>
|
| 4029 |
<g id="text_9">
|
| 4030 |
+
<text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: end" x="40.72" y="101.738437" transform="rotate(-0 40.72 101.738437)">3.0</text>
|
| 4031 |
</g>
|
| 4032 |
</g>
|
| 4033 |
<g id="label--y" class="ylabel">
|
|
|
|
| 4035 |
</g>
|
| 4036 |
</g>
|
| 4037 |
<g id="series--torch-layer-norm" class="series">
|
| 4038 |
+
<path d="M 83.741924 437.689571 L 323.888085 303.515627 L 564.034245 314.85592 L 804.180406 46.442361 " clip-path="url(#p2214f54723)" style="fill: none; stroke: #1f77b4; stroke-width: 1.5; stroke-linecap: square" />
|
| 4039 |
<defs>
|
| 4040 |
<path id="md7efaf3aec" d="M 0 3 C 0.795609 3 1.55874 2.683901 2.12132 2.12132 C 2.683901 1.55874 3 0.795609 3 0 C 3 -0.795609 2.683901 -1.55874 2.12132 -2.12132 C 1.55874 -2.683901 0.795609 -3 0 -3 C -0.795609 -3 -1.55874 -2.683901 -2.12132 -2.12132 C -2.683901 -1.55874 -3 -0.795609 -3 0 C -3 0.795609 -2.683901 1.55874 -2.12132 2.12132 C -1.55874 2.683901 -0.795609 3 0 3 z " style="stroke: #1f77b4" />
|
| 4041 |
</defs>
|
| 4042 |
<g clip-path="url(#p2214f54723)">
|
| 4043 |
<use ns4:href="#md7efaf3aec" x="83.741924" y="437.689571" style="fill: #1f77b4; stroke: #1f77b4" />
|
| 4044 |
+
<use ns4:href="#md7efaf3aec" x="323.888085" y="303.515627" style="fill: #1f77b4; stroke: #1f77b4" />
|
| 4045 |
+
<use ns4:href="#md7efaf3aec" x="564.034245" y="314.85592" style="fill: #1f77b4; stroke: #1f77b4" />
|
| 4046 |
<use ns4:href="#md7efaf3aec" x="804.180406" y="46.442361" style="fill: #1f77b4; stroke: #1f77b4" />
|
| 4047 |
</g>
|
| 4048 |
</g>
|
| 4049 |
<g id="series--hf-kernels-layer-norm" class="series">
|
| 4050 |
+
<path d="M 83.741924 435.933176 L 323.888085 307.404498 L 564.034245 307.981644 L 804.180406 57.739446 " clip-path="url(#p2214f54723)" style="fill: none; stroke: #ff7f0e; stroke-width: 1.5; stroke-linecap: square" />
|
| 4051 |
<defs>
|
| 4052 |
<path id="m9b8c54d372" d="M 0 3 C 0.795609 3 1.55874 2.683901 2.12132 2.12132 C 2.683901 1.55874 3 0.795609 3 0 C 3 -0.795609 2.683901 -1.55874 2.12132 -2.12132 C 1.55874 -2.683901 0.795609 -3 0 -3 C -0.795609 -3 -1.55874 -2.683901 -2.12132 -2.12132 C -2.683901 -1.55874 -3 -0.795609 -3 0 C -3 0.795609 -2.683901 1.55874 -2.12132 2.12132 C -1.55874 2.683901 -0.795609 3 0 3 z " style="stroke: #ff7f0e" />
|
| 4053 |
</defs>
|
| 4054 |
<g clip-path="url(#p2214f54723)">
|
| 4055 |
+
<use ns4:href="#m9b8c54d372" x="83.741924" y="435.933176" style="fill: #ff7f0e; stroke: #ff7f0e" />
|
| 4056 |
+
<use ns4:href="#m9b8c54d372" x="323.888085" y="307.404498" style="fill: #ff7f0e; stroke: #ff7f0e" />
|
| 4057 |
+
<use ns4:href="#m9b8c54d372" x="564.034245" y="307.981644" style="fill: #ff7f0e; stroke: #ff7f0e" />
|
| 4058 |
+
<use ns4:href="#m9b8c54d372" x="804.180406" y="57.739446" style="fill: #ff7f0e; stroke: #ff7f0e" />
|
| 4059 |
</g>
|
| 4060 |
</g>
|
| 4061 |
<g id="patch_3">
|
|
|
|
| 4113 |
<span onclick="toggleOutput('combine')" style="cursor: pointer;">▼ output</span>
|
| 4114 |
<span id="uv-indicator-combine" onclick="toggleUvLogsFromHeader('combine')" style="cursor: pointer;">▶ uv-logs</span>
|
| 4115 |
</span> |
|
| 4116 |
+
Cell: combine | 4.18s
|
| 4117 |
| <button class="run-btn" onclick="runCell('combine')">▶ run</button>
|
| 4118 |
<button class="copy-btn" onclick="copyCell('combine')">Copy</button>
|
| 4119 |
<a href="cells/combine.py" target="_blank" class="raw-btn">Raw</a>
|
|
|
|
| 4200 |
COMBINED BENCHMARK SUMMARY
|
| 4201 |
|
| 4202 |
impl wl p50(ms) ok
|
| 4203 |
+
hf_kernels_layer_norm LN_B16_S2048_D4096 0.83 True
|
| 4204 |
+
hf_kernels_layer_norm LN_B16_S2048_D8192 1.66 True
|
| 4205 |
hf_kernels_layer_norm LN_B16_S4096_D4096 1.65 True
|
| 4206 |
hf_kernels_layer_norm LN_B16_S4096_D8192 3.26 True
|
| 4207 |
torch_layer_norm LN_B16_S2048_D4096 0.82 True
|
|
|
|
| 4227 |
<div class="uv-install-logs" id="uv-logs-combine">
|
| 4228 |
<div class="uv-logs-header" onclick="toggleUvLogs(this)">▶ UV Install Logs</div>
|
| 4229 |
<div class="uv-logs-content" style="display: none;">
|
| 4230 |
+
Installed 37 packages in 195ms
|
| 4231 |
</div>
|
| 4232 |
</div>
|
| 4233 |
<div class="cell-artifacts">
|
|
|
|
| 4240 |
<rdf:RDF>
|
| 4241 |
<ns2:Work>
|
| 4242 |
<dc:type rdf:resource="http://purl.org/dc/dcmitype/StillImage" />
|
| 4243 |
+
<dc:date>2025-10-29T15:51:05.081730</dc:date>
|
| 4244 |
<dc:format>image/svg+xml</dc:format>
|
| 4245 |
<dc:creator>
|
| 4246 |
<ns2:Agent>
|
|
|
|
| 4324 |
<g id="matplotlib.axis_2">
|
| 4325 |
<g id="ytick_1">
|
| 4326 |
<g id="grid-y--2" class="grid grid-y">
|
| 4327 |
+
<path d="M 47.72 409.909979 L 840.20233 409.909979 " clip-path="url(#p2214f54723)" style="fill: none; stroke: #b0b0b0; stroke-opacity: 0.3; stroke-width: 0.8; stroke-linecap: square" />
|
| 4328 |
</g>
|
| 4329 |
<g id="line2d_5">
|
| 4330 |
<defs>
|
| 4331 |
<path id="m0fca2865ba" d="M 0 0 L -3.5 0 " style="stroke: #000000; stroke-width: 0.8" />
|
| 4332 |
</defs>
|
| 4333 |
<g>
|
| 4334 |
+
<use ns4:href="#m0fca2865ba" x="47.72" y="409.909979" style="stroke: #000000; stroke-width: 0.8" />
|
| 4335 |
</g>
|
| 4336 |
</g>
|
| 4337 |
<g id="text_5">
|
| 4338 |
+
<text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: end" x="40.72" y="413.709198" transform="rotate(-0 40.72 413.709198)">1.0</text>
|
| 4339 |
</g>
|
| 4340 |
</g>
|
| 4341 |
<g id="ytick_2">
|
| 4342 |
<g id="grid-y--3" class="grid grid-y">
|
| 4343 |
+
<path d="M 47.72 331.917289 L 840.20233 331.917289 " clip-path="url(#p2214f54723)" style="fill: none; stroke: #b0b0b0; stroke-opacity: 0.3; stroke-width: 0.8; stroke-linecap: square" />
|
| 4344 |
</g>
|
| 4345 |
<g id="line2d_6">
|
| 4346 |
<g>
|
| 4347 |
+
<use ns4:href="#m0fca2865ba" x="47.72" y="331.917289" style="stroke: #000000; stroke-width: 0.8" />
|
| 4348 |
</g>
|
| 4349 |
</g>
|
| 4350 |
<g id="text_6">
|
| 4351 |
+
<text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: end" x="40.72" y="335.716508" transform="rotate(-0 40.72 335.716508)">1.5</text>
|
| 4352 |
</g>
|
| 4353 |
</g>
|
| 4354 |
<g id="ytick_3">
|
| 4355 |
<g id="grid-y--4" class="grid grid-y">
|
| 4356 |
+
<path d="M 47.72 253.924599 L 840.20233 253.924599 " clip-path="url(#p2214f54723)" style="fill: none; stroke: #b0b0b0; stroke-opacity: 0.3; stroke-width: 0.8; stroke-linecap: square" />
|
| 4357 |
</g>
|
| 4358 |
<g id="line2d_7">
|
| 4359 |
<g>
|
| 4360 |
+
<use ns4:href="#m0fca2865ba" x="47.72" y="253.924599" style="stroke: #000000; stroke-width: 0.8" />
|
| 4361 |
</g>
|
| 4362 |
</g>
|
| 4363 |
<g id="text_7">
|
| 4364 |
+
<text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: end" x="40.72" y="257.723817" transform="rotate(-0 40.72 257.723817)">2.0</text>
|
| 4365 |
</g>
|
| 4366 |
</g>
|
| 4367 |
<g id="ytick_4">
|
| 4368 |
<g id="grid-y--5" class="grid grid-y">
|
| 4369 |
+
<path d="M 47.72 175.931908 L 840.20233 175.931908 " clip-path="url(#p2214f54723)" style="fill: none; stroke: #b0b0b0; stroke-opacity: 0.3; stroke-width: 0.8; stroke-linecap: square" />
|
| 4370 |
</g>
|
| 4371 |
<g id="line2d_8">
|
| 4372 |
<g>
|
| 4373 |
+
<use ns4:href="#m0fca2865ba" x="47.72" y="175.931908" style="stroke: #000000; stroke-width: 0.8" />
|
| 4374 |
</g>
|
| 4375 |
</g>
|
| 4376 |
<g id="text_8">
|
| 4377 |
+
<text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: end" x="40.72" y="179.731127" transform="rotate(-0 40.72 179.731127)">2.5</text>
|
| 4378 |
</g>
|
| 4379 |
</g>
|
| 4380 |
<g id="ytick_5">
|
| 4381 |
<g id="grid-y--6" class="grid grid-y">
|
| 4382 |
+
<path d="M 47.72 97.939218 L 840.20233 97.939218 " clip-path="url(#p2214f54723)" style="fill: none; stroke: #b0b0b0; stroke-opacity: 0.3; stroke-width: 0.8; stroke-linecap: square" />
|
| 4383 |
</g>
|
| 4384 |
<g id="line2d_9">
|
| 4385 |
<g>
|
| 4386 |
+
<use ns4:href="#m0fca2865ba" x="47.72" y="97.939218" style="stroke: #000000; stroke-width: 0.8" />
|
| 4387 |
</g>
|
| 4388 |
</g>
|
| 4389 |
<g id="text_9">
|
| 4390 |
+
<text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: end" x="40.72" y="101.738437" transform="rotate(-0 40.72 101.738437)">3.0</text>
|
| 4391 |
</g>
|
| 4392 |
</g>
|
| 4393 |
<g id="label--y" class="ylabel">
|
|
|
|
| 4395 |
</g>
|
| 4396 |
</g>
|
| 4397 |
<g id="series--torch-layer-norm" class="series">
|
| 4398 |
+
<path d="M 83.741924 437.689571 L 323.888085 303.515627 L 564.034245 314.85592 L 804.180406 46.442361 " clip-path="url(#p2214f54723)" style="fill: none; stroke: #1f77b4; stroke-width: 1.5; stroke-linecap: square" />
|
| 4399 |
<defs>
|
| 4400 |
<path id="md7efaf3aec" d="M 0 3 C 0.795609 3 1.55874 2.683901 2.12132 2.12132 C 2.683901 1.55874 3 0.795609 3 0 C 3 -0.795609 2.683901 -1.55874 2.12132 -2.12132 C 1.55874 -2.683901 0.795609 -3 0 -3 C -0.795609 -3 -1.55874 -2.683901 -2.12132 -2.12132 C -2.683901 -1.55874 -3 -0.795609 -3 0 C -3 0.795609 -2.683901 1.55874 -2.12132 2.12132 C -1.55874 2.683901 -0.795609 3 0 3 z " style="stroke: #1f77b4" />
|
| 4401 |
</defs>
|
| 4402 |
<g clip-path="url(#p2214f54723)">
|
| 4403 |
<use ns4:href="#md7efaf3aec" x="83.741924" y="437.689571" style="fill: #1f77b4; stroke: #1f77b4" />
|
| 4404 |
+
<use ns4:href="#md7efaf3aec" x="323.888085" y="303.515627" style="fill: #1f77b4; stroke: #1f77b4" />
|
| 4405 |
+
<use ns4:href="#md7efaf3aec" x="564.034245" y="314.85592" style="fill: #1f77b4; stroke: #1f77b4" />
|
| 4406 |
<use ns4:href="#md7efaf3aec" x="804.180406" y="46.442361" style="fill: #1f77b4; stroke: #1f77b4" />
|
| 4407 |
</g>
|
| 4408 |
</g>
|
| 4409 |
<g id="series--hf-kernels-layer-norm" class="series">
|
| 4410 |
+
<path d="M 83.741924 435.933176 L 323.888085 307.404498 L 564.034245 307.981644 L 804.180406 57.739446 " clip-path="url(#p2214f54723)" style="fill: none; stroke: #ff7f0e; stroke-width: 1.5; stroke-linecap: square" />
|
| 4411 |
<defs>
|
| 4412 |
<path id="m9b8c54d372" d="M 0 3 C 0.795609 3 1.55874 2.683901 2.12132 2.12132 C 2.683901 1.55874 3 0.795609 3 0 C 3 -0.795609 2.683901 -1.55874 2.12132 -2.12132 C 1.55874 -2.683901 0.795609 -3 0 -3 C -0.795609 -3 -1.55874 -2.683901 -2.12132 -2.12132 C -2.683901 -1.55874 -3 -0.795609 -3 0 C -3 0.795609 -2.683901 1.55874 -2.12132 2.12132 C -1.55874 2.683901 -0.795609 3 0 3 z " style="stroke: #ff7f0e" />
|
| 4413 |
</defs>
|
| 4414 |
<g clip-path="url(#p2214f54723)">
|
| 4415 |
+
<use ns4:href="#m9b8c54d372" x="83.741924" y="435.933176" style="fill: #ff7f0e; stroke: #ff7f0e" />
|
| 4416 |
+
<use ns4:href="#m9b8c54d372" x="323.888085" y="307.404498" style="fill: #ff7f0e; stroke: #ff7f0e" />
|
| 4417 |
+
<use ns4:href="#m9b8c54d372" x="564.034245" y="307.981644" style="fill: #ff7f0e; stroke: #ff7f0e" />
|
| 4418 |
+
<use ns4:href="#m9b8c54d372" x="804.180406" y="57.739446" style="fill: #ff7f0e; stroke: #ff7f0e" />
|
| 4419 |
</g>
|
| 4420 |
</g>
|
| 4421 |
<g id="patch_3">
|
rotary/impls/artifacts/benchmark/rotary.jsonl
CHANGED
|
@@ -1,24 +1,24 @@
|
|
| 1 |
-
{"ts": "2025-10-
|
| 2 |
-
{"ts": "2025-10-
|
| 3 |
-
{"ts": "2025-10-
|
| 4 |
-
{"ts": "2025-10-
|
| 5 |
-
{"ts": "2025-10-
|
| 6 |
-
{"ts": "2025-10-
|
| 7 |
-
{"ts": "2025-10-
|
| 8 |
-
{"ts": "2025-10-
|
| 9 |
-
{"ts": "2025-10-
|
| 10 |
-
{"ts": "2025-10-
|
| 11 |
-
{"ts": "2025-10-
|
| 12 |
-
{"ts": "2025-10-
|
| 13 |
-
{"ts": "2025-10-
|
| 14 |
-
{"ts": "2025-10-
|
| 15 |
-
{"ts": "2025-10-
|
| 16 |
-
{"ts": "2025-10-
|
| 17 |
-
{"ts": "2025-10-
|
| 18 |
-
{"ts": "2025-10-
|
| 19 |
-
{"ts": "2025-10-
|
| 20 |
-
{"ts": "2025-10-
|
| 21 |
-
{"ts": "2025-10-
|
| 22 |
-
{"ts": "2025-10-
|
| 23 |
-
{"ts": "2025-10-
|
| 24 |
-
{"ts": "2025-10-
|
|
|
|
| 1 |
+
{"ts": "2025-10-29T15:50:31Z", "run": "acca4aa63c524f09b2ae69a99c25a06c", "impl": "hf_kernels_rotary", "tags": {"family": "hf-kernels", "backend": "cuda"}, "wl": {"name": "cuda_B1_S128_H8_D64_R32", "batch": 1, "seqlen": 128, "num_heads": 8, "head_dim": 64, "rotary_dim": 32, "dtype": "float32", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.0735019999638098, "p50": 0.07410199998503231, "p90": 0.07441199994673298, "mean": 0.07416379996811884, "iqr": 0.00038999996831989847, "raw_times": [0.07478099996660603, 0.0735019999638098, 0.07441199994673298, 0.07402199997841308, 0.07410199998503231], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.08146199996872383, "peak_bytes": 3178496, "ok": true, "absmax": null, "corr": {"ok": true, "rtol": 1e-05, "atol": 1e-05, "absmax_q": 9.5367431640625e-07, "absmax_k": 9.5367431640625e-07, "mae_q": 1.590452924915553e-08, "mae_k": 1.5487040982975486e-08, "mse_q": 2.5241010080938753e-15, "mse_k": 2.364223539299626e-15, "ref": "rotary_torch"}, "err": null}
|
| 2 |
+
{"ts": "2025-10-29T15:50:31Z", "run": "acca4aa63c524f09b2ae69a99c25a06c", "impl": "hf_kernels_rotary", "tags": {"family": "hf-kernels", "backend": "cuda"}, "wl": {"name": "cuda_B1_S128_H8_D128_R64", "batch": 1, "seqlen": 128, "num_heads": 8, "head_dim": 128, "rotary_dim": 64, "dtype": "float32", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.0912220000373054, "p50": 0.09200200003078862, "p90": 0.09276200000840618, "mean": 0.09224400001812683, "iqr": 0.0012400000173329317, "raw_times": [0.09152199999107324, 0.09276200000840618, 0.0912220000373054, 0.09200200003078862, 0.09371200002306068], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.09689300003401513, "peak_bytes": 6356992, "ok": true, "absmax": null, "corr": {"ok": true, "rtol": 1e-05, "atol": 1e-05, "absmax_q": 9.5367431640625e-07, "absmax_k": 9.5367431640625e-07, "mae_q": 1.5508486939097565e-08, "mae_k": 1.567566698668088e-08, "mse_q": 2.3630110116356316e-15, "mse_k": 2.416562128626943e-15, "ref": "rotary_torch"}, "err": null}
|
| 3 |
+
{"ts": "2025-10-29T15:50:31Z", "run": "acca4aa63c524f09b2ae69a99c25a06c", "impl": "hf_kernels_rotary", "tags": {"family": "hf-kernels", "backend": "cuda"}, "wl": {"name": "cuda_B1_S128_H32_D64_R32", "batch": 1, "seqlen": 128, "num_heads": 32, "head_dim": 64, "rotary_dim": 32, "dtype": "float32", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.08820200002901402, "p50": 0.09085200002800775, "p90": 0.0915720000307374, "mean": 0.09087420002060753, "iqr": 0.002170000016121776, "raw_times": [0.08820200002901402, 0.09434300000066287, 0.08940200001461562, 0.0915720000307374, 0.09085200002800775], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.0964319999638974, "peak_bytes": 12615680, "ok": true, "absmax": null, "corr": {"ok": true, "rtol": 1e-05, "atol": 1e-05, "absmax_q": 9.5367431640625e-07, "absmax_k": 9.5367431640625e-07, "mae_q": 1.5856898016863852e-08, "mae_k": 1.572981211950264e-08, "mse_q": 2.4771055025978386e-15, "mse_k": 2.4544071371937915e-15, "ref": "rotary_torch"}, "err": null}
|
| 4 |
+
{"ts": "2025-10-29T15:50:31Z", "run": "acca4aa63c524f09b2ae69a99c25a06c", "impl": "hf_kernels_rotary", "tags": {"family": "hf-kernels", "backend": "cuda"}, "wl": {"name": "cuda_B1_S128_H32_D128_R64", "batch": 1, "seqlen": 128, "num_heads": 32, "head_dim": 128, "rotary_dim": 64, "dtype": "float32", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.09069200001476929, "p50": 0.09134200001881254, "p90": 0.09142199996858835, "mean": 0.09263220000548245, "iqr": 0.0006699999630654929, "raw_times": [0.09069200001476929, 0.09075200000552286, 0.09142199996858835, 0.09895300001971918, 0.09134200001881254], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.09313199996086041, "peak_bytes": 25231360, "ok": true, "absmax": null, "corr": {"ok": true, "rtol": 1e-05, "atol": 1e-05, "absmax_q": 9.5367431640625e-07, "absmax_k": 9.5367431640625e-07, "mae_q": 1.5617658277733426e-08, "mae_k": 1.5788685914230882e-08, "mse_q": 2.4549424620164562e-15, "mse_k": 2.492823469483563e-15, "ref": "rotary_torch"}, "err": null}
|
| 5 |
+
{"ts": "2025-10-29T15:50:31Z", "run": "acca4aa63c524f09b2ae69a99c25a06c", "impl": "hf_kernels_rotary", "tags": {"family": "hf-kernels", "backend": "cuda"}, "wl": {"name": "cuda_B1_S512_H8_D64_R32", "batch": 1, "seqlen": 512, "num_heads": 8, "head_dim": 64, "rotary_dim": 32, "dtype": "float32", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.0885120000475581, "p50": 0.08998200001997247, "p90": 0.09122199998046199, "mean": 0.09028400000943293, "iqr": 0.0016600000094513234, "raw_times": [0.09122199998046199, 0.0885120000475581, 0.09214200002816142, 0.08998200001997247, 0.08956199997101066], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.1227330000119764, "peak_bytes": 12779520, "ok": true, "absmax": null, "corr": {"ok": true, "rtol": 1e-05, "atol": 1e-05, "absmax_q": 9.5367431640625e-07, "absmax_k": 9.5367431640625e-07, "mae_q": 1.5962712041073246e-08, "mae_k": 1.5743363945830424e-08, "mse_q": 2.534145124782417e-15, "mse_k": 2.451281585618423e-15, "ref": "rotary_torch"}, "err": null}
|
| 6 |
+
{"ts": "2025-10-29T15:50:31Z", "run": "acca4aa63c524f09b2ae69a99c25a06c", "impl": "hf_kernels_rotary", "tags": {"family": "hf-kernels", "backend": "cuda"}, "wl": {"name": "cuda_B1_S512_H8_D128_R64", "batch": 1, "seqlen": 512, "num_heads": 8, "head_dim": 128, "rotary_dim": 64, "dtype": "float32", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.08860200000526675, "p50": 0.09058200004119499, "p90": 0.09118299999499868, "mean": 0.09031840000943703, "iqr": 0.0011699999618031143, "raw_times": [0.08860200000526675, 0.09001300003319557, 0.09058200004119499, 0.09121199997252916, 0.09118299999499868], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.09078199997247793, "peak_bytes": 25427968, "ok": true, "absmax": null, "corr": {"ok": true, "rtol": 1e-05, "atol": 1e-05, "absmax_q": 9.5367431640625e-07, "absmax_k": 9.5367431640625e-07, "mae_q": 1.578730035589615e-08, "mae_k": 1.5859711766097462e-08, "mse_q": 2.440287521479536e-15, "mse_k": 2.477901290051784e-15, "ref": "rotary_torch"}, "err": null}
|
| 7 |
+
{"ts": "2025-10-29T15:50:31Z", "run": "acca4aa63c524f09b2ae69a99c25a06c", "impl": "hf_kernels_rotary", "tags": {"family": "hf-kernels", "backend": "cuda"}, "wl": {"name": "cuda_B1_S512_H32_D64_R32", "batch": 1, "seqlen": 512, "num_heads": 32, "head_dim": 64, "rotary_dim": 32, "dtype": "float32", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.08772200004614206, "p50": 0.09064199997510514, "p90": 0.09105200001613412, "mean": 0.08990000001176668, "iqr": 0.002190000031987438, "raw_times": [0.08772200004614206, 0.09064199997510514, 0.09105200001613412, 0.0912220000373054, 0.08886199998414668], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.09194199998319164, "peak_bytes": 50462720, "ok": true, "absmax": null, "corr": {"ok": true, "rtol": 1e-05, "atol": 1e-05, "absmax_q": 9.5367431640625e-07, "absmax_k": 9.5367431640625e-07, "mae_q": 1.5775295736375483e-08, "mae_k": 1.5847881229547056e-08, "mse_q": 2.471039476146077e-15, "mse_k": 2.472378635235686e-15, "ref": "rotary_torch"}, "err": null}
|
| 8 |
+
{"ts": "2025-10-29T15:50:32Z", "run": "acca4aa63c524f09b2ae69a99c25a06c", "impl": "hf_kernels_rotary", "tags": {"family": "hf-kernels", "backend": "cuda"}, "wl": {"name": "cuda_B1_S512_H32_D128_R64", "batch": 1, "seqlen": 512, "num_heads": 32, "head_dim": 128, "rotary_dim": 64, "dtype": "float32", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.08855200002244601, "p50": 0.08938199999874996, "p90": 0.0907319999896572, "mean": 0.0897739999913938, "iqr": 0.0015100000041456951, "raw_times": [0.0892219999855115, 0.0909819999606043, 0.0907319999896572, 0.08855200002244601, 0.08938199999874996], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.09457200002316313, "peak_bytes": 100925440, "ok": true, "absmax": null, "corr": {"ok": true, "rtol": 1e-05, "atol": 1e-05, "absmax_q": 9.5367431640625e-07, "absmax_k": 9.5367431640625e-07, "mae_q": 1.5959869870130206e-08, "mae_k": 1.588083975434529e-08, "mse_q": 2.510663677418633e-15, "mse_k": 2.502786271009168e-15, "ref": "rotary_torch"}, "err": null}
|
| 9 |
+
{"ts": "2025-10-29T15:50:32Z", "run": "acca4aa63c524f09b2ae69a99c25a06c", "impl": "hf_kernels_rotary", "tags": {"family": "hf-kernels", "backend": "cuda"}, "wl": {"name": "cuda_B1_S2048_H8_D64_R32", "batch": 1, "seqlen": 2048, "num_heads": 8, "head_dim": 64, "rotary_dim": 32, "dtype": "float32", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.08880199999339311, "p50": 0.08953200000405559, "p90": 0.08999199997106189, "mean": 0.08967999999640597, "iqr": 0.0006899999789311551, "raw_times": [0.08880199999339311, 0.08930199999213073, 0.08999199997106189, 0.09077200002138852, 0.08953200000405559], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.09282199999915974, "peak_bytes": 51118080, "ok": true, "absmax": null, "corr": {"ok": true, "rtol": 1e-05, "atol": 1e-05, "absmax_q": 9.5367431640625e-07, "absmax_k": 9.5367431640625e-07, "mae_q": 1.5894533689220225e-08, "mae_k": 1.5873395042831362e-08, "mse_q": 2.5093181655819197e-15, "mse_k": 2.488611809911578e-15, "ref": "rotary_torch"}, "err": null}
|
| 10 |
+
{"ts": "2025-10-29T15:50:32Z", "run": "acca4aa63c524f09b2ae69a99c25a06c", "impl": "hf_kernels_rotary", "tags": {"family": "hf-kernels", "backend": "cuda"}, "wl": {"name": "cuda_B1_S2048_H8_D128_R64", "batch": 1, "seqlen": 2048, "num_heads": 8, "head_dim": 128, "rotary_dim": 64, "dtype": "float32", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.08904199995640738, "p50": 0.09102199999233562, "p90": 0.09121199997252916, "mean": 0.0907579999761765, "iqr": 0.0006099999723119254, "raw_times": [0.08904199995640738, 0.09191199995939314, 0.09121199997252916, 0.09060200000021723, 0.09102199999233562], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.09379199997283649, "peak_bytes": 101711872, "ok": true, "absmax": null, "corr": {"ok": true, "rtol": 1e-05, "atol": 1e-05, "absmax_q": 9.5367431640625e-07, "absmax_k": 9.5367431640625e-07, "mae_q": 1.5936768349433805e-08, "mae_k": 1.5960043953100467e-08, "mse_q": 2.51039008577667e-15, "mse_k": 2.5111253103748867e-15, "ref": "rotary_torch"}, "err": null}
|
| 11 |
+
{"ts": "2025-10-29T15:50:32Z", "run": "acca4aa63c524f09b2ae69a99c25a06c", "impl": "hf_kernels_rotary", "tags": {"family": "hf-kernels", "backend": "cuda"}, "wl": {"name": "cuda_B1_S2048_H32_D64_R32", "batch": 1, "seqlen": 2048, "num_heads": 32, "head_dim": 64, "rotary_dim": 32, "dtype": "float32", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.09005199996181545, "p50": 0.09118200000557408, "p90": 0.0916120000056253, "mean": 0.09133820000215565, "iqr": 0.0005590000000665896, "raw_times": [0.09005199996181545, 0.09105300000555872, 0.09279200003220467, 0.0916120000056253, 0.09118200000557408], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.09626199999956953, "peak_bytes": 201850880, "ok": true, "absmax": null, "corr": {"ok": true, "rtol": 1e-05, "atol": 1e-05, "absmax_q": 1.9073486328125e-06, "absmax_k": 9.5367431640625e-07, "mae_q": 1.586510300910504e-08, "mae_k": 1.5813935050346117e-08, "mse_q": 2.499836478770355e-15, "mse_k": 2.4755639026338358e-15, "ref": "rotary_torch"}, "err": null}
|
| 12 |
+
{"ts": "2025-10-29T15:50:32Z", "run": "acca4aa63c524f09b2ae69a99c25a06c", "impl": "hf_kernels_rotary", "tags": {"family": "hf-kernels", "backend": "cuda"}, "wl": {"name": "cuda_B1_S2048_H32_D128_R64", "batch": 1, "seqlen": 2048, "num_heads": 32, "head_dim": 128, "rotary_dim": 64, "dtype": "float32", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.2600759999609181, "p50": 0.261636000004728, "p90": 0.2620960000285777, "mean": 0.26208780000160914, "iqr": 0.0012810000384888554, "raw_times": [0.2600759999609181, 0.261636000004728, 0.26581600002373307, 0.26081499999008884, 0.2620960000285777], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.263886000027469, "peak_bytes": 403701760, "ok": true, "absmax": null, "corr": {"ok": true, "rtol": 1e-05, "atol": 1e-05, "absmax_q": 9.5367431640625e-07, "absmax_k": 9.5367431640625e-07, "mae_q": 1.581049247079136e-08, "mae_k": 1.5861061797295406e-08, "mse_q": 2.4735094242202705e-15, "mse_k": 2.486832828964107e-15, "ref": "rotary_torch"}, "err": null}
|
| 13 |
+
{"ts": "2025-10-29T15:50:32Z", "run": "acca4aa63c524f09b2ae69a99c25a06c", "impl": "hf_kernels_rotary", "tags": {"family": "hf-kernels", "backend": "cuda"}, "wl": {"name": "cuda_B2_S128_H8_D64_R32", "batch": 2, "seqlen": 128, "num_heads": 8, "head_dim": 64, "rotary_dim": 32, "dtype": "float32", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.08898199996565381, "p50": 0.09088199999496283, "p90": 0.09099199996853713, "mean": 0.09348599999157159, "iqr": 0.001969999971151992, "raw_times": [0.08898199996565381, 0.09099199996853713, 0.10755200003131904, 0.09088199999496283, 0.08902199999738514], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.09600300001011419, "peak_bytes": 137396224, "ok": true, "absmax": null, "corr": {"ok": true, "rtol": 1e-05, "atol": 1e-05, "absmax_q": 9.5367431640625e-07, "absmax_k": 9.5367431640625e-07, "mae_q": 1.5824980437173508e-08, "mae_k": 1.5615324144846454e-08, "mse_q": 2.488090249374306e-15, "mse_k": 2.425079044911585e-15, "ref": "rotary_torch"}, "err": null}
|
| 14 |
+
{"ts": "2025-10-29T15:50:32Z", "run": "acca4aa63c524f09b2ae69a99c25a06c", "impl": "hf_kernels_rotary", "tags": {"family": "hf-kernels", "backend": "cuda"}, "wl": {"name": "cuda_B2_S128_H8_D128_R64", "batch": 2, "seqlen": 128, "num_heads": 8, "head_dim": 128, "rotary_dim": 64, "dtype": "float32", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.08880199999339311, "p50": 0.09035199997242671, "p90": 0.09093199997778356, "mean": 0.09011999998165265, "iqr": 0.0015400000279441883, "raw_times": [0.08939199994983937, 0.09093199997778356, 0.09035199997242671, 0.09112200001482051, 0.08880199999339311], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.09145199999238685, "peak_bytes": 12648448, "ok": true, "absmax": null, "corr": {"ok": true, "rtol": 1e-05, "atol": 1e-05, "absmax_q": 9.5367431640625e-07, "absmax_k": 9.5367431640625e-07, "mae_q": 1.5683587761827766e-08, "mae_k": 1.574532682013796e-08, "mse_q": 2.4310271220254415e-15, "mse_k": 2.4601385856313877e-15, "ref": "rotary_torch"}, "err": null}
|
| 15 |
+
{"ts": "2025-10-29T15:50:32Z", "run": "acca4aa63c524f09b2ae69a99c25a06c", "impl": "hf_kernels_rotary", "tags": {"family": "hf-kernels", "backend": "cuda"}, "wl": {"name": "cuda_B2_S128_H32_D64_R32", "batch": 2, "seqlen": 128, "num_heads": 32, "head_dim": 64, "rotary_dim": 32, "dtype": "float32", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.08985199997368909, "p50": 0.09101199998440279, "p90": 0.09125200000426048, "mean": 0.09087419999787016, "iqr": 0.0002900000026784255, "raw_times": [0.08985199997368909, 0.09096200000158206, 0.0912930000254164, 0.09125200000426048, 0.09101199998440279], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.09303199999521894, "peak_bytes": 25198592, "ok": true, "absmax": null, "corr": {"ok": true, "rtol": 1e-05, "atol": 1e-05, "absmax_q": 9.5367431640625e-07, "absmax_k": 9.5367431640625e-07, "mae_q": 1.5835009747888762e-08, "mae_k": 1.572560215379326e-08, "mse_q": 2.478222950813504e-15, "mse_k": 2.4541699679685603e-15, "ref": "rotary_torch"}, "err": null}
|
| 16 |
+
{"ts": "2025-10-29T15:50:32Z", "run": "acca4aa63c524f09b2ae69a99c25a06c", "impl": "hf_kernels_rotary", "tags": {"family": "hf-kernels", "backend": "cuda"}, "wl": {"name": "cuda_B2_S128_H32_D128_R64", "batch": 2, "seqlen": 128, "num_heads": 32, "head_dim": 128, "rotary_dim": 64, "dtype": "float32", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.08871200003568447, "p50": 0.0907719999645451, "p90": 0.09140200000956611, "mean": 0.09065600000894847, "iqr": 0.001259999976355175, "raw_times": [0.08871200003568447, 0.09225200000173572, 0.09014200003321093, 0.0907719999645451, 0.09140200000956611], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.09131100000558945, "peak_bytes": 50397184, "ok": true, "absmax": null, "corr": {"ok": true, "rtol": 1e-05, "atol": 1e-05, "absmax_q": 9.5367431640625e-07, "absmax_k": 9.5367431640625e-07, "mae_q": 1.5876850056883995e-08, "mae_k": 1.5927410501603845e-08, "mse_q": 2.504224532953606e-15, "mse_k": 2.503892919554756e-15, "ref": "rotary_torch"}, "err": null}
|
| 17 |
+
{"ts": "2025-10-29T15:50:32Z", "run": "acca4aa63c524f09b2ae69a99c25a06c", "impl": "hf_kernels_rotary", "tags": {"family": "hf-kernels", "backend": "cuda"}, "wl": {"name": "cuda_B2_S512_H8_D64_R32", "batch": 2, "seqlen": 512, "num_heads": 8, "head_dim": 64, "rotary_dim": 32, "dtype": "float32", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.08793200004220125, "p50": 0.0902419999988524, "p90": 0.09114200003068618, "mean": 0.09024000002000321, "iqr": 0.001160000010713702, "raw_times": [0.08793200004220125, 0.08998200001997247, 0.0902419999988524, 0.09114200003068618, 0.09190200000830373], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.09403199999269418, "peak_bytes": 25362432, "ok": true, "absmax": null, "corr": {"ok": true, "rtol": 1e-05, "atol": 1e-05, "absmax_q": 9.5367431640625e-07, "absmax_k": 9.5367431640625e-07, "mae_q": 1.5820052823301012e-08, "mae_k": 1.580205122309053e-08, "mse_q": 2.4876468276264184e-15, "mse_k": 2.4866062476507165e-15, "ref": "rotary_torch"}, "err": null}
|
| 18 |
+
{"ts": "2025-10-29T15:50:32Z", "run": "acca4aa63c524f09b2ae69a99c25a06c", "impl": "hf_kernels_rotary", "tags": {"family": "hf-kernels", "backend": "cuda"}, "wl": {"name": "cuda_B2_S512_H8_D128_R64", "batch": 2, "seqlen": 512, "num_heads": 8, "head_dim": 128, "rotary_dim": 64, "dtype": "float32", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.08730199999718025, "p50": 0.0906619999909708, "p90": 0.09115200003861901, "mean": 0.08998400001019036, "iqr": 0.0016399999935856613, "raw_times": [0.08730199999718025, 0.09115200003861901, 0.09129199997914839, 0.08951200004503335, 0.0906619999909708], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.09093099998835896, "peak_bytes": 50593792, "ok": true, "absmax": null, "corr": {"ok": true, "rtol": 1e-05, "atol": 1e-05, "absmax_q": 9.5367431640625e-07, "absmax_k": 9.5367431640625e-07, "mae_q": 1.5823172105911e-08, "mae_k": 1.582038855474366e-08, "mse_q": 2.464257071579175e-15, "mse_k": 2.4775099608301526e-15, "ref": "rotary_torch"}, "err": null}
|
| 19 |
+
{"ts": "2025-10-29T15:50:32Z", "run": "acca4aa63c524f09b2ae69a99c25a06c", "impl": "hf_kernels_rotary", "tags": {"family": "hf-kernels", "backend": "cuda"}, "wl": {"name": "cuda_B2_S512_H32_D64_R32", "batch": 2, "seqlen": 512, "num_heads": 32, "head_dim": 64, "rotary_dim": 32, "dtype": "float32", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.08923199999344433, "p50": 0.09018200000809884, "p90": 0.09221200002684782, "mean": 0.09105200000476543, "iqr": 0.0028300000280978566, "raw_times": [0.08923199999344433, 0.09221200002684782, 0.09018200000809884, 0.08938199999874996, 0.09425199999668621], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.09410199999138058, "peak_bytes": 100794368, "ok": true, "absmax": null, "corr": {"ok": true, "rtol": 1e-05, "atol": 1e-05, "absmax_q": 9.5367431640625e-07, "absmax_k": 9.5367431640625e-07, "mae_q": 1.5888783622131086e-08, "mae_k": 1.5861886026868888e-08, "mse_q": 2.4766798685418433e-15, "mse_k": 2.475923891636419e-15, "ref": "rotary_torch"}, "err": null}
|
| 20 |
+
{"ts": "2025-10-29T15:50:32Z", "run": "acca4aa63c524f09b2ae69a99c25a06c", "impl": "hf_kernels_rotary", "tags": {"family": "hf-kernels", "backend": "cuda"}, "wl": {"name": "cuda_B2_S512_H32_D128_R64", "batch": 2, "seqlen": 512, "num_heads": 32, "head_dim": 128, "rotary_dim": 64, "dtype": "float32", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.08850200003962527, "p50": 0.0899920000279053, "p90": 0.09176200001093093, "mean": 0.09526220001134789, "iqr": 0.002740000013545796, "raw_times": [0.08850200003962527, 0.0899920000279053, 0.11703299998089278, 0.08902199999738514, 0.09176200001093093], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.09607300000880059, "peak_bytes": 201588736, "ok": true, "absmax": null, "corr": {"ok": true, "rtol": 1e-05, "atol": 1e-05, "absmax_q": 9.5367431640625e-07, "absmax_k": 9.5367431640625e-07, "mae_q": 1.5826390864503992e-08, "mae_k": 1.5792682717119533e-08, "mse_q": 2.480465258783123e-15, "mse_k": 2.475580631534544e-15, "ref": "rotary_torch"}, "err": null}
|
| 21 |
+
{"ts": "2025-10-29T15:50:32Z", "run": "acca4aa63c524f09b2ae69a99c25a06c", "impl": "hf_kernels_rotary", "tags": {"family": "hf-kernels", "backend": "cuda"}, "wl": {"name": "cuda_B2_S2048_H8_D64_R32", "batch": 2, "seqlen": 2048, "num_heads": 8, "head_dim": 64, "rotary_dim": 32, "dtype": "float32", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.0902020000239645, "p50": 0.09163200002149097, "p90": 0.09188199999243807, "mean": 0.09142600000586754, "iqr": 0.0006299999881775875, "raw_times": [0.09163200002149097, 0.09216199998718366, 0.09188199999243807, 0.09125200000426048, 0.0902020000239645], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.09537199997566859, "peak_bytes": 101449728, "ok": true, "absmax": null, "corr": {"ok": true, "rtol": 1e-05, "atol": 1e-05, "absmax_q": 9.5367431640625e-07, "absmax_k": 9.5367431640625e-07, "mae_q": 1.592899323554775e-08, "mae_k": 1.5925031959795888e-08, "mse_q": 2.50783882253954e-15, "mse_k": 2.5015648494992274e-15, "ref": "rotary_torch"}, "err": null}
|
| 22 |
+
{"ts": "2025-10-29T15:50:32Z", "run": "acca4aa63c524f09b2ae69a99c25a06c", "impl": "hf_kernels_rotary", "tags": {"family": "hf-kernels", "backend": "cuda"}, "wl": {"name": "cuda_B2_S2048_H8_D128_R64", "batch": 2, "seqlen": 2048, "num_heads": 8, "head_dim": 128, "rotary_dim": 64, "dtype": "float32", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.08815199998934986, "p50": 0.08920199996964584, "p90": 0.0900620000265917, "mean": 0.08925999999291889, "iqr": 0.001270000041131425, "raw_times": [0.08815199998934986, 0.09009199999354678, 0.0900620000265917, 0.08920199996964584, 0.08879199998546028], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.09250199997268282, "peak_bytes": 202375168, "ok": true, "absmax": null, "corr": {"ok": true, "rtol": 1e-05, "atol": 1e-05, "absmax_q": 9.5367431640625e-07, "absmax_k": 9.5367431640625e-07, "mae_q": 1.590209919299923e-08, "mae_k": 1.590130160877834e-08, "mse_q": 2.4971026799330918e-15, "mse_k": 2.506967649153289e-15, "ref": "rotary_torch"}, "err": null}
|
| 23 |
+
{"ts": "2025-10-29T15:50:32Z", "run": "acca4aa63c524f09b2ae69a99c25a06c", "impl": "hf_kernels_rotary", "tags": {"family": "hf-kernels", "backend": "cuda"}, "wl": {"name": "cuda_B2_S2048_H32_D64_R32", "batch": 2, "seqlen": 2048, "num_heads": 32, "head_dim": 64, "rotary_dim": 32, "dtype": "float32", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.26207600001271203, "p50": 0.263255999982448, "p90": 0.2654460000144354, "mean": 0.26436599999897226, "iqr": 0.0022400000148081745, "raw_times": [0.26207600001271203, 0.263255999982448, 0.2678459999856386, 0.26320599999962724, 0.2654460000144354], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.25824599998713893, "peak_bytes": 403177472, "ok": true, "absmax": null, "corr": {"ok": true, "rtol": 1e-05, "atol": 1e-05, "absmax_q": 9.5367431640625e-07, "absmax_k": 9.5367431640625e-07, "mae_q": 1.5847520629108658e-08, "mae_k": 1.5862454461057496e-08, "mse_q": 2.4917348203881045e-15, "mse_k": 2.491306009958557e-15, "ref": "rotary_torch"}, "err": null}
|
| 24 |
+
{"ts": "2025-10-29T15:50:32Z", "run": "acca4aa63c524f09b2ae69a99c25a06c", "impl": "hf_kernels_rotary", "tags": {"family": "hf-kernels", "backend": "cuda"}, "wl": {"name": "cuda_B2_S2048_H32_D128_R64", "batch": 2, "seqlen": 2048, "num_heads": 32, "head_dim": 128, "rotary_dim": 64, "dtype": "float32", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.8428699999853961, "p50": 0.8440990000053716, "p90": 0.8457790000306886, "mean": 0.8458453999992344, "iqr": 0.0025290000280620006, "raw_times": [0.8428699999853961, 0.8532289999720888, 0.8432500000026266, 0.8440990000053716, 0.8457790000306886], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.8568399999830945, "peak_bytes": 806354944, "ok": true, "absmax": null, "corr": {"ok": true, "rtol": 1e-05, "atol": 1e-05, "absmax_q": 9.5367431640625e-07, "absmax_k": 9.5367431640625e-07, "mae_q": 1.585225106737198e-08, "mae_k": 1.581303976649906e-08, "mse_q": 2.4866460581992374e-15, "mse_k": 2.4721545950211372e-15, "ref": "rotary_torch"}, "err": null}
|
rotary/impls/cells/benchmark.py
CHANGED
|
@@ -4,6 +4,7 @@
|
|
| 4 |
# "numpy",
|
| 5 |
# "torch==2.8.0",
|
| 6 |
# "kernels-benchmark-tools",
|
|
|
|
| 7 |
# ]
|
| 8 |
#
|
| 9 |
# [tool.uv.sources]
|
|
@@ -12,46 +13,36 @@
|
|
| 12 |
import torch
|
| 13 |
import sys
|
| 14 |
from kernels_benchmark_tools import KernelTypeEnum, run_benchmark
|
|
|
|
| 15 |
|
|
|
|
|
|
|
| 16 |
|
| 17 |
-
def apply_rotary_torch(x1, x2, cos, sin, conj=False):
|
| 18 |
-
"""Reference rotary implementation."""
|
| 19 |
-
if not conj:
|
| 20 |
-
out1 = x1 * cos - x2 * sin
|
| 21 |
-
out2 = x1 * sin + x2 * cos
|
| 22 |
-
else:
|
| 23 |
-
out1 = x1 * cos + x2 * sin
|
| 24 |
-
out2 = -x1 * sin + x2 * cos
|
| 25 |
-
return out1, out2
|
| 26 |
|
| 27 |
-
|
| 28 |
-
def torch_rotary(query, key, cos, sin, conj=False):
|
| 29 |
rotary_dim = cos.shape[-1]
|
| 30 |
|
| 31 |
-
# Clone
|
| 32 |
q_out = query.clone()
|
| 33 |
k_out = key.clone()
|
| 34 |
|
| 35 |
# Apply rotation to query
|
| 36 |
q1 = q_out[..., :rotary_dim]
|
| 37 |
q2 = q_out[..., rotary_dim : 2 * rotary_dim]
|
| 38 |
-
|
| 39 |
-
q_out[..., :rotary_dim] = q_out_1
|
| 40 |
-
q_out[..., rotary_dim : 2 * rotary_dim] = q_out_2
|
| 41 |
|
| 42 |
# Apply rotation to key
|
| 43 |
k1 = k_out[..., :rotary_dim]
|
| 44 |
k2 = k_out[..., rotary_dim : 2 * rotary_dim]
|
| 45 |
-
|
| 46 |
-
k_out[..., :rotary_dim] = k_out_1
|
| 47 |
-
k_out[..., rotary_dim : 2 * rotary_dim] = k_out_2
|
| 48 |
|
| 49 |
return q_out, k_out
|
| 50 |
|
| 51 |
|
| 52 |
run_benchmark(
|
| 53 |
kernel_type=KernelTypeEnum.ROTARY,
|
| 54 |
-
impl_name="
|
| 55 |
-
impl_tags={"family": "
|
| 56 |
-
impl_func=
|
|
|
|
| 57 |
)
|
|
|
|
| 4 |
# "numpy",
|
| 5 |
# "torch==2.8.0",
|
| 6 |
# "kernels-benchmark-tools",
|
| 7 |
+
# "kernels",
|
| 8 |
# ]
|
| 9 |
#
|
| 10 |
# [tool.uv.sources]
|
|
|
|
| 13 |
import torch
|
| 14 |
import sys
|
| 15 |
from kernels_benchmark_tools import KernelTypeEnum, run_benchmark
|
| 16 |
+
from kernels import get_kernel
|
| 17 |
|
| 18 |
+
# Load the rotary kernel
|
| 19 |
+
rotary = get_kernel("kernels-community/rotary")
|
| 20 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 21 |
|
| 22 |
+
def hf_kernels_rotary(query, key, cos, sin, conj=False):
|
|
|
|
| 23 |
rotary_dim = cos.shape[-1]
|
| 24 |
|
| 25 |
+
# Clone to avoid modifying inputs
|
| 26 |
q_out = query.clone()
|
| 27 |
k_out = key.clone()
|
| 28 |
|
| 29 |
# Apply rotation to query
|
| 30 |
q1 = q_out[..., :rotary_dim]
|
| 31 |
q2 = q_out[..., rotary_dim : 2 * rotary_dim]
|
| 32 |
+
rotary.apply_rotary(q1, q2, cos, sin, q1, q2, conj)
|
|
|
|
|
|
|
| 33 |
|
| 34 |
# Apply rotation to key
|
| 35 |
k1 = k_out[..., :rotary_dim]
|
| 36 |
k2 = k_out[..., rotary_dim : 2 * rotary_dim]
|
| 37 |
+
rotary.apply_rotary(k1, k2, cos, sin, k1, k2, conj)
|
|
|
|
|
|
|
| 38 |
|
| 39 |
return q_out, k_out
|
| 40 |
|
| 41 |
|
| 42 |
run_benchmark(
|
| 43 |
kernel_type=KernelTypeEnum.ROTARY,
|
| 44 |
+
impl_name="hf_kernels_rotary",
|
| 45 |
+
impl_tags={"family": "hf-kernels", "backend": "cuda"},
|
| 46 |
+
impl_func=hf_kernels_rotary,
|
| 47 |
+
dtype="float32",
|
| 48 |
)
|
rotary/impls/hf_kernels_rotary.html
CHANGED
|
The diff for this file is too large to render.
See raw diff
|
|
|
rotary/impls/torch_rotary.html
CHANGED
|
The diff for this file is too large to render.
See raw diff
|
|
|
rotary/index.html
CHANGED
|
@@ -809,6 +809,14 @@
|
|
| 809 |
.artifact-preview svg {
|
| 810 |
background: transparent;
|
| 811 |
}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 812 |
/* CSV table styling */
|
| 813 |
.artifact-csv {
|
| 814 |
margin-top: 1rem;
|
|
|
|
| 809 |
.artifact-preview svg {
|
| 810 |
background: transparent;
|
| 811 |
}
|
| 812 |
+
/* Invert SVG images in dark mode */
|
| 813 |
+
:root[data-theme="dark"] .artifact-preview img[src$=".svg"] {
|
| 814 |
+
filter: invert(0.9) hue-rotate(180deg);
|
| 815 |
+
}
|
| 816 |
+
/* Keep SVG images readable in monocolor mode */
|
| 817 |
+
:root[data-ui="monocolor"] .artifact-preview img[src$=".svg"] {
|
| 818 |
+
filter: none;
|
| 819 |
+
}
|
| 820 |
/* CSV table styling */
|
| 821 |
.artifact-csv {
|
| 822 |
margin-top: 1rem;
|
rotary/results/artifacts/combine/latency.svg
CHANGED
|
|
Git LFS Details
|
|
|
Git LFS Details
|
rotary/results/combined_results.html
CHANGED
|
@@ -809,6 +809,14 @@
|
|
| 809 |
.artifact-preview svg {
|
| 810 |
background: transparent;
|
| 811 |
}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 812 |
/* CSV table styling */
|
| 813 |
.artifact-csv {
|
| 814 |
margin-top: 1rem;
|
|
@@ -3872,7 +3880,7 @@ span.linenos.special { color: #000000; background-color: #ffffc0; padding-left:
|
|
| 3872 |
<rdf:RDF>
|
| 3873 |
<ns2:Work>
|
| 3874 |
<dc:type rdf:resource="http://purl.org/dc/dcmitype/StillImage" />
|
| 3875 |
-
<dc:date>2025-10-
|
| 3876 |
<dc:format>image/svg+xml</dc:format>
|
| 3877 |
<dc:creator>
|
| 3878 |
<ns2:Agent>
|
|
@@ -4216,108 +4224,179 @@ span.linenos.special { color: #000000; background-color: #ffffc0; padding-left:
|
|
| 4216 |
<g id="matplotlib.axis_2">
|
| 4217 |
<g id="ytick_1">
|
| 4218 |
<g id="grid-y--2" class="grid grid-y">
|
| 4219 |
-
<path d="M 47.72
|
| 4220 |
</g>
|
| 4221 |
<g id="line2d_25">
|
| 4222 |
<defs>
|
| 4223 |
<path id="m0fca2865ba" d="M 0 0 L -3.5 0 " style="stroke: #000000; stroke-width: 0.8" />
|
| 4224 |
</defs>
|
| 4225 |
<g>
|
| 4226 |
-
<use ns4:href="#m0fca2865ba" x="47.72" y="
|
| 4227 |
</g>
|
| 4228 |
</g>
|
| 4229 |
<g id="text_25">
|
| 4230 |
-
<text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: end" x="40.72" y="
|
| 4231 |
</g>
|
| 4232 |
</g>
|
| 4233 |
<g id="ytick_2">
|
| 4234 |
<g id="grid-y--3" class="grid grid-y">
|
| 4235 |
-
<path d="M 47.72
|
| 4236 |
</g>
|
| 4237 |
<g id="line2d_26">
|
| 4238 |
<g>
|
| 4239 |
-
<use ns4:href="#m0fca2865ba" x="47.72" y="
|
| 4240 |
</g>
|
| 4241 |
</g>
|
| 4242 |
<g id="text_26">
|
| 4243 |
-
<text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: end" x="40.72" y="
|
| 4244 |
</g>
|
| 4245 |
</g>
|
| 4246 |
<g id="ytick_3">
|
| 4247 |
<g id="grid-y--4" class="grid grid-y">
|
| 4248 |
-
<path d="M 47.72
|
| 4249 |
</g>
|
| 4250 |
<g id="line2d_27">
|
| 4251 |
<g>
|
| 4252 |
-
<use ns4:href="#m0fca2865ba" x="47.72" y="
|
| 4253 |
</g>
|
| 4254 |
</g>
|
| 4255 |
<g id="text_27">
|
| 4256 |
-
<text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: end" x="40.72" y="
|
| 4257 |
</g>
|
| 4258 |
</g>
|
| 4259 |
<g id="ytick_4">
|
| 4260 |
<g id="grid-y--5" class="grid grid-y">
|
| 4261 |
-
<path d="M 47.72
|
| 4262 |
</g>
|
| 4263 |
<g id="line2d_28">
|
| 4264 |
<g>
|
| 4265 |
-
<use ns4:href="#m0fca2865ba" x="47.72" y="
|
| 4266 |
</g>
|
| 4267 |
</g>
|
| 4268 |
<g id="text_28">
|
| 4269 |
-
<text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: end" x="40.72" y="
|
| 4270 |
</g>
|
| 4271 |
</g>
|
| 4272 |
<g id="ytick_5">
|
| 4273 |
<g id="grid-y--6" class="grid grid-y">
|
| 4274 |
-
<path d="M 47.72
|
| 4275 |
</g>
|
| 4276 |
<g id="line2d_29">
|
| 4277 |
<g>
|
| 4278 |
-
<use ns4:href="#m0fca2865ba" x="47.72" y="
|
| 4279 |
</g>
|
| 4280 |
</g>
|
| 4281 |
<g id="text_29">
|
| 4282 |
-
<text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: end" x="40.72" y="
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 4283 |
</g>
|
| 4284 |
</g>
|
| 4285 |
<g id="label--y" class="ylabel">
|
| 4286 |
<text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: middle" x="18.737188" y="224.974753" transform="rotate(-90 18.737188 224.974753)">Latency P50 (ms)</text>
|
| 4287 |
</g>
|
| 4288 |
</g>
|
| 4289 |
-
<g id="series--
|
| 4290 |
-
<path d="M 82.966497 405.060892 L 113.615625
|
| 4291 |
<defs>
|
| 4292 |
<path id="md7efaf3aec" d="M 0 3 C 0.795609 3 1.55874 2.683901 2.12132 2.12132 C 2.683901 1.55874 3 0.795609 3 0 C 3 -0.795609 2.683901 -1.55874 2.12132 -2.12132 C 1.55874 -2.683901 0.795609 -3 0 -3 C -0.795609 -3 -1.55874 -2.683901 -2.12132 -2.12132 C -2.683901 -1.55874 -3 -0.795609 -3 0 C -3 0.795609 -2.683901 1.55874 -2.12132 2.12132 C -1.55874 2.683901 -0.795609 3 0 3 z " style="stroke: #1f77b4" />
|
| 4293 |
</defs>
|
| 4294 |
<g clip-path="url(#p088c925177)">
|
| 4295 |
<use ns4:href="#md7efaf3aec" x="82.966497" y="405.060892" style="fill: #1f77b4; stroke: #1f77b4" />
|
| 4296 |
-
<use ns4:href="#md7efaf3aec" x="113.615625" y="
|
| 4297 |
-
<use ns4:href="#md7efaf3aec" x="144.264753" y="
|
| 4298 |
-
<use ns4:href="#md7efaf3aec" x="174.913881" y="
|
| 4299 |
-
<use ns4:href="#md7efaf3aec" x="205.563009" y="
|
| 4300 |
-
<use ns4:href="#md7efaf3aec" x="236.212137" y="
|
| 4301 |
-
<use ns4:href="#md7efaf3aec" x="266.861265" y="
|
| 4302 |
-
<use ns4:href="#md7efaf3aec" x="297.510393" y="
|
| 4303 |
-
<use ns4:href="#md7efaf3aec" x="328.159521" y="
|
| 4304 |
-
<use ns4:href="#md7efaf3aec" x="358.808648" y="
|
| 4305 |
-
<use ns4:href="#md7efaf3aec" x="389.457776" y="
|
| 4306 |
-
<use ns4:href="#md7efaf3aec" x="420.106904" y="
|
| 4307 |
-
<use ns4:href="#md7efaf3aec" x="450.756032" y="
|
| 4308 |
-
<use ns4:href="#md7efaf3aec" x="481.40516" y="
|
| 4309 |
-
<use ns4:href="#md7efaf3aec" x="512.054288" y="
|
| 4310 |
-
<use ns4:href="#md7efaf3aec" x="542.703416" y="
|
| 4311 |
-
<use ns4:href="#md7efaf3aec" x="573.352544" y="
|
| 4312 |
-
<use ns4:href="#md7efaf3aec" x="604.001672" y="
|
| 4313 |
-
<use ns4:href="#md7efaf3aec" x="634.6508" y="
|
| 4314 |
-
<use ns4:href="#md7efaf3aec" x="665.299928" y="
|
| 4315 |
-
<use ns4:href="#md7efaf3aec" x="695.949056" y="
|
| 4316 |
-
<use ns4:href="#md7efaf3aec" x="726.598184" y="
|
| 4317 |
-
<use ns4:href="#md7efaf3aec" x="757.247312" y="
|
| 4318 |
<use ns4:href="#md7efaf3aec" x="787.896439" y="44.888614" style="fill: #1f77b4; stroke: #1f77b4" />
|
| 4319 |
</g>
|
| 4320 |
</g>
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 4321 |
<g id="patch_3">
|
| 4322 |
<path d="M 47.72 423.069506 L 47.72 26.88 " style="fill: none; stroke: #000000; stroke-width: 0.8; stroke-linejoin: miter; stroke-linecap: square" />
|
| 4323 |
</g>
|
|
@@ -4330,21 +4409,30 @@ span.linenos.special { color: #000000; background-color: #ffffc0; padding-left:
|
|
| 4330 |
<g id="patch_6">
|
| 4331 |
<path d="M 47.72 26.88 L 823.142937 26.88 " style="fill: none; stroke: #000000; stroke-width: 0.8; stroke-linejoin: miter; stroke-linecap: square" />
|
| 4332 |
</g>
|
| 4333 |
-
<g id="
|
| 4334 |
<text style="font-size: 12px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: middle" x="435.431468" y="20.88" transform="rotate(-0 435.431468 20.88)">Attention Implementation Latency</text>
|
| 4335 |
</g>
|
| 4336 |
<g id="legend" class="legend">
|
| 4337 |
<g id="patch_7">
|
| 4338 |
-
<path d="M 54.72
|
| 4339 |
</g>
|
| 4340 |
-
<g id="
|
| 4341 |
<path d="M 56.72 39.978438 L 66.72 39.978438 L 76.72 39.978438 " style="fill: none; stroke: #1f77b4; stroke-width: 1.5; stroke-linecap: square" />
|
| 4342 |
<g>
|
| 4343 |
<use ns4:href="#md7efaf3aec" x="66.72" y="39.978438" style="fill: #1f77b4; stroke: #1f77b4" />
|
| 4344 |
</g>
|
| 4345 |
</g>
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 4346 |
<g id="legend-label--torch-eager" class="legend">
|
| 4347 |
-
<text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: start" x="84.72" y="
|
| 4348 |
</g>
|
| 4349 |
</g>
|
| 4350 |
</g>
|
|
@@ -4364,7 +4452,7 @@ span.linenos.special { color: #000000; background-color: #ffffc0; padding-left:
|
|
| 4364 |
<span onclick="toggleOutput('combine')" style="cursor: pointer;">▼ output</span>
|
| 4365 |
<span id="uv-indicator-combine" onclick="toggleUvLogsFromHeader('combine')" style="cursor: pointer;">▶ uv-logs</span>
|
| 4366 |
</span> |
|
| 4367 |
-
Cell: combine | 4.
|
| 4368 |
| <button class="run-btn" onclick="runCell('combine')">▶ run</button>
|
| 4369 |
<button class="copy-btn" onclick="copyCell('combine')">Copy</button>
|
| 4370 |
<a href="cells/combine.py" target="_blank" class="raw-btn">Raw</a>
|
|
@@ -4436,11 +4524,11 @@ Cell: combine | 4.35s
|
|
| 4436 |
<div class="cell-stdout"><pre class="stdout-text">======================================================================
|
| 4437 |
LOADING BENCHMARK DATA
|
| 4438 |
======================================================================
|
| 4439 |
-
✓ HF Kernels Rotary : /__w/kernels-benchmarks/kernels-benchmarks/benches/rotary/impls/.uvnote/cache/
|
| 4440 |
✓ PyTorch Rotary : /__w/kernels-benchmarks/kernels-benchmarks/benches/rotary/impls/.uvnote/cache/abf801d6445dfa81a8dd7b2e6257930c39c18160a9b97a739858c3b244e16cc5
|
| 4441 |
|
| 4442 |
✓ Found HF Kernels Rotary
|
| 4443 |
-
Path: /__w/kernels-benchmarks/kernels-benchmarks/benches/rotary/impls/.uvnote/cache/
|
| 4444 |
✓ Found PyTorch Rotary
|
| 4445 |
Path: /__w/kernels-benchmarks/kernels-benchmarks/benches/rotary/impls/.uvnote/cache/abf801d6445dfa81a8dd7b2e6257930c39c18160a9b97a739858c3b244e16cc5/rotary.jsonl
|
| 4446 |
|
|
@@ -4451,54 +4539,54 @@ Summary: 2 found, 0 skipped, 0 missing
|
|
| 4451 |
COMBINED BENCHMARK SUMMARY
|
| 4452 |
|
| 4453 |
impl wl p50(ms) ok
|
| 4454 |
-
hf_kernels_rotary cuda_B1_S128_H32_D128_R64 0.09
|
| 4455 |
-
hf_kernels_rotary cuda_B1_S128_H32_D64_R32 0.09
|
| 4456 |
-
hf_kernels_rotary cuda_B1_S128_H8_D128_R64 0.
|
| 4457 |
-
hf_kernels_rotary cuda_B1_S128_H8_D64_R32 0.
|
| 4458 |
-
hf_kernels_rotary cuda_B1_S2048_H32_D128_R64 0.
|
| 4459 |
-
hf_kernels_rotary cuda_B1_S2048_H32_D64_R32 0.09
|
| 4460 |
-
hf_kernels_rotary cuda_B1_S2048_H8_D128_R64 0.09
|
| 4461 |
-
hf_kernels_rotary cuda_B1_S2048_H8_D64_R32 0.09
|
| 4462 |
-
hf_kernels_rotary cuda_B1_S512_H32_D128_R64 0.09
|
| 4463 |
-
hf_kernels_rotary cuda_B1_S512_H32_D64_R32 0.09
|
| 4464 |
-
hf_kernels_rotary cuda_B1_S512_H8_D128_R64 0.09
|
| 4465 |
-
hf_kernels_rotary cuda_B1_S512_H8_D64_R32 0.09
|
| 4466 |
-
hf_kernels_rotary cuda_B2_S128_H32_D128_R64 0.09
|
| 4467 |
-
hf_kernels_rotary cuda_B2_S128_H32_D64_R32 0.09
|
| 4468 |
-
hf_kernels_rotary cuda_B2_S128_H8_D128_R64 0.09
|
| 4469 |
-
hf_kernels_rotary cuda_B2_S128_H8_D64_R32 0.09
|
| 4470 |
-
hf_kernels_rotary cuda_B2_S2048_H32_D128_R64 0.
|
| 4471 |
-
hf_kernels_rotary cuda_B2_S2048_H32_D64_R32 0.
|
| 4472 |
-
hf_kernels_rotary cuda_B2_S2048_H8_D128_R64 0.09
|
| 4473 |
-
hf_kernels_rotary cuda_B2_S2048_H8_D64_R32 0.09
|
| 4474 |
-
hf_kernels_rotary cuda_B2_S512_H32_D128_R64 0.09
|
| 4475 |
-
hf_kernels_rotary cuda_B2_S512_H32_D64_R32 0.09
|
| 4476 |
-
hf_kernels_rotary cuda_B2_S512_H8_D128_R64 0.09
|
| 4477 |
-
hf_kernels_rotary cuda_B2_S512_H8_D64_R32 0.09
|
| 4478 |
-
torch_eager cuda_B1_S128_H32_D128_R64 0.
|
| 4479 |
torch_eager cuda_B1_S128_H32_D64_R32 0.22 True
|
| 4480 |
-
torch_eager cuda_B1_S128_H8_D128_R64 0.
|
| 4481 |
-
torch_eager cuda_B1_S128_H8_D64_R32 0.
|
| 4482 |
-
torch_eager cuda_B1_S2048_H32_D128_R64 0.
|
| 4483 |
-
torch_eager cuda_B1_S2048_H32_D64_R32 0.
|
| 4484 |
-
torch_eager cuda_B1_S2048_H8_D128_R64 0.
|
| 4485 |
-
torch_eager cuda_B1_S2048_H8_D64_R32 0.
|
| 4486 |
-
torch_eager cuda_B1_S512_H32_D128_R64 0.
|
| 4487 |
-
torch_eager cuda_B1_S512_H32_D64_R32 0.
|
| 4488 |
-
torch_eager cuda_B1_S512_H8_D128_R64 0.
|
| 4489 |
torch_eager cuda_B1_S512_H8_D64_R32 0.22 True
|
| 4490 |
-
torch_eager cuda_B2_S128_H32_D128_R64 0.
|
| 4491 |
-
torch_eager cuda_B2_S128_H32_D64_R32 0.
|
| 4492 |
-
torch_eager cuda_B2_S128_H8_D128_R64 0.
|
| 4493 |
-
torch_eager cuda_B2_S128_H8_D64_R32 0.
|
| 4494 |
torch_eager cuda_B2_S2048_H32_D128_R64 0.64 True
|
| 4495 |
torch_eager cuda_B2_S2048_H32_D64_R32 0.23 True
|
| 4496 |
torch_eager cuda_B2_S2048_H8_D128_R64 0.22 True
|
| 4497 |
-
torch_eager cuda_B2_S2048_H8_D64_R32 0.
|
| 4498 |
-
torch_eager cuda_B2_S512_H32_D128_R64 0.
|
| 4499 |
-
torch_eager cuda_B2_S512_H32_D64_R32 0.
|
| 4500 |
torch_eager cuda_B2_S512_H8_D128_R64 0.21 True
|
| 4501 |
-
torch_eager cuda_B2_S512_H8_D64_R32 0.
|
| 4502 |
|
| 4503 |
GENERATING COMBINED VISUALIZATION
|
| 4504 |
|
|
@@ -4518,7 +4606,7 @@ Implementations included:
|
|
| 4518 |
<div class="uv-install-logs" id="uv-logs-combine">
|
| 4519 |
<div class="uv-logs-header" onclick="toggleUvLogs(this)">▶ UV Install Logs</div>
|
| 4520 |
<div class="uv-logs-content" style="display: none;">
|
| 4521 |
-
Installed 37 packages in
|
| 4522 |
</div>
|
| 4523 |
</div>
|
| 4524 |
<div class="cell-artifacts">
|
|
@@ -4531,7 +4619,7 @@ Installed 37 packages in 239ms
|
|
| 4531 |
<rdf:RDF>
|
| 4532 |
<ns2:Work>
|
| 4533 |
<dc:type rdf:resource="http://purl.org/dc/dcmitype/StillImage" />
|
| 4534 |
-
<dc:date>2025-10-
|
| 4535 |
<dc:format>image/svg+xml</dc:format>
|
| 4536 |
<dc:creator>
|
| 4537 |
<ns2:Agent>
|
|
@@ -4875,108 +4963,179 @@ Installed 37 packages in 239ms
|
|
| 4875 |
<g id="matplotlib.axis_2">
|
| 4876 |
<g id="ytick_1">
|
| 4877 |
<g id="grid-y--2" class="grid grid-y">
|
| 4878 |
-
<path d="M 47.72
|
| 4879 |
</g>
|
| 4880 |
<g id="line2d_25">
|
| 4881 |
<defs>
|
| 4882 |
<path id="m0fca2865ba" d="M 0 0 L -3.5 0 " style="stroke: #000000; stroke-width: 0.8" />
|
| 4883 |
</defs>
|
| 4884 |
<g>
|
| 4885 |
-
<use ns4:href="#m0fca2865ba" x="47.72" y="
|
| 4886 |
</g>
|
| 4887 |
</g>
|
| 4888 |
<g id="text_25">
|
| 4889 |
-
<text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: end" x="40.72" y="
|
| 4890 |
</g>
|
| 4891 |
</g>
|
| 4892 |
<g id="ytick_2">
|
| 4893 |
<g id="grid-y--3" class="grid grid-y">
|
| 4894 |
-
<path d="M 47.72
|
| 4895 |
</g>
|
| 4896 |
<g id="line2d_26">
|
| 4897 |
<g>
|
| 4898 |
-
<use ns4:href="#m0fca2865ba" x="47.72" y="
|
| 4899 |
</g>
|
| 4900 |
</g>
|
| 4901 |
<g id="text_26">
|
| 4902 |
-
<text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: end" x="40.72" y="
|
| 4903 |
</g>
|
| 4904 |
</g>
|
| 4905 |
<g id="ytick_3">
|
| 4906 |
<g id="grid-y--4" class="grid grid-y">
|
| 4907 |
-
<path d="M 47.72
|
| 4908 |
</g>
|
| 4909 |
<g id="line2d_27">
|
| 4910 |
<g>
|
| 4911 |
-
<use ns4:href="#m0fca2865ba" x="47.72" y="
|
| 4912 |
</g>
|
| 4913 |
</g>
|
| 4914 |
<g id="text_27">
|
| 4915 |
-
<text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: end" x="40.72" y="
|
| 4916 |
</g>
|
| 4917 |
</g>
|
| 4918 |
<g id="ytick_4">
|
| 4919 |
<g id="grid-y--5" class="grid grid-y">
|
| 4920 |
-
<path d="M 47.72
|
| 4921 |
</g>
|
| 4922 |
<g id="line2d_28">
|
| 4923 |
<g>
|
| 4924 |
-
<use ns4:href="#m0fca2865ba" x="47.72" y="
|
| 4925 |
</g>
|
| 4926 |
</g>
|
| 4927 |
<g id="text_28">
|
| 4928 |
-
<text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: end" x="40.72" y="
|
| 4929 |
</g>
|
| 4930 |
</g>
|
| 4931 |
<g id="ytick_5">
|
| 4932 |
<g id="grid-y--6" class="grid grid-y">
|
| 4933 |
-
<path d="M 47.72
|
| 4934 |
</g>
|
| 4935 |
<g id="line2d_29">
|
| 4936 |
<g>
|
| 4937 |
-
<use ns4:href="#m0fca2865ba" x="47.72" y="
|
| 4938 |
</g>
|
| 4939 |
</g>
|
| 4940 |
<g id="text_29">
|
| 4941 |
-
<text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: end" x="40.72" y="
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 4942 |
</g>
|
| 4943 |
</g>
|
| 4944 |
<g id="label--y" class="ylabel">
|
| 4945 |
<text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: middle" x="18.737188" y="224.974753" transform="rotate(-90 18.737188 224.974753)">Latency P50 (ms)</text>
|
| 4946 |
</g>
|
| 4947 |
</g>
|
| 4948 |
-
<g id="series--
|
| 4949 |
-
<path d="M 82.966497 405.060892 L 113.615625
|
| 4950 |
<defs>
|
| 4951 |
<path id="md7efaf3aec" d="M 0 3 C 0.795609 3 1.55874 2.683901 2.12132 2.12132 C 2.683901 1.55874 3 0.795609 3 0 C 3 -0.795609 2.683901 -1.55874 2.12132 -2.12132 C 1.55874 -2.683901 0.795609 -3 0 -3 C -0.795609 -3 -1.55874 -2.683901 -2.12132 -2.12132 C -2.683901 -1.55874 -3 -0.795609 -3 0 C -3 0.795609 -2.683901 1.55874 -2.12132 2.12132 C -1.55874 2.683901 -0.795609 3 0 3 z " style="stroke: #1f77b4" />
|
| 4952 |
</defs>
|
| 4953 |
<g clip-path="url(#p088c925177)">
|
| 4954 |
<use ns4:href="#md7efaf3aec" x="82.966497" y="405.060892" style="fill: #1f77b4; stroke: #1f77b4" />
|
| 4955 |
-
<use ns4:href="#md7efaf3aec" x="113.615625" y="
|
| 4956 |
-
<use ns4:href="#md7efaf3aec" x="144.264753" y="
|
| 4957 |
-
<use ns4:href="#md7efaf3aec" x="174.913881" y="
|
| 4958 |
-
<use ns4:href="#md7efaf3aec" x="205.563009" y="
|
| 4959 |
-
<use ns4:href="#md7efaf3aec" x="236.212137" y="
|
| 4960 |
-
<use ns4:href="#md7efaf3aec" x="266.861265" y="
|
| 4961 |
-
<use ns4:href="#md7efaf3aec" x="297.510393" y="
|
| 4962 |
-
<use ns4:href="#md7efaf3aec" x="328.159521" y="
|
| 4963 |
-
<use ns4:href="#md7efaf3aec" x="358.808648" y="
|
| 4964 |
-
<use ns4:href="#md7efaf3aec" x="389.457776" y="
|
| 4965 |
-
<use ns4:href="#md7efaf3aec" x="420.106904" y="
|
| 4966 |
-
<use ns4:href="#md7efaf3aec" x="450.756032" y="
|
| 4967 |
-
<use ns4:href="#md7efaf3aec" x="481.40516" y="
|
| 4968 |
-
<use ns4:href="#md7efaf3aec" x="512.054288" y="
|
| 4969 |
-
<use ns4:href="#md7efaf3aec" x="542.703416" y="
|
| 4970 |
-
<use ns4:href="#md7efaf3aec" x="573.352544" y="
|
| 4971 |
-
<use ns4:href="#md7efaf3aec" x="604.001672" y="
|
| 4972 |
-
<use ns4:href="#md7efaf3aec" x="634.6508" y="
|
| 4973 |
-
<use ns4:href="#md7efaf3aec" x="665.299928" y="
|
| 4974 |
-
<use ns4:href="#md7efaf3aec" x="695.949056" y="
|
| 4975 |
-
<use ns4:href="#md7efaf3aec" x="726.598184" y="
|
| 4976 |
-
<use ns4:href="#md7efaf3aec" x="757.247312" y="
|
| 4977 |
<use ns4:href="#md7efaf3aec" x="787.896439" y="44.888614" style="fill: #1f77b4; stroke: #1f77b4" />
|
| 4978 |
</g>
|
| 4979 |
</g>
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 4980 |
<g id="patch_3">
|
| 4981 |
<path d="M 47.72 423.069506 L 47.72 26.88 " style="fill: none; stroke: #000000; stroke-width: 0.8; stroke-linejoin: miter; stroke-linecap: square" />
|
| 4982 |
</g>
|
|
@@ -4989,21 +5148,30 @@ Installed 37 packages in 239ms
|
|
| 4989 |
<g id="patch_6">
|
| 4990 |
<path d="M 47.72 26.88 L 823.142937 26.88 " style="fill: none; stroke: #000000; stroke-width: 0.8; stroke-linejoin: miter; stroke-linecap: square" />
|
| 4991 |
</g>
|
| 4992 |
-
<g id="
|
| 4993 |
<text style="font-size: 12px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: middle" x="435.431468" y="20.88" transform="rotate(-0 435.431468 20.88)">Attention Implementation Latency</text>
|
| 4994 |
</g>
|
| 4995 |
<g id="legend" class="legend">
|
| 4996 |
<g id="patch_7">
|
| 4997 |
-
<path d="M 54.72
|
| 4998 |
</g>
|
| 4999 |
-
<g id="
|
| 5000 |
<path d="M 56.72 39.978438 L 66.72 39.978438 L 76.72 39.978438 " style="fill: none; stroke: #1f77b4; stroke-width: 1.5; stroke-linecap: square" />
|
| 5001 |
<g>
|
| 5002 |
<use ns4:href="#md7efaf3aec" x="66.72" y="39.978438" style="fill: #1f77b4; stroke: #1f77b4" />
|
| 5003 |
</g>
|
| 5004 |
</g>
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 5005 |
<g id="legend-label--torch-eager" class="legend">
|
| 5006 |
-
<text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: start" x="84.72" y="
|
| 5007 |
</g>
|
| 5008 |
</g>
|
| 5009 |
</g>
|
|
|
|
| 809 |
.artifact-preview svg {
|
| 810 |
background: transparent;
|
| 811 |
}
|
| 812 |
+
/* Invert SVG images in dark mode */
|
| 813 |
+
:root[data-theme="dark"] .artifact-preview img[src$=".svg"] {
|
| 814 |
+
filter: invert(0.9) hue-rotate(180deg);
|
| 815 |
+
}
|
| 816 |
+
/* Keep SVG images readable in monocolor mode */
|
| 817 |
+
:root[data-ui="monocolor"] .artifact-preview img[src$=".svg"] {
|
| 818 |
+
filter: none;
|
| 819 |
+
}
|
| 820 |
/* CSV table styling */
|
| 821 |
.artifact-csv {
|
| 822 |
margin-top: 1rem;
|
|
|
|
| 3880 |
<rdf:RDF>
|
| 3881 |
<ns2:Work>
|
| 3882 |
<dc:type rdf:resource="http://purl.org/dc/dcmitype/StillImage" />
|
| 3883 |
+
<dc:date>2025-10-29T15:51:00.751980</dc:date>
|
| 3884 |
<dc:format>image/svg+xml</dc:format>
|
| 3885 |
<dc:creator>
|
| 3886 |
<ns2:Agent>
|
|
|
|
| 4224 |
<g id="matplotlib.axis_2">
|
| 4225 |
<g id="ytick_1">
|
| 4226 |
<g id="grid-y--2" class="grid grid-y">
|
| 4227 |
+
<path d="M 47.72 392.946895 L 823.142937 392.946895 " clip-path="url(#p088c925177)" style="fill: none; stroke: #b0b0b0; stroke-opacity: 0.3; stroke-width: 0.8; stroke-linecap: square" />
|
| 4228 |
</g>
|
| 4229 |
<g id="line2d_25">
|
| 4230 |
<defs>
|
| 4231 |
<path id="m0fca2865ba" d="M 0 0 L -3.5 0 " style="stroke: #000000; stroke-width: 0.8" />
|
| 4232 |
</defs>
|
| 4233 |
<g>
|
| 4234 |
+
<use ns4:href="#m0fca2865ba" x="47.72" y="392.946895" style="stroke: #000000; stroke-width: 0.8" />
|
| 4235 |
</g>
|
| 4236 |
</g>
|
| 4237 |
<g id="text_25">
|
| 4238 |
+
<text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: end" x="40.72" y="396.746114" transform="rotate(-0 40.72 396.746114)">0.1</text>
|
| 4239 |
</g>
|
| 4240 |
</g>
|
| 4241 |
<g id="ytick_2">
|
| 4242 |
<g id="grid-y--3" class="grid grid-y">
|
| 4243 |
+
<path d="M 47.72 346.171092 L 823.142937 346.171092 " clip-path="url(#p088c925177)" style="fill: none; stroke: #b0b0b0; stroke-opacity: 0.3; stroke-width: 0.8; stroke-linecap: square" />
|
| 4244 |
</g>
|
| 4245 |
<g id="line2d_26">
|
| 4246 |
<g>
|
| 4247 |
+
<use ns4:href="#m0fca2865ba" x="47.72" y="346.171092" style="stroke: #000000; stroke-width: 0.8" />
|
| 4248 |
</g>
|
| 4249 |
</g>
|
| 4250 |
<g id="text_26">
|
| 4251 |
+
<text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: end" x="40.72" y="349.970311" transform="rotate(-0 40.72 349.970311)">0.2</text>
|
| 4252 |
</g>
|
| 4253 |
</g>
|
| 4254 |
<g id="ytick_3">
|
| 4255 |
<g id="grid-y--4" class="grid grid-y">
|
| 4256 |
+
<path d="M 47.72 299.395289 L 823.142937 299.395289 " clip-path="url(#p088c925177)" style="fill: none; stroke: #b0b0b0; stroke-opacity: 0.3; stroke-width: 0.8; stroke-linecap: square" />
|
| 4257 |
</g>
|
| 4258 |
<g id="line2d_27">
|
| 4259 |
<g>
|
| 4260 |
+
<use ns4:href="#m0fca2865ba" x="47.72" y="299.395289" style="stroke: #000000; stroke-width: 0.8" />
|
| 4261 |
</g>
|
| 4262 |
</g>
|
| 4263 |
<g id="text_27">
|
| 4264 |
+
<text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: end" x="40.72" y="303.194508" transform="rotate(-0 40.72 303.194508)">0.3</text>
|
| 4265 |
</g>
|
| 4266 |
</g>
|
| 4267 |
<g id="ytick_4">
|
| 4268 |
<g id="grid-y--5" class="grid grid-y">
|
| 4269 |
+
<path d="M 47.72 252.619486 L 823.142937 252.619486 " clip-path="url(#p088c925177)" style="fill: none; stroke: #b0b0b0; stroke-opacity: 0.3; stroke-width: 0.8; stroke-linecap: square" />
|
| 4270 |
</g>
|
| 4271 |
<g id="line2d_28">
|
| 4272 |
<g>
|
| 4273 |
+
<use ns4:href="#m0fca2865ba" x="47.72" y="252.619486" style="stroke: #000000; stroke-width: 0.8" />
|
| 4274 |
</g>
|
| 4275 |
</g>
|
| 4276 |
<g id="text_28">
|
| 4277 |
+
<text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: end" x="40.72" y="256.418705" transform="rotate(-0 40.72 256.418705)">0.4</text>
|
| 4278 |
</g>
|
| 4279 |
</g>
|
| 4280 |
<g id="ytick_5">
|
| 4281 |
<g id="grid-y--6" class="grid grid-y">
|
| 4282 |
+
<path d="M 47.72 205.843684 L 823.142937 205.843684 " clip-path="url(#p088c925177)" style="fill: none; stroke: #b0b0b0; stroke-opacity: 0.3; stroke-width: 0.8; stroke-linecap: square" />
|
| 4283 |
</g>
|
| 4284 |
<g id="line2d_29">
|
| 4285 |
<g>
|
| 4286 |
+
<use ns4:href="#m0fca2865ba" x="47.72" y="205.843684" style="stroke: #000000; stroke-width: 0.8" />
|
| 4287 |
</g>
|
| 4288 |
</g>
|
| 4289 |
<g id="text_29">
|
| 4290 |
+
<text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: end" x="40.72" y="209.642902" transform="rotate(-0 40.72 209.642902)">0.5</text>
|
| 4291 |
+
</g>
|
| 4292 |
+
</g>
|
| 4293 |
+
<g id="ytick_6">
|
| 4294 |
+
<g id="grid-y--7" class="grid grid-y">
|
| 4295 |
+
<path d="M 47.72 159.067881 L 823.142937 159.067881 " clip-path="url(#p088c925177)" style="fill: none; stroke: #b0b0b0; stroke-opacity: 0.3; stroke-width: 0.8; stroke-linecap: square" />
|
| 4296 |
+
</g>
|
| 4297 |
+
<g id="line2d_30">
|
| 4298 |
+
<g>
|
| 4299 |
+
<use ns4:href="#m0fca2865ba" x="47.72" y="159.067881" style="stroke: #000000; stroke-width: 0.8" />
|
| 4300 |
+
</g>
|
| 4301 |
+
</g>
|
| 4302 |
+
<g id="text_30">
|
| 4303 |
+
<text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: end" x="40.72" y="162.8671" transform="rotate(-0 40.72 162.8671)">0.6</text>
|
| 4304 |
+
</g>
|
| 4305 |
+
</g>
|
| 4306 |
+
<g id="ytick_7">
|
| 4307 |
+
<g id="grid-y--8" class="grid grid-y">
|
| 4308 |
+
<path d="M 47.72 112.292078 L 823.142937 112.292078 " clip-path="url(#p088c925177)" style="fill: none; stroke: #b0b0b0; stroke-opacity: 0.3; stroke-width: 0.8; stroke-linecap: square" />
|
| 4309 |
+
</g>
|
| 4310 |
+
<g id="line2d_31">
|
| 4311 |
+
<g>
|
| 4312 |
+
<use ns4:href="#m0fca2865ba" x="47.72" y="112.292078" style="stroke: #000000; stroke-width: 0.8" />
|
| 4313 |
+
</g>
|
| 4314 |
+
</g>
|
| 4315 |
+
<g id="text_31">
|
| 4316 |
+
<text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: end" x="40.72" y="116.091297" transform="rotate(-0 40.72 116.091297)">0.7</text>
|
| 4317 |
+
</g>
|
| 4318 |
+
</g>
|
| 4319 |
+
<g id="ytick_8">
|
| 4320 |
+
<g id="grid-y--9" class="grid grid-y">
|
| 4321 |
+
<path d="M 47.72 65.516275 L 823.142937 65.516275 " clip-path="url(#p088c925177)" style="fill: none; stroke: #b0b0b0; stroke-opacity: 0.3; stroke-width: 0.8; stroke-linecap: square" />
|
| 4322 |
+
</g>
|
| 4323 |
+
<g id="line2d_32">
|
| 4324 |
+
<g>
|
| 4325 |
+
<use ns4:href="#m0fca2865ba" x="47.72" y="65.516275" style="stroke: #000000; stroke-width: 0.8" />
|
| 4326 |
+
</g>
|
| 4327 |
+
</g>
|
| 4328 |
+
<g id="text_32">
|
| 4329 |
+
<text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: end" x="40.72" y="69.315494" transform="rotate(-0 40.72 69.315494)">0.8</text>
|
| 4330 |
</g>
|
| 4331 |
</g>
|
| 4332 |
<g id="label--y" class="ylabel">
|
| 4333 |
<text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: middle" x="18.737188" y="224.974753" transform="rotate(-90 18.737188 224.974753)">Latency P50 (ms)</text>
|
| 4334 |
</g>
|
| 4335 |
</g>
|
| 4336 |
+
<g id="series--hf-kernels-rotary" class="series">
|
| 4337 |
+
<path d="M 82.966497 405.060892 L 113.615625 396.688024 L 144.264753 397.225945 L 174.913881 396.996744 L 205.563009 397.632895 L 236.212137 397.35224 L 266.861265 397.324175 L 297.510393 397.91355 L 328.159521 397.843386 L 358.808648 397.146426 L 389.457776 397.071585 L 420.106904 317.340358 L 450.756032 397.211913 L 481.40516 397.459824 L 512.054288 397.151104 L 542.703416 397.263366 L 573.352544 397.511278 L 604.001672 397.314819 L 634.6508 397.539343 L 665.299928 397.628217 L 695.949056 396.861094 L 726.598184 397.997746 L 757.247312 316.58259 L 787.896439 44.888614 " clip-path="url(#p088c925177)" style="fill: none; stroke: #1f77b4; stroke-width: 1.5; stroke-linecap: square" />
|
| 4338 |
<defs>
|
| 4339 |
<path id="md7efaf3aec" d="M 0 3 C 0.795609 3 1.55874 2.683901 2.12132 2.12132 C 2.683901 1.55874 3 0.795609 3 0 C 3 -0.795609 2.683901 -1.55874 2.12132 -2.12132 C 1.55874 -2.683901 0.795609 -3 0 -3 C -0.795609 -3 -1.55874 -2.683901 -2.12132 -2.12132 C -2.683901 -1.55874 -3 -0.795609 -3 0 C -3 0.795609 -2.683901 1.55874 -2.12132 2.12132 C -1.55874 2.683901 -0.795609 3 0 3 z " style="stroke: #1f77b4" />
|
| 4340 |
</defs>
|
| 4341 |
<g clip-path="url(#p088c925177)">
|
| 4342 |
<use ns4:href="#md7efaf3aec" x="82.966497" y="405.060892" style="fill: #1f77b4; stroke: #1f77b4" />
|
| 4343 |
+
<use ns4:href="#md7efaf3aec" x="113.615625" y="396.688024" style="fill: #1f77b4; stroke: #1f77b4" />
|
| 4344 |
+
<use ns4:href="#md7efaf3aec" x="144.264753" y="397.225945" style="fill: #1f77b4; stroke: #1f77b4" />
|
| 4345 |
+
<use ns4:href="#md7efaf3aec" x="174.913881" y="396.996744" style="fill: #1f77b4; stroke: #1f77b4" />
|
| 4346 |
+
<use ns4:href="#md7efaf3aec" x="205.563009" y="397.632895" style="fill: #1f77b4; stroke: #1f77b4" />
|
| 4347 |
+
<use ns4:href="#md7efaf3aec" x="236.212137" y="397.35224" style="fill: #1f77b4; stroke: #1f77b4" />
|
| 4348 |
+
<use ns4:href="#md7efaf3aec" x="266.861265" y="397.324175" style="fill: #1f77b4; stroke: #1f77b4" />
|
| 4349 |
+
<use ns4:href="#md7efaf3aec" x="297.510393" y="397.91355" style="fill: #1f77b4; stroke: #1f77b4" />
|
| 4350 |
+
<use ns4:href="#md7efaf3aec" x="328.159521" y="397.843386" style="fill: #1f77b4; stroke: #1f77b4" />
|
| 4351 |
+
<use ns4:href="#md7efaf3aec" x="358.808648" y="397.146426" style="fill: #1f77b4; stroke: #1f77b4" />
|
| 4352 |
+
<use ns4:href="#md7efaf3aec" x="389.457776" y="397.071585" style="fill: #1f77b4; stroke: #1f77b4" />
|
| 4353 |
+
<use ns4:href="#md7efaf3aec" x="420.106904" y="317.340358" style="fill: #1f77b4; stroke: #1f77b4" />
|
| 4354 |
+
<use ns4:href="#md7efaf3aec" x="450.756032" y="397.211913" style="fill: #1f77b4; stroke: #1f77b4" />
|
| 4355 |
+
<use ns4:href="#md7efaf3aec" x="481.40516" y="397.459824" style="fill: #1f77b4; stroke: #1f77b4" />
|
| 4356 |
+
<use ns4:href="#md7efaf3aec" x="512.054288" y="397.151104" style="fill: #1f77b4; stroke: #1f77b4" />
|
| 4357 |
+
<use ns4:href="#md7efaf3aec" x="542.703416" y="397.263366" style="fill: #1f77b4; stroke: #1f77b4" />
|
| 4358 |
+
<use ns4:href="#md7efaf3aec" x="573.352544" y="397.511278" style="fill: #1f77b4; stroke: #1f77b4" />
|
| 4359 |
+
<use ns4:href="#md7efaf3aec" x="604.001672" y="397.314819" style="fill: #1f77b4; stroke: #1f77b4" />
|
| 4360 |
+
<use ns4:href="#md7efaf3aec" x="634.6508" y="397.539343" style="fill: #1f77b4; stroke: #1f77b4" />
|
| 4361 |
+
<use ns4:href="#md7efaf3aec" x="665.299928" y="397.628217" style="fill: #1f77b4; stroke: #1f77b4" />
|
| 4362 |
+
<use ns4:href="#md7efaf3aec" x="695.949056" y="396.861094" style="fill: #1f77b4; stroke: #1f77b4" />
|
| 4363 |
+
<use ns4:href="#md7efaf3aec" x="726.598184" y="397.997746" style="fill: #1f77b4; stroke: #1f77b4" />
|
| 4364 |
+
<use ns4:href="#md7efaf3aec" x="757.247312" y="316.58259" style="fill: #1f77b4; stroke: #1f77b4" />
|
| 4365 |
<use ns4:href="#md7efaf3aec" x="787.896439" y="44.888614" style="fill: #1f77b4; stroke: #1f77b4" />
|
| 4366 |
</g>
|
| 4367 |
</g>
|
| 4368 |
+
<g id="series--torch-eager" class="series">
|
| 4369 |
+
<path d="M 82.966497 359.229025 L 113.615625 336.991341 L 144.264753 338.68977 L 174.913881 340.649676 L 205.563009 337.183122 L 236.212137 342.30975 L 266.861265 341.346168 L 297.510393 339.896118 L 328.159521 341.271327 L 358.808648 340.438717 L 389.457776 342.132002 L 420.106904 334.919173 L 450.756032 340.405974 L 481.40516 340.528059 L 512.054288 341.112289 L 542.703416 340.373231 L 573.352544 340.218871 L 604.001672 341.364878 L 634.6508 340.069189 L 665.299928 340.574367 L 695.949056 340.438717 L 726.598184 339.068186 L 757.247312 332.696854 L 787.896439 141.132167 " clip-path="url(#p088c925177)" style="fill: none; stroke: #ff7f0e; stroke-width: 1.5; stroke-linecap: square" />
|
| 4370 |
+
<defs>
|
| 4371 |
+
<path id="m9b8c54d372" d="M 0 3 C 0.795609 3 1.55874 2.683901 2.12132 2.12132 C 2.683901 1.55874 3 0.795609 3 0 C 3 -0.795609 2.683901 -1.55874 2.12132 -2.12132 C 1.55874 -2.683901 0.795609 -3 0 -3 C -0.795609 -3 -1.55874 -2.683901 -2.12132 -2.12132 C -2.683901 -1.55874 -3 -0.795609 -3 0 C -3 0.795609 -2.683901 1.55874 -2.12132 2.12132 C -1.55874 2.683901 -0.795609 3 0 3 z " style="stroke: #ff7f0e" />
|
| 4372 |
+
</defs>
|
| 4373 |
+
<g clip-path="url(#p088c925177)">
|
| 4374 |
+
<use ns4:href="#m9b8c54d372" x="82.966497" y="359.229025" style="fill: #ff7f0e; stroke: #ff7f0e" />
|
| 4375 |
+
<use ns4:href="#m9b8c54d372" x="113.615625" y="336.991341" style="fill: #ff7f0e; stroke: #ff7f0e" />
|
| 4376 |
+
<use ns4:href="#m9b8c54d372" x="144.264753" y="338.68977" style="fill: #ff7f0e; stroke: #ff7f0e" />
|
| 4377 |
+
<use ns4:href="#m9b8c54d372" x="174.913881" y="340.649676" style="fill: #ff7f0e; stroke: #ff7f0e" />
|
| 4378 |
+
<use ns4:href="#m9b8c54d372" x="205.563009" y="337.183122" style="fill: #ff7f0e; stroke: #ff7f0e" />
|
| 4379 |
+
<use ns4:href="#m9b8c54d372" x="236.212137" y="342.30975" style="fill: #ff7f0e; stroke: #ff7f0e" />
|
| 4380 |
+
<use ns4:href="#m9b8c54d372" x="266.861265" y="341.346168" style="fill: #ff7f0e; stroke: #ff7f0e" />
|
| 4381 |
+
<use ns4:href="#m9b8c54d372" x="297.510393" y="339.896118" style="fill: #ff7f0e; stroke: #ff7f0e" />
|
| 4382 |
+
<use ns4:href="#m9b8c54d372" x="328.159521" y="341.271327" style="fill: #ff7f0e; stroke: #ff7f0e" />
|
| 4383 |
+
<use ns4:href="#m9b8c54d372" x="358.808648" y="340.438717" style="fill: #ff7f0e; stroke: #ff7f0e" />
|
| 4384 |
+
<use ns4:href="#m9b8c54d372" x="389.457776" y="342.132002" style="fill: #ff7f0e; stroke: #ff7f0e" />
|
| 4385 |
+
<use ns4:href="#m9b8c54d372" x="420.106904" y="334.919173" style="fill: #ff7f0e; stroke: #ff7f0e" />
|
| 4386 |
+
<use ns4:href="#m9b8c54d372" x="450.756032" y="340.405974" style="fill: #ff7f0e; stroke: #ff7f0e" />
|
| 4387 |
+
<use ns4:href="#m9b8c54d372" x="481.40516" y="340.528059" style="fill: #ff7f0e; stroke: #ff7f0e" />
|
| 4388 |
+
<use ns4:href="#m9b8c54d372" x="512.054288" y="341.112289" style="fill: #ff7f0e; stroke: #ff7f0e" />
|
| 4389 |
+
<use ns4:href="#m9b8c54d372" x="542.703416" y="340.373231" style="fill: #ff7f0e; stroke: #ff7f0e" />
|
| 4390 |
+
<use ns4:href="#m9b8c54d372" x="573.352544" y="340.218871" style="fill: #ff7f0e; stroke: #ff7f0e" />
|
| 4391 |
+
<use ns4:href="#m9b8c54d372" x="604.001672" y="341.364878" style="fill: #ff7f0e; stroke: #ff7f0e" />
|
| 4392 |
+
<use ns4:href="#m9b8c54d372" x="634.6508" y="340.069189" style="fill: #ff7f0e; stroke: #ff7f0e" />
|
| 4393 |
+
<use ns4:href="#m9b8c54d372" x="665.299928" y="340.574367" style="fill: #ff7f0e; stroke: #ff7f0e" />
|
| 4394 |
+
<use ns4:href="#m9b8c54d372" x="695.949056" y="340.438717" style="fill: #ff7f0e; stroke: #ff7f0e" />
|
| 4395 |
+
<use ns4:href="#m9b8c54d372" x="726.598184" y="339.068186" style="fill: #ff7f0e; stroke: #ff7f0e" />
|
| 4396 |
+
<use ns4:href="#m9b8c54d372" x="757.247312" y="332.696854" style="fill: #ff7f0e; stroke: #ff7f0e" />
|
| 4397 |
+
<use ns4:href="#m9b8c54d372" x="787.896439" y="141.132167" style="fill: #ff7f0e; stroke: #ff7f0e" />
|
| 4398 |
+
</g>
|
| 4399 |
+
</g>
|
| 4400 |
<g id="patch_3">
|
| 4401 |
<path d="M 47.72 423.069506 L 47.72 26.88 " style="fill: none; stroke: #000000; stroke-width: 0.8; stroke-linejoin: miter; stroke-linecap: square" />
|
| 4402 |
</g>
|
|
|
|
| 4409 |
<g id="patch_6">
|
| 4410 |
<path d="M 47.72 26.88 L 823.142937 26.88 " style="fill: none; stroke: #000000; stroke-width: 0.8; stroke-linejoin: miter; stroke-linecap: square" />
|
| 4411 |
</g>
|
| 4412 |
+
<g id="text_33">
|
| 4413 |
<text style="font-size: 12px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: middle" x="435.431468" y="20.88" transform="rotate(-0 435.431468 20.88)">Attention Implementation Latency</text>
|
| 4414 |
</g>
|
| 4415 |
<g id="legend" class="legend">
|
| 4416 |
<g id="patch_7">
|
| 4417 |
+
<path d="M 54.72 64.7925 L 172.655938 64.7925 Q 174.655938 64.7925 174.655938 62.7925 L 174.655938 33.88 Q 174.655938 31.88 172.655938 31.88 L 54.72 31.88 Q 52.72 31.88 52.72 33.88 L 52.72 62.7925 Q 52.72 64.7925 54.72 64.7925 L 54.72 64.7925 z " style="fill: none; opacity: 0.8; stroke: #cccccc; stroke-linejoin: miter" />
|
| 4418 |
</g>
|
| 4419 |
+
<g id="line2d_33">
|
| 4420 |
<path d="M 56.72 39.978438 L 66.72 39.978438 L 76.72 39.978438 " style="fill: none; stroke: #1f77b4; stroke-width: 1.5; stroke-linecap: square" />
|
| 4421 |
<g>
|
| 4422 |
<use ns4:href="#md7efaf3aec" x="66.72" y="39.978438" style="fill: #1f77b4; stroke: #1f77b4" />
|
| 4423 |
</g>
|
| 4424 |
</g>
|
| 4425 |
+
<g id="legend-label--hf-kernels-rotary" class="legend">
|
| 4426 |
+
<text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: start" x="84.72" y="43.478438" transform="rotate(-0 84.72 43.478438)">hf_kernels_rotary</text>
|
| 4427 |
+
</g>
|
| 4428 |
+
<g id="line2d_34">
|
| 4429 |
+
<path d="M 56.72 54.934687 L 66.72 54.934687 L 76.72 54.934687 " style="fill: none; stroke: #ff7f0e; stroke-width: 1.5; stroke-linecap: square" />
|
| 4430 |
+
<g>
|
| 4431 |
+
<use ns4:href="#m9b8c54d372" x="66.72" y="54.934687" style="fill: #ff7f0e; stroke: #ff7f0e" />
|
| 4432 |
+
</g>
|
| 4433 |
+
</g>
|
| 4434 |
<g id="legend-label--torch-eager" class="legend">
|
| 4435 |
+
<text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: start" x="84.72" y="58.434687" transform="rotate(-0 84.72 58.434687)">torch_eager</text>
|
| 4436 |
</g>
|
| 4437 |
</g>
|
| 4438 |
</g>
|
|
|
|
| 4452 |
<span onclick="toggleOutput('combine')" style="cursor: pointer;">▼ output</span>
|
| 4453 |
<span id="uv-indicator-combine" onclick="toggleUvLogsFromHeader('combine')" style="cursor: pointer;">▶ uv-logs</span>
|
| 4454 |
</span> |
|
| 4455 |
+
Cell: combine | 4.43s
|
| 4456 |
| <button class="run-btn" onclick="runCell('combine')">▶ run</button>
|
| 4457 |
<button class="copy-btn" onclick="copyCell('combine')">Copy</button>
|
| 4458 |
<a href="cells/combine.py" target="_blank" class="raw-btn">Raw</a>
|
|
|
|
| 4524 |
<div class="cell-stdout"><pre class="stdout-text">======================================================================
|
| 4525 |
LOADING BENCHMARK DATA
|
| 4526 |
======================================================================
|
| 4527 |
+
✓ HF Kernels Rotary : /__w/kernels-benchmarks/kernels-benchmarks/benches/rotary/impls/.uvnote/cache/3884170bda871392d403d55c822a8b7de8970f81c4733ae7630938c3bf0db88a
|
| 4528 |
✓ PyTorch Rotary : /__w/kernels-benchmarks/kernels-benchmarks/benches/rotary/impls/.uvnote/cache/abf801d6445dfa81a8dd7b2e6257930c39c18160a9b97a739858c3b244e16cc5
|
| 4529 |
|
| 4530 |
✓ Found HF Kernels Rotary
|
| 4531 |
+
Path: /__w/kernels-benchmarks/kernels-benchmarks/benches/rotary/impls/.uvnote/cache/3884170bda871392d403d55c822a8b7de8970f81c4733ae7630938c3bf0db88a/rotary.jsonl
|
| 4532 |
✓ Found PyTorch Rotary
|
| 4533 |
Path: /__w/kernels-benchmarks/kernels-benchmarks/benches/rotary/impls/.uvnote/cache/abf801d6445dfa81a8dd7b2e6257930c39c18160a9b97a739858c3b244e16cc5/rotary.jsonl
|
| 4534 |
|
|
|
|
| 4539 |
COMBINED BENCHMARK SUMMARY
|
| 4540 |
|
| 4541 |
impl wl p50(ms) ok
|
| 4542 |
+
hf_kernels_rotary cuda_B1_S128_H32_D128_R64 0.09 True
|
| 4543 |
+
hf_kernels_rotary cuda_B1_S128_H32_D64_R32 0.09 True
|
| 4544 |
+
hf_kernels_rotary cuda_B1_S128_H8_D128_R64 0.09 True
|
| 4545 |
+
hf_kernels_rotary cuda_B1_S128_H8_D64_R32 0.07 True
|
| 4546 |
+
hf_kernels_rotary cuda_B1_S2048_H32_D128_R64 0.26 True
|
| 4547 |
+
hf_kernels_rotary cuda_B1_S2048_H32_D64_R32 0.09 True
|
| 4548 |
+
hf_kernels_rotary cuda_B1_S2048_H8_D128_R64 0.09 True
|
| 4549 |
+
hf_kernels_rotary cuda_B1_S2048_H8_D64_R32 0.09 True
|
| 4550 |
+
hf_kernels_rotary cuda_B1_S512_H32_D128_R64 0.09 True
|
| 4551 |
+
hf_kernels_rotary cuda_B1_S512_H32_D64_R32 0.09 True
|
| 4552 |
+
hf_kernels_rotary cuda_B1_S512_H8_D128_R64 0.09 True
|
| 4553 |
+
hf_kernels_rotary cuda_B1_S512_H8_D64_R32 0.09 True
|
| 4554 |
+
hf_kernels_rotary cuda_B2_S128_H32_D128_R64 0.09 True
|
| 4555 |
+
hf_kernels_rotary cuda_B2_S128_H32_D64_R32 0.09 True
|
| 4556 |
+
hf_kernels_rotary cuda_B2_S128_H8_D128_R64 0.09 True
|
| 4557 |
+
hf_kernels_rotary cuda_B2_S128_H8_D64_R32 0.09 True
|
| 4558 |
+
hf_kernels_rotary cuda_B2_S2048_H32_D128_R64 0.84 True
|
| 4559 |
+
hf_kernels_rotary cuda_B2_S2048_H32_D64_R32 0.26 True
|
| 4560 |
+
hf_kernels_rotary cuda_B2_S2048_H8_D128_R64 0.09 True
|
| 4561 |
+
hf_kernels_rotary cuda_B2_S2048_H8_D64_R32 0.09 True
|
| 4562 |
+
hf_kernels_rotary cuda_B2_S512_H32_D128_R64 0.09 True
|
| 4563 |
+
hf_kernels_rotary cuda_B2_S512_H32_D64_R32 0.09 True
|
| 4564 |
+
hf_kernels_rotary cuda_B2_S512_H8_D128_R64 0.09 True
|
| 4565 |
+
hf_kernels_rotary cuda_B2_S512_H8_D64_R32 0.09 True
|
| 4566 |
+
torch_eager cuda_B1_S128_H32_D128_R64 0.21 True
|
| 4567 |
torch_eager cuda_B1_S128_H32_D64_R32 0.22 True
|
| 4568 |
+
torch_eager cuda_B1_S128_H8_D128_R64 0.22 True
|
| 4569 |
+
torch_eager cuda_B1_S128_H8_D64_R32 0.17 True
|
| 4570 |
+
torch_eager cuda_B1_S2048_H32_D128_R64 0.22 True
|
| 4571 |
+
torch_eager cuda_B1_S2048_H32_D64_R32 0.21 True
|
| 4572 |
+
torch_eager cuda_B1_S2048_H8_D128_R64 0.21 True
|
| 4573 |
+
torch_eager cuda_B1_S2048_H8_D64_R32 0.21 True
|
| 4574 |
+
torch_eager cuda_B1_S512_H32_D128_R64 0.21 True
|
| 4575 |
+
torch_eager cuda_B1_S512_H32_D64_R32 0.21 True
|
| 4576 |
+
torch_eager cuda_B1_S512_H8_D128_R64 0.21 True
|
| 4577 |
torch_eager cuda_B1_S512_H8_D64_R32 0.22 True
|
| 4578 |
+
torch_eager cuda_B2_S128_H32_D128_R64 0.21 True
|
| 4579 |
+
torch_eager cuda_B2_S128_H32_D64_R32 0.21 True
|
| 4580 |
+
torch_eager cuda_B2_S128_H8_D128_R64 0.21 True
|
| 4581 |
+
torch_eager cuda_B2_S128_H8_D64_R32 0.21 True
|
| 4582 |
torch_eager cuda_B2_S2048_H32_D128_R64 0.64 True
|
| 4583 |
torch_eager cuda_B2_S2048_H32_D64_R32 0.23 True
|
| 4584 |
torch_eager cuda_B2_S2048_H8_D128_R64 0.22 True
|
| 4585 |
+
torch_eager cuda_B2_S2048_H8_D64_R32 0.21 True
|
| 4586 |
+
torch_eager cuda_B2_S512_H32_D128_R64 0.21 True
|
| 4587 |
+
torch_eager cuda_B2_S512_H32_D64_R32 0.21 True
|
| 4588 |
torch_eager cuda_B2_S512_H8_D128_R64 0.21 True
|
| 4589 |
+
torch_eager cuda_B2_S512_H8_D64_R32 0.21 True
|
| 4590 |
|
| 4591 |
GENERATING COMBINED VISUALIZATION
|
| 4592 |
|
|
|
|
| 4606 |
<div class="uv-install-logs" id="uv-logs-combine">
|
| 4607 |
<div class="uv-logs-header" onclick="toggleUvLogs(this)">▶ UV Install Logs</div>
|
| 4608 |
<div class="uv-logs-content" style="display: none;">
|
| 4609 |
+
Installed 37 packages in 229ms
|
| 4610 |
</div>
|
| 4611 |
</div>
|
| 4612 |
<div class="cell-artifacts">
|
|
|
|
| 4619 |
<rdf:RDF>
|
| 4620 |
<ns2:Work>
|
| 4621 |
<dc:type rdf:resource="http://purl.org/dc/dcmitype/StillImage" />
|
| 4622 |
+
<dc:date>2025-10-29T15:51:00.751980</dc:date>
|
| 4623 |
<dc:format>image/svg+xml</dc:format>
|
| 4624 |
<dc:creator>
|
| 4625 |
<ns2:Agent>
|
|
|
|
| 4963 |
<g id="matplotlib.axis_2">
|
| 4964 |
<g id="ytick_1">
|
| 4965 |
<g id="grid-y--2" class="grid grid-y">
|
| 4966 |
+
<path d="M 47.72 392.946895 L 823.142937 392.946895 " clip-path="url(#p088c925177)" style="fill: none; stroke: #b0b0b0; stroke-opacity: 0.3; stroke-width: 0.8; stroke-linecap: square" />
|
| 4967 |
</g>
|
| 4968 |
<g id="line2d_25">
|
| 4969 |
<defs>
|
| 4970 |
<path id="m0fca2865ba" d="M 0 0 L -3.5 0 " style="stroke: #000000; stroke-width: 0.8" />
|
| 4971 |
</defs>
|
| 4972 |
<g>
|
| 4973 |
+
<use ns4:href="#m0fca2865ba" x="47.72" y="392.946895" style="stroke: #000000; stroke-width: 0.8" />
|
| 4974 |
</g>
|
| 4975 |
</g>
|
| 4976 |
<g id="text_25">
|
| 4977 |
+
<text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: end" x="40.72" y="396.746114" transform="rotate(-0 40.72 396.746114)">0.1</text>
|
| 4978 |
</g>
|
| 4979 |
</g>
|
| 4980 |
<g id="ytick_2">
|
| 4981 |
<g id="grid-y--3" class="grid grid-y">
|
| 4982 |
+
<path d="M 47.72 346.171092 L 823.142937 346.171092 " clip-path="url(#p088c925177)" style="fill: none; stroke: #b0b0b0; stroke-opacity: 0.3; stroke-width: 0.8; stroke-linecap: square" />
|
| 4983 |
</g>
|
| 4984 |
<g id="line2d_26">
|
| 4985 |
<g>
|
| 4986 |
+
<use ns4:href="#m0fca2865ba" x="47.72" y="346.171092" style="stroke: #000000; stroke-width: 0.8" />
|
| 4987 |
</g>
|
| 4988 |
</g>
|
| 4989 |
<g id="text_26">
|
| 4990 |
+
<text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: end" x="40.72" y="349.970311" transform="rotate(-0 40.72 349.970311)">0.2</text>
|
| 4991 |
</g>
|
| 4992 |
</g>
|
| 4993 |
<g id="ytick_3">
|
| 4994 |
<g id="grid-y--4" class="grid grid-y">
|
| 4995 |
+
<path d="M 47.72 299.395289 L 823.142937 299.395289 " clip-path="url(#p088c925177)" style="fill: none; stroke: #b0b0b0; stroke-opacity: 0.3; stroke-width: 0.8; stroke-linecap: square" />
|
| 4996 |
</g>
|
| 4997 |
<g id="line2d_27">
|
| 4998 |
<g>
|
| 4999 |
+
<use ns4:href="#m0fca2865ba" x="47.72" y="299.395289" style="stroke: #000000; stroke-width: 0.8" />
|
| 5000 |
</g>
|
| 5001 |
</g>
|
| 5002 |
<g id="text_27">
|
| 5003 |
+
<text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: end" x="40.72" y="303.194508" transform="rotate(-0 40.72 303.194508)">0.3</text>
|
| 5004 |
</g>
|
| 5005 |
</g>
|
| 5006 |
<g id="ytick_4">
|
| 5007 |
<g id="grid-y--5" class="grid grid-y">
|
| 5008 |
+
<path d="M 47.72 252.619486 L 823.142937 252.619486 " clip-path="url(#p088c925177)" style="fill: none; stroke: #b0b0b0; stroke-opacity: 0.3; stroke-width: 0.8; stroke-linecap: square" />
|
| 5009 |
</g>
|
| 5010 |
<g id="line2d_28">
|
| 5011 |
<g>
|
| 5012 |
+
<use ns4:href="#m0fca2865ba" x="47.72" y="252.619486" style="stroke: #000000; stroke-width: 0.8" />
|
| 5013 |
</g>
|
| 5014 |
</g>
|
| 5015 |
<g id="text_28">
|
| 5016 |
+
<text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: end" x="40.72" y="256.418705" transform="rotate(-0 40.72 256.418705)">0.4</text>
|
| 5017 |
</g>
|
| 5018 |
</g>
|
| 5019 |
<g id="ytick_5">
|
| 5020 |
<g id="grid-y--6" class="grid grid-y">
|
| 5021 |
+
<path d="M 47.72 205.843684 L 823.142937 205.843684 " clip-path="url(#p088c925177)" style="fill: none; stroke: #b0b0b0; stroke-opacity: 0.3; stroke-width: 0.8; stroke-linecap: square" />
|
| 5022 |
</g>
|
| 5023 |
<g id="line2d_29">
|
| 5024 |
<g>
|
| 5025 |
+
<use ns4:href="#m0fca2865ba" x="47.72" y="205.843684" style="stroke: #000000; stroke-width: 0.8" />
|
| 5026 |
</g>
|
| 5027 |
</g>
|
| 5028 |
<g id="text_29">
|
| 5029 |
+
<text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: end" x="40.72" y="209.642902" transform="rotate(-0 40.72 209.642902)">0.5</text>
|
| 5030 |
+
</g>
|
| 5031 |
+
</g>
|
| 5032 |
+
<g id="ytick_6">
|
| 5033 |
+
<g id="grid-y--7" class="grid grid-y">
|
| 5034 |
+
<path d="M 47.72 159.067881 L 823.142937 159.067881 " clip-path="url(#p088c925177)" style="fill: none; stroke: #b0b0b0; stroke-opacity: 0.3; stroke-width: 0.8; stroke-linecap: square" />
|
| 5035 |
+
</g>
|
| 5036 |
+
<g id="line2d_30">
|
| 5037 |
+
<g>
|
| 5038 |
+
<use ns4:href="#m0fca2865ba" x="47.72" y="159.067881" style="stroke: #000000; stroke-width: 0.8" />
|
| 5039 |
+
</g>
|
| 5040 |
+
</g>
|
| 5041 |
+
<g id="text_30">
|
| 5042 |
+
<text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: end" x="40.72" y="162.8671" transform="rotate(-0 40.72 162.8671)">0.6</text>
|
| 5043 |
+
</g>
|
| 5044 |
+
</g>
|
| 5045 |
+
<g id="ytick_7">
|
| 5046 |
+
<g id="grid-y--8" class="grid grid-y">
|
| 5047 |
+
<path d="M 47.72 112.292078 L 823.142937 112.292078 " clip-path="url(#p088c925177)" style="fill: none; stroke: #b0b0b0; stroke-opacity: 0.3; stroke-width: 0.8; stroke-linecap: square" />
|
| 5048 |
+
</g>
|
| 5049 |
+
<g id="line2d_31">
|
| 5050 |
+
<g>
|
| 5051 |
+
<use ns4:href="#m0fca2865ba" x="47.72" y="112.292078" style="stroke: #000000; stroke-width: 0.8" />
|
| 5052 |
+
</g>
|
| 5053 |
+
</g>
|
| 5054 |
+
<g id="text_31">
|
| 5055 |
+
<text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: end" x="40.72" y="116.091297" transform="rotate(-0 40.72 116.091297)">0.7</text>
|
| 5056 |
+
</g>
|
| 5057 |
+
</g>
|
| 5058 |
+
<g id="ytick_8">
|
| 5059 |
+
<g id="grid-y--9" class="grid grid-y">
|
| 5060 |
+
<path d="M 47.72 65.516275 L 823.142937 65.516275 " clip-path="url(#p088c925177)" style="fill: none; stroke: #b0b0b0; stroke-opacity: 0.3; stroke-width: 0.8; stroke-linecap: square" />
|
| 5061 |
+
</g>
|
| 5062 |
+
<g id="line2d_32">
|
| 5063 |
+
<g>
|
| 5064 |
+
<use ns4:href="#m0fca2865ba" x="47.72" y="65.516275" style="stroke: #000000; stroke-width: 0.8" />
|
| 5065 |
+
</g>
|
| 5066 |
+
</g>
|
| 5067 |
+
<g id="text_32">
|
| 5068 |
+
<text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: end" x="40.72" y="69.315494" transform="rotate(-0 40.72 69.315494)">0.8</text>
|
| 5069 |
</g>
|
| 5070 |
</g>
|
| 5071 |
<g id="label--y" class="ylabel">
|
| 5072 |
<text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: middle" x="18.737188" y="224.974753" transform="rotate(-90 18.737188 224.974753)">Latency P50 (ms)</text>
|
| 5073 |
</g>
|
| 5074 |
</g>
|
| 5075 |
+
<g id="series--hf-kernels-rotary" class="series">
|
| 5076 |
+
<path d="M 82.966497 405.060892 L 113.615625 396.688024 L 144.264753 397.225945 L 174.913881 396.996744 L 205.563009 397.632895 L 236.212137 397.35224 L 266.861265 397.324175 L 297.510393 397.91355 L 328.159521 397.843386 L 358.808648 397.146426 L 389.457776 397.071585 L 420.106904 317.340358 L 450.756032 397.211913 L 481.40516 397.459824 L 512.054288 397.151104 L 542.703416 397.263366 L 573.352544 397.511278 L 604.001672 397.314819 L 634.6508 397.539343 L 665.299928 397.628217 L 695.949056 396.861094 L 726.598184 397.997746 L 757.247312 316.58259 L 787.896439 44.888614 " clip-path="url(#p088c925177)" style="fill: none; stroke: #1f77b4; stroke-width: 1.5; stroke-linecap: square" />
|
| 5077 |
<defs>
|
| 5078 |
<path id="md7efaf3aec" d="M 0 3 C 0.795609 3 1.55874 2.683901 2.12132 2.12132 C 2.683901 1.55874 3 0.795609 3 0 C 3 -0.795609 2.683901 -1.55874 2.12132 -2.12132 C 1.55874 -2.683901 0.795609 -3 0 -3 C -0.795609 -3 -1.55874 -2.683901 -2.12132 -2.12132 C -2.683901 -1.55874 -3 -0.795609 -3 0 C -3 0.795609 -2.683901 1.55874 -2.12132 2.12132 C -1.55874 2.683901 -0.795609 3 0 3 z " style="stroke: #1f77b4" />
|
| 5079 |
</defs>
|
| 5080 |
<g clip-path="url(#p088c925177)">
|
| 5081 |
<use ns4:href="#md7efaf3aec" x="82.966497" y="405.060892" style="fill: #1f77b4; stroke: #1f77b4" />
|
| 5082 |
+
<use ns4:href="#md7efaf3aec" x="113.615625" y="396.688024" style="fill: #1f77b4; stroke: #1f77b4" />
|
| 5083 |
+
<use ns4:href="#md7efaf3aec" x="144.264753" y="397.225945" style="fill: #1f77b4; stroke: #1f77b4" />
|
| 5084 |
+
<use ns4:href="#md7efaf3aec" x="174.913881" y="396.996744" style="fill: #1f77b4; stroke: #1f77b4" />
|
| 5085 |
+
<use ns4:href="#md7efaf3aec" x="205.563009" y="397.632895" style="fill: #1f77b4; stroke: #1f77b4" />
|
| 5086 |
+
<use ns4:href="#md7efaf3aec" x="236.212137" y="397.35224" style="fill: #1f77b4; stroke: #1f77b4" />
|
| 5087 |
+
<use ns4:href="#md7efaf3aec" x="266.861265" y="397.324175" style="fill: #1f77b4; stroke: #1f77b4" />
|
| 5088 |
+
<use ns4:href="#md7efaf3aec" x="297.510393" y="397.91355" style="fill: #1f77b4; stroke: #1f77b4" />
|
| 5089 |
+
<use ns4:href="#md7efaf3aec" x="328.159521" y="397.843386" style="fill: #1f77b4; stroke: #1f77b4" />
|
| 5090 |
+
<use ns4:href="#md7efaf3aec" x="358.808648" y="397.146426" style="fill: #1f77b4; stroke: #1f77b4" />
|
| 5091 |
+
<use ns4:href="#md7efaf3aec" x="389.457776" y="397.071585" style="fill: #1f77b4; stroke: #1f77b4" />
|
| 5092 |
+
<use ns4:href="#md7efaf3aec" x="420.106904" y="317.340358" style="fill: #1f77b4; stroke: #1f77b4" />
|
| 5093 |
+
<use ns4:href="#md7efaf3aec" x="450.756032" y="397.211913" style="fill: #1f77b4; stroke: #1f77b4" />
|
| 5094 |
+
<use ns4:href="#md7efaf3aec" x="481.40516" y="397.459824" style="fill: #1f77b4; stroke: #1f77b4" />
|
| 5095 |
+
<use ns4:href="#md7efaf3aec" x="512.054288" y="397.151104" style="fill: #1f77b4; stroke: #1f77b4" />
|
| 5096 |
+
<use ns4:href="#md7efaf3aec" x="542.703416" y="397.263366" style="fill: #1f77b4; stroke: #1f77b4" />
|
| 5097 |
+
<use ns4:href="#md7efaf3aec" x="573.352544" y="397.511278" style="fill: #1f77b4; stroke: #1f77b4" />
|
| 5098 |
+
<use ns4:href="#md7efaf3aec" x="604.001672" y="397.314819" style="fill: #1f77b4; stroke: #1f77b4" />
|
| 5099 |
+
<use ns4:href="#md7efaf3aec" x="634.6508" y="397.539343" style="fill: #1f77b4; stroke: #1f77b4" />
|
| 5100 |
+
<use ns4:href="#md7efaf3aec" x="665.299928" y="397.628217" style="fill: #1f77b4; stroke: #1f77b4" />
|
| 5101 |
+
<use ns4:href="#md7efaf3aec" x="695.949056" y="396.861094" style="fill: #1f77b4; stroke: #1f77b4" />
|
| 5102 |
+
<use ns4:href="#md7efaf3aec" x="726.598184" y="397.997746" style="fill: #1f77b4; stroke: #1f77b4" />
|
| 5103 |
+
<use ns4:href="#md7efaf3aec" x="757.247312" y="316.58259" style="fill: #1f77b4; stroke: #1f77b4" />
|
| 5104 |
<use ns4:href="#md7efaf3aec" x="787.896439" y="44.888614" style="fill: #1f77b4; stroke: #1f77b4" />
|
| 5105 |
</g>
|
| 5106 |
</g>
|
| 5107 |
+
<g id="series--torch-eager" class="series">
|
| 5108 |
+
<path d="M 82.966497 359.229025 L 113.615625 336.991341 L 144.264753 338.68977 L 174.913881 340.649676 L 205.563009 337.183122 L 236.212137 342.30975 L 266.861265 341.346168 L 297.510393 339.896118 L 328.159521 341.271327 L 358.808648 340.438717 L 389.457776 342.132002 L 420.106904 334.919173 L 450.756032 340.405974 L 481.40516 340.528059 L 512.054288 341.112289 L 542.703416 340.373231 L 573.352544 340.218871 L 604.001672 341.364878 L 634.6508 340.069189 L 665.299928 340.574367 L 695.949056 340.438717 L 726.598184 339.068186 L 757.247312 332.696854 L 787.896439 141.132167 " clip-path="url(#p088c925177)" style="fill: none; stroke: #ff7f0e; stroke-width: 1.5; stroke-linecap: square" />
|
| 5109 |
+
<defs>
|
| 5110 |
+
<path id="m9b8c54d372" d="M 0 3 C 0.795609 3 1.55874 2.683901 2.12132 2.12132 C 2.683901 1.55874 3 0.795609 3 0 C 3 -0.795609 2.683901 -1.55874 2.12132 -2.12132 C 1.55874 -2.683901 0.795609 -3 0 -3 C -0.795609 -3 -1.55874 -2.683901 -2.12132 -2.12132 C -2.683901 -1.55874 -3 -0.795609 -3 0 C -3 0.795609 -2.683901 1.55874 -2.12132 2.12132 C -1.55874 2.683901 -0.795609 3 0 3 z " style="stroke: #ff7f0e" />
|
| 5111 |
+
</defs>
|
| 5112 |
+
<g clip-path="url(#p088c925177)">
|
| 5113 |
+
<use ns4:href="#m9b8c54d372" x="82.966497" y="359.229025" style="fill: #ff7f0e; stroke: #ff7f0e" />
|
| 5114 |
+
<use ns4:href="#m9b8c54d372" x="113.615625" y="336.991341" style="fill: #ff7f0e; stroke: #ff7f0e" />
|
| 5115 |
+
<use ns4:href="#m9b8c54d372" x="144.264753" y="338.68977" style="fill: #ff7f0e; stroke: #ff7f0e" />
|
| 5116 |
+
<use ns4:href="#m9b8c54d372" x="174.913881" y="340.649676" style="fill: #ff7f0e; stroke: #ff7f0e" />
|
| 5117 |
+
<use ns4:href="#m9b8c54d372" x="205.563009" y="337.183122" style="fill: #ff7f0e; stroke: #ff7f0e" />
|
| 5118 |
+
<use ns4:href="#m9b8c54d372" x="236.212137" y="342.30975" style="fill: #ff7f0e; stroke: #ff7f0e" />
|
| 5119 |
+
<use ns4:href="#m9b8c54d372" x="266.861265" y="341.346168" style="fill: #ff7f0e; stroke: #ff7f0e" />
|
| 5120 |
+
<use ns4:href="#m9b8c54d372" x="297.510393" y="339.896118" style="fill: #ff7f0e; stroke: #ff7f0e" />
|
| 5121 |
+
<use ns4:href="#m9b8c54d372" x="328.159521" y="341.271327" style="fill: #ff7f0e; stroke: #ff7f0e" />
|
| 5122 |
+
<use ns4:href="#m9b8c54d372" x="358.808648" y="340.438717" style="fill: #ff7f0e; stroke: #ff7f0e" />
|
| 5123 |
+
<use ns4:href="#m9b8c54d372" x="389.457776" y="342.132002" style="fill: #ff7f0e; stroke: #ff7f0e" />
|
| 5124 |
+
<use ns4:href="#m9b8c54d372" x="420.106904" y="334.919173" style="fill: #ff7f0e; stroke: #ff7f0e" />
|
| 5125 |
+
<use ns4:href="#m9b8c54d372" x="450.756032" y="340.405974" style="fill: #ff7f0e; stroke: #ff7f0e" />
|
| 5126 |
+
<use ns4:href="#m9b8c54d372" x="481.40516" y="340.528059" style="fill: #ff7f0e; stroke: #ff7f0e" />
|
| 5127 |
+
<use ns4:href="#m9b8c54d372" x="512.054288" y="341.112289" style="fill: #ff7f0e; stroke: #ff7f0e" />
|
| 5128 |
+
<use ns4:href="#m9b8c54d372" x="542.703416" y="340.373231" style="fill: #ff7f0e; stroke: #ff7f0e" />
|
| 5129 |
+
<use ns4:href="#m9b8c54d372" x="573.352544" y="340.218871" style="fill: #ff7f0e; stroke: #ff7f0e" />
|
| 5130 |
+
<use ns4:href="#m9b8c54d372" x="604.001672" y="341.364878" style="fill: #ff7f0e; stroke: #ff7f0e" />
|
| 5131 |
+
<use ns4:href="#m9b8c54d372" x="634.6508" y="340.069189" style="fill: #ff7f0e; stroke: #ff7f0e" />
|
| 5132 |
+
<use ns4:href="#m9b8c54d372" x="665.299928" y="340.574367" style="fill: #ff7f0e; stroke: #ff7f0e" />
|
| 5133 |
+
<use ns4:href="#m9b8c54d372" x="695.949056" y="340.438717" style="fill: #ff7f0e; stroke: #ff7f0e" />
|
| 5134 |
+
<use ns4:href="#m9b8c54d372" x="726.598184" y="339.068186" style="fill: #ff7f0e; stroke: #ff7f0e" />
|
| 5135 |
+
<use ns4:href="#m9b8c54d372" x="757.247312" y="332.696854" style="fill: #ff7f0e; stroke: #ff7f0e" />
|
| 5136 |
+
<use ns4:href="#m9b8c54d372" x="787.896439" y="141.132167" style="fill: #ff7f0e; stroke: #ff7f0e" />
|
| 5137 |
+
</g>
|
| 5138 |
+
</g>
|
| 5139 |
<g id="patch_3">
|
| 5140 |
<path d="M 47.72 423.069506 L 47.72 26.88 " style="fill: none; stroke: #000000; stroke-width: 0.8; stroke-linejoin: miter; stroke-linecap: square" />
|
| 5141 |
</g>
|
|
|
|
| 5148 |
<g id="patch_6">
|
| 5149 |
<path d="M 47.72 26.88 L 823.142937 26.88 " style="fill: none; stroke: #000000; stroke-width: 0.8; stroke-linejoin: miter; stroke-linecap: square" />
|
| 5150 |
</g>
|
| 5151 |
+
<g id="text_33">
|
| 5152 |
<text style="font-size: 12px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: middle" x="435.431468" y="20.88" transform="rotate(-0 435.431468 20.88)">Attention Implementation Latency</text>
|
| 5153 |
</g>
|
| 5154 |
<g id="legend" class="legend">
|
| 5155 |
<g id="patch_7">
|
| 5156 |
+
<path d="M 54.72 64.7925 L 172.655938 64.7925 Q 174.655938 64.7925 174.655938 62.7925 L 174.655938 33.88 Q 174.655938 31.88 172.655938 31.88 L 54.72 31.88 Q 52.72 31.88 52.72 33.88 L 52.72 62.7925 Q 52.72 64.7925 54.72 64.7925 L 54.72 64.7925 z " style="fill: none; opacity: 0.8; stroke: #cccccc; stroke-linejoin: miter" />
|
| 5157 |
</g>
|
| 5158 |
+
<g id="line2d_33">
|
| 5159 |
<path d="M 56.72 39.978438 L 66.72 39.978438 L 76.72 39.978438 " style="fill: none; stroke: #1f77b4; stroke-width: 1.5; stroke-linecap: square" />
|
| 5160 |
<g>
|
| 5161 |
<use ns4:href="#md7efaf3aec" x="66.72" y="39.978438" style="fill: #1f77b4; stroke: #1f77b4" />
|
| 5162 |
</g>
|
| 5163 |
</g>
|
| 5164 |
+
<g id="legend-label--hf-kernels-rotary" class="legend">
|
| 5165 |
+
<text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: start" x="84.72" y="43.478438" transform="rotate(-0 84.72 43.478438)">hf_kernels_rotary</text>
|
| 5166 |
+
</g>
|
| 5167 |
+
<g id="line2d_34">
|
| 5168 |
+
<path d="M 56.72 54.934687 L 66.72 54.934687 L 76.72 54.934687 " style="fill: none; stroke: #ff7f0e; stroke-width: 1.5; stroke-linecap: square" />
|
| 5169 |
+
<g>
|
| 5170 |
+
<use ns4:href="#m9b8c54d372" x="66.72" y="54.934687" style="fill: #ff7f0e; stroke: #ff7f0e" />
|
| 5171 |
+
</g>
|
| 5172 |
+
</g>
|
| 5173 |
<g id="legend-label--torch-eager" class="legend">
|
| 5174 |
+
<text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: start" x="84.72" y="58.434687" transform="rotate(-0 84.72 58.434687)">torch_eager</text>
|
| 5175 |
</g>
|
| 5176 |
</g>
|
| 5177 |
</g>
|