drbh HF Staff commited on
Commit
1c22380
·
verified ·
1 Parent(s): c415961

Upload folder using huggingface_hub

Browse files
Files changed (36) hide show
  1. activation/impls/artifacts/benchmark/activation.jsonl +9 -9
  2. activation/impls/cells/benchmark.py +7 -13
  3. activation/impls/hf_kernels_swiglu.html +99 -91
  4. activation/impls/torch_swiglu.html +129 -127
  5. activation/results/artifacts/combine/latency.svg +2 -2
  6. activation/results/combined_results.html +89 -107
  7. causal_conv1d/impls/artifacts/benchmark/causal_conv1d.jsonl +24 -24
  8. causal_conv1d/impls/cells/benchmark.py +18 -9
  9. causal_conv1d/impls/hf_kernels_causal_conv1d.html +0 -0
  10. causal_conv1d/impls/torch_causal_conv1d.html +0 -0
  11. causal_conv1d/results/artifacts/combine/latency.svg +2 -2
  12. causal_conv1d/results/combined_results.html +145 -137
  13. flash_attn/impls/artifacts/benchmark/attention.jsonl +6 -6
  14. flash_attn/impls/cells/benchmark.py +10 -9
  15. flash_attn/impls/flash_attention.html +151 -195
  16. flash_attn/impls/hf_kernels_flash_attn.html +102 -93
  17. flash_attn/impls/hf_kernels_flash_attn3.html +96 -84
  18. flash_attn/impls/mem_efficient_attention.html +139 -131
  19. flash_attn/impls/sage_attention.html +20 -17
  20. flash_attn/impls/xformers.html +146 -92
  21. flash_attn/results/artifacts/combine/latency.svg +2 -2
  22. flash_attn/results/combined_results.html +156 -148
  23. index.html +14 -12
  24. layer_norm/impls/artifacts/benchmark/layer_norm.jsonl +4 -4
  25. layer_norm/impls/cells/benchmark.py +5 -28
  26. layer_norm/impls/hf_kernels_layer_norm.html +62 -54
  27. layer_norm/impls/torch_layer_norm.html +61 -59
  28. layer_norm/results/artifacts/combine/latency.svg +2 -2
  29. layer_norm/results/combined_results.html +60 -52
  30. rotary/impls/artifacts/benchmark/rotary.jsonl +24 -24
  31. rotary/impls/cells/benchmark.py +12 -21
  32. rotary/impls/hf_kernels_rotary.html +0 -0
  33. rotary/impls/torch_rotary.html +0 -0
  34. rotary/index.html +8 -0
  35. rotary/results/artifacts/combine/latency.svg +2 -2
  36. rotary/results/combined_results.html +302 -134
activation/impls/artifacts/benchmark/activation.jsonl CHANGED
@@ -1,9 +1,9 @@
1
- {"ts": "2025-10-29T14:27:26Z", "run": "cb02b80f11f04022890ac7026c401c57", "impl": "hf_kernels_swiglu", "tags": {"family": "hf-kernels", "backend": "cuda"}, "wl": {"name": "cuda_T128_D768", "num_tokens": 128, "hidden_dim": 768, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.02251099999739381, "p50": 0.02324100000805629, "p90": 0.023539999972399528, "mean": 0.023146399996676337, "iqr": 0.0007499999696847226, "raw_times": [0.023539999972399528, 0.022790000002714805, 0.02324100000805629, 0.02365000000281725, 0.02251099999739381], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.029810000000907166, "peak_bytes": 1966080, "ok": true, "absmax": 0.0, "corr": {"ok": true, "rtol": 0.02, "atol": 0.02, "absmax": 0.0, "mae": 0.0, "mse": 0.0, "ref": "swiglu_fp32"}, "err": null}
2
- {"ts": "2025-10-29T14:27:26Z", "run": "cb02b80f11f04022890ac7026c401c57", "impl": "hf_kernels_swiglu", "tags": {"family": "hf-kernels", "backend": "cuda"}, "wl": {"name": "cuda_T128_D1024", "num_tokens": 128, "hidden_dim": 1024, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.027370999987397227, "p50": 0.028240999995432503, "p90": 0.028329999963716546, "mean": 0.02825879998908931, "iqr": 0.00023899997358967084, "raw_times": [0.028090999990126875, 0.028240999995432503, 0.028329999963716546, 0.029261000008773408, 0.027370999987397227], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.03212000001440174, "peak_bytes": 2621440, "ok": true, "absmax": 0.0, "corr": {"ok": true, "rtol": 0.02, "atol": 0.02, "absmax": 0.0, "mae": 0.0, "mse": 0.0, "ref": "swiglu_fp32"}, "err": null}
3
- {"ts": "2025-10-29T14:27:26Z", "run": "cb02b80f11f04022890ac7026c401c57", "impl": "hf_kernels_swiglu", "tags": {"family": "hf-kernels", "backend": "cuda"}, "wl": {"name": "cuda_T128_D2048", "num_tokens": 128, "hidden_dim": 2048, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.02641099996480989, "p50": 0.027520999992702855, "p90": 0.028440999983558868, "mean": 0.027734599996165343, "iqr": 0.001440999938040477, "raw_times": [0.02641099996480989, 0.028440999983558868, 0.029299999994236714, 0.027520999992702855, 0.02700000004551839], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.032080999972095015, "peak_bytes": 5242880, "ok": true, "absmax": 0.0, "corr": {"ok": true, "rtol": 0.02, "atol": 0.02, "absmax": 0.0, "mae": 0.0, "mse": 0.0, "ref": "swiglu_fp32"}, "err": null}
4
- {"ts": "2025-10-29T14:27:26Z", "run": "cb02b80f11f04022890ac7026c401c57", "impl": "hf_kernels_swiglu", "tags": {"family": "hf-kernels", "backend": "cuda"}, "wl": {"name": "cuda_T256_D768", "num_tokens": 256, "hidden_dim": 768, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.026321000007101247, "p50": 0.02724099999795726, "p90": 0.028659999998126295, "mean": 0.02923079999845868, "iqr": 0.0014990000067882647, "raw_times": [0.026321000007101247, 0.03677099999777056, 0.028659999998126295, 0.02716099999133803, 0.02724099999795726], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.031121000006351096, "peak_bytes": 3932160, "ok": true, "absmax": 0.0, "corr": {"ok": true, "rtol": 0.02, "atol": 0.02, "absmax": 0.0, "mae": 0.0, "mse": 0.0, "ref": "swiglu_fp32"}, "err": null}
5
- {"ts": "2025-10-29T14:27:26Z", "run": "cb02b80f11f04022890ac7026c401c57", "impl": "hf_kernels_swiglu", "tags": {"family": "hf-kernels", "backend": "cuda"}, "wl": {"name": "cuda_T256_D1024", "num_tokens": 256, "hidden_dim": 1024, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.025979999975334067, "p50": 0.028520999990178098, "p90": 0.028720999978304462, "mean": 0.027810800008865044, "iqr": 0.00169999992749581, "raw_times": [0.025979999975334067, 0.028811000049699942, 0.028520999990178098, 0.027021000050808652, 0.028720999978304462], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.02976100000751103, "peak_bytes": 5242880, "ok": true, "absmax": 0.0, "corr": {"ok": true, "rtol": 0.02, "atol": 0.02, "absmax": 0.0, "mae": 0.0, "mse": 0.0, "ref": "swiglu_fp32"}, "err": null}
6
- {"ts": "2025-10-29T14:27:26Z", "run": "cb02b80f11f04022890ac7026c401c57", "impl": "hf_kernels_swiglu", "tags": {"family": "hf-kernels", "backend": "cuda"}, "wl": {"name": "cuda_T256_D2048", "num_tokens": 256, "hidden_dim": 2048, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.026359999992564553, "p50": 0.027051000017763727, "p90": 0.027101000000584463, "mean": 0.027004599996871548, "iqr": 0.00035100003970001126, "raw_times": [0.027101000000584463, 0.027051000017763727, 0.027761000012560544, 0.026749999960884452, 0.026359999992564553], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.029620999953294813, "peak_bytes": 10485760, "ok": true, "absmax": 0.0, "corr": {"ok": true, "rtol": 0.02, "atol": 0.02, "absmax": 0.0, "mae": 0.0, "mse": 0.0, "ref": "swiglu_fp32"}, "err": null}
7
- {"ts": "2025-10-29T14:27:26Z", "run": "cb02b80f11f04022890ac7026c401c57", "impl": "hf_kernels_swiglu", "tags": {"family": "hf-kernels", "backend": "cuda"}, "wl": {"name": "cuda_T512_D768", "num_tokens": 512, "hidden_dim": 768, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.02576100001761006, "p50": 0.027530000011211087, "p90": 0.02828099997032041, "mean": 0.0273743999969156, "iqr": 0.001340999972399004, "raw_times": [0.02576100001761006, 0.02828099997032041, 0.026939999997921404, 0.02835999998751504, 0.027530000011211087], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.030121000008875853, "peak_bytes": 7864320, "ok": true, "absmax": 0.0, "corr": {"ok": true, "rtol": 0.02, "atol": 0.02, "absmax": 0.0, "mae": 0.0, "mse": 0.0, "ref": "swiglu_fp32"}, "err": null}
8
- {"ts": "2025-10-29T14:27:26Z", "run": "cb02b80f11f04022890ac7026c401c57", "impl": "hf_kernels_swiglu", "tags": {"family": "hf-kernels", "backend": "cuda"}, "wl": {"name": "cuda_T512_D1024", "num_tokens": 512, "hidden_dim": 1024, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.025459999960730784, "p50": 0.028590999988864496, "p90": 0.02870100001928222, "mean": 0.027812799999082927, "iqr": 0.00113999999484804, "raw_times": [0.025459999960730784, 0.02870100001928222, 0.028751000002102955, 0.02756100002443418, 0.028590999988864496], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.029950999987704563, "peak_bytes": 10485760, "ok": true, "absmax": 0.0, "corr": {"ok": true, "rtol": 0.02, "atol": 0.02, "absmax": 0.0, "mae": 0.0, "mse": 0.0, "ref": "swiglu_fp32"}, "err": null}
9
- {"ts": "2025-10-29T14:27:26Z", "run": "cb02b80f11f04022890ac7026c401c57", "impl": "hf_kernels_swiglu", "tags": {"family": "hf-kernels", "backend": "cuda"}, "wl": {"name": "cuda_T512_D2048", "num_tokens": 512, "hidden_dim": 2048, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.02632999996876606, "p50": 0.027500999976837193, "p90": 0.028640000039104052, "mean": 0.028318399995441723, "iqr": 0.0021100000253682083, "raw_times": [0.02632999996876606, 0.03259099997876547, 0.027500999976837193, 0.026530000013735844, 0.028640000039104052], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.029991000019435887, "peak_bytes": 20971520, "ok": true, "absmax": 0.0, "corr": {"ok": true, "rtol": 0.02, "atol": 0.02, "absmax": 0.0, "mae": 0.0, "mse": 0.0, "ref": "swiglu_fp32"}, "err": null}
 
1
+ {"ts": "2025-10-29T15:50:51Z", "run": "e54f22d2514045f0929111d26589784b", "impl": "torch_eager", "tags": {"family": "hf-kernels", "backend": "eager"}, "wl": {"name": "cuda_T128_D768", "num_tokens": 128, "hidden_dim": 768, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.040432000048440386, "p50": 0.04165099994679622, "p90": 0.0417410000181917, "mean": 0.04172699999571705, "iqr": 0.0011400000516914588, "raw_times": [0.0417410000181917, 0.04420999999865671, 0.040432000048440386, 0.04165099994679622, 0.04060099996650024], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.046430999987023824, "peak_bytes": 1966080, "ok": true, "absmax": 0.0, "corr": {"ok": true, "rtol": 0.02, "atol": 0.02, "absmax": 0.0, "mae": 0.0, "mse": 0.0, "ref": "swiglu_fp32"}, "err": null}
2
+ {"ts": "2025-10-29T15:50:51Z", "run": "e54f22d2514045f0929111d26589784b", "impl": "torch_eager", "tags": {"family": "hf-kernels", "backend": "eager"}, "wl": {"name": "cuda_T128_D1024", "num_tokens": 128, "hidden_dim": 1024, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.04963099996757592, "p50": 0.05265099997586731, "p90": 0.053851000018312334, "mean": 0.054568999985349365, "iqr": 0.0016500000583619112, "raw_times": [0.04963099996757592, 0.05265099997586731, 0.05220099995995042, 0.053851000018312334, 0.06451100000504084], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.05472100002634761, "peak_bytes": 2621440, "ok": true, "absmax": 0.0, "corr": {"ok": true, "rtol": 0.02, "atol": 0.02, "absmax": 0.0, "mae": 0.0, "mse": 0.0, "ref": "swiglu_fp32"}, "err": null}
3
+ {"ts": "2025-10-29T15:50:51Z", "run": "e54f22d2514045f0929111d26589784b", "impl": "torch_eager", "tags": {"family": "hf-kernels", "backend": "eager"}, "wl": {"name": "cuda_T128_D2048", "num_tokens": 128, "hidden_dim": 2048, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.04966099999137441, "p50": 0.05102099999021448, "p90": 0.05103099999814731, "mean": 0.05151719999503257, "iqr": 0.0007099999947968172, "raw_times": [0.04966099999137441, 0.05555199999207616, 0.05032100000335049, 0.05102099999021448, 0.05103099999814731], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.05423200002496742, "peak_bytes": 5242880, "ok": true, "absmax": 0.0, "corr": {"ok": true, "rtol": 0.02, "atol": 0.02, "absmax": 0.0, "mae": 0.0, "mse": 0.0, "ref": "swiglu_fp32"}, "err": null}
4
+ {"ts": "2025-10-29T15:50:51Z", "run": "e54f22d2514045f0929111d26589784b", "impl": "torch_eager", "tags": {"family": "hf-kernels", "backend": "eager"}, "wl": {"name": "cuda_T256_D768", "num_tokens": 256, "hidden_dim": 768, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.04886099998202553, "p50": 0.05024199998615586, "p90": 0.0503609999782384, "mean": 0.05005519998348973, "iqr": 0.0007900000014160469, "raw_times": [0.04886099998202553, 0.04957099997682235, 0.051240999994206504, 0.05024199998615586, 0.0503609999782384], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.053871000034177996, "peak_bytes": 3932160, "ok": true, "absmax": 0.0, "corr": {"ok": true, "rtol": 0.02, "atol": 0.02, "absmax": 0.0, "mae": 0.0, "mse": 0.0, "ref": "swiglu_fp32"}, "err": null}
5
+ {"ts": "2025-10-29T15:50:51Z", "run": "e54f22d2514045f0929111d26589784b", "impl": "torch_eager", "tags": {"family": "hf-kernels", "backend": "eager"}, "wl": {"name": "cuda_T256_D1024", "num_tokens": 256, "hidden_dim": 1024, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.04914099997677113, "p50": 0.04985100002841136, "p90": 0.05049099996767836, "mean": 0.04988699998875745, "iqr": 0.0013399999829744047, "raw_times": [0.04915099998470396, 0.05080099998622245, 0.04985100002841136, 0.04914099997677113, 0.05049099996767836], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.053920999960155314, "peak_bytes": 5242880, "ok": true, "absmax": 0.0, "corr": {"ok": true, "rtol": 0.02, "atol": 0.02, "absmax": 0.0, "mae": 0.0, "mse": 0.0, "ref": "swiglu_fp32"}, "err": null}
6
+ {"ts": "2025-10-29T15:50:51Z", "run": "e54f22d2514045f0929111d26589784b", "impl": "torch_eager", "tags": {"family": "hf-kernels", "backend": "eager"}, "wl": {"name": "cuda_T256_D2048", "num_tokens": 256, "hidden_dim": 2048, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.04656100003330721, "p50": 0.04960100000062084, "p90": 0.05333199999313365, "mean": 0.05254540001260466, "iqr": 0.0039209999727063405, "raw_times": [0.04656100003330721, 0.05333199999313365, 0.04960100000062084, 0.04941100002042731, 0.06382200001553429], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.051971000004868984, "peak_bytes": 10485760, "ok": true, "absmax": 0.0, "corr": {"ok": true, "rtol": 0.02, "atol": 0.02, "absmax": 0.0, "mae": 0.0, "mse": 0.0, "ref": "swiglu_fp32"}, "err": null}
7
+ {"ts": "2025-10-29T15:50:51Z", "run": "e54f22d2514045f0929111d26589784b", "impl": "torch_eager", "tags": {"family": "hf-kernels", "backend": "eager"}, "wl": {"name": "cuda_T512_D768", "num_tokens": 512, "hidden_dim": 768, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.04889099994898061, "p50": 0.050290999979552, "p90": 0.05037099998617123, "mean": 0.05047499996635452, "iqr": 0.0002600000357233512, "raw_times": [0.04889099994898061, 0.052710999966620875, 0.050110999950447876, 0.05037099998617123, 0.050290999979552], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.05234200000359124, "peak_bytes": 7864320, "ok": true, "absmax": 0.0, "corr": {"ok": true, "rtol": 0.02, "atol": 0.02, "absmax": 0.0, "mae": 0.0, "mse": 0.0, "ref": "swiglu_fp32"}, "err": null}
8
+ {"ts": "2025-10-29T15:50:51Z", "run": "e54f22d2514045f0929111d26589784b", "impl": "torch_eager", "tags": {"family": "hf-kernels", "backend": "eager"}, "wl": {"name": "cuda_T512_D1024", "num_tokens": 512, "hidden_dim": 1024, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.0489209999727791, "p50": 0.04973099999006081, "p90": 0.05078099997035679, "mean": 0.051391199974659685, "iqr": 0.0012099999935344385, "raw_times": [0.0489209999727791, 0.05078099997035679, 0.04973099999006081, 0.04957099997682235, 0.05795199996327938], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.0512020000087432, "peak_bytes": 10485760, "ok": true, "absmax": 0.0, "corr": {"ok": true, "rtol": 0.02, "atol": 0.02, "absmax": 0.0, "mae": 0.0, "mse": 0.0, "ref": "swiglu_fp32"}, "err": null}
9
+ {"ts": "2025-10-29T15:50:51Z", "run": "e54f22d2514045f0929111d26589784b", "impl": "torch_eager", "tags": {"family": "hf-kernels", "backend": "eager"}, "wl": {"name": "cuda_T512_D2048", "num_tokens": 512, "hidden_dim": 2048, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.04852099999652637, "p50": 0.04917100000056962, "p90": 0.049370999988695985, "mean": 0.049055200008751854, "iqr": 0.0007299999538190605, "raw_times": [0.04852099999652637, 0.048641000034876924, 0.04917100000056962, 0.049370999988695985, 0.04957200002309037], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.05309099998385136, "peak_bytes": 20971520, "ok": true, "absmax": 0.0, "corr": {"ok": true, "rtol": 0.02, "atol": 0.02, "absmax": 0.0, "mae": 0.0, "mse": 0.0, "ref": "swiglu_fp32"}, "err": null}
activation/impls/cells/benchmark.py CHANGED
@@ -4,7 +4,6 @@
4
  # "numpy",
5
  # "torch==2.8.0",
6
  # "kernels-benchmark-tools",
7
- # "kernels",
8
  # ]
9
  #
10
  # [tool.uv.sources]
@@ -13,22 +12,17 @@
13
  import torch
14
  import sys
15
  from kernels_benchmark_tools import KernelTypeEnum, run_benchmark
16
- from kernels import get_kernel
17
 
18
- # Load the activation kernel
19
- activation = get_kernel("kernels-community/activation")
20
 
21
-
22
- def hf_kernels_swiglu(input_tensor):
23
- hidden_dim = input_tensor.shape[-1] // 2
24
- out_shape = input_tensor.shape[:-1] + (hidden_dim,)
25
- out = torch.empty(out_shape, dtype=input_tensor.dtype, device=input_tensor.device)
26
- return activation.silu_and_mul(out, input_tensor)
27
 
28
 
29
  run_benchmark(
30
  kernel_type=KernelTypeEnum.ACTIVATION,
31
- impl_name="hf_kernels_swiglu",
32
- impl_tags={"family": "hf-kernels", "backend": "cuda"},
33
- impl_func=hf_kernels_swiglu,
34
  )
 
4
  # "numpy",
5
  # "torch==2.8.0",
6
  # "kernels-benchmark-tools",
 
7
  # ]
8
  #
9
  # [tool.uv.sources]
 
12
  import torch
13
  import sys
14
  from kernels_benchmark_tools import KernelTypeEnum, run_benchmark
15
+ import torch, torch.nn.functional as F
16
 
 
 
17
 
18
+ def swiglu_eager(x):
19
+ d = x.shape[-1] // 2
20
+ return F.silu(x[..., :d]) * x[..., d:]
 
 
 
21
 
22
 
23
  run_benchmark(
24
  kernel_type=KernelTypeEnum.ACTIVATION,
25
+ impl_name="torch_eager",
26
+ impl_tags={"family":"hf-kernels", "backend":"eager"},
27
+ impl_func=swiglu_eager,
28
  )
activation/impls/hf_kernels_swiglu.html CHANGED
@@ -809,6 +809,14 @@
809
  .artifact-preview svg {
810
  background: transparent;
811
  }
 
 
 
 
 
 
 
 
812
  /* CSV table styling */
813
  .artifact-csv {
814
  margin-top: 1rem;
@@ -3871,7 +3879,7 @@ span.linenos.special { color: #000000; background-color: #ffffc0; padding-left:
3871
  <span onclick="toggleOutput('nv')" style="cursor: pointer;">▼ output</span>
3872
  <span id="uv-indicator-nv" style="cursor: default; opacity: 0.3;">▶ uv-logs</span>
3873
  </span> |
3874
- Cell: nv | 0.26s
3875
  | <button class="run-btn" onclick="runCell('nv')">▶ run</button>
3876
  <button class="copy-btn" onclick="copyCell('nv')">Copy</button>
3877
  <a href="cells/nv.py" target="_blank" class="raw-btn">Raw</a>
@@ -3887,7 +3895,7 @@ Cell: nv | 0.26s
3887
  </div>
3888
  </div>
3889
  <div id="output-nv" class="cell-output">
3890
- <div class="cell-stdout"><pre class="stdout-text">Wed Oct 29 14:26:44 2025
3891
  +-----------------------------------------------------------------------------------------+
3892
  | NVIDIA-SMI 570.195.03 Driver Version: 570.195.03 CUDA Version: 12.8 |
3893
  |-----------------------------------------+------------------------+----------------------+
@@ -3896,7 +3904,7 @@ Cell: nv | 0.26s
3896
  | | | MIG M. |
3897
  |=========================================+========================+======================|
3898
  | 0 NVIDIA L40S On | 00000000:4D:00.0 Off | 0 |
3899
- | N/A 32C P0 133W / 350W | 0MiB / 46068MiB | 100% Default |
3900
  | | | N/A |
3901
  +-----------------------------------------+------------------------+----------------------+
3902
 
@@ -3920,7 +3928,7 @@ Cell: nv | 0.26s
3920
  <span onclick="toggleOutput('benchmark')" style="cursor: pointer;">▼ output</span>
3921
  <span id="uv-indicator-benchmark" onclick="toggleUvLogsFromHeader('benchmark')" style="cursor: pointer;">▶ uv-logs</span>
3922
  </span> |
3923
- Cell: benchmark | 4.19s
3924
  | <button class="run-btn" onclick="runCell('benchmark')">▶ run</button>
3925
  <button class="copy-btn" onclick="copyCell('benchmark')">Copy</button>
3926
  <a href="cells/benchmark.py" target="_blank" class="raw-btn">Raw</a>
@@ -3976,17 +3984,17 @@ PROFILE TRACE: hf_kernels_swiglu | cuda_T128_D768
3976
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
3977
  Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
3978
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
3979
- hf_kernels_swiglu 0.00% 0.000us 0.00% 0.000us 0.000us 72.288us 1807.20% 72.288us 72.288us 1
3980
- hf_kernels_swiglu 12.07% 211.387us 99.59% 1.744ms 1.744ms 0.000us 0.00% 5.376us 5.376us 1
3981
- _activation_beeaae6::silu_and_mul 1.10% 19.319us 84.87% 1.486ms 495.368us 4.000us 100.00% 5.376us 1.792us 3
3982
- void vllm::act_and_mul_kernel&lt;c10::BFloat16, &amp;(c10::... 0.00% 0.000us 0.00% 0.000us 0.000us 4.000us 100.00% 4.000us 1.333us 3
3983
- Activity Buffer Request 81.49% 1.427ms 81.49% 1.427ms 1.427ms 1.376us 34.40% 1.376us 1.376us 1
3984
- aten::empty 2.64% 46.231us 2.64% 46.231us 15.410us 0.000us 0.00% 0.000us 0.000us 3
3985
- cudaLaunchKernel 2.28% 39.911us 2.28% 39.911us 13.304us 0.000us 0.00% 0.000us 0.000us 3
3986
- cudaDeviceSynchronize 0.41% 7.220us 0.41% 7.220us 7.220us 0.000us 0.00% 0.000us 0.000us 1
3987
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
3988
- Self CPU time total: 1.751ms
3989
- Self CUDA time total: 4.000us
3990
 
3991
 
3992
 
@@ -3996,16 +4004,16 @@ PROFILE TRACE: hf_kernels_swiglu | cuda_T128_D1024
3996
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
3997
  Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
3998
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
3999
- hf_kernels_swiglu 0.00% 0.000us 0.00% 0.000us 0.000us 62.686us 1579.79% 62.686us 62.686us 1
4000
- hf_kernels_swiglu 6.72% 108.943us 99.67% 1.616ms 1.616ms 0.000us 0.00% 5.312us 5.312us 1
4001
- _activation_beeaae6::silu_and_mul 1.34% 21.721us 91.77% 1.488ms 495.875us 3.968us 100.00% 5.312us 1.771us 3
4002
  void vllm::act_and_mul_kernel&lt;c10::BFloat16, &amp;(c10::... 0.00% 0.000us 0.00% 0.000us 0.000us 3.968us 100.00% 3.968us 1.323us 3
4003
- Activity Buffer Request 88.82% 1.440ms 88.82% 1.440ms 1.440ms 1.344us 33.87% 1.344us 1.344us 1
4004
- aten::empty 1.18% 19.150us 1.18% 19.150us 6.383us 0.000us 0.00% 0.000us 0.000us 3
4005
- cudaLaunchKernel 1.61% 26.150us 1.61% 26.150us 8.717us 0.000us 0.00% 0.000us 0.000us 3
4006
- cudaDeviceSynchronize 0.33% 5.310us 0.33% 5.310us 5.310us 0.000us 0.00% 0.000us 0.000us 1
4007
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4008
- Self CPU time total: 1.621ms
4009
  Self CUDA time total: 3.968us
4010
 
4011
 
@@ -4016,17 +4024,17 @@ PROFILE TRACE: hf_kernels_swiglu | cuda_T128_D2048
4016
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4017
  Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
4018
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4019
- hf_kernels_swiglu 0.00% 0.000us 0.00% 0.000us 0.000us 66.687us 1361.79% 66.687us 66.687us 1
4020
- hf_kernels_swiglu 6.74% 109.943us 99.70% 1.626ms 1.626ms 0.000us 0.00% 6.529us 6.529us 1
4021
- _activation_beeaae6::silu_and_mul 1.25% 20.459us 91.78% 1.496ms 498.816us 4.897us 100.00% 6.529us 2.176us 3
4022
- void vllm::act_and_mul_kernel&lt;c10::BFloat16, &amp;(c10::... 0.00% 0.000us 0.00% 0.000us 0.000us 4.897us 100.00% 4.897us 1.632us 3
4023
- Activity Buffer Request 88.91% 1.450ms 88.91% 1.450ms 1.450ms 1.632us 33.33% 1.632us 1.632us 1
4024
- aten::empty 1.18% 19.260us 1.18% 19.260us 6.420us 0.000us 0.00% 0.000us 0.000us 3
4025
- cudaLaunchKernel 1.61% 26.232us 1.61% 26.232us 8.744us 0.000us 0.00% 0.000us 0.000us 3
4026
- cudaDeviceSynchronize 0.30% 4.870us 0.30% 4.870us 4.870us 0.000us 0.00% 0.000us 0.000us 1
4027
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4028
- Self CPU time total: 1.631ms
4029
- Self CUDA time total: 4.897us
4030
 
4031
 
4032
 
@@ -4036,17 +4044,17 @@ PROFILE TRACE: hf_kernels_swiglu | cuda_T256_D768
4036
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4037
  Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
4038
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4039
- hf_kernels_swiglu 0.00% 0.000us 0.00% 0.000us 0.000us 66.081us 1552.66% 66.081us 66.081us 1
4040
- hf_kernels_swiglu 6.15% 108.423us 99.71% 1.758ms 1.758ms 0.000us 0.00% 5.696us 5.696us 1
4041
- _activation_beeaae6::silu_and_mul 1.25% 22.001us 92.49% 1.631ms 543.697us 4.256us 100.00% 5.696us 1.899us 3
4042
- void vllm::act_and_mul_kernel&lt;c10::BFloat16, &amp;(c10::... 0.00% 0.000us 0.00% 0.000us 0.000us 4.256us 100.00% 4.256us 1.419us 3
4043
- Activity Buffer Request 80.93% 1.427ms 80.93% 1.427ms 1.427ms 1.440us 33.83% 1.440us 1.440us 1
4044
- aten::empty 1.07% 18.910us 1.07% 18.910us 6.303us 0.000us 0.00% 0.000us 0.000us 3
4045
- cudaLaunchKernel 10.31% 181.874us 10.31% 181.874us 60.625us 0.000us 0.00% 0.000us 0.000us 3
4046
- cudaDeviceSynchronize 0.29% 5.110us 0.29% 5.110us 5.110us 0.000us 0.00% 0.000us 0.000us 1
4047
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4048
- Self CPU time total: 1.764ms
4049
- Self CUDA time total: 4.256us
4050
 
4051
 
4052
 
@@ -4056,17 +4064,17 @@ PROFILE TRACE: hf_kernels_swiglu | cuda_T256_D1024
4056
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4057
  Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
4058
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4059
- hf_kernels_swiglu 0.00% 0.000us 0.00% 0.000us 0.000us 63.167us 1072.63% 63.167us 63.167us 1
4060
- hf_kernels_swiglu 15.22% 87.332us 99.19% 569.294us 569.294us 0.000us 0.00% 7.873us 7.873us 1
4061
- _activation_beeaae6::silu_and_mul 3.58% 20.570us 80.67% 463.002us 154.334us 5.889us 100.00% 7.873us 2.624us 3
4062
- void vllm::act_and_mul_kernel&lt;c10::BFloat16, &amp;(c10::... 0.00% 0.000us 0.00% 0.000us 0.000us 5.889us 100.00% 5.889us 1.963us 3
4063
- Activity Buffer Request 48.76% 279.877us 48.76% 279.877us 279.877us 1.984us 33.69% 1.984us 1.984us 1
4064
- aten::empty 3.30% 18.960us 3.30% 18.960us 6.320us 0.000us 0.00% 0.000us 0.000us 3
4065
- cudaLaunchKernel 28.32% 162.555us 28.32% 162.555us 54.185us 0.000us 0.00% 0.000us 0.000us 3
4066
- cudaDeviceSynchronize 0.81% 4.660us 0.81% 4.660us 4.660us 0.000us 0.00% 0.000us 0.000us 1
4067
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4068
- Self CPU time total: 573.954us
4069
- Self CUDA time total: 5.889us
4070
 
4071
 
4072
 
@@ -4076,17 +4084,17 @@ PROFILE TRACE: hf_kernels_swiglu | cuda_T256_D2048
4076
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4077
  Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
4078
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4079
- hf_kernels_swiglu 0.00% 0.000us 0.00% 0.000us 0.000us 69.632us 906.67% 69.632us 69.632us 1
4080
- hf_kernels_swiglu 6.07% 107.484us 99.73% 1.766ms 1.766ms 0.000us 0.00% 10.240us 10.240us 1
4081
- _activation_beeaae6::silu_and_mul 1.19% 21.010us 92.55% 1.639ms 546.413us 7.680us 100.00% 10.240us 3.413us 3
4082
- void vllm::act_and_mul_kernel&lt;c10::BFloat16, &amp;(c10::... 0.00% 0.000us 0.00% 0.000us 0.000us 7.680us 100.00% 7.680us 2.560us 3
4083
- Activity Buffer Request 81.69% 1.447ms 81.69% 1.447ms 1.447ms 2.560us 33.33% 2.560us 2.560us 1
4084
- aten::empty 1.11% 19.720us 1.11% 19.720us 6.573us 0.000us 0.00% 0.000us 0.000us 3
4085
- cudaLaunchKernel 9.67% 171.234us 9.67% 171.234us 57.078us 0.000us 0.00% 0.000us 0.000us 3
4086
- cudaDeviceSynchronize 0.27% 4.800us 0.27% 4.800us 4.800us 0.000us 0.00% 0.000us 0.000us 1
4087
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4088
- Self CPU time total: 1.771ms
4089
- Self CUDA time total: 7.680us
4090
 
4091
 
4092
 
@@ -4096,16 +4104,16 @@ PROFILE TRACE: hf_kernels_swiglu | cuda_T512_D768
4096
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4097
  Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
4098
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4099
- hf_kernels_swiglu 0.00% 0.000us 0.00% 0.000us 0.000us 72.064us 1098.54% 72.064us 72.064us 1
4100
- hf_kernels_swiglu 6.19% 109.521us 99.72% 1.763ms 1.763ms 0.000us 0.00% 8.768us 8.768us 1
4101
- _activation_beeaae6::silu_and_mul 1.22% 21.580us 92.43% 1.635ms 544.850us 6.560us 100.00% 8.768us 2.923us 3
4102
  void vllm::act_and_mul_kernel&lt;c10::BFloat16, &amp;(c10::... 0.00% 0.000us 0.00% 0.000us 0.000us 6.560us 100.00% 6.560us 2.187us 3
4103
- Activity Buffer Request 81.92% 1.449ms 81.92% 1.449ms 1.449ms 2.208us 33.66% 2.208us 2.208us 1
4104
- aten::empty 1.09% 19.351us 1.09% 19.351us 6.450us 0.000us 0.00% 0.000us 0.000us 3
4105
- cudaLaunchKernel 9.29% 164.205us 9.29% 164.205us 54.735us 0.000us 0.00% 0.000us 0.000us 3
4106
- cudaDeviceSynchronize 0.28% 4.990us 0.28% 4.990us 4.990us 0.000us 0.00% 0.000us 0.000us 1
4107
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4108
- Self CPU time total: 1.768ms
4109
  Self CUDA time total: 6.560us
4110
 
4111
 
@@ -4116,16 +4124,16 @@ PROFILE TRACE: hf_kernels_swiglu | cuda_T512_D1024
4116
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4117
  Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
4118
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4119
- hf_kernels_swiglu 0.00% 0.000us 0.00% 0.000us 0.000us 65.118us 692.16% 65.118us 65.118us 1
4120
- hf_kernels_swiglu 16.62% 89.683us 99.03% 534.374us 534.374us 0.000us 0.00% 12.576us 12.576us 1
4121
- _activation_beeaae6::silu_and_mul 3.96% 21.372us 78.99% 426.201us 142.067us 9.408us 100.00% 12.576us 4.192us 3
4122
  void vllm::act_and_mul_kernel&lt;c10::BFloat16, &amp;(c10::... 0.00% 0.000us 0.00% 0.000us 0.000us 9.408us 100.00% 9.408us 3.136us 3
4123
- Activity Buffer Request 44.61% 240.735us 44.61% 240.735us 240.735us 3.168us 33.67% 3.168us 3.168us 1
4124
- aten::empty 3.43% 18.490us 3.43% 18.490us 6.163us 0.000us 0.00% 0.000us 0.000us 3
4125
- cudaLaunchKernel 30.41% 164.094us 30.41% 164.094us 54.698us 0.000us 0.00% 0.000us 0.000us 3
4126
- cudaDeviceSynchronize 0.97% 5.210us 0.97% 5.210us 5.210us 0.000us 0.00% 0.000us 0.000us 1
4127
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4128
- Self CPU time total: 539.584us
4129
  Self CUDA time total: 9.408us
4130
 
4131
 
@@ -4136,17 +4144,17 @@ PROFILE TRACE: hf_kernels_swiglu | cuda_T512_D2048
4136
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4137
  Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
4138
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4139
- hf_kernels_swiglu 0.00% 0.000us 0.00% 0.000us 0.000us 69.182us 527.34% 69.182us 69.182us 1
4140
- hf_kernels_swiglu 12.86% 103.214us 99.41% 797.800us 797.800us 0.000us 0.00% 17.534us 17.534us 1
4141
- _activation_beeaae6::silu_and_mul 2.63% 21.139us 84.20% 675.726us 225.242us 13.119us 100.00% 17.534us 5.845us 3
4142
- void vllm::act_and_mul_kernel&lt;c10::BFloat16, &amp;(c10::... 0.00% 0.000us 0.00% 0.000us 0.000us 13.119us 100.00% 13.119us 4.373us 3
4143
- Activity Buffer Request 61.21% 491.232us 61.21% 491.232us 491.232us 4.415us 33.65% 4.415us 4.415us 1
4144
- aten::empty 2.35% 18.860us 2.35% 18.860us 6.287us 0.000us 0.00% 0.000us 0.000us 3
4145
- cudaLaunchKernel 20.35% 163.355us 20.35% 163.355us 54.452us 0.000us 0.00% 0.000us 0.000us 3
4146
- cudaDeviceSynchronize 0.59% 4.750us 0.59% 4.750us 4.750us 0.000us 0.00% 0.000us 0.000us 1
4147
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4148
- Self CPU time total: 802.550us
4149
- Self CUDA time total: 13.119us
4150
 
4151
 
4152
  impl wl p50(ms) ok
@@ -4163,12 +4171,12 @@ hf_kernels_swiglu cuda_T512_D768 0.03 True
4163
  <div class="uv-install-logs" id="uv-logs-benchmark">
4164
  <div class="uv-logs-header" onclick="toggleUvLogs(this)">▶ UV Install Logs</div>
4165
  <div class="uv-logs-content" style="display: none;">
4166
- Installed 15 packages in 13ms
4167
  </div>
4168
  </div>
4169
  <div class="cell-stderr">Fetching 7 files: 0%| | 0/7 [00:00&lt;?, ?it/s]
4170
- Fetching 7 files: 71%|███████▏ | 5/7 [00:00&lt;00:00, 14.29it/s]
4171
- Fetching 7 files: 100%|██████████| 7/7 [00:00&lt;00:00, 19.98it/s]</div>
4172
  <div class="cell-artifacts">
4173
  <h4>Artifacts:</h4>
4174
  <a href="artifacts/benchmark/activation.jsonl" class="artifact" target="_blank">activation.jsonl</a>
 
809
  .artifact-preview svg {
810
  background: transparent;
811
  }
812
+ /* Invert SVG images in dark mode */
813
+ :root[data-theme="dark"] .artifact-preview img[src$=".svg"] {
814
+ filter: invert(0.9) hue-rotate(180deg);
815
+ }
816
+ /* Keep SVG images readable in monocolor mode */
817
+ :root[data-ui="monocolor"] .artifact-preview img[src$=".svg"] {
818
+ filter: none;
819
+ }
820
  /* CSV table styling */
821
  .artifact-csv {
822
  margin-top: 1rem;
 
3879
  <span onclick="toggleOutput('nv')" style="cursor: pointer;">▼ output</span>
3880
  <span id="uv-indicator-nv" style="cursor: default; opacity: 0.3;">▶ uv-logs</span>
3881
  </span> |
3882
+ Cell: nv | 0.21s
3883
  | <button class="run-btn" onclick="runCell('nv')">▶ run</button>
3884
  <button class="copy-btn" onclick="copyCell('nv')">Copy</button>
3885
  <a href="cells/nv.py" target="_blank" class="raw-btn">Raw</a>
 
3895
  </div>
3896
  </div>
3897
  <div id="output-nv" class="cell-output">
3898
+ <div class="cell-stdout"><pre class="stdout-text">Wed Oct 29 15:50:40 2025
3899
  +-----------------------------------------------------------------------------------------+
3900
  | NVIDIA-SMI 570.195.03 Driver Version: 570.195.03 CUDA Version: 12.8 |
3901
  |-----------------------------------------+------------------------+----------------------+
 
3904
  | | | MIG M. |
3905
  |=========================================+========================+======================|
3906
  | 0 NVIDIA L40S On | 00000000:4D:00.0 Off | 0 |
3907
+ | N/A 28C P0 78W / 350W | 0MiB / 46068MiB | 11% Default |
3908
  | | | N/A |
3909
  +-----------------------------------------+------------------------+----------------------+
3910
 
 
3928
  <span onclick="toggleOutput('benchmark')" style="cursor: pointer;">▼ output</span>
3929
  <span id="uv-indicator-benchmark" onclick="toggleUvLogsFromHeader('benchmark')" style="cursor: pointer;">▶ uv-logs</span>
3930
  </span> |
3931
+ Cell: benchmark | 7.78s
3932
  | <button class="run-btn" onclick="runCell('benchmark')">▶ run</button>
3933
  <button class="copy-btn" onclick="copyCell('benchmark')">Copy</button>
3934
  <a href="cells/benchmark.py" target="_blank" class="raw-btn">Raw</a>
 
3984
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
3985
  Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
3986
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
3987
+ hf_kernels_swiglu 0.00% 0.000us 0.00% 0.000us 0.000us 79.968us 1983.33% 79.968us 79.968us 1
3988
+ hf_kernels_swiglu 10.58% 184.424us 99.57% 1.736ms 1.736ms 0.000us 0.00% 5.408us 5.408us 1
3989
+ _activation_beeaae6::silu_and_mul 1.26% 21.900us 86.25% 1.504ms 501.188us 4.032us 100.00% 5.408us 1.803us 3
3990
+ void vllm::act_and_mul_kernel&lt;c10::BFloat16, &amp;(c10::... 0.00% 0.000us 0.00% 0.000us 0.000us 4.032us 100.00% 4.032us 1.344us 3
3991
+ Activity Buffer Request 82.49% 1.438ms 82.49% 1.438ms 1.438ms 1.376us 34.13% 1.376us 1.376us 1
3992
+ aten::empty 2.74% 47.772us 2.74% 47.772us 15.924us 0.000us 0.00% 0.000us 0.000us 3
3993
+ cudaLaunchKernel 2.50% 43.631us 2.50% 43.631us 14.544us 0.000us 0.00% 0.000us 0.000us 3
3994
+ cudaDeviceSynchronize 0.43% 7.440us 0.43% 7.440us 7.440us 0.000us 0.00% 0.000us 0.000us 1
3995
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
3996
+ Self CPU time total: 1.743ms
3997
+ Self CUDA time total: 4.032us
3998
 
3999
 
4000
 
 
4004
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4005
  Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
4006
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4007
+ hf_kernels_swiglu 0.00% 0.000us 0.00% 0.000us 0.000us 60.192us 1516.94% 60.192us 60.192us 1
4008
+ hf_kernels_swiglu 5.66% 89.803us 99.62% 1.581ms 1.581ms 0.000us 0.00% 5.312us 5.312us 1
4009
+ _activation_beeaae6::silu_and_mul 1.35% 21.470us 92.79% 1.473ms 491.035us 3.968us 100.00% 5.312us 1.771us 3
4010
  void vllm::act_and_mul_kernel&lt;c10::BFloat16, &amp;(c10::... 0.00% 0.000us 0.00% 0.000us 0.000us 3.968us 100.00% 3.968us 1.323us 3
4011
+ Activity Buffer Request 89.86% 1.427ms 89.86% 1.427ms 1.427ms 1.344us 33.87% 1.344us 1.344us 1
4012
+ aten::empty 1.17% 18.590us 1.17% 18.590us 6.197us 0.000us 0.00% 0.000us 0.000us 3
4013
+ cudaLaunchKernel 1.58% 25.022us 1.58% 25.022us 8.341us 0.000us 0.00% 0.000us 0.000us 3
4014
+ cudaDeviceSynchronize 0.38% 6.110us 0.38% 6.110us 6.110us 0.000us 0.00% 0.000us 0.000us 1
4015
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4016
+ Self CPU time total: 1.588ms
4017
  Self CUDA time total: 3.968us
4018
 
4019
 
 
4024
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4025
  Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
4026
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4027
+ hf_kernels_swiglu 0.00% 0.000us 0.00% 0.000us 0.000us 65.535us 1338.54% 65.535us 65.535us 1
4028
+ hf_kernels_swiglu 5.56% 88.483us 99.64% 1.586ms 1.586ms 0.000us 0.00% 6.528us 6.528us 1
4029
+ _activation_beeaae6::silu_and_mul 1.35% 21.452us 92.87% 1.478ms 492.822us 4.896us 100.00% 6.528us 2.176us 3
4030
+ void vllm::act_and_mul_kernel&lt;c10::BFloat16, &amp;(c10::... 0.00% 0.000us 0.00% 0.000us 0.000us 4.896us 100.00% 4.896us 1.632us 3
4031
+ Activity Buffer Request 89.90% 1.431ms 89.90% 1.431ms 1.431ms 1.632us 33.33% 1.632us 1.632us 1
4032
+ aten::empty 1.21% 19.310us 1.21% 19.310us 6.437us 0.000us 0.00% 0.000us 0.000us 3
4033
+ cudaLaunchKernel 1.63% 25.910us 1.63% 25.910us 8.637us 0.000us 0.00% 0.000us 0.000us 3
4034
+ cudaDeviceSynchronize 0.36% 5.661us 0.36% 5.661us 5.661us 0.000us 0.00% 0.000us 0.000us 1
4035
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4036
+ Self CPU time total: 1.592ms
4037
+ Self CUDA time total: 4.896us
4038
 
4039
 
4040
 
 
4044
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4045
  Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
4046
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4047
+ hf_kernels_swiglu 0.00% 0.000us 0.00% 0.000us 0.000us 67.008us 1562.69% 67.008us 67.008us 1
4048
+ hf_kernels_swiglu 4.93% 90.832us 99.72% 1.836ms 1.836ms 0.000us 0.00% 5.728us 5.728us 1
4049
+ _activation_beeaae6::silu_and_mul 1.23% 22.581us 93.74% 1.726ms 575.177us 4.288us 100.00% 5.728us 1.909us 3
4050
+ void vllm::act_and_mul_kernel&lt;c10::BFloat16, &amp;(c10::... 0.00% 0.000us 0.00% 0.000us 0.000us 4.288us 100.00% 4.288us 1.429us 3
4051
+ Activity Buffer Request 81.40% 1.498ms 81.40% 1.498ms 1.498ms 1.440us 33.58% 1.440us 1.440us 1
4052
+ aten::empty 1.04% 19.180us 1.04% 19.180us 6.393us 0.000us 0.00% 0.000us 0.000us 3
4053
+ cudaLaunchKernel 11.11% 204.595us 11.11% 204.595us 68.198us 0.000us 0.00% 0.000us 0.000us 3
4054
+ cudaDeviceSynchronize 0.28% 5.180us 0.28% 5.180us 5.180us 0.000us 0.00% 0.000us 0.000us 1
4055
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4056
+ Self CPU time total: 1.841ms
4057
+ Self CUDA time total: 4.288us
4058
 
4059
 
4060
 
 
4064
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4065
  Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
4066
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4067
+ hf_kernels_swiglu 0.00% 0.000us 0.00% 0.000us 0.000us 64.800us 1106.37% 64.800us 64.800us 1
4068
+ hf_kernels_swiglu 5.65% 97.973us 99.69% 1.728ms 1.728ms 0.000us 0.00% 7.810us 7.810us 1
4069
+ _activation_beeaae6::silu_and_mul 1.27% 22.090us 92.96% 1.611ms 536.996us 5.857us 100.00% 7.810us 2.603us 3
4070
+ void vllm::act_and_mul_kernel&lt;c10::BFloat16, &amp;(c10::... 0.00% 0.000us 0.00% 0.000us 0.000us 5.857us 100.00% 5.857us 1.952us 3
4071
+ Activity Buffer Request 82.37% 1.427ms 82.37% 1.427ms 1.427ms 1.953us 33.34% 1.953us 1.953us 1
4072
+ aten::empty 1.09% 18.810us 1.09% 18.810us 6.270us 0.000us 0.00% 0.000us 0.000us 3
4073
+ cudaLaunchKernel 9.31% 161.434us 9.31% 161.434us 53.811us 0.000us 0.00% 0.000us 0.000us 3
4074
+ cudaDeviceSynchronize 0.31% 5.300us 0.31% 5.300us 5.300us 0.000us 0.00% 0.000us 0.000us 1
4075
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4076
+ Self CPU time total: 1.733ms
4077
+ Self CUDA time total: 5.857us
4078
 
4079
 
4080
 
 
4084
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4085
  Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
4086
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4087
+ hf_kernels_swiglu 0.00% 0.000us 0.00% 0.000us 0.000us 77.311us 1002.48% 77.311us 77.311us 1
4088
+ hf_kernels_swiglu 20.04% 98.272us 98.88% 484.972us 484.972us 0.000us 0.00% 10.304us 10.304us 1
4089
+ _activation_beeaae6::silu_and_mul 4.97% 24.390us 74.66% 366.210us 122.070us 7.712us 100.00% 10.304us 3.435us 3
4090
+ void vllm::act_and_mul_kernel&lt;c10::BFloat16, &amp;(c10::... 0.00% 0.000us 0.00% 0.000us 0.000us 7.712us 100.00% 7.712us 2.571us 3
4091
+ Activity Buffer Request 34.13% 167.415us 34.13% 167.415us 167.415us 2.592us 33.61% 2.592us 2.592us 1
4092
+ aten::empty 4.18% 20.490us 4.18% 20.490us 6.830us 0.000us 0.00% 0.000us 0.000us 3
4093
+ cudaLaunchKernel 35.56% 174.405us 35.56% 174.405us 58.135us 0.000us 0.00% 0.000us 0.000us 3
4094
+ cudaDeviceSynchronize 1.12% 5.511us 1.12% 5.511us 5.511us 0.000us 0.00% 0.000us 0.000us 1
4095
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4096
+ Self CPU time total: 490.483us
4097
+ Self CUDA time total: 7.712us
4098
 
4099
 
4100
 
 
4104
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4105
  Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
4106
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4107
+ hf_kernels_swiglu 0.00% 0.000us 0.00% 0.000us 0.000us 63.327us 965.35% 63.327us 63.327us 1
4108
+ hf_kernels_swiglu 20.14% 83.823us 98.84% 411.400us 411.400us 0.000us 0.00% 8.768us 8.768us 1
4109
+ _activation_beeaae6::silu_and_mul 5.43% 22.601us 74.29% 309.187us 103.062us 6.560us 100.00% 8.768us 2.923us 3
4110
  void vllm::act_and_mul_kernel&lt;c10::BFloat16, &amp;(c10::... 0.00% 0.000us 0.00% 0.000us 0.000us 6.560us 100.00% 6.560us 2.187us 3
4111
+ Activity Buffer Request 32.27% 134.313us 32.27% 134.313us 134.313us 2.208us 33.66% 2.208us 2.208us 1
4112
+ aten::empty 4.42% 18.390us 4.42% 18.390us 6.130us 0.000us 0.00% 0.000us 0.000us 3
4113
+ cudaLaunchKernel 36.59% 152.273us 36.59% 152.273us 50.758us 0.000us 0.00% 0.000us 0.000us 3
4114
+ cudaDeviceSynchronize 1.16% 4.810us 1.16% 4.810us 4.810us 0.000us 0.00% 0.000us 0.000us 1
4115
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4116
+ Self CPU time total: 416.210us
4117
  Self CUDA time total: 6.560us
4118
 
4119
 
 
4124
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4125
  Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
4126
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4127
+ hf_kernels_swiglu 0.00% 0.000us 0.00% 0.000us 0.000us 69.952us 743.54% 69.952us 69.952us 1
4128
+ hf_kernels_swiglu 5.37% 93.270us 99.70% 1.733ms 1.733ms 0.000us 0.00% 12.544us 12.544us 1
4129
+ _activation_beeaae6::silu_and_mul 1.28% 22.251us 93.17% 1.619ms 539.830us 9.408us 100.00% 12.544us 4.181us 3
4130
  void vllm::act_and_mul_kernel&lt;c10::BFloat16, &amp;(c10::... 0.00% 0.000us 0.00% 0.000us 0.000us 9.408us 100.00% 9.408us 3.136us 3
4131
+ Activity Buffer Request 83.02% 1.443ms 83.02% 1.443ms 1.443ms 3.136us 33.33% 3.136us 3.136us 1
4132
+ aten::empty 1.17% 20.271us 1.17% 20.271us 6.757us 0.000us 0.00% 0.000us 0.000us 3
4133
+ cudaLaunchKernel 8.87% 154.165us 8.87% 154.165us 51.388us 0.000us 0.00% 0.000us 0.000us 3
4134
+ cudaDeviceSynchronize 0.30% 5.210us 0.30% 5.210us 5.210us 0.000us 0.00% 0.000us 0.000us 1
4135
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4136
+ Self CPU time total: 1.738ms
4137
  Self CUDA time total: 9.408us
4138
 
4139
 
 
4144
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4145
  Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
4146
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4147
+ hf_kernels_swiglu 0.00% 0.000us 0.00% 0.000us 0.000us 65.278us 502.45% 65.278us 65.278us 1
4148
+ hf_kernels_swiglu 20.56% 86.143us 98.78% 413.910us 413.910us 0.000us 0.00% 17.344us 17.344us 1
4149
+ _activation_beeaae6::silu_and_mul 5.61% 23.493us 73.70% 308.818us 102.939us 12.992us 100.00% 17.344us 5.781us 3
4150
+ void vllm::act_and_mul_kernel&lt;c10::BFloat16, &amp;(c10::... 0.00% 0.000us 0.00% 0.000us 0.000us 12.992us 100.00% 12.992us 4.331us 3
4151
+ Activity Buffer Request 31.64% 132.592us 31.64% 132.592us 132.592us 4.352us 33.50% 4.352us 4.352us 1
4152
+ aten::empty 4.52% 18.949us 4.52% 18.949us 6.316us 0.000us 0.00% 0.000us 0.000us 3
4153
+ cudaLaunchKernel 36.45% 152.733us 36.45% 152.733us 50.911us 0.000us 0.00% 0.000us 0.000us 3
4154
+ cudaDeviceSynchronize 1.22% 5.130us 1.22% 5.130us 5.130us 0.000us 0.00% 0.000us 0.000us 1
4155
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4156
+ Self CPU time total: 419.040us
4157
+ Self CUDA time total: 12.992us
4158
 
4159
 
4160
  impl wl p50(ms) ok
 
4171
  <div class="uv-install-logs" id="uv-logs-benchmark">
4172
  <div class="uv-logs-header" onclick="toggleUvLogs(this)">▶ UV Install Logs</div>
4173
  <div class="uv-logs-content" style="display: none;">
4174
+ Installed 52 packages in 252ms
4175
  </div>
4176
  </div>
4177
  <div class="cell-stderr">Fetching 7 files: 0%| | 0/7 [00:00&lt;?, ?it/s]
4178
+ Fetching 7 files: 71%|███████▏ | 5/7 [00:00&lt;00:00, 17.75it/s]
4179
+ Fetching 7 files: 100%|██████████| 7/7 [00:00&lt;00:00, 24.82it/s]</div>
4180
  <div class="cell-artifacts">
4181
  <h4>Artifacts:</h4>
4182
  <a href="artifacts/benchmark/activation.jsonl" class="artifact" target="_blank">activation.jsonl</a>
activation/impls/torch_swiglu.html CHANGED
@@ -809,6 +809,14 @@
809
  .artifact-preview svg {
810
  background: transparent;
811
  }
 
 
 
 
 
 
 
 
812
  /* CSV table styling */
813
  .artifact-csv {
814
  margin-top: 1rem;
@@ -3871,7 +3879,7 @@ span.linenos.special { color: #000000; background-color: #ffffc0; padding-left:
3871
  <span onclick="toggleOutput('nv')" style="cursor: pointer;">▼ output</span>
3872
  <span id="uv-indicator-nv" style="cursor: default; opacity: 0.3;">▶ uv-logs</span>
3873
  </span> |
3874
- Cell: nv | 0.26s
3875
  | <button class="run-btn" onclick="runCell('nv')">▶ run</button>
3876
  <button class="copy-btn" onclick="copyCell('nv')">Copy</button>
3877
  <a href="cells/nv.py" target="_blank" class="raw-btn">Raw</a>
@@ -3887,7 +3895,7 @@ Cell: nv | 0.26s
3887
  </div>
3888
  </div>
3889
  <div id="output-nv" class="cell-output">
3890
- <div class="cell-stdout"><pre class="stdout-text">Wed Oct 29 14:26:44 2025
3891
  +-----------------------------------------------------------------------------------------+
3892
  | NVIDIA-SMI 570.195.03 Driver Version: 570.195.03 CUDA Version: 12.8 |
3893
  |-----------------------------------------+------------------------+----------------------+
@@ -3896,7 +3904,7 @@ Cell: nv | 0.26s
3896
  | | | MIG M. |
3897
  |=========================================+========================+======================|
3898
  | 0 NVIDIA L40S On | 00000000:4D:00.0 Off | 0 |
3899
- | N/A 32C P0 133W / 350W | 0MiB / 46068MiB | 100% Default |
3900
  | | | N/A |
3901
  +-----------------------------------------+------------------------+----------------------+
3902
 
@@ -3918,9 +3926,9 @@ Cell: nv | 0.26s
3918
  <span class="collapse-indicators">
3919
  <span onclick="toggleCode('benchmark')" style="cursor: pointer;">▼ code</span>
3920
  <span onclick="toggleOutput('benchmark')" style="cursor: pointer;">▼ output</span>
3921
- <span id="uv-indicator-benchmark" onclick="toggleUvLogsFromHeader('benchmark')" style="cursor: pointer;">▶ uv-logs</span>
3922
  </span> |
3923
- Cell: benchmark | 6.86s
3924
  | <button class="run-btn" onclick="runCell('benchmark')">▶ run</button>
3925
  <button class="copy-btn" onclick="copyCell('benchmark')">Copy</button>
3926
  <a href="cells/benchmark.py" target="_blank" class="raw-btn">Raw</a>
@@ -3970,19 +3978,19 @@ PROFILE TRACE: torch_eager | cuda_T128_D768
3970
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
3971
  Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
3972
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
3973
- torch_eager 0.00% 0.000us 0.00% 0.000us 0.000us 189.470us 1483.94% 189.470us 189.470us 1
3974
- torch_eager 11.64% 220.727us 99.60% 1.889ms 1.889ms 0.000us 0.00% 15.103us 15.103us 1
3975
- aten::silu 3.36% 63.732us 81.84% 1.552ms 517.326us 6.559us 51.37% 8.894us 2.965us 3
3976
- void at::native::elementwise_kernel&lt;128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 6.559us 51.37% 6.559us 2.186us 3
3977
- aten::mul 1.83% 34.608us 3.05% 57.780us 19.260us 6.209us 48.63% 6.209us 2.070us 3
3978
- void at::native::elementwise_kernel&lt;128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 6.209us 48.63% 6.209us 2.070us 3
3979
- Activity Buffer Request 76.17% 1.444ms 76.17% 1.444ms 1.444ms 2.335us 18.29% 2.335us 2.335us 1
3980
- aten::slice 2.47% 46.790us 3.07% 58.281us 9.714us 0.000us 0.00% 0.000us 0.000us 6
3981
- aten::as_strided 0.61% 11.491us 0.61% 11.491us 1.915us 0.000us 0.00% 0.000us 0.000us 6
3982
- cudaLaunchKernel 3.54% 67.043us 3.54% 67.043us 11.174us 0.000us 0.00% 0.000us 0.000us 6
3983
- cudaDeviceSynchronize 0.40% 7.531us 0.40% 7.531us 7.531us 0.000us 0.00% 0.000us 0.000us 1
3984
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
3985
- Self CPU time total: 1.896ms
3986
  Self CUDA time total: 12.768us
3987
 
3988
 
@@ -3993,20 +4001,20 @@ PROFILE TRACE: torch_eager | cuda_T128_D1024
3993
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
3994
  Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
3995
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
3996
- torch_eager 0.00% 0.000us 0.00% 0.000us 0.000us 160.895us 1299.43% 160.895us 160.895us 1
3997
- torch_eager 6.82% 117.243us 99.71% 1.713ms 1.713ms 0.000us 0.00% 14.558us 14.558us 1
3998
- aten::silu 2.46% 42.340us 88.23% 1.516ms 505.362us 6.399us 51.68% 8.575us 2.858us 3
3999
- void at::native::elementwise_kernel&lt;128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 6.399us 51.68% 6.399us 2.133us 3
4000
- aten::mul 1.64% 28.101us 2.83% 48.681us 16.227us 5.983us 48.32% 5.983us 1.994us 3
4001
- void at::native::elementwise_kernel&lt;128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 5.983us 48.32% 5.983us 1.994us 3
4002
- Activity Buffer Request 84.10% 1.445ms 84.10% 1.445ms 1.445ms 2.176us 17.57% 2.176us 2.176us 1
4003
- aten::slice 1.47% 25.252us 1.82% 31.222us 5.204us 0.000us 0.00% 0.000us 0.000us 6
4004
- aten::as_strided 0.35% 5.970us 0.35% 5.970us 0.995us 0.000us 0.00% 0.000us 0.000us 6
4005
- cudaLaunchKernel 2.87% 49.290us 2.87% 49.290us 8.215us 0.000us 0.00% 0.000us 0.000us 6
4006
- cudaDeviceSynchronize 0.29% 5.020us 0.29% 5.020us 5.020us 0.000us 0.00% 0.000us 0.000us 1
4007
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4008
- Self CPU time total: 1.718ms
4009
- Self CUDA time total: 12.382us
4010
 
4011
 
4012
 
@@ -4016,20 +4024,20 @@ PROFILE TRACE: torch_eager | cuda_T128_D2048
4016
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4017
  Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
4018
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4019
- torch_eager 0.00% 0.000us 0.00% 0.000us 0.000us 157.982us 1195.38% 157.982us 157.982us 1
4020
- torch_eager 6.51% 110.244us 99.65% 1.686ms 1.686ms 0.000us 0.00% 15.488us 15.488us 1
4021
- aten::silu 2.52% 42.653us 88.50% 1.498ms 499.192us 6.784us 51.33% 9.056us 3.019us 3
4022
- void at::native::elementwise_kernel&lt;128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 6.784us 51.33% 6.784us 2.261us 3
4023
- aten::mul 1.66% 28.021us 2.76% 46.791us 15.597us 6.432us 48.67% 6.432us 2.144us 3
4024
- void at::native::elementwise_kernel&lt;128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 6.432us 48.67% 6.432us 2.144us 3
4025
- Activity Buffer Request 84.30% 1.427ms 84.30% 1.427ms 1.427ms 2.272us 17.19% 2.272us 2.272us 1
4026
- aten::slice 1.51% 25.627us 1.87% 31.700us 5.283us 0.000us 0.00% 0.000us 0.000us 6
4027
- aten::as_strided 0.36% 6.073us 0.36% 6.073us 1.012us 0.000us 0.00% 0.000us 0.000us 6
4028
- cudaLaunchKernel 2.78% 47.050us 2.78% 47.050us 7.842us 0.000us 0.00% 0.000us 0.000us 6
4029
- cudaDeviceSynchronize 0.35% 5.950us 0.35% 5.950us 5.950us 0.000us 0.00% 0.000us 0.000us 1
4030
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4031
- Self CPU time total: 1.692ms
4032
- Self CUDA time total: 13.216us
4033
 
4034
 
4035
 
@@ -4039,20 +4047,20 @@ PROFILE TRACE: torch_eager | cuda_T256_D768
4039
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4040
  Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
4041
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4042
- torch_eager 0.00% 0.000us 0.00% 0.000us 0.000us 159.902us 1258.67% 159.902us 159.902us 1
4043
- torch_eager 6.73% 114.317us 99.66% 1.694ms 1.694ms 0.000us 0.00% 14.912us 14.912us 1
4044
- aten::silu 2.46% 41.881us 88.34% 1.501ms 500.465us 6.560us 51.64% 8.768us 2.923us 3
4045
- void at::native::elementwise_kernel&lt;128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 6.560us 51.64% 6.560us 2.187us 3
4046
- aten::mul 1.68% 28.581us 2.79% 47.441us 15.814us 6.144us 48.36% 6.144us 2.048us 3
4047
- void at::native::elementwise_kernel&lt;128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 6.144us 48.36% 6.144us 2.048us 3
4048
- Activity Buffer Request 74.33% 1.263ms 74.33% 1.263ms 1.263ms 2.208us 17.38% 2.208us 2.208us 1
4049
- aten::slice 1.44% 24.468us 1.80% 30.638us 5.106us 0.000us 0.00% 0.000us 0.000us 6
4050
- aten::as_strided 0.36% 6.170us 0.36% 6.170us 1.028us 0.000us 0.00% 0.000us 0.000us 6
4051
- cudaLaunchKernel 12.65% 214.994us 12.65% 214.994us 35.832us 0.000us 0.00% 0.000us 0.000us 6
4052
- cudaDeviceSynchronize 0.34% 5.830us 0.34% 5.830us 5.830us 0.000us 0.00% 0.000us 0.000us 1
4053
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4054
- Self CPU time total: 1.700ms
4055
- Self CUDA time total: 12.704us
4056
 
4057
 
4058
 
@@ -4062,20 +4070,20 @@ PROFILE TRACE: torch_eager | cuda_T256_D1024
4062
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4063
  Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
4064
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4065
- torch_eager 0.00% 0.000us 0.00% 0.000us 0.000us 157.053us 1185.48% 157.053us 157.053us 1
4066
- torch_eager 6.08% 111.294us 99.69% 1.824ms 1.824ms 0.000us 0.00% 15.552us 15.552us 1
4067
- aten::silu 2.39% 43.729us 89.42% 1.636ms 545.306us 6.784us 51.21% 9.088us 3.029us 3
4068
- void at::native::elementwise_kernel&lt;128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 6.784us 51.21% 6.784us 2.261us 3
4069
- aten::mul 1.44% 26.361us 2.52% 46.181us 15.394us 6.464us 48.79% 6.464us 2.155us 3
4070
- void at::native::elementwise_kernel&lt;128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 6.464us 48.79% 6.464us 2.155us 3
4071
- Activity Buffer Request 77.97% 1.426ms 77.97% 1.426ms 1.426ms 2.304us 17.39% 2.304us 2.304us 1
4072
- aten::slice 1.34% 24.571us 1.66% 30.441us 5.074us 0.000us 0.00% 0.000us 0.000us 6
4073
- aten::as_strided 0.32% 5.870us 0.32% 5.870us 0.978us 0.000us 0.00% 0.000us 0.000us 6
4074
- cudaLaunchKernel 10.14% 185.544us 10.14% 185.544us 30.924us 0.000us 0.00% 0.000us 0.000us 6
4075
- cudaDeviceSynchronize 0.31% 5.601us 0.31% 5.601us 5.601us 0.000us 0.00% 0.000us 0.000us 1
4076
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4077
- Self CPU time total: 1.829ms
4078
- Self CUDA time total: 13.248us
4079
 
4080
 
4081
 
@@ -4085,20 +4093,20 @@ PROFILE TRACE: torch_eager | cuda_T256_D2048
4085
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4086
  Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
4087
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4088
- torch_eager 0.00% 0.000us 0.00% 0.000us 0.000us 151.390us 977.47% 151.390us 151.390us 1
4089
- torch_eager 22.03% 109.975us 99.02% 494.363us 494.363us 0.000us 0.00% 18.176us 18.176us 1
4090
- aten::silu 8.41% 41.971us 61.88% 308.937us 102.979us 7.936us 51.24% 10.624us 3.541us 3
4091
- void at::native::elementwise_kernel&lt;128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 7.936us 51.24% 7.936us 2.645us 3
4092
- aten::mul 5.23% 26.101us 8.92% 44.531us 14.844us 7.552us 48.76% 7.552us 2.517us 3
4093
- void at::native::elementwise_kernel&lt;128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 7.552us 48.76% 7.552us 2.517us 3
4094
- Activity Buffer Request 22.19% 110.773us 22.19% 110.773us 110.773us 2.688us 17.36% 2.688us 2.688us 1
4095
- aten::slice 5.05% 25.220us 6.19% 30.920us 5.153us 0.000us 0.00% 0.000us 0.000us 6
4096
- aten::as_strided 1.14% 5.700us 1.14% 5.700us 0.950us 0.000us 0.00% 0.000us 0.000us 6
4097
- cudaLaunchKernel 34.98% 174.623us 34.98% 174.623us 29.104us 0.000us 0.00% 0.000us 0.000us 6
4098
- cudaDeviceSynchronize 0.98% 4.900us 0.98% 4.900us 4.900us 0.000us 0.00% 0.000us 0.000us 1
4099
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4100
- Self CPU time total: 499.263us
4101
- Self CUDA time total: 15.488us
4102
 
4103
 
4104
 
@@ -4108,20 +4116,20 @@ PROFILE TRACE: torch_eager | cuda_T512_D768
4108
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4109
  Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
4110
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4111
- torch_eager 0.00% 0.000us 0.00% 0.000us 0.000us 163.583us 1143.70% 163.583us 163.583us 1
4112
- torch_eager 6.28% 116.052us 99.70% 1.841ms 1.841ms 0.000us 0.00% 16.767us 16.767us 1
4113
- aten::silu 2.27% 41.942us 89.09% 1.645ms 548.450us 7.327us 51.23% 9.791us 3.264us 3
4114
- void at::native::elementwise_kernel&lt;128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 7.327us 51.23% 7.327us 2.442us 3
4115
- aten::mul 1.55% 28.681us 2.62% 48.392us 16.131us 6.976us 48.77% 6.976us 2.325us 3
4116
- void at::native::elementwise_kernel&lt;128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 6.976us 48.77% 6.976us 2.325us 3
4117
- Activity Buffer Request 78.22% 1.445ms 78.22% 1.445ms 1.445ms 2.464us 17.23% 2.464us 2.464us 1
4118
- aten::slice 1.38% 25.430us 1.70% 31.392us 5.232us 0.000us 0.00% 0.000us 0.000us 6
4119
- aten::as_strided 0.32% 5.962us 0.32% 5.962us 0.994us 0.000us 0.00% 0.000us 0.000us 6
4120
- cudaLaunchKernel 9.67% 178.614us 9.67% 178.614us 29.769us 0.000us 0.00% 0.000us 0.000us 6
4121
- cudaDeviceSynchronize 0.30% 5.570us 0.30% 5.570us 5.570us 0.000us 0.00% 0.000us 0.000us 1
4122
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4123
- Self CPU time total: 1.847ms
4124
- Self CUDA time total: 14.303us
4125
 
4126
 
4127
 
@@ -4131,20 +4139,20 @@ PROFILE TRACE: torch_eager | cuda_T512_D1024
4131
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4132
  Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
4133
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4134
- torch_eager 0.00% 0.000us 0.00% 0.000us 0.000us 150.172us 969.60% 150.172us 150.172us 1
4135
- torch_eager 23.07% 110.204us 98.98% 472.752us 472.752us 0.000us 0.00% 18.176us 18.176us 1
4136
- aten::silu 9.08% 43.371us 60.20% 287.547us 95.849us 7.936us 51.24% 10.624us 3.541us 3
4137
- void at::native::elementwise_kernel&lt;128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 7.936us 51.24% 7.936us 2.645us 3
4138
- aten::mul 5.48% 26.181us 9.38% 44.801us 14.934us 7.552us 48.76% 7.552us 2.517us 3
4139
- void at::native::elementwise_kernel&lt;128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 7.552us 48.76% 7.552us 2.517us 3
4140
- Activity Buffer Request 19.26% 92.002us 19.26% 92.002us 92.002us 2.688us 17.36% 2.688us 2.688us 1
4141
- aten::slice 5.00% 23.870us 6.32% 30.200us 5.033us 0.000us 0.00% 0.000us 0.000us 6
4142
- aten::as_strided 1.33% 6.330us 1.33% 6.330us 1.055us 0.000us 0.00% 0.000us 0.000us 6
4143
- cudaLaunchKernel 35.76% 170.794us 35.76% 170.794us 28.466us 0.000us 0.00% 0.000us 0.000us 6
4144
- cudaDeviceSynchronize 1.02% 4.871us 1.02% 4.871us 4.871us 0.000us 0.00% 0.000us 0.000us 1
4145
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4146
- Self CPU time total: 477.623us
4147
- Self CUDA time total: 15.488us
4148
 
4149
 
4150
 
@@ -4154,20 +4162,20 @@ PROFILE TRACE: torch_eager | cuda_T512_D2048
4154
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4155
  Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
4156
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4157
- torch_eager 0.00% 0.000us 0.00% 0.000us 0.000us 160.000us 713.30% 160.000us 160.000us 1
4158
- torch_eager 5.99% 109.975us 99.73% 1.831ms 1.831ms 0.000us 0.00% 26.335us 26.335us 1
4159
- aten::silu 2.30% 42.230us 89.52% 1.643ms 547.763us 11.583us 51.64% 15.487us 5.162us 3
4160
- void at::native::elementwise_kernel&lt;128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 11.583us 51.64% 11.583us 3.861us 3
4161
- aten::mul 1.54% 28.250us 2.52% 46.180us 15.393us 10.848us 48.36% 10.848us 3.616us 3
4162
- void at::native::elementwise_kernel&lt;128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 10.848us 48.36% 10.848us 3.616us 3
4163
- Activity Buffer Request 78.83% 1.447ms 78.83% 1.447ms 1.447ms 3.904us 17.40% 3.904us 3.904us 1
4164
- aten::slice 1.37% 25.211us 1.70% 31.261us 5.210us 0.000us 0.00% 0.000us 0.000us 6
4165
- aten::as_strided 0.33% 6.050us 0.33% 6.050us 1.008us 0.000us 0.00% 0.000us 0.000us 6
4166
- cudaLaunchKernel 9.37% 171.964us 9.37% 171.964us 28.661us 0.000us 0.00% 0.000us 0.000us 6
4167
- cudaDeviceSynchronize 0.27% 4.930us 0.27% 4.930us 4.930us 0.000us 0.00% 0.000us 0.000us 1
4168
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4169
- Self CPU time total: 1.836ms
4170
- Self CUDA time total: 22.431us
4171
 
4172
 
4173
  impl wl p50(ms) ok
@@ -4181,12 +4189,6 @@ torch_eager cuda_T512_D1024 0.05 True
4181
  torch_eager cuda_T512_D2048 0.05 True
4182
  torch_eager cuda_T512_D768 0.05 True
4183
  </pre></div>
4184
- <div class="uv-install-logs" id="uv-logs-benchmark">
4185
- <div class="uv-logs-header" onclick="toggleUvLogs(this)">▶ UV Install Logs</div>
4186
- <div class="uv-logs-content" style="display: none;">
4187
- Installed 37 packages in 230ms
4188
- </div>
4189
- </div>
4190
  <div class="cell-artifacts">
4191
  <h4>Artifacts:</h4>
4192
  <a href="artifacts/benchmark/activation.jsonl" class="artifact" target="_blank">activation.jsonl</a>
 
809
  .artifact-preview svg {
810
  background: transparent;
811
  }
812
+ /* Invert SVG images in dark mode */
813
+ :root[data-theme="dark"] .artifact-preview img[src$=".svg"] {
814
+ filter: invert(0.9) hue-rotate(180deg);
815
+ }
816
+ /* Keep SVG images readable in monocolor mode */
817
+ :root[data-ui="monocolor"] .artifact-preview img[src$=".svg"] {
818
+ filter: none;
819
+ }
820
  /* CSV table styling */
821
  .artifact-csv {
822
  margin-top: 1rem;
 
3879
  <span onclick="toggleOutput('nv')" style="cursor: pointer;">▼ output</span>
3880
  <span id="uv-indicator-nv" style="cursor: default; opacity: 0.3;">▶ uv-logs</span>
3881
  </span> |
3882
+ Cell: nv | 0.21s
3883
  | <button class="run-btn" onclick="runCell('nv')">▶ run</button>
3884
  <button class="copy-btn" onclick="copyCell('nv')">Copy</button>
3885
  <a href="cells/nv.py" target="_blank" class="raw-btn">Raw</a>
 
3895
  </div>
3896
  </div>
3897
  <div id="output-nv" class="cell-output">
3898
+ <div class="cell-stdout"><pre class="stdout-text">Wed Oct 29 15:50:40 2025
3899
  +-----------------------------------------------------------------------------------------+
3900
  | NVIDIA-SMI 570.195.03 Driver Version: 570.195.03 CUDA Version: 12.8 |
3901
  |-----------------------------------------+------------------------+----------------------+
 
3904
  | | | MIG M. |
3905
  |=========================================+========================+======================|
3906
  | 0 NVIDIA L40S On | 00000000:4D:00.0 Off | 0 |
3907
+ | N/A 28C P0 78W / 350W | 0MiB / 46068MiB | 11% Default |
3908
  | | | N/A |
3909
  +-----------------------------------------+------------------------+----------------------+
3910
 
 
3926
  <span class="collapse-indicators">
3927
  <span onclick="toggleCode('benchmark')" style="cursor: pointer;">▼ code</span>
3928
  <span onclick="toggleOutput('benchmark')" style="cursor: pointer;">▼ output</span>
3929
+ <span id="uv-indicator-benchmark" style="cursor: default; opacity: 0.3;">▶ uv-logs</span>
3930
  </span> |
3931
+ Cell: benchmark | 3.39s
3932
  | <button class="run-btn" onclick="runCell('benchmark')">▶ run</button>
3933
  <button class="copy-btn" onclick="copyCell('benchmark')">Copy</button>
3934
  <a href="cells/benchmark.py" target="_blank" class="raw-btn">Raw</a>
 
3978
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
3979
  Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
3980
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
3981
+ torch_eager 0.00% 0.000us 0.00% 0.000us 0.000us 198.560us 1555.14% 198.560us 198.560us 1
3982
+ torch_eager 10.82% 202.394us 99.60% 1.864ms 1.864ms 0.000us 0.00% 15.104us 15.104us 1
3983
+ aten::silu 3.05% 57.001us 82.79% 1.549ms 516.356us 6.560us 51.38% 8.896us 2.965us 3
3984
+ void at::native::elementwise_kernel&lt;128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 6.560us 51.38% 6.560us 2.187us 3
3985
+ aten::mul 1.85% 34.663us 3.11% 58.253us 19.418us 6.208us 48.62% 6.208us 2.069us 3
3986
+ void at::native::elementwise_kernel&lt;128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 6.208us 48.62% 6.208us 2.069us 3
3987
+ Activity Buffer Request 77.33% 1.447ms 77.33% 1.447ms 1.447ms 2.336us 18.30% 2.336us 2.336us 1
3988
+ aten::slice 2.27% 42.481us 2.88% 53.841us 8.973us 0.000us 0.00% 0.000us 0.000us 6
3989
+ aten::as_strided 0.61% 11.360us 0.61% 11.360us 1.893us 0.000us 0.00% 0.000us 0.000us 6
3990
+ cudaLaunchKernel 3.67% 68.681us 3.67% 68.681us 11.447us 0.000us 0.00% 0.000us 0.000us 6
3991
+ cudaDeviceSynchronize 0.40% 7.560us 0.40% 7.560us 7.560us 0.000us 0.00% 0.000us 0.000us 1
3992
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
3993
+ Self CPU time total: 1.871ms
3994
  Self CUDA time total: 12.768us
3995
 
3996
 
 
4001
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4002
  Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
4003
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4004
+ torch_eager 0.00% 0.000us 0.00% 0.000us 0.000us 153.854us 1245.68% 153.854us 153.854us 1
4005
+ torch_eager 7.83% 135.935us 99.65% 1.729ms 1.729ms 0.000us 0.00% 14.495us 14.495us 1
4006
+ aten::silu 2.47% 42.821us 87.44% 1.517ms 505.699us 6.399us 51.81% 8.543us 2.848us 3
4007
+ void at::native::elementwise_kernel&lt;128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 6.399us 51.81% 6.399us 2.133us 3
4008
+ aten::mul 1.58% 27.360us 2.69% 46.680us 15.560us 5.952us 48.19% 5.952us 1.984us 3
4009
+ void at::native::elementwise_kernel&lt;128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 5.952us 48.19% 5.952us 1.984us 3
4010
+ Activity Buffer Request 83.34% 1.446ms 83.34% 1.446ms 1.446ms 2.144us 17.36% 2.144us 2.144us 1
4011
+ aten::slice 1.38% 23.991us 1.69% 29.361us 4.893us 0.000us 0.00% 0.000us 0.000us 6
4012
+ aten::as_strided 0.31% 5.370us 0.31% 5.370us 0.895us 0.000us 0.00% 0.000us 0.000us 6
4013
+ cudaLaunchKernel 2.74% 47.550us 2.74% 47.550us 7.925us 0.000us 0.00% 0.000us 0.000us 6
4014
+ cudaDeviceSynchronize 0.35% 6.041us 0.35% 6.041us 6.041us 0.000us 0.00% 0.000us 0.000us 1
4015
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4016
+ Self CPU time total: 1.735ms
4017
+ Self CUDA time total: 12.351us
4018
 
4019
 
4020
 
 
4024
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4025
  Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
4026
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4027
+ torch_eager 0.00% 0.000us 0.00% 0.000us 0.000us 152.990us 1157.70% 152.990us 152.990us 1
4028
+ torch_eager 7.93% 136.944us 99.69% 1.722ms 1.722ms 0.000us 0.00% 15.487us 15.487us 1
4029
+ aten::silu 2.43% 41.922us 87.32% 1.508ms 502.829us 6.752us 51.09% 9.024us 3.008us 3
4030
+ void at::native::elementwise_kernel&lt;128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 6.752us 51.09% 6.752us 2.251us 3
4031
+ aten::mul 1.55% 26.841us 2.71% 46.791us 15.597us 6.463us 48.91% 6.463us 2.154us 3
4032
+ void at::native::elementwise_kernel&lt;128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 6.463us 48.91% 6.463us 2.154us 3
4033
+ Activity Buffer Request 83.33% 1.439ms 83.33% 1.439ms 1.439ms 2.272us 17.19% 2.272us 2.272us 1
4034
+ aten::slice 1.41% 24.420us 1.74% 29.990us 4.998us 0.000us 0.00% 0.000us 0.000us 6
4035
+ aten::as_strided 0.32% 5.570us 0.32% 5.570us 0.928us 0.000us 0.00% 0.000us 0.000us 6
4036
+ cudaLaunchKernel 2.72% 47.030us 2.72% 47.030us 7.838us 0.000us 0.00% 0.000us 0.000us 6
4037
+ cudaDeviceSynchronize 0.31% 5.290us 0.31% 5.290us 5.290us 0.000us 0.00% 0.000us 0.000us 1
4038
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4039
+ Self CPU time total: 1.728ms
4040
+ Self CUDA time total: 13.215us
4041
 
4042
 
4043
 
 
4047
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4048
  Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
4049
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4050
+ torch_eager 0.00% 0.000us 0.00% 0.000us 0.000us 152.287us 1195.72% 152.287us 152.287us 1
4051
+ torch_eager 6.75% 128.682us 99.76% 1.902ms 1.902ms 0.000us 0.00% 14.944us 14.944us 1
4052
+ aten::silu 2.22% 42.301us 89.12% 1.699ms 566.261us 6.560us 51.51% 8.768us 2.923us 3
4053
+ void at::native::elementwise_kernel&lt;128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 6.560us 51.51% 6.560us 2.187us 3
4054
+ aten::mul 1.34% 25.502us 2.28% 43.392us 14.464us 6.176us 48.49% 6.176us 2.059us 3
4055
+ void at::native::elementwise_kernel&lt;128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 6.176us 48.49% 6.176us 2.059us 3
4056
+ Activity Buffer Request 74.83% 1.427ms 74.83% 1.427ms 1.427ms 2.208us 17.34% 2.208us 2.208us 1
4057
+ aten::slice 1.32% 25.141us 1.61% 30.781us 5.130us 0.000us 0.00% 0.000us 0.000us 6
4058
+ aten::as_strided 0.30% 5.640us 0.30% 5.640us 0.940us 0.000us 0.00% 0.000us 0.000us 6
4059
+ cudaLaunchKernel 13.00% 247.856us 13.00% 247.856us 41.309us 0.000us 0.00% 0.000us 0.000us 6
4060
+ cudaDeviceSynchronize 0.24% 4.611us 0.24% 4.611us 4.611us 0.000us 0.00% 0.000us 0.000us 1
4061
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4062
+ Self CPU time total: 1.906ms
4063
+ Self CUDA time total: 12.736us
4064
 
4065
 
4066
 
 
4070
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4071
  Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
4072
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4073
+ torch_eager 0.00% 0.000us 0.00% 0.000us 0.000us 153.054us 1155.39% 153.054us 153.054us 1
4074
+ torch_eager 6.42% 122.793us 99.75% 1.907ms 1.907ms 0.000us 0.00% 15.518us 15.518us 1
4075
+ aten::silu 2.19% 41.952us 89.33% 1.708ms 569.191us 6.751us 50.96% 9.022us 3.007us 3
4076
+ void at::native::elementwise_kernel&lt;128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 6.751us 50.96% 6.751us 2.250us 3
4077
+ aten::mul 1.27% 24.330us 2.36% 45.101us 15.034us 6.496us 49.04% 6.496us 2.165us 3
4078
+ void at::native::elementwise_kernel&lt;128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 6.496us 49.04% 6.496us 2.165us 3
4079
+ Activity Buffer Request 76.06% 1.454ms 76.06% 1.454ms 1.454ms 2.271us 17.14% 2.271us 2.271us 1
4080
+ aten::slice 1.34% 25.570us 1.64% 31.330us 5.222us 0.000us 0.00% 0.000us 0.000us 6
4081
+ aten::as_strided 0.30% 5.760us 0.30% 5.760us 0.960us 0.000us 0.00% 0.000us 0.000us 6
4082
+ cudaLaunchKernel 12.16% 232.387us 12.16% 232.387us 38.731us 0.000us 0.00% 0.000us 0.000us 6
4083
+ cudaDeviceSynchronize 0.25% 4.840us 0.25% 4.840us 4.840us 0.000us 0.00% 0.000us 0.000us 1
4084
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4085
+ Self CPU time total: 1.912ms
4086
+ Self CUDA time total: 13.247us
4087
 
4088
 
4089
 
 
4093
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4094
  Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
4095
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4096
+ torch_eager 0.00% 0.000us 0.00% 0.000us 0.000us 159.743us 1029.27% 159.743us 159.743us 1
4097
+ torch_eager 7.04% 135.613us 99.74% 1.921ms 1.921ms 0.000us 0.00% 18.208us 18.208us 1
4098
+ aten::silu 2.22% 42.702us 88.66% 1.708ms 569.181us 7.936us 51.13% 10.624us 3.541us 3
4099
+ void at::native::elementwise_kernel&lt;128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 7.936us 51.13% 7.936us 2.645us 3
4100
+ aten::mul 1.46% 28.181us 2.39% 45.941us 15.314us 7.584us 48.87% 7.584us 2.528us 3
4101
+ void at::native::elementwise_kernel&lt;128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 7.584us 48.87% 7.584us 2.528us 3
4102
+ Activity Buffer Request 75.65% 1.457ms 75.65% 1.457ms 1.457ms 2.688us 17.32% 2.688us 2.688us 1
4103
+ aten::slice 1.35% 26.081us 1.66% 31.951us 5.325us 0.000us 0.00% 0.000us 0.000us 6
4104
+ aten::as_strided 0.30% 5.870us 0.30% 5.870us 0.978us 0.000us 0.00% 0.000us 0.000us 6
4105
+ cudaLaunchKernel 11.71% 225.495us 11.71% 225.495us 37.582us 0.000us 0.00% 0.000us 0.000us 6
4106
+ cudaDeviceSynchronize 0.26% 4.960us 0.26% 4.960us 4.960us 0.000us 0.00% 0.000us 0.000us 1
4107
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4108
+ Self CPU time total: 1.926ms
4109
+ Self CUDA time total: 15.520us
4110
 
4111
 
4112
 
 
4116
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4117
  Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
4118
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4119
+ torch_eager 0.00% 0.000us 0.00% 0.000us 0.000us 156.031us 1088.46% 156.031us 156.031us 1
4120
+ torch_eager 6.78% 127.672us 99.74% 1.878ms 1.878ms 0.000us 0.00% 16.798us 16.798us 1
4121
+ aten::silu 2.24% 42.252us 88.75% 1.671ms 556.944us 7.327us 51.11% 9.790us 3.263us 3
4122
+ void at::native::elementwise_kernel&lt;128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 7.327us 51.11% 7.327us 2.442us 3
4123
+ aten::mul 1.40% 26.401us 2.46% 46.222us 15.407us 7.008us 48.89% 7.008us 2.336us 3
4124
+ void at::native::elementwise_kernel&lt;128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 7.008us 48.89% 7.008us 2.336us 3
4125
+ Activity Buffer Request 75.83% 1.428ms 75.83% 1.428ms 1.428ms 2.463us 17.18% 2.463us 2.463us 1
4126
+ aten::slice 1.43% 26.941us 1.75% 32.941us 5.490us 0.000us 0.00% 0.000us 0.000us 6
4127
+ aten::as_strided 0.32% 6.000us 0.32% 6.000us 1.000us 0.000us 0.00% 0.000us 0.000us 6
4128
+ cudaLaunchKernel 11.73% 220.885us 11.73% 220.885us 36.814us 0.000us 0.00% 0.000us 0.000us 6
4129
+ cudaDeviceSynchronize 0.26% 4.871us 0.26% 4.871us 4.871us 0.000us 0.00% 0.000us 0.000us 1
4130
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4131
+ Self CPU time total: 1.883ms
4132
+ Self CUDA time total: 14.335us
4133
 
4134
 
4135
 
 
4139
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4140
  Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
4141
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4142
+ torch_eager 0.00% 0.000us 0.00% 0.000us 0.000us 151.072us 971.40% 151.072us 151.072us 1
4143
+ torch_eager 5.82% 108.433us 99.72% 1.859ms 1.859ms 0.000us 0.00% 18.240us 18.240us 1
4144
+ aten::silu 2.20% 40.971us 89.83% 1.675ms 558.344us 7.968us 51.23% 10.656us 3.552us 3
4145
+ void at::native::elementwise_kernel&lt;128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 7.968us 51.23% 7.968us 2.656us 3
4146
+ aten::mul 1.42% 26.501us 2.46% 45.902us 15.301us 7.584us 48.77% 7.584us 2.528us 3
4147
+ void at::native::elementwise_kernel&lt;128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 7.584us 48.77% 7.584us 2.528us 3
4148
+ Activity Buffer Request 76.88% 1.433ms 76.88% 1.433ms 1.433ms 2.688us 17.28% 2.688us 2.688us 1
4149
+ aten::slice 1.31% 24.441us 1.61% 29.960us 4.993us 0.000us 0.00% 0.000us 0.000us 6
4150
+ aten::as_strided 0.30% 5.519us 0.30% 5.519us 0.920us 0.000us 0.00% 0.000us 0.000us 6
4151
+ cudaLaunchKernel 11.80% 219.996us 11.80% 219.996us 36.666us 0.000us 0.00% 0.000us 0.000us 6
4152
+ cudaDeviceSynchronize 0.28% 5.300us 0.28% 5.300us 5.300us 0.000us 0.00% 0.000us 0.000us 1
4153
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4154
+ Self CPU time total: 1.865ms
4155
+ Self CUDA time total: 15.552us
4156
 
4157
 
4158
 
 
4162
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4163
  Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
4164
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4165
+ torch_eager 0.00% 0.000us 0.00% 0.000us 0.000us 157.150us 692.69% 157.150us 157.150us 1
4166
+ torch_eager 5.73% 107.203us 99.74% 1.865ms 1.865ms 0.000us 0.00% 26.622us 26.622us 1
4167
+ aten::silu 2.21% 41.231us 89.87% 1.680ms 560.117us 11.647us 51.34% 15.582us 5.194us 3
4168
+ void at::native::elementwise_kernel&lt;128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 11.647us 51.34% 11.647us 3.882us 3
4169
+ aten::mul 1.38% 25.882us 2.47% 46.192us 15.397us 11.040us 48.66% 11.040us 3.680us 3
4170
+ void at::native::elementwise_kernel&lt;128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 11.040us 48.66% 11.040us 3.680us 3
4171
+ Activity Buffer Request 77.17% 1.443ms 77.17% 1.443ms 1.443ms 3.935us 17.34% 3.935us 3.935us 1
4172
+ aten::slice 1.37% 25.600us 1.67% 31.160us 5.193us 0.000us 0.00% 0.000us 0.000us 6
4173
+ aten::as_strided 0.30% 5.560us 0.30% 5.560us 0.927us 0.000us 0.00% 0.000us 0.000us 6
4174
+ cudaLaunchKernel 11.58% 216.535us 11.58% 216.535us 36.089us 0.000us 0.00% 0.000us 0.000us 6
4175
+ cudaDeviceSynchronize 0.26% 4.830us 0.26% 4.830us 4.830us 0.000us 0.00% 0.000us 0.000us 1
4176
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4177
+ Self CPU time total: 1.870ms
4178
+ Self CUDA time total: 22.687us
4179
 
4180
 
4181
  impl wl p50(ms) ok
 
4189
  torch_eager cuda_T512_D2048 0.05 True
4190
  torch_eager cuda_T512_D768 0.05 True
4191
  </pre></div>
 
 
 
 
 
 
4192
  <div class="cell-artifacts">
4193
  <h4>Artifacts:</h4>
4194
  <a href="artifacts/benchmark/activation.jsonl" class="artifact" target="_blank">activation.jsonl</a>
activation/results/artifacts/combine/latency.svg CHANGED

Git LFS Details

  • SHA256: f62c7d85fc4a76cf7a1060a62df99ff0d32133ab94bb502b68dcd53171c39602
  • Pointer size: 130 Bytes
  • Size of remote file: 21.4 kB

Git LFS Details

  • SHA256: 29b9e8bb5a372481457939e6eee0f747e53209886137e7247a5b8d98423c5492
  • Pointer size: 130 Bytes
  • Size of remote file: 20.6 kB
activation/results/combined_results.html CHANGED
@@ -809,6 +809,14 @@
809
  .artifact-preview svg {
810
  background: transparent;
811
  }
 
 
 
 
 
 
 
 
812
  /* CSV table styling */
813
  .artifact-csv {
814
  margin-top: 1rem;
@@ -3872,7 +3880,7 @@ span.linenos.special { color: #000000; background-color: #ffffc0; padding-left:
3872
  <rdf:RDF>
3873
  <ns2:Work>
3874
  <dc:type rdf:resource="http://purl.org/dc/dcmitype/StillImage" />
3875
- <dc:date>2025-10-29T14:27:49.999657</dc:date>
3876
  <dc:format>image/svg+xml</dc:format>
3877
  <dc:creator>
3878
  <ns2:Agent>
@@ -4021,96 +4029,83 @@ span.linenos.special { color: #000000; background-color: #ffffc0; padding-left:
4021
  <g id="matplotlib.axis_2">
4022
  <g id="ytick_1">
4023
  <g id="grid-y--2" class="grid grid-y">
4024
- <path d="M 60.23 428.188156 L 847.294169 428.188156 " clip-path="url(#p620c7d392f)" style="fill: none; stroke: #b0b0b0; stroke-opacity: 0.3; stroke-width: 0.8; stroke-linecap: square" />
4025
  </g>
4026
  <g id="line2d_10">
4027
  <defs>
4028
  <path id="m0fca2865ba" d="M 0 0 L -3.5 0 " style="stroke: #000000; stroke-width: 0.8" />
4029
  </defs>
4030
  <g>
4031
- <use ns4:href="#m0fca2865ba" x="60.23" y="428.188156" style="stroke: #000000; stroke-width: 0.8" />
4032
  </g>
4033
  </g>
4034
  <g id="text_10">
4035
- <text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: end" x="53.23" y="431.987375" transform="rotate(-0 53.23 431.987375)">0.025</text>
4036
  </g>
4037
  </g>
4038
  <g id="ytick_2">
4039
  <g id="grid-y--3" class="grid grid-y">
4040
- <path d="M 60.23 362.86799 L 847.294169 362.86799 " clip-path="url(#p620c7d392f)" style="fill: none; stroke: #b0b0b0; stroke-opacity: 0.3; stroke-width: 0.8; stroke-linecap: square" />
4041
  </g>
4042
  <g id="line2d_11">
4043
  <g>
4044
- <use ns4:href="#m0fca2865ba" x="60.23" y="362.86799" style="stroke: #000000; stroke-width: 0.8" />
4045
  </g>
4046
  </g>
4047
  <g id="text_11">
4048
- <text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: end" x="53.23" y="366.667209" transform="rotate(-0 53.23 366.667209)">0.030</text>
4049
  </g>
4050
  </g>
4051
  <g id="ytick_3">
4052
  <g id="grid-y--4" class="grid grid-y">
4053
- <path d="M 60.23 297.547824 L 847.294169 297.547824 " clip-path="url(#p620c7d392f)" style="fill: none; stroke: #b0b0b0; stroke-opacity: 0.3; stroke-width: 0.8; stroke-linecap: square" />
4054
  </g>
4055
  <g id="line2d_12">
4056
  <g>
4057
- <use ns4:href="#m0fca2865ba" x="60.23" y="297.547824" style="stroke: #000000; stroke-width: 0.8" />
4058
  </g>
4059
  </g>
4060
  <g id="text_12">
4061
- <text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: end" x="53.23" y="301.347043" transform="rotate(-0 53.23 301.347043)">0.035</text>
4062
  </g>
4063
  </g>
4064
  <g id="ytick_4">
4065
  <g id="grid-y--5" class="grid grid-y">
4066
- <path d="M 60.23 232.227658 L 847.294169 232.227658 " clip-path="url(#p620c7d392f)" style="fill: none; stroke: #b0b0b0; stroke-opacity: 0.3; stroke-width: 0.8; stroke-linecap: square" />
4067
  </g>
4068
  <g id="line2d_13">
4069
  <g>
4070
- <use ns4:href="#m0fca2865ba" x="60.23" y="232.227658" style="stroke: #000000; stroke-width: 0.8" />
4071
  </g>
4072
  </g>
4073
  <g id="text_13">
4074
- <text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: end" x="53.23" y="236.026877" transform="rotate(-0 53.23 236.026877)">0.040</text>
4075
  </g>
4076
  </g>
4077
  <g id="ytick_5">
4078
  <g id="grid-y--6" class="grid grid-y">
4079
- <path d="M 60.23 166.907492 L 847.294169 166.907492 " clip-path="url(#p620c7d392f)" style="fill: none; stroke: #b0b0b0; stroke-opacity: 0.3; stroke-width: 0.8; stroke-linecap: square" />
4080
  </g>
4081
  <g id="line2d_14">
4082
  <g>
4083
- <use ns4:href="#m0fca2865ba" x="60.23" y="166.907492" style="stroke: #000000; stroke-width: 0.8" />
4084
  </g>
4085
  </g>
4086
  <g id="text_14">
4087
- <text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: end" x="53.23" y="170.706711" transform="rotate(-0 53.23 170.706711)">0.045</text>
4088
  </g>
4089
  </g>
4090
  <g id="ytick_6">
4091
  <g id="grid-y--7" class="grid grid-y">
4092
- <path d="M 60.23 101.587327 L 847.294169 101.587327 " clip-path="url(#p620c7d392f)" style="fill: none; stroke: #b0b0b0; stroke-opacity: 0.3; stroke-width: 0.8; stroke-linecap: square" />
4093
  </g>
4094
  <g id="line2d_15">
4095
  <g>
4096
- <use ns4:href="#m0fca2865ba" x="60.23" y="101.587327" style="stroke: #000000; stroke-width: 0.8" />
4097
  </g>
4098
  </g>
4099
  <g id="text_15">
4100
- <text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: end" x="53.23" y="105.386545" transform="rotate(-0 53.23 105.386545)">0.050</text>
4101
- </g>
4102
- </g>
4103
- <g id="ytick_7">
4104
- <g id="grid-y--8" class="grid grid-y">
4105
- <path d="M 60.23 36.267161 L 847.294169 36.267161 " clip-path="url(#p620c7d392f)" style="fill: none; stroke: #b0b0b0; stroke-opacity: 0.3; stroke-width: 0.8; stroke-linecap: square" />
4106
- </g>
4107
- <g id="line2d_16">
4108
- <g>
4109
- <use ns4:href="#m0fca2865ba" x="60.23" y="36.267161" style="stroke: #000000; stroke-width: 0.8" />
4110
- </g>
4111
- </g>
4112
- <g id="text_16">
4113
- <text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: end" x="53.23" y="40.066379" transform="rotate(-0 53.23 40.066379)">0.055</text>
4114
  </g>
4115
  </g>
4116
  <g id="label--y" class="ylabel">
@@ -4118,37 +4113,37 @@ span.linenos.special { color: #000000; background-color: #ffffc0; padding-left:
4118
  </g>
4119
  </g>
4120
  <g id="series--hf-kernels-swiglu" class="series">
4121
- <path d="M 96.005644 451.16779 L 185.444754 385.847624 L 274.883864 395.253728 L 364.322974 398.911657 L 453.762084 382.189695 L 543.201194 401.393823 L 632.640304 395.136152 L 722.079415 381.275213 L 811.518525 395.515009 " clip-path="url(#p620c7d392f)" style="fill: none; stroke: #1f77b4; stroke-width: 1.5; stroke-linecap: square" />
4122
  <defs>
4123
  <path id="md7efaf3aec" d="M 0 3 C 0.795609 3 1.55874 2.683901 2.12132 2.12132 C 2.683901 1.55874 3 0.795609 3 0 C 3 -0.795609 2.683901 -1.55874 2.12132 -2.12132 C 1.55874 -2.683901 0.795609 -3 0 -3 C -0.795609 -3 -1.55874 -2.683901 -2.12132 -2.12132 C -2.683901 -1.55874 -3 -0.795609 -3 0 C -3 0.795609 -2.683901 1.55874 -2.12132 2.12132 C -1.55874 2.683901 -0.795609 3 0 3 z " style="stroke: #1f77b4" />
4124
  </defs>
4125
  <g clip-path="url(#p620c7d392f)">
4126
  <use ns4:href="#md7efaf3aec" x="96.005644" y="451.16779" style="fill: #1f77b4; stroke: #1f77b4" />
4127
- <use ns4:href="#md7efaf3aec" x="185.444754" y="385.847624" style="fill: #1f77b4; stroke: #1f77b4" />
4128
- <use ns4:href="#md7efaf3aec" x="274.883864" y="395.253728" style="fill: #1f77b4; stroke: #1f77b4" />
4129
- <use ns4:href="#md7efaf3aec" x="364.322974" y="398.911657" style="fill: #1f77b4; stroke: #1f77b4" />
4130
- <use ns4:href="#md7efaf3aec" x="453.762084" y="382.189695" style="fill: #1f77b4; stroke: #1f77b4" />
4131
- <use ns4:href="#md7efaf3aec" x="543.201194" y="401.393823" style="fill: #1f77b4; stroke: #1f77b4" />
4132
- <use ns4:href="#md7efaf3aec" x="632.640304" y="395.136152" style="fill: #1f77b4; stroke: #1f77b4" />
4133
- <use ns4:href="#md7efaf3aec" x="722.079415" y="381.275213" style="fill: #1f77b4; stroke: #1f77b4" />
4134
- <use ns4:href="#md7efaf3aec" x="811.518525" y="395.515009" style="fill: #1f77b4; stroke: #1f77b4" />
4135
  </g>
4136
  </g>
4137
  <g id="series--torch-eager" class="series">
4138
- <path d="M 96.005644 194.328898 L 185.444754 47.08418 L 274.883864 59.495011 L 364.322974 61.46768 L 453.762084 66.170732 L 543.201194 84.055394 L 632.640304 56.503348 L 722.079415 80.67181 L 811.518525 81.586292 " clip-path="url(#p620c7d392f)" style="fill: none; stroke: #ff7f0e; stroke-width: 1.5; stroke-linecap: square" />
4139
  <defs>
4140
  <path id="m9b8c54d372" d="M 0 3 C 0.795609 3 1.55874 2.683901 2.12132 2.12132 C 2.683901 1.55874 3 0.795609 3 0 C 3 -0.795609 2.683901 -1.55874 2.12132 -2.12132 C 1.55874 -2.683901 0.795609 -3 0 -3 C -0.795609 -3 -1.55874 -2.683901 -2.12132 -2.12132 C -2.683901 -1.55874 -3 -0.795609 -3 0 C -3 0.795609 -2.683901 1.55874 -2.12132 2.12132 C -1.55874 2.683901 -0.795609 3 0 3 z " style="stroke: #ff7f0e" />
4141
  </defs>
4142
  <g clip-path="url(#p620c7d392f)">
4143
- <use ns4:href="#m9b8c54d372" x="96.005644" y="194.328898" style="fill: #ff7f0e; stroke: #ff7f0e" />
4144
  <use ns4:href="#m9b8c54d372" x="185.444754" y="47.08418" style="fill: #ff7f0e; stroke: #ff7f0e" />
4145
- <use ns4:href="#m9b8c54d372" x="274.883864" y="59.495011" style="fill: #ff7f0e; stroke: #ff7f0e" />
4146
- <use ns4:href="#m9b8c54d372" x="364.322974" y="61.46768" style="fill: #ff7f0e; stroke: #ff7f0e" />
4147
- <use ns4:href="#m9b8c54d372" x="453.762084" y="66.170732" style="fill: #ff7f0e; stroke: #ff7f0e" />
4148
- <use ns4:href="#m9b8c54d372" x="543.201194" y="84.055394" style="fill: #ff7f0e; stroke: #ff7f0e" />
4149
- <use ns4:href="#m9b8c54d372" x="632.640304" y="56.503348" style="fill: #ff7f0e; stroke: #ff7f0e" />
4150
- <use ns4:href="#m9b8c54d372" x="722.079415" y="80.67181" style="fill: #ff7f0e; stroke: #ff7f0e" />
4151
- <use ns4:href="#m9b8c54d372" x="811.518525" y="81.586292" style="fill: #ff7f0e; stroke: #ff7f0e" />
4152
  </g>
4153
  </g>
4154
  <g id="patch_3">
@@ -4163,14 +4158,14 @@ span.linenos.special { color: #000000; background-color: #ffffc0; padding-left:
4163
  <g id="patch_6">
4164
  <path d="M 60.23 26.88 L 847.294169 26.88 " style="fill: none; stroke: #000000; stroke-width: 0.8; stroke-linejoin: miter; stroke-linecap: square" />
4165
  </g>
4166
- <g id="text_17">
4167
  <text style="font-size: 12px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: middle" x="453.762084" y="20.88" transform="rotate(-0 453.762084 20.88)">Attention Implementation Latency</text>
4168
  </g>
4169
  <g id="legend" class="legend">
4170
  <g id="patch_7">
4171
  <path d="M 720.811356 64.7925 L 840.294169 64.7925 Q 842.294169 64.7925 842.294169 62.7925 L 842.294169 33.88 Q 842.294169 31.88 840.294169 31.88 L 720.811356 31.88 Q 718.811356 31.88 718.811356 33.88 L 718.811356 62.7925 Q 718.811356 64.7925 720.811356 64.7925 L 720.811356 64.7925 z " style="fill: none; opacity: 0.8; stroke: #cccccc; stroke-linejoin: miter" />
4172
  </g>
4173
- <g id="line2d_17">
4174
  <path d="M 722.811356 39.978438 L 732.811356 39.978438 L 742.811356 39.978438 " style="fill: none; stroke: #1f77b4; stroke-width: 1.5; stroke-linecap: square" />
4175
  <g>
4176
  <use ns4:href="#md7efaf3aec" x="732.811356" y="39.978438" style="fill: #1f77b4; stroke: #1f77b4" />
@@ -4179,7 +4174,7 @@ span.linenos.special { color: #000000; background-color: #ffffc0; padding-left:
4179
  <g id="legend-label--hf-kernels-swiglu" class="legend">
4180
  <text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: start" x="750.811356" y="43.478438" transform="rotate(-0 750.811356 43.478438)">hf_kernels_swiglu</text>
4181
  </g>
4182
- <g id="line2d_18">
4183
  <path d="M 722.811356 54.934687 L 732.811356 54.934687 L 742.811356 54.934687 " style="fill: none; stroke: #ff7f0e; stroke-width: 1.5; stroke-linecap: square" />
4184
  <g>
4185
  <use ns4:href="#m9b8c54d372" x="732.811356" y="54.934687" style="fill: #ff7f0e; stroke: #ff7f0e" />
@@ -4206,7 +4201,7 @@ span.linenos.special { color: #000000; background-color: #ffffc0; padding-left:
4206
  <span onclick="toggleOutput('combine')" style="cursor: pointer;">▼ output</span>
4207
  <span id="uv-indicator-combine" onclick="toggleUvLogsFromHeader('combine')" style="cursor: pointer;">▶ uv-logs</span>
4208
  </span> |
4209
- Cell: combine | 4.24s
4210
  | <button class="run-btn" onclick="runCell('combine')">▶ run</button>
4211
  <button class="copy-btn" onclick="copyCell('combine')">Copy</button>
4212
  <a href="cells/combine.py" target="_blank" class="raw-btn">Raw</a>
@@ -4345,7 +4340,7 @@ Installed 37 packages in 218ms
4345
  <rdf:RDF>
4346
  <ns2:Work>
4347
  <dc:type rdf:resource="http://purl.org/dc/dcmitype/StillImage" />
4348
- <dc:date>2025-10-29T14:27:49.999657</dc:date>
4349
  <dc:format>image/svg+xml</dc:format>
4350
  <dc:creator>
4351
  <ns2:Agent>
@@ -4494,96 +4489,83 @@ Installed 37 packages in 218ms
4494
  <g id="matplotlib.axis_2">
4495
  <g id="ytick_1">
4496
  <g id="grid-y--2" class="grid grid-y">
4497
- <path d="M 60.23 428.188156 L 847.294169 428.188156 " clip-path="url(#p620c7d392f)" style="fill: none; stroke: #b0b0b0; stroke-opacity: 0.3; stroke-width: 0.8; stroke-linecap: square" />
4498
  </g>
4499
  <g id="line2d_10">
4500
  <defs>
4501
  <path id="m0fca2865ba" d="M 0 0 L -3.5 0 " style="stroke: #000000; stroke-width: 0.8" />
4502
  </defs>
4503
  <g>
4504
- <use ns4:href="#m0fca2865ba" x="60.23" y="428.188156" style="stroke: #000000; stroke-width: 0.8" />
4505
  </g>
4506
  </g>
4507
  <g id="text_10">
4508
- <text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: end" x="53.23" y="431.987375" transform="rotate(-0 53.23 431.987375)">0.025</text>
4509
  </g>
4510
  </g>
4511
  <g id="ytick_2">
4512
  <g id="grid-y--3" class="grid grid-y">
4513
- <path d="M 60.23 362.86799 L 847.294169 362.86799 " clip-path="url(#p620c7d392f)" style="fill: none; stroke: #b0b0b0; stroke-opacity: 0.3; stroke-width: 0.8; stroke-linecap: square" />
4514
  </g>
4515
  <g id="line2d_11">
4516
  <g>
4517
- <use ns4:href="#m0fca2865ba" x="60.23" y="362.86799" style="stroke: #000000; stroke-width: 0.8" />
4518
  </g>
4519
  </g>
4520
  <g id="text_11">
4521
- <text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: end" x="53.23" y="366.667209" transform="rotate(-0 53.23 366.667209)">0.030</text>
4522
  </g>
4523
  </g>
4524
  <g id="ytick_3">
4525
  <g id="grid-y--4" class="grid grid-y">
4526
- <path d="M 60.23 297.547824 L 847.294169 297.547824 " clip-path="url(#p620c7d392f)" style="fill: none; stroke: #b0b0b0; stroke-opacity: 0.3; stroke-width: 0.8; stroke-linecap: square" />
4527
  </g>
4528
  <g id="line2d_12">
4529
  <g>
4530
- <use ns4:href="#m0fca2865ba" x="60.23" y="297.547824" style="stroke: #000000; stroke-width: 0.8" />
4531
  </g>
4532
  </g>
4533
  <g id="text_12">
4534
- <text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: end" x="53.23" y="301.347043" transform="rotate(-0 53.23 301.347043)">0.035</text>
4535
  </g>
4536
  </g>
4537
  <g id="ytick_4">
4538
  <g id="grid-y--5" class="grid grid-y">
4539
- <path d="M 60.23 232.227658 L 847.294169 232.227658 " clip-path="url(#p620c7d392f)" style="fill: none; stroke: #b0b0b0; stroke-opacity: 0.3; stroke-width: 0.8; stroke-linecap: square" />
4540
  </g>
4541
  <g id="line2d_13">
4542
  <g>
4543
- <use ns4:href="#m0fca2865ba" x="60.23" y="232.227658" style="stroke: #000000; stroke-width: 0.8" />
4544
  </g>
4545
  </g>
4546
  <g id="text_13">
4547
- <text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: end" x="53.23" y="236.026877" transform="rotate(-0 53.23 236.026877)">0.040</text>
4548
  </g>
4549
  </g>
4550
  <g id="ytick_5">
4551
  <g id="grid-y--6" class="grid grid-y">
4552
- <path d="M 60.23 166.907492 L 847.294169 166.907492 " clip-path="url(#p620c7d392f)" style="fill: none; stroke: #b0b0b0; stroke-opacity: 0.3; stroke-width: 0.8; stroke-linecap: square" />
4553
  </g>
4554
  <g id="line2d_14">
4555
  <g>
4556
- <use ns4:href="#m0fca2865ba" x="60.23" y="166.907492" style="stroke: #000000; stroke-width: 0.8" />
4557
  </g>
4558
  </g>
4559
  <g id="text_14">
4560
- <text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: end" x="53.23" y="170.706711" transform="rotate(-0 53.23 170.706711)">0.045</text>
4561
  </g>
4562
  </g>
4563
  <g id="ytick_6">
4564
  <g id="grid-y--7" class="grid grid-y">
4565
- <path d="M 60.23 101.587327 L 847.294169 101.587327 " clip-path="url(#p620c7d392f)" style="fill: none; stroke: #b0b0b0; stroke-opacity: 0.3; stroke-width: 0.8; stroke-linecap: square" />
4566
  </g>
4567
  <g id="line2d_15">
4568
  <g>
4569
- <use ns4:href="#m0fca2865ba" x="60.23" y="101.587327" style="stroke: #000000; stroke-width: 0.8" />
4570
  </g>
4571
  </g>
4572
  <g id="text_15">
4573
- <text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: end" x="53.23" y="105.386545" transform="rotate(-0 53.23 105.386545)">0.050</text>
4574
- </g>
4575
- </g>
4576
- <g id="ytick_7">
4577
- <g id="grid-y--8" class="grid grid-y">
4578
- <path d="M 60.23 36.267161 L 847.294169 36.267161 " clip-path="url(#p620c7d392f)" style="fill: none; stroke: #b0b0b0; stroke-opacity: 0.3; stroke-width: 0.8; stroke-linecap: square" />
4579
- </g>
4580
- <g id="line2d_16">
4581
- <g>
4582
- <use ns4:href="#m0fca2865ba" x="60.23" y="36.267161" style="stroke: #000000; stroke-width: 0.8" />
4583
- </g>
4584
- </g>
4585
- <g id="text_16">
4586
- <text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: end" x="53.23" y="40.066379" transform="rotate(-0 53.23 40.066379)">0.055</text>
4587
  </g>
4588
  </g>
4589
  <g id="label--y" class="ylabel">
@@ -4591,37 +4573,37 @@ Installed 37 packages in 218ms
4591
  </g>
4592
  </g>
4593
  <g id="series--hf-kernels-swiglu" class="series">
4594
- <path d="M 96.005644 451.16779 L 185.444754 385.847624 L 274.883864 395.253728 L 364.322974 398.911657 L 453.762084 382.189695 L 543.201194 401.393823 L 632.640304 395.136152 L 722.079415 381.275213 L 811.518525 395.515009 " clip-path="url(#p620c7d392f)" style="fill: none; stroke: #1f77b4; stroke-width: 1.5; stroke-linecap: square" />
4595
  <defs>
4596
  <path id="md7efaf3aec" d="M 0 3 C 0.795609 3 1.55874 2.683901 2.12132 2.12132 C 2.683901 1.55874 3 0.795609 3 0 C 3 -0.795609 2.683901 -1.55874 2.12132 -2.12132 C 1.55874 -2.683901 0.795609 -3 0 -3 C -0.795609 -3 -1.55874 -2.683901 -2.12132 -2.12132 C -2.683901 -1.55874 -3 -0.795609 -3 0 C -3 0.795609 -2.683901 1.55874 -2.12132 2.12132 C -1.55874 2.683901 -0.795609 3 0 3 z " style="stroke: #1f77b4" />
4597
  </defs>
4598
  <g clip-path="url(#p620c7d392f)">
4599
  <use ns4:href="#md7efaf3aec" x="96.005644" y="451.16779" style="fill: #1f77b4; stroke: #1f77b4" />
4600
- <use ns4:href="#md7efaf3aec" x="185.444754" y="385.847624" style="fill: #1f77b4; stroke: #1f77b4" />
4601
- <use ns4:href="#md7efaf3aec" x="274.883864" y="395.253728" style="fill: #1f77b4; stroke: #1f77b4" />
4602
- <use ns4:href="#md7efaf3aec" x="364.322974" y="398.911657" style="fill: #1f77b4; stroke: #1f77b4" />
4603
- <use ns4:href="#md7efaf3aec" x="453.762084" y="382.189695" style="fill: #1f77b4; stroke: #1f77b4" />
4604
- <use ns4:href="#md7efaf3aec" x="543.201194" y="401.393823" style="fill: #1f77b4; stroke: #1f77b4" />
4605
- <use ns4:href="#md7efaf3aec" x="632.640304" y="395.136152" style="fill: #1f77b4; stroke: #1f77b4" />
4606
- <use ns4:href="#md7efaf3aec" x="722.079415" y="381.275213" style="fill: #1f77b4; stroke: #1f77b4" />
4607
- <use ns4:href="#md7efaf3aec" x="811.518525" y="395.515009" style="fill: #1f77b4; stroke: #1f77b4" />
4608
  </g>
4609
  </g>
4610
  <g id="series--torch-eager" class="series">
4611
- <path d="M 96.005644 194.328898 L 185.444754 47.08418 L 274.883864 59.495011 L 364.322974 61.46768 L 453.762084 66.170732 L 543.201194 84.055394 L 632.640304 56.503348 L 722.079415 80.67181 L 811.518525 81.586292 " clip-path="url(#p620c7d392f)" style="fill: none; stroke: #ff7f0e; stroke-width: 1.5; stroke-linecap: square" />
4612
  <defs>
4613
  <path id="m9b8c54d372" d="M 0 3 C 0.795609 3 1.55874 2.683901 2.12132 2.12132 C 2.683901 1.55874 3 0.795609 3 0 C 3 -0.795609 2.683901 -1.55874 2.12132 -2.12132 C 1.55874 -2.683901 0.795609 -3 0 -3 C -0.795609 -3 -1.55874 -2.683901 -2.12132 -2.12132 C -2.683901 -1.55874 -3 -0.795609 -3 0 C -3 0.795609 -2.683901 1.55874 -2.12132 2.12132 C -1.55874 2.683901 -0.795609 3 0 3 z " style="stroke: #ff7f0e" />
4614
  </defs>
4615
  <g clip-path="url(#p620c7d392f)">
4616
- <use ns4:href="#m9b8c54d372" x="96.005644" y="194.328898" style="fill: #ff7f0e; stroke: #ff7f0e" />
4617
  <use ns4:href="#m9b8c54d372" x="185.444754" y="47.08418" style="fill: #ff7f0e; stroke: #ff7f0e" />
4618
- <use ns4:href="#m9b8c54d372" x="274.883864" y="59.495011" style="fill: #ff7f0e; stroke: #ff7f0e" />
4619
- <use ns4:href="#m9b8c54d372" x="364.322974" y="61.46768" style="fill: #ff7f0e; stroke: #ff7f0e" />
4620
- <use ns4:href="#m9b8c54d372" x="453.762084" y="66.170732" style="fill: #ff7f0e; stroke: #ff7f0e" />
4621
- <use ns4:href="#m9b8c54d372" x="543.201194" y="84.055394" style="fill: #ff7f0e; stroke: #ff7f0e" />
4622
- <use ns4:href="#m9b8c54d372" x="632.640304" y="56.503348" style="fill: #ff7f0e; stroke: #ff7f0e" />
4623
- <use ns4:href="#m9b8c54d372" x="722.079415" y="80.67181" style="fill: #ff7f0e; stroke: #ff7f0e" />
4624
- <use ns4:href="#m9b8c54d372" x="811.518525" y="81.586292" style="fill: #ff7f0e; stroke: #ff7f0e" />
4625
  </g>
4626
  </g>
4627
  <g id="patch_3">
@@ -4636,14 +4618,14 @@ Installed 37 packages in 218ms
4636
  <g id="patch_6">
4637
  <path d="M 60.23 26.88 L 847.294169 26.88 " style="fill: none; stroke: #000000; stroke-width: 0.8; stroke-linejoin: miter; stroke-linecap: square" />
4638
  </g>
4639
- <g id="text_17">
4640
  <text style="font-size: 12px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: middle" x="453.762084" y="20.88" transform="rotate(-0 453.762084 20.88)">Attention Implementation Latency</text>
4641
  </g>
4642
  <g id="legend" class="legend">
4643
  <g id="patch_7">
4644
  <path d="M 720.811356 64.7925 L 840.294169 64.7925 Q 842.294169 64.7925 842.294169 62.7925 L 842.294169 33.88 Q 842.294169 31.88 840.294169 31.88 L 720.811356 31.88 Q 718.811356 31.88 718.811356 33.88 L 718.811356 62.7925 Q 718.811356 64.7925 720.811356 64.7925 L 720.811356 64.7925 z " style="fill: none; opacity: 0.8; stroke: #cccccc; stroke-linejoin: miter" />
4645
  </g>
4646
- <g id="line2d_17">
4647
  <path d="M 722.811356 39.978438 L 732.811356 39.978438 L 742.811356 39.978438 " style="fill: none; stroke: #1f77b4; stroke-width: 1.5; stroke-linecap: square" />
4648
  <g>
4649
  <use ns4:href="#md7efaf3aec" x="732.811356" y="39.978438" style="fill: #1f77b4; stroke: #1f77b4" />
@@ -4652,7 +4634,7 @@ Installed 37 packages in 218ms
4652
  <g id="legend-label--hf-kernels-swiglu" class="legend">
4653
  <text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: start" x="750.811356" y="43.478438" transform="rotate(-0 750.811356 43.478438)">hf_kernels_swiglu</text>
4654
  </g>
4655
- <g id="line2d_18">
4656
  <path d="M 722.811356 54.934687 L 732.811356 54.934687 L 742.811356 54.934687 " style="fill: none; stroke: #ff7f0e; stroke-width: 1.5; stroke-linecap: square" />
4657
  <g>
4658
  <use ns4:href="#m9b8c54d372" x="732.811356" y="54.934687" style="fill: #ff7f0e; stroke: #ff7f0e" />
 
809
  .artifact-preview svg {
810
  background: transparent;
811
  }
812
+ /* Invert SVG images in dark mode */
813
+ :root[data-theme="dark"] .artifact-preview img[src$=".svg"] {
814
+ filter: invert(0.9) hue-rotate(180deg);
815
+ }
816
+ /* Keep SVG images readable in monocolor mode */
817
+ :root[data-ui="monocolor"] .artifact-preview img[src$=".svg"] {
818
+ filter: none;
819
+ }
820
  /* CSV table styling */
821
  .artifact-csv {
822
  margin-top: 1rem;
 
3880
  <rdf:RDF>
3881
  <ns2:Work>
3882
  <dc:type rdf:resource="http://purl.org/dc/dcmitype/StillImage" />
3883
+ <dc:date>2025-10-29T15:51:13.643076</dc:date>
3884
  <dc:format>image/svg+xml</dc:format>
3885
  <dc:creator>
3886
  <ns2:Agent>
 
4029
  <g id="matplotlib.axis_2">
4030
  <g id="ytick_1">
4031
  <g id="grid-y--2" class="grid grid-y">
4032
+ <path d="M 60.23 438.443756 L 847.294169 438.443756 " clip-path="url(#p620c7d392f)" style="fill: none; stroke: #b0b0b0; stroke-opacity: 0.3; stroke-width: 0.8; stroke-linecap: square" />
4033
  </g>
4034
  <g id="line2d_10">
4035
  <defs>
4036
  <path id="m0fca2865ba" d="M 0 0 L -3.5 0 " style="stroke: #000000; stroke-width: 0.8" />
4037
  </defs>
4038
  <g>
4039
+ <use ns4:href="#m0fca2865ba" x="60.23" y="438.443756" style="stroke: #000000; stroke-width: 0.8" />
4040
  </g>
4041
  </g>
4042
  <g id="text_10">
4043
+ <text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: end" x="53.23" y="442.242975" transform="rotate(-0 53.23 442.242975)">0.025</text>
4044
  </g>
4045
  </g>
4046
  <g id="ytick_2">
4047
  <g id="grid-y--3" class="grid grid-y">
4048
+ <path d="M 60.23 367.676049 L 847.294169 367.676049 " clip-path="url(#p620c7d392f)" style="fill: none; stroke: #b0b0b0; stroke-opacity: 0.3; stroke-width: 0.8; stroke-linecap: square" />
4049
  </g>
4050
  <g id="line2d_11">
4051
  <g>
4052
+ <use ns4:href="#m0fca2865ba" x="60.23" y="367.676049" style="stroke: #000000; stroke-width: 0.8" />
4053
  </g>
4054
  </g>
4055
  <g id="text_11">
4056
+ <text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: end" x="53.23" y="371.475268" transform="rotate(-0 53.23 371.475268)">0.030</text>
4057
  </g>
4058
  </g>
4059
  <g id="ytick_3">
4060
  <g id="grid-y--4" class="grid grid-y">
4061
+ <path d="M 60.23 296.908341 L 847.294169 296.908341 " clip-path="url(#p620c7d392f)" style="fill: none; stroke: #b0b0b0; stroke-opacity: 0.3; stroke-width: 0.8; stroke-linecap: square" />
4062
  </g>
4063
  <g id="line2d_12">
4064
  <g>
4065
+ <use ns4:href="#m0fca2865ba" x="60.23" y="296.908341" style="stroke: #000000; stroke-width: 0.8" />
4066
  </g>
4067
  </g>
4068
  <g id="text_12">
4069
+ <text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: end" x="53.23" y="300.70756" transform="rotate(-0 53.23 300.70756)">0.035</text>
4070
  </g>
4071
  </g>
4072
  <g id="ytick_4">
4073
  <g id="grid-y--5" class="grid grid-y">
4074
+ <path d="M 60.23 226.140634 L 847.294169 226.140634 " clip-path="url(#p620c7d392f)" style="fill: none; stroke: #b0b0b0; stroke-opacity: 0.3; stroke-width: 0.8; stroke-linecap: square" />
4075
  </g>
4076
  <g id="line2d_13">
4077
  <g>
4078
+ <use ns4:href="#m0fca2865ba" x="60.23" y="226.140634" style="stroke: #000000; stroke-width: 0.8" />
4079
  </g>
4080
  </g>
4081
  <g id="text_13">
4082
+ <text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: end" x="53.23" y="229.939852" transform="rotate(-0 53.23 229.939852)">0.040</text>
4083
  </g>
4084
  </g>
4085
  <g id="ytick_5">
4086
  <g id="grid-y--6" class="grid grid-y">
4087
+ <path d="M 60.23 155.372926 L 847.294169 155.372926 " clip-path="url(#p620c7d392f)" style="fill: none; stroke: #b0b0b0; stroke-opacity: 0.3; stroke-width: 0.8; stroke-linecap: square" />
4088
  </g>
4089
  <g id="line2d_14">
4090
  <g>
4091
+ <use ns4:href="#m0fca2865ba" x="60.23" y="155.372926" style="stroke: #000000; stroke-width: 0.8" />
4092
  </g>
4093
  </g>
4094
  <g id="text_14">
4095
+ <text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: end" x="53.23" y="159.172145" transform="rotate(-0 53.23 159.172145)">0.045</text>
4096
  </g>
4097
  </g>
4098
  <g id="ytick_6">
4099
  <g id="grid-y--7" class="grid grid-y">
4100
+ <path d="M 60.23 84.605219 L 847.294169 84.605219 " clip-path="url(#p620c7d392f)" style="fill: none; stroke: #b0b0b0; stroke-opacity: 0.3; stroke-width: 0.8; stroke-linecap: square" />
4101
  </g>
4102
  <g id="line2d_15">
4103
  <g>
4104
+ <use ns4:href="#m0fca2865ba" x="60.23" y="84.605219" style="stroke: #000000; stroke-width: 0.8" />
4105
  </g>
4106
  </g>
4107
  <g id="text_15">
4108
+ <text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: end" x="53.23" y="88.404437" transform="rotate(-0 53.23 88.404437)">0.050</text>
 
 
 
 
 
 
 
 
 
 
 
 
 
4109
  </g>
4110
  </g>
4111
  <g id="label--y" class="ylabel">
 
4113
  </g>
4114
  </g>
4115
  <g id="series--hf-kernels-swiglu" class="series">
4116
+ <path d="M 96.005644 451.16779 L 185.444754 364.40658 L 274.883864 374.045142 L 364.322974 392.869353 L 453.762084 389.882956 L 543.201194 397.667403 L 632.640304 381.532366 L 722.079415 398.106163 L 811.518525 419.478011 " clip-path="url(#p620c7d392f)" style="fill: none; stroke: #1f77b4; stroke-width: 1.5; stroke-linecap: square" />
4117
  <defs>
4118
  <path id="md7efaf3aec" d="M 0 3 C 0.795609 3 1.55874 2.683901 2.12132 2.12132 C 2.683901 1.55874 3 0.795609 3 0 C 3 -0.795609 2.683901 -1.55874 2.12132 -2.12132 C 1.55874 -2.683901 0.795609 -3 0 -3 C -0.795609 -3 -1.55874 -2.683901 -2.12132 -2.12132 C -2.683901 -1.55874 -3 -0.795609 -3 0 C -3 0.795609 -2.683901 1.55874 -2.12132 2.12132 C -1.55874 2.683901 -0.795609 3 0 3 z " style="stroke: #1f77b4" />
4119
  </defs>
4120
  <g clip-path="url(#p620c7d392f)">
4121
  <use ns4:href="#md7efaf3aec" x="96.005644" y="451.16779" style="fill: #1f77b4; stroke: #1f77b4" />
4122
+ <use ns4:href="#md7efaf3aec" x="185.444754" y="364.40658" style="fill: #1f77b4; stroke: #1f77b4" />
4123
+ <use ns4:href="#md7efaf3aec" x="274.883864" y="374.045142" style="fill: #1f77b4; stroke: #1f77b4" />
4124
+ <use ns4:href="#md7efaf3aec" x="364.322974" y="392.869353" style="fill: #1f77b4; stroke: #1f77b4" />
4125
+ <use ns4:href="#md7efaf3aec" x="453.762084" y="389.882956" style="fill: #1f77b4; stroke: #1f77b4" />
4126
+ <use ns4:href="#md7efaf3aec" x="543.201194" y="397.667403" style="fill: #1f77b4; stroke: #1f77b4" />
4127
+ <use ns4:href="#md7efaf3aec" x="632.640304" y="381.532366" style="fill: #1f77b4; stroke: #1f77b4" />
4128
+ <use ns4:href="#md7efaf3aec" x="722.079415" y="398.106163" style="fill: #1f77b4; stroke: #1f77b4" />
4129
+ <use ns4:href="#md7efaf3aec" x="811.518525" y="419.478011" style="fill: #1f77b4; stroke: #1f77b4" />
4130
  </g>
4131
  </g>
4132
  <g id="series--torch-eager" class="series">
4133
+ <path d="M 96.005644 202.773137 L 185.444754 47.08418 L 274.883864 70.154453 L 364.322974 81.180062 L 453.762084 86.714096 L 543.201194 90.252482 L 632.640304 80.486538 L 722.079415 88.412521 L 811.518525 96.338505 " clip-path="url(#p620c7d392f)" style="fill: none; stroke: #ff7f0e; stroke-width: 1.5; stroke-linecap: square" />
4134
  <defs>
4135
  <path id="m9b8c54d372" d="M 0 3 C 0.795609 3 1.55874 2.683901 2.12132 2.12132 C 2.683901 1.55874 3 0.795609 3 0 C 3 -0.795609 2.683901 -1.55874 2.12132 -2.12132 C 1.55874 -2.683901 0.795609 -3 0 -3 C -0.795609 -3 -1.55874 -2.683901 -2.12132 -2.12132 C -2.683901 -1.55874 -3 -0.795609 -3 0 C -3 0.795609 -2.683901 1.55874 -2.12132 2.12132 C -1.55874 2.683901 -0.795609 3 0 3 z " style="stroke: #ff7f0e" />
4136
  </defs>
4137
  <g clip-path="url(#p620c7d392f)">
4138
+ <use ns4:href="#m9b8c54d372" x="96.005644" y="202.773137" style="fill: #ff7f0e; stroke: #ff7f0e" />
4139
  <use ns4:href="#m9b8c54d372" x="185.444754" y="47.08418" style="fill: #ff7f0e; stroke: #ff7f0e" />
4140
+ <use ns4:href="#m9b8c54d372" x="274.883864" y="70.154453" style="fill: #ff7f0e; stroke: #ff7f0e" />
4141
+ <use ns4:href="#m9b8c54d372" x="364.322974" y="81.180062" style="fill: #ff7f0e; stroke: #ff7f0e" />
4142
+ <use ns4:href="#m9b8c54d372" x="453.762084" y="86.714096" style="fill: #ff7f0e; stroke: #ff7f0e" />
4143
+ <use ns4:href="#m9b8c54d372" x="543.201194" y="90.252482" style="fill: #ff7f0e; stroke: #ff7f0e" />
4144
+ <use ns4:href="#m9b8c54d372" x="632.640304" y="80.486538" style="fill: #ff7f0e; stroke: #ff7f0e" />
4145
+ <use ns4:href="#m9b8c54d372" x="722.079415" y="88.412521" style="fill: #ff7f0e; stroke: #ff7f0e" />
4146
+ <use ns4:href="#m9b8c54d372" x="811.518525" y="96.338505" style="fill: #ff7f0e; stroke: #ff7f0e" />
4147
  </g>
4148
  </g>
4149
  <g id="patch_3">
 
4158
  <g id="patch_6">
4159
  <path d="M 60.23 26.88 L 847.294169 26.88 " style="fill: none; stroke: #000000; stroke-width: 0.8; stroke-linejoin: miter; stroke-linecap: square" />
4160
  </g>
4161
+ <g id="text_16">
4162
  <text style="font-size: 12px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: middle" x="453.762084" y="20.88" transform="rotate(-0 453.762084 20.88)">Attention Implementation Latency</text>
4163
  </g>
4164
  <g id="legend" class="legend">
4165
  <g id="patch_7">
4166
  <path d="M 720.811356 64.7925 L 840.294169 64.7925 Q 842.294169 64.7925 842.294169 62.7925 L 842.294169 33.88 Q 842.294169 31.88 840.294169 31.88 L 720.811356 31.88 Q 718.811356 31.88 718.811356 33.88 L 718.811356 62.7925 Q 718.811356 64.7925 720.811356 64.7925 L 720.811356 64.7925 z " style="fill: none; opacity: 0.8; stroke: #cccccc; stroke-linejoin: miter" />
4167
  </g>
4168
+ <g id="line2d_16">
4169
  <path d="M 722.811356 39.978438 L 732.811356 39.978438 L 742.811356 39.978438 " style="fill: none; stroke: #1f77b4; stroke-width: 1.5; stroke-linecap: square" />
4170
  <g>
4171
  <use ns4:href="#md7efaf3aec" x="732.811356" y="39.978438" style="fill: #1f77b4; stroke: #1f77b4" />
 
4174
  <g id="legend-label--hf-kernels-swiglu" class="legend">
4175
  <text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: start" x="750.811356" y="43.478438" transform="rotate(-0 750.811356 43.478438)">hf_kernels_swiglu</text>
4176
  </g>
4177
+ <g id="line2d_17">
4178
  <path d="M 722.811356 54.934687 L 732.811356 54.934687 L 742.811356 54.934687 " style="fill: none; stroke: #ff7f0e; stroke-width: 1.5; stroke-linecap: square" />
4179
  <g>
4180
  <use ns4:href="#m9b8c54d372" x="732.811356" y="54.934687" style="fill: #ff7f0e; stroke: #ff7f0e" />
 
4201
  <span onclick="toggleOutput('combine')" style="cursor: pointer;">▼ output</span>
4202
  <span id="uv-indicator-combine" onclick="toggleUvLogsFromHeader('combine')" style="cursor: pointer;">▶ uv-logs</span>
4203
  </span> |
4204
+ Cell: combine | 4.26s
4205
  | <button class="run-btn" onclick="runCell('combine')">▶ run</button>
4206
  <button class="copy-btn" onclick="copyCell('combine')">Copy</button>
4207
  <a href="cells/combine.py" target="_blank" class="raw-btn">Raw</a>
 
4340
  <rdf:RDF>
4341
  <ns2:Work>
4342
  <dc:type rdf:resource="http://purl.org/dc/dcmitype/StillImage" />
4343
+ <dc:date>2025-10-29T15:51:13.643076</dc:date>
4344
  <dc:format>image/svg+xml</dc:format>
4345
  <dc:creator>
4346
  <ns2:Agent>
 
4489
  <g id="matplotlib.axis_2">
4490
  <g id="ytick_1">
4491
  <g id="grid-y--2" class="grid grid-y">
4492
+ <path d="M 60.23 438.443756 L 847.294169 438.443756 " clip-path="url(#p620c7d392f)" style="fill: none; stroke: #b0b0b0; stroke-opacity: 0.3; stroke-width: 0.8; stroke-linecap: square" />
4493
  </g>
4494
  <g id="line2d_10">
4495
  <defs>
4496
  <path id="m0fca2865ba" d="M 0 0 L -3.5 0 " style="stroke: #000000; stroke-width: 0.8" />
4497
  </defs>
4498
  <g>
4499
+ <use ns4:href="#m0fca2865ba" x="60.23" y="438.443756" style="stroke: #000000; stroke-width: 0.8" />
4500
  </g>
4501
  </g>
4502
  <g id="text_10">
4503
+ <text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: end" x="53.23" y="442.242975" transform="rotate(-0 53.23 442.242975)">0.025</text>
4504
  </g>
4505
  </g>
4506
  <g id="ytick_2">
4507
  <g id="grid-y--3" class="grid grid-y">
4508
+ <path d="M 60.23 367.676049 L 847.294169 367.676049 " clip-path="url(#p620c7d392f)" style="fill: none; stroke: #b0b0b0; stroke-opacity: 0.3; stroke-width: 0.8; stroke-linecap: square" />
4509
  </g>
4510
  <g id="line2d_11">
4511
  <g>
4512
+ <use ns4:href="#m0fca2865ba" x="60.23" y="367.676049" style="stroke: #000000; stroke-width: 0.8" />
4513
  </g>
4514
  </g>
4515
  <g id="text_11">
4516
+ <text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: end" x="53.23" y="371.475268" transform="rotate(-0 53.23 371.475268)">0.030</text>
4517
  </g>
4518
  </g>
4519
  <g id="ytick_3">
4520
  <g id="grid-y--4" class="grid grid-y">
4521
+ <path d="M 60.23 296.908341 L 847.294169 296.908341 " clip-path="url(#p620c7d392f)" style="fill: none; stroke: #b0b0b0; stroke-opacity: 0.3; stroke-width: 0.8; stroke-linecap: square" />
4522
  </g>
4523
  <g id="line2d_12">
4524
  <g>
4525
+ <use ns4:href="#m0fca2865ba" x="60.23" y="296.908341" style="stroke: #000000; stroke-width: 0.8" />
4526
  </g>
4527
  </g>
4528
  <g id="text_12">
4529
+ <text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: end" x="53.23" y="300.70756" transform="rotate(-0 53.23 300.70756)">0.035</text>
4530
  </g>
4531
  </g>
4532
  <g id="ytick_4">
4533
  <g id="grid-y--5" class="grid grid-y">
4534
+ <path d="M 60.23 226.140634 L 847.294169 226.140634 " clip-path="url(#p620c7d392f)" style="fill: none; stroke: #b0b0b0; stroke-opacity: 0.3; stroke-width: 0.8; stroke-linecap: square" />
4535
  </g>
4536
  <g id="line2d_13">
4537
  <g>
4538
+ <use ns4:href="#m0fca2865ba" x="60.23" y="226.140634" style="stroke: #000000; stroke-width: 0.8" />
4539
  </g>
4540
  </g>
4541
  <g id="text_13">
4542
+ <text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: end" x="53.23" y="229.939852" transform="rotate(-0 53.23 229.939852)">0.040</text>
4543
  </g>
4544
  </g>
4545
  <g id="ytick_5">
4546
  <g id="grid-y--6" class="grid grid-y">
4547
+ <path d="M 60.23 155.372926 L 847.294169 155.372926 " clip-path="url(#p620c7d392f)" style="fill: none; stroke: #b0b0b0; stroke-opacity: 0.3; stroke-width: 0.8; stroke-linecap: square" />
4548
  </g>
4549
  <g id="line2d_14">
4550
  <g>
4551
+ <use ns4:href="#m0fca2865ba" x="60.23" y="155.372926" style="stroke: #000000; stroke-width: 0.8" />
4552
  </g>
4553
  </g>
4554
  <g id="text_14">
4555
+ <text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: end" x="53.23" y="159.172145" transform="rotate(-0 53.23 159.172145)">0.045</text>
4556
  </g>
4557
  </g>
4558
  <g id="ytick_6">
4559
  <g id="grid-y--7" class="grid grid-y">
4560
+ <path d="M 60.23 84.605219 L 847.294169 84.605219 " clip-path="url(#p620c7d392f)" style="fill: none; stroke: #b0b0b0; stroke-opacity: 0.3; stroke-width: 0.8; stroke-linecap: square" />
4561
  </g>
4562
  <g id="line2d_15">
4563
  <g>
4564
+ <use ns4:href="#m0fca2865ba" x="60.23" y="84.605219" style="stroke: #000000; stroke-width: 0.8" />
4565
  </g>
4566
  </g>
4567
  <g id="text_15">
4568
+ <text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: end" x="53.23" y="88.404437" transform="rotate(-0 53.23 88.404437)">0.050</text>
 
 
 
 
 
 
 
 
 
 
 
 
 
4569
  </g>
4570
  </g>
4571
  <g id="label--y" class="ylabel">
 
4573
  </g>
4574
  </g>
4575
  <g id="series--hf-kernels-swiglu" class="series">
4576
+ <path d="M 96.005644 451.16779 L 185.444754 364.40658 L 274.883864 374.045142 L 364.322974 392.869353 L 453.762084 389.882956 L 543.201194 397.667403 L 632.640304 381.532366 L 722.079415 398.106163 L 811.518525 419.478011 " clip-path="url(#p620c7d392f)" style="fill: none; stroke: #1f77b4; stroke-width: 1.5; stroke-linecap: square" />
4577
  <defs>
4578
  <path id="md7efaf3aec" d="M 0 3 C 0.795609 3 1.55874 2.683901 2.12132 2.12132 C 2.683901 1.55874 3 0.795609 3 0 C 3 -0.795609 2.683901 -1.55874 2.12132 -2.12132 C 1.55874 -2.683901 0.795609 -3 0 -3 C -0.795609 -3 -1.55874 -2.683901 -2.12132 -2.12132 C -2.683901 -1.55874 -3 -0.795609 -3 0 C -3 0.795609 -2.683901 1.55874 -2.12132 2.12132 C -1.55874 2.683901 -0.795609 3 0 3 z " style="stroke: #1f77b4" />
4579
  </defs>
4580
  <g clip-path="url(#p620c7d392f)">
4581
  <use ns4:href="#md7efaf3aec" x="96.005644" y="451.16779" style="fill: #1f77b4; stroke: #1f77b4" />
4582
+ <use ns4:href="#md7efaf3aec" x="185.444754" y="364.40658" style="fill: #1f77b4; stroke: #1f77b4" />
4583
+ <use ns4:href="#md7efaf3aec" x="274.883864" y="374.045142" style="fill: #1f77b4; stroke: #1f77b4" />
4584
+ <use ns4:href="#md7efaf3aec" x="364.322974" y="392.869353" style="fill: #1f77b4; stroke: #1f77b4" />
4585
+ <use ns4:href="#md7efaf3aec" x="453.762084" y="389.882956" style="fill: #1f77b4; stroke: #1f77b4" />
4586
+ <use ns4:href="#md7efaf3aec" x="543.201194" y="397.667403" style="fill: #1f77b4; stroke: #1f77b4" />
4587
+ <use ns4:href="#md7efaf3aec" x="632.640304" y="381.532366" style="fill: #1f77b4; stroke: #1f77b4" />
4588
+ <use ns4:href="#md7efaf3aec" x="722.079415" y="398.106163" style="fill: #1f77b4; stroke: #1f77b4" />
4589
+ <use ns4:href="#md7efaf3aec" x="811.518525" y="419.478011" style="fill: #1f77b4; stroke: #1f77b4" />
4590
  </g>
4591
  </g>
4592
  <g id="series--torch-eager" class="series">
4593
+ <path d="M 96.005644 202.773137 L 185.444754 47.08418 L 274.883864 70.154453 L 364.322974 81.180062 L 453.762084 86.714096 L 543.201194 90.252482 L 632.640304 80.486538 L 722.079415 88.412521 L 811.518525 96.338505 " clip-path="url(#p620c7d392f)" style="fill: none; stroke: #ff7f0e; stroke-width: 1.5; stroke-linecap: square" />
4594
  <defs>
4595
  <path id="m9b8c54d372" d="M 0 3 C 0.795609 3 1.55874 2.683901 2.12132 2.12132 C 2.683901 1.55874 3 0.795609 3 0 C 3 -0.795609 2.683901 -1.55874 2.12132 -2.12132 C 1.55874 -2.683901 0.795609 -3 0 -3 C -0.795609 -3 -1.55874 -2.683901 -2.12132 -2.12132 C -2.683901 -1.55874 -3 -0.795609 -3 0 C -3 0.795609 -2.683901 1.55874 -2.12132 2.12132 C -1.55874 2.683901 -0.795609 3 0 3 z " style="stroke: #ff7f0e" />
4596
  </defs>
4597
  <g clip-path="url(#p620c7d392f)">
4598
+ <use ns4:href="#m9b8c54d372" x="96.005644" y="202.773137" style="fill: #ff7f0e; stroke: #ff7f0e" />
4599
  <use ns4:href="#m9b8c54d372" x="185.444754" y="47.08418" style="fill: #ff7f0e; stroke: #ff7f0e" />
4600
+ <use ns4:href="#m9b8c54d372" x="274.883864" y="70.154453" style="fill: #ff7f0e; stroke: #ff7f0e" />
4601
+ <use ns4:href="#m9b8c54d372" x="364.322974" y="81.180062" style="fill: #ff7f0e; stroke: #ff7f0e" />
4602
+ <use ns4:href="#m9b8c54d372" x="453.762084" y="86.714096" style="fill: #ff7f0e; stroke: #ff7f0e" />
4603
+ <use ns4:href="#m9b8c54d372" x="543.201194" y="90.252482" style="fill: #ff7f0e; stroke: #ff7f0e" />
4604
+ <use ns4:href="#m9b8c54d372" x="632.640304" y="80.486538" style="fill: #ff7f0e; stroke: #ff7f0e" />
4605
+ <use ns4:href="#m9b8c54d372" x="722.079415" y="88.412521" style="fill: #ff7f0e; stroke: #ff7f0e" />
4606
+ <use ns4:href="#m9b8c54d372" x="811.518525" y="96.338505" style="fill: #ff7f0e; stroke: #ff7f0e" />
4607
  </g>
4608
  </g>
4609
  <g id="patch_3">
 
4618
  <g id="patch_6">
4619
  <path d="M 60.23 26.88 L 847.294169 26.88 " style="fill: none; stroke: #000000; stroke-width: 0.8; stroke-linejoin: miter; stroke-linecap: square" />
4620
  </g>
4621
+ <g id="text_16">
4622
  <text style="font-size: 12px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: middle" x="453.762084" y="20.88" transform="rotate(-0 453.762084 20.88)">Attention Implementation Latency</text>
4623
  </g>
4624
  <g id="legend" class="legend">
4625
  <g id="patch_7">
4626
  <path d="M 720.811356 64.7925 L 840.294169 64.7925 Q 842.294169 64.7925 842.294169 62.7925 L 842.294169 33.88 Q 842.294169 31.88 840.294169 31.88 L 720.811356 31.88 Q 718.811356 31.88 718.811356 33.88 L 718.811356 62.7925 Q 718.811356 64.7925 720.811356 64.7925 L 720.811356 64.7925 z " style="fill: none; opacity: 0.8; stroke: #cccccc; stroke-linejoin: miter" />
4627
  </g>
4628
+ <g id="line2d_16">
4629
  <path d="M 722.811356 39.978438 L 732.811356 39.978438 L 742.811356 39.978438 " style="fill: none; stroke: #1f77b4; stroke-width: 1.5; stroke-linecap: square" />
4630
  <g>
4631
  <use ns4:href="#md7efaf3aec" x="732.811356" y="39.978438" style="fill: #1f77b4; stroke: #1f77b4" />
 
4634
  <g id="legend-label--hf-kernels-swiglu" class="legend">
4635
  <text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: start" x="750.811356" y="43.478438" transform="rotate(-0 750.811356 43.478438)">hf_kernels_swiglu</text>
4636
  </g>
4637
+ <g id="line2d_17">
4638
  <path d="M 722.811356 54.934687 L 732.811356 54.934687 L 742.811356 54.934687 " style="fill: none; stroke: #ff7f0e; stroke-width: 1.5; stroke-linecap: square" />
4639
  <g>
4640
  <use ns4:href="#m9b8c54d372" x="732.811356" y="54.934687" style="fill: #ff7f0e; stroke: #ff7f0e" />
causal_conv1d/impls/artifacts/benchmark/causal_conv1d.jsonl CHANGED
@@ -1,24 +1,24 @@
1
- {"ts": "2025-10-29T14:27:35Z", "run": "b5ef0d3c828f4daf8ce959b0e1a6bf4c", "impl": "hf_kernels_causal_conv1d", "tags": {"family": "hf-kernels", "backend": "cuda"}, "wl": {"name": "cuda_B2_D64_S128_W2", "batch": 2, "dim": 64, "seqlen": 128, "width": 2, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.047051000024112, "p50": 0.048391000007086404, "p90": 0.048571999968771706, "mean": 0.04890720000503279, "iqr": 0.0005509999709829572, "raw_times": [0.0525010000274051, 0.048571999968771706, 0.04802099999778875, 0.048391000007086404, 0.047051000024112], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.058991999992485944, "peak_bytes": 295936, "ok": true, "absmax": 0.0, "corr": {"ok": true, "rtol": 0.003, "atol": 0.005, "absmax": 0.0, "mae": 0.0, "mse": 0.0, "ref": "causal_conv1d_fp32"}, "err": null}
2
- {"ts": "2025-10-29T14:27:36Z", "run": "b5ef0d3c828f4daf8ce959b0e1a6bf4c", "impl": "hf_kernels_causal_conv1d", "tags": {"family": "hf-kernels", "backend": "cuda"}, "wl": {"name": "cuda_B2_D64_S128_W4", "batch": 2, "dim": 64, "seqlen": 128, "width": 4, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.05480199996554802, "p50": 0.05610099998420992, "p90": 0.05628099995647062, "mean": 0.056069199968078465, "iqr": 0.0006299999881775875, "raw_times": [0.057510999965870724, 0.05628099995647062, 0.05610099998420992, 0.055650999968293036, 0.05480199996554802], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.06025200002568454, "peak_bytes": 296448, "ok": true, "absmax": 0.0, "corr": {"ok": true, "rtol": 0.003, "atol": 0.005, "absmax": 0.0, "mae": 0.0, "mse": 0.0, "ref": "causal_conv1d_fp32"}, "err": null}
3
- {"ts": "2025-10-29T14:27:36Z", "run": "b5ef0d3c828f4daf8ce959b0e1a6bf4c", "impl": "hf_kernels_causal_conv1d", "tags": {"family": "hf-kernels", "backend": "cuda"}, "wl": {"name": "cuda_B2_D64_S512_W2", "batch": 2, "dim": 64, "seqlen": 512, "width": 2, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.05469199999197372, "p50": 0.056971999981669796, "p90": 0.057361000017408514, "mean": 0.056363600003805914, "iqr": 0.0025490000439276628, "raw_times": [0.05469199999197372, 0.057361000017408514, 0.056971999981669796, 0.05798100005449669, 0.05481199997348085], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.0596809999819925, "peak_bytes": 1180672, "ok": true, "absmax": 0.0, "corr": {"ok": true, "rtol": 0.003, "atol": 0.005, "absmax": 0.0, "mae": 0.0, "mse": 0.0, "ref": "causal_conv1d_fp32"}, "err": null}
4
- {"ts": "2025-10-29T14:27:36Z", "run": "b5ef0d3c828f4daf8ce959b0e1a6bf4c", "impl": "hf_kernels_causal_conv1d", "tags": {"family": "hf-kernels", "backend": "cuda"}, "wl": {"name": "cuda_B2_D64_S512_W4", "batch": 2, "dim": 64, "seqlen": 512, "width": 4, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.05416100003685642, "p50": 0.05502099998011545, "p90": 0.05511200004093553, "mean": 0.05489540000098714, "iqr": 0.00016000007008187822, "raw_times": [0.05416100003685642, 0.05495199997085365, 0.055230999976174644, 0.05511200004093553, 0.05502099998011545], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.05659100003185813, "peak_bytes": 1181184, "ok": true, "absmax": 0.0, "corr": {"ok": true, "rtol": 0.003, "atol": 0.005, "absmax": 0.0, "mae": 0.0, "mse": 0.0, "ref": "causal_conv1d_fp32"}, "err": null}
5
- {"ts": "2025-10-29T14:27:36Z", "run": "b5ef0d3c828f4daf8ce959b0e1a6bf4c", "impl": "hf_kernels_causal_conv1d", "tags": {"family": "hf-kernels", "backend": "cuda"}, "wl": {"name": "cuda_B2_D64_S2048_W2", "batch": 2, "dim": 64, "seqlen": 2048, "width": 2, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.052401000004920206, "p50": 0.055000999964249786, "p90": 0.056301000029179704, "mean": 0.054810999995424936, "iqr": 0.0023400000372930663, "raw_times": [0.052401000004920206, 0.056301000029179704, 0.056390999986888346, 0.05396099999188664, 0.055000999964249786], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.05838100003074942, "peak_bytes": 4719616, "ok": true, "absmax": 0.0, "corr": {"ok": true, "rtol": 0.003, "atol": 0.005, "absmax": 0.0, "mae": 0.0, "mse": 0.0, "ref": "causal_conv1d_fp32"}, "err": null}
6
- {"ts": "2025-10-29T14:27:36Z", "run": "b5ef0d3c828f4daf8ce959b0e1a6bf4c", "impl": "hf_kernels_causal_conv1d", "tags": {"family": "hf-kernels", "backend": "cuda"}, "wl": {"name": "cuda_B2_D64_S2048_W4", "batch": 2, "dim": 64, "seqlen": 2048, "width": 4, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.05270100001553146, "p50": 0.05342100001826111, "p90": 0.054111999986616865, "mean": 0.053611199996339565, "iqr": 0.0008709999974598759, "raw_times": [0.05324099998915699, 0.054580999972131394, 0.054111999986616865, 0.05342100001826111, 0.05270100001553146], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.0580610000042725, "peak_bytes": 4720128, "ok": true, "absmax": 0.0, "corr": {"ok": true, "rtol": 0.003, "atol": 0.005, "absmax": 0.0, "mae": 0.0, "mse": 0.0, "ref": "causal_conv1d_fp32"}, "err": null}
7
- {"ts": "2025-10-29T14:27:36Z", "run": "b5ef0d3c828f4daf8ce959b0e1a6bf4c", "impl": "hf_kernels_causal_conv1d", "tags": {"family": "hf-kernels", "backend": "cuda"}, "wl": {"name": "cuda_B2_D2048_S128_W2", "batch": 2, "dim": 2048, "seqlen": 128, "width": 2, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.051810999991630524, "p50": 0.05310099999178419, "p90": 0.053301000036753976, "mean": 0.05577720000928821, "iqr": 0.0007700000423938036, "raw_times": [0.051810999991630524, 0.05253099999436017, 0.06814200003191218, 0.053301000036753976, 0.05310099999178419], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.056801999960498506, "peak_bytes": 9461760, "ok": true, "absmax": 0.0, "corr": {"ok": true, "rtol": 0.003, "atol": 0.005, "absmax": 0.0, "mae": 0.0, "mse": 0.0, "ref": "causal_conv1d_fp32"}, "err": null}
8
- {"ts": "2025-10-29T14:27:36Z", "run": "b5ef0d3c828f4daf8ce959b0e1a6bf4c", "impl": "hf_kernels_causal_conv1d", "tags": {"family": "hf-kernels", "backend": "cuda"}, "wl": {"name": "cuda_B2_D2048_S128_W4", "batch": 2, "dim": 2048, "seqlen": 128, "width": 4, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.053270999956112064, "p50": 0.05397199998924407, "p90": 0.05399100001568513, "mean": 0.054061200000887766, "iqr": 0.00023000001192485797, "raw_times": [0.05399100001568513, 0.05531100003963729, 0.053270999956112064, 0.05397199998924407, 0.05376100000376027], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.056641000014678866, "peak_bytes": 9478144, "ok": true, "absmax": 0.0, "corr": {"ok": true, "rtol": 0.003, "atol": 0.005, "absmax": 0.0, "mae": 0.0, "mse": 0.0, "ref": "causal_conv1d_fp32"}, "err": null}
9
- {"ts": "2025-10-29T14:27:36Z", "run": "b5ef0d3c828f4daf8ce959b0e1a6bf4c", "impl": "hf_kernels_causal_conv1d", "tags": {"family": "hf-kernels", "backend": "cuda"}, "wl": {"name": "cuda_B2_D2048_S512_W2", "batch": 2, "dim": 2048, "seqlen": 512, "width": 2, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.052231000040592335, "p50": 0.052561000018158666, "p90": 0.0526110000009794, "mean": 0.0529970000116009, "iqr": 0.0002200000039920269, "raw_times": [0.052231000040592335, 0.052390999996987375, 0.05519100000128674, 0.052561000018158666, 0.0526110000009794], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.05621100001462764, "peak_bytes": 37773312, "ok": true, "absmax": 0.0, "corr": {"ok": true, "rtol": 0.003, "atol": 0.005, "absmax": 0.0, "mae": 0.0, "mse": 0.0, "ref": "causal_conv1d_fp32"}, "err": null}
10
- {"ts": "2025-10-29T14:27:36Z", "run": "b5ef0d3c828f4daf8ce959b0e1a6bf4c", "impl": "hf_kernels_causal_conv1d", "tags": {"family": "hf-kernels", "backend": "cuda"}, "wl": {"name": "cuda_B2_D2048_S512_W4", "batch": 2, "dim": 2048, "seqlen": 512, "width": 4, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.052561000018158666, "p50": 0.053772000001117704, "p90": 0.05471100001841478, "mean": 0.057879400003457704, "iqr": 0.0021000000174353772, "raw_times": [0.0526110000009794, 0.05471100001841478, 0.053772000001117704, 0.052561000018158666, 0.07574199997861797], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.06976200000963217, "peak_bytes": 37789696, "ok": true, "absmax": 0.0, "corr": {"ok": true, "rtol": 0.003, "atol": 0.005, "absmax": 0.0, "mae": 0.0, "mse": 0.0, "ref": "causal_conv1d_fp32"}, "err": null}
11
- {"ts": "2025-10-29T14:27:36Z", "run": "b5ef0d3c828f4daf8ce959b0e1a6bf4c", "impl": "hf_kernels_causal_conv1d", "tags": {"family": "hf-kernels", "backend": "cuda"}, "wl": {"name": "cuda_B2_D2048_S2048_W2", "batch": 2, "dim": 2048, "seqlen": 2048, "width": 2, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.051341000016691396, "p50": 0.05308099997591853, "p90": 0.05349200000637211, "mean": 0.05272739998645193, "iqr": 0.0014310000437944836, "raw_times": [0.052060999962577625, 0.05366199997069998, 0.05308099997591853, 0.051341000016691396, 0.05349200000637211], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.05661099999088037, "peak_bytes": 151019520, "ok": true, "absmax": 0.0, "corr": {"ok": true, "rtol": 0.003, "atol": 0.005, "absmax": 0.0, "mae": 0.0, "mse": 0.0, "ref": "causal_conv1d_fp32"}, "err": null}
12
- {"ts": "2025-10-29T14:27:36Z", "run": "b5ef0d3c828f4daf8ce959b0e1a6bf4c", "impl": "hf_kernels_causal_conv1d", "tags": {"family": "hf-kernels", "backend": "cuda"}, "wl": {"name": "cuda_B2_D2048_S2048_W4", "batch": 2, "dim": 2048, "seqlen": 2048, "width": 4, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.05081099999415528, "p50": 0.053202000003693684, "p90": 0.05362099994954406, "mean": 0.05282339998302632, "iqr": 0.0011499999459374521, "raw_times": [0.05081099999415528, 0.052471000003606605, 0.05362099994954406, 0.054011999964131974, 0.053202000003693684], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.05642200000011144, "peak_bytes": 151035904, "ok": true, "absmax": 0.0, "corr": {"ok": true, "rtol": 0.003, "atol": 0.005, "absmax": 0.0, "mae": 0.0, "mse": 0.0, "ref": "causal_conv1d_fp32"}, "err": null}
13
- {"ts": "2025-10-29T14:27:36Z", "run": "b5ef0d3c828f4daf8ce959b0e1a6bf4c", "impl": "hf_kernels_causal_conv1d", "tags": {"family": "hf-kernels", "backend": "cuda"}, "wl": {"name": "cuda_B4_D64_S128_W2", "batch": 4, "dim": 64, "seqlen": 128, "width": 2, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.052221000032659504, "p50": 0.05397099999981947, "p90": 0.05448100000648992, "mean": 0.05380100001275423, "iqr": 0.001479999980347202, "raw_times": [0.052221000032659504, 0.05300100002614272, 0.055330999998659536, 0.05448100000648992, 0.05397099999981947], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.058421999995061924, "peak_bytes": 33727488, "ok": true, "absmax": 0.0, "corr": {"ok": true, "rtol": 0.003, "atol": 0.005, "absmax": 0.0, "mae": 0.0, "mse": 0.0, "ref": "causal_conv1d_fp32"}, "err": null}
14
- {"ts": "2025-10-29T14:27:36Z", "run": "b5ef0d3c828f4daf8ce959b0e1a6bf4c", "impl": "hf_kernels_causal_conv1d", "tags": {"family": "hf-kernels", "backend": "cuda"}, "wl": {"name": "cuda_B4_D64_S128_W4", "batch": 4, "dim": 64, "seqlen": 128, "width": 4, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.05164199995988383, "p50": 0.053460999993149017, "p90": 0.05421100001967716, "mean": 0.0532791999944493, "iqr": 0.00139000002263856, "raw_times": [0.0528209999970386, 0.054261000002497894, 0.05421100001967716, 0.05164199995988383, 0.053460999993149017], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.05741199998965385, "peak_bytes": 591360, "ok": true, "absmax": 0.0, "corr": {"ok": true, "rtol": 0.003, "atol": 0.005, "absmax": 0.0, "mae": 0.0, "mse": 0.0, "ref": "causal_conv1d_fp32"}, "err": null}
15
- {"ts": "2025-10-29T14:27:36Z", "run": "b5ef0d3c828f4daf8ce959b0e1a6bf4c", "impl": "hf_kernels_causal_conv1d", "tags": {"family": "hf-kernels", "backend": "cuda"}, "wl": {"name": "cuda_B4_D64_S512_W2", "batch": 4, "dim": 64, "seqlen": 512, "width": 2, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.052460999995673774, "p50": 0.0557109999590466, "p90": 0.05585100001326282, "mean": 0.05600519999688913, "iqr": 0.002330000029360235, "raw_times": [0.052460999995673774, 0.0557109999590466, 0.05585100001326282, 0.053520999983902584, 0.06248200003255988], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.058421000005637325, "peak_bytes": 2360320, "ok": true, "absmax": 0.0, "corr": {"ok": true, "rtol": 0.003, "atol": 0.005, "absmax": 0.0, "mae": 0.0, "mse": 0.0, "ref": "causal_conv1d_fp32"}, "err": null}
16
- {"ts": "2025-10-29T14:27:36Z", "run": "b5ef0d3c828f4daf8ce959b0e1a6bf4c", "impl": "hf_kernels_causal_conv1d", "tags": {"family": "hf-kernels", "backend": "cuda"}, "wl": {"name": "cuda_B4_D64_S512_W4", "batch": 4, "dim": 64, "seqlen": 512, "width": 4, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.0540510000064387, "p50": 0.0541219999945497, "p90": 0.05425200004083308, "mean": 0.054337400013082515, "iqr": 0.0001910000264615519, "raw_times": [0.0540510000064387, 0.05425200004083308, 0.05520100000921957, 0.0541219999945497, 0.05406100001437153], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.05772200000819794, "peak_bytes": 2360832, "ok": true, "absmax": 0.0, "corr": {"ok": true, "rtol": 0.003, "atol": 0.005, "absmax": 0.0, "mae": 0.0, "mse": 0.0, "ref": "causal_conv1d_fp32"}, "err": null}
17
- {"ts": "2025-10-29T14:27:36Z", "run": "b5ef0d3c828f4daf8ce959b0e1a6bf4c", "impl": "hf_kernels_causal_conv1d", "tags": {"family": "hf-kernels", "backend": "cuda"}, "wl": {"name": "cuda_B4_D64_S2048_W2", "batch": 4, "dim": 64, "seqlen": 2048, "width": 2, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.05302099998516496, "p50": 0.05408099997339377, "p90": 0.0544409999747586, "mean": 0.05571119997966889, "iqr": 0.0008299999763039523, "raw_times": [0.05302099998516496, 0.0544409999747586, 0.06340199996657248, 0.053610999998454645, 0.05408099997339377], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.05787100002407897, "peak_bytes": 9438208, "ok": true, "absmax": 0.0, "corr": {"ok": true, "rtol": 0.003, "atol": 0.005, "absmax": 0.0, "mae": 0.0, "mse": 0.0, "ref": "causal_conv1d_fp32"}, "err": null}
18
- {"ts": "2025-10-29T14:27:36Z", "run": "b5ef0d3c828f4daf8ce959b0e1a6bf4c", "impl": "hf_kernels_causal_conv1d", "tags": {"family": "hf-kernels", "backend": "cuda"}, "wl": {"name": "cuda_B4_D64_S2048_W4", "batch": 4, "dim": 64, "seqlen": 2048, "width": 4, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.052391999986411975, "p50": 0.05292200000894809, "p90": 0.053131000015582686, "mean": 0.05318180001268047, "iqr": 0.00026899999738816405, "raw_times": [0.053131000015582686, 0.054602000034265075, 0.052391999986411975, 0.05286200001819452, 0.05292200000894809], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.05755099999760205, "peak_bytes": 9438720, "ok": true, "absmax": 0.0, "corr": {"ok": true, "rtol": 0.003, "atol": 0.005, "absmax": 0.0, "mae": 0.0, "mse": 0.0, "ref": "causal_conv1d_fp32"}, "err": null}
19
- {"ts": "2025-10-29T14:27:36Z", "run": "b5ef0d3c828f4daf8ce959b0e1a6bf4c", "impl": "hf_kernels_causal_conv1d", "tags": {"family": "hf-kernels", "backend": "cuda"}, "wl": {"name": "cuda_B4_D2048_S128_W2", "batch": 4, "dim": 2048, "seqlen": 128, "width": 2, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.05195099998900332, "p50": 0.05479100002503401, "p90": 0.05620100000669481, "mean": 0.05852919999824735, "iqr": 0.0034000000255218765, "raw_times": [0.07690199998933167, 0.05620100000669481, 0.05479100002503401, 0.05195099998900332, 0.052800999981172936], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.057030999982998765, "peak_bytes": 18931712, "ok": true, "absmax": 0.0, "corr": {"ok": true, "rtol": 0.003, "atol": 0.005, "absmax": 0.0, "mae": 0.0, "mse": 0.0, "ref": "causal_conv1d_fp32"}, "err": null}
20
- {"ts": "2025-10-29T14:27:36Z", "run": "b5ef0d3c828f4daf8ce959b0e1a6bf4c", "impl": "hf_kernels_causal_conv1d", "tags": {"family": "hf-kernels", "backend": "cuda"}, "wl": {"name": "cuda_B4_D2048_S128_W4", "batch": 4, "dim": 2048, "seqlen": 128, "width": 4, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.05290100000365783, "p50": 0.05457100002104198, "p90": 0.055251000048883725, "mean": 0.055353400011881604, "iqr": 0.001779000058377278, "raw_times": [0.05290100000365783, 0.06057199999531804, 0.055251000048883725, 0.05347199999050645, 0.05457100002104198], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.056300999972336285, "peak_bytes": 18948096, "ok": true, "absmax": 0.0, "corr": {"ok": true, "rtol": 0.003, "atol": 0.005, "absmax": 0.0, "mae": 0.0, "mse": 0.0, "ref": "causal_conv1d_fp32"}, "err": null}
21
- {"ts": "2025-10-29T14:27:36Z", "run": "b5ef0d3c828f4daf8ce959b0e1a6bf4c", "impl": "hf_kernels_causal_conv1d", "tags": {"family": "hf-kernels", "backend": "cuda"}, "wl": {"name": "cuda_B4_D2048_S512_W2", "batch": 4, "dim": 2048, "seqlen": 512, "width": 2, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.052152000023397704, "p50": 0.05461199998535449, "p90": 0.05471200000783938, "mean": 0.05381760001910152, "iqr": 0.001900999961890193, "raw_times": [0.052152000023397704, 0.05461199998535449, 0.05480100003296684, 0.05471200000783938, 0.052811000045949186], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.05758200001082514, "peak_bytes": 75522048, "ok": true, "absmax": 0.0, "corr": {"ok": true, "rtol": 0.003, "atol": 0.005, "absmax": 0.0, "mae": 0.0, "mse": 0.0, "ref": "causal_conv1d_fp32"}, "err": null}
22
- {"ts": "2025-10-29T14:27:36Z", "run": "b5ef0d3c828f4daf8ce959b0e1a6bf4c", "impl": "hf_kernels_causal_conv1d", "tags": {"family": "hf-kernels", "backend": "cuda"}, "wl": {"name": "cuda_B4_D2048_S512_W4", "batch": 4, "dim": 2048, "seqlen": 512, "width": 4, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.05339099999446262, "p50": 0.05463100001179555, "p90": 0.055171999974845676, "mean": 0.05451339999353877, "iqr": 0.0011309999763398082, "raw_times": [0.05404099999850587, 0.055331999988084135, 0.05339099999446262, 0.05463100001179555, 0.055171999974845676], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.058501000012256554, "peak_bytes": 75538432, "ok": true, "absmax": 0.0, "corr": {"ok": true, "rtol": 0.003, "atol": 0.005, "absmax": 0.0, "mae": 0.0, "mse": 0.0, "ref": "causal_conv1d_fp32"}, "err": null}
23
- {"ts": "2025-10-29T14:27:36Z", "run": "b5ef0d3c828f4daf8ce959b0e1a6bf4c", "impl": "hf_kernels_causal_conv1d", "tags": {"family": "hf-kernels", "backend": "cuda"}, "wl": {"name": "cuda_B4_D2048_S2048_W2", "batch": 4, "dim": 2048, "seqlen": 2048, "width": 2, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.05277100001421786, "p50": 0.053860999969401746, "p90": 0.054361000024982786, "mean": 0.053951200004576094, "iqr": 0.0007190000133050489, "raw_times": [0.05277100001421786, 0.05512100000260034, 0.05364200001167774, 0.053860999969401746, 0.054361000024982786], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.057851999997637904, "peak_bytes": 302014464, "ok": true, "absmax": 0.0, "corr": {"ok": true, "rtol": 0.003, "atol": 0.005, "absmax": 0.0, "mae": 0.0, "mse": 0.0, "ref": "causal_conv1d_fp32"}, "err": null}
24
- {"ts": "2025-10-29T14:27:36Z", "run": "b5ef0d3c828f4daf8ce959b0e1a6bf4c", "impl": "hf_kernels_causal_conv1d", "tags": {"family": "hf-kernels", "backend": "cuda"}, "wl": {"name": "cuda_B4_D2048_S2048_W4", "batch": 4, "dim": 2048, "seqlen": 2048, "width": 4, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.05124100005104992, "p50": 0.053691000005073874, "p90": 0.054261999991922494, "mean": 0.05327740001348502, "iqr": 0.0014609999539061391, "raw_times": [0.05124100005104992, 0.05439199998136246, 0.054261999991922494, 0.053691000005073874, 0.052801000038016355], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.05714199994599767, "peak_bytes": 302030848, "ok": true, "absmax": 0.0, "corr": {"ok": true, "rtol": 0.003, "atol": 0.005, "absmax": 0.0, "mae": 0.0, "mse": 0.0, "ref": "causal_conv1d_fp32"}, "err": null}
 
1
+ {"ts": "2025-10-29T15:50:27Z", "run": "73b51cf0e9b74d0eaee1c67a266218ef", "impl": "torch_eager", "tags": {"family": "pytorch", "backend": "eager"}, "wl": {"name": "cuda_B2_D64_S128_W2", "batch": 2, "dim": 64, "seqlen": 128, "width": 2, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.07023200004141472, "p50": 0.07095199998730095, "p90": 0.07123199998204655, "mean": 0.07353400000056354, "iqr": 0.0008999999749903509, "raw_times": [0.07095199998730095, 0.08492199998499927, 0.0703320000070562, 0.07123199998204655, 0.07023200004141472], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.07603100004871521, "peak_bytes": 295936, "ok": true, "absmax": 0.0, "corr": {"ok": true, "rtol": 0.003, "atol": 0.005, "absmax": 0.0, "mae": 0.0, "mse": 0.0, "ref": "causal_conv1d_fp32"}, "err": null}
2
+ {"ts": "2025-10-29T15:50:27Z", "run": "73b51cf0e9b74d0eaee1c67a266218ef", "impl": "torch_eager", "tags": {"family": "pytorch", "backend": "eager"}, "wl": {"name": "cuda_B2_D64_S128_W4", "batch": 2, "dim": 64, "seqlen": 128, "width": 4, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.08460200001536577, "p50": 0.08611200001951147, "p90": 0.08698200002754675, "mean": 0.08602200001632809, "iqr": 0.001740000016070553, "raw_times": [0.08460200001536577, 0.08611200001951147, 0.08698200002754675, 0.08717200000774028, 0.0852420000114762], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.08820200002901402, "peak_bytes": 296448, "ok": true, "absmax": 0.0, "corr": {"ok": true, "rtol": 0.003, "atol": 0.005, "absmax": 0.0, "mae": 0.0, "mse": 0.0, "ref": "causal_conv1d_fp32"}, "err": null}
3
+ {"ts": "2025-10-29T15:50:27Z", "run": "73b51cf0e9b74d0eaee1c67a266218ef", "impl": "torch_eager", "tags": {"family": "pytorch", "backend": "eager"}, "wl": {"name": "cuda_B2_D64_S512_W2", "batch": 2, "dim": 64, "seqlen": 512, "width": 2, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.08334199998216718, "p50": 0.08516200000485696, "p90": 0.08565199999566175, "mean": 0.08509399999638845, "iqr": 0.0015599999869664316, "raw_times": [0.08334199998216718, 0.08722199999056102, 0.08565199999566175, 0.08516200000485696, 0.08409200000869532], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.0867219999918234, "peak_bytes": 1180672, "ok": true, "absmax": 0.0, "corr": {"ok": true, "rtol": 0.003, "atol": 0.005, "absmax": 0.0, "mae": 0.0, "mse": 0.0, "ref": "causal_conv1d_fp32"}, "err": null}
4
+ {"ts": "2025-10-29T15:50:27Z", "run": "73b51cf0e9b74d0eaee1c67a266218ef", "impl": "torch_eager", "tags": {"family": "pytorch", "backend": "eager"}, "wl": {"name": "cuda_B2_D64_S512_W4", "batch": 2, "dim": 64, "seqlen": 512, "width": 4, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.08196199996746145, "p50": 0.08375099997692814, "p90": 0.08384200003774822, "mean": 0.08337179999671207, "iqr": 0.0010800000609378912, "raw_times": [0.08276199997681033, 0.08454200002461221, 0.08384200003774822, 0.08375099997692814, 0.08196199996746145], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.08716199999980745, "peak_bytes": 1181184, "ok": true, "absmax": 0.0, "corr": {"ok": true, "rtol": 0.003, "atol": 0.005, "absmax": 0.0, "mae": 0.0, "mse": 0.0, "ref": "causal_conv1d_fp32"}, "err": null}
5
+ {"ts": "2025-10-29T15:50:27Z", "run": "73b51cf0e9b74d0eaee1c67a266218ef", "impl": "torch_eager", "tags": {"family": "pytorch", "backend": "eager"}, "wl": {"name": "cuda_B2_D64_S2048_W2", "batch": 2, "dim": 64, "seqlen": 2048, "width": 2, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.08276199997681033, "p50": 0.08335200004694343, "p90": 0.08474200001273857, "mean": 0.08374400000548121, "iqr": 0.0019199999883312557, "raw_times": [0.08335200004694343, 0.08474200001273857, 0.08504199996650641, 0.08276199997681033, 0.08282200002440732], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.08652200000369703, "peak_bytes": 4719616, "ok": true, "absmax": 0.0, "corr": {"ok": true, "rtol": 0.003, "atol": 0.005, "absmax": 0.0, "mae": 0.0, "mse": 0.0, "ref": "causal_conv1d_fp32"}, "err": null}
6
+ {"ts": "2025-10-29T15:50:27Z", "run": "73b51cf0e9b74d0eaee1c67a266218ef", "impl": "torch_eager", "tags": {"family": "pytorch", "backend": "eager"}, "wl": {"name": "cuda_B2_D64_S2048_W4", "batch": 2, "dim": 64, "seqlen": 2048, "width": 4, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.08203199996614785, "p50": 0.08333200003107777, "p90": 0.08342199998878641, "mean": 0.08316619998822716, "iqr": 0.0006700000199089118, "raw_times": [0.08203199996614785, 0.08333200003107777, 0.08342199998878641, 0.0827519999688775, 0.08429299998624629], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.08916199999475793, "peak_bytes": 4720128, "ok": true, "absmax": 0.0, "corr": {"ok": true, "rtol": 0.003, "atol": 0.005, "absmax": 0.0, "mae": 0.0, "mse": 0.0, "ref": "causal_conv1d_fp32"}, "err": null}
7
+ {"ts": "2025-10-29T15:50:27Z", "run": "73b51cf0e9b74d0eaee1c67a266218ef", "impl": "torch_eager", "tags": {"family": "pytorch", "backend": "eager"}, "wl": {"name": "cuda_B2_D2048_S128_W2", "batch": 2, "dim": 2048, "seqlen": 128, "width": 2, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.08283200003234015, "p50": 0.08409299999811992, "p90": 0.08469200002991784, "mean": 0.08781020001151774, "iqr": 0.001050000037139398, "raw_times": [0.08469200002991784, 0.08409299999811992, 0.10379200000443234, 0.08364199999277844, 0.08283200003234015], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.08838200000127472, "peak_bytes": 9461760, "ok": true, "absmax": 0.0, "corr": {"ok": true, "rtol": 0.003, "atol": 0.005, "absmax": 0.0, "mae": 0.0, "mse": 0.0, "ref": "causal_conv1d_fp32"}, "err": null}
8
+ {"ts": "2025-10-29T15:50:27Z", "run": "73b51cf0e9b74d0eaee1c67a266218ef", "impl": "torch_eager", "tags": {"family": "pytorch", "backend": "eager"}, "wl": {"name": "cuda_B2_D2048_S128_W4", "batch": 2, "dim": 2048, "seqlen": 128, "width": 4, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.08139199997003743, "p50": 0.08336199999803284, "p90": 0.08399199998621043, "mean": 0.0832759999980226, "iqr": 0.0010599999313853914, "raw_times": [0.08139199997003743, 0.08470199998100725, 0.08293200005482504, 0.08336199999803284, 0.08399199998621043], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.08715199999187462, "peak_bytes": 9478144, "ok": true, "absmax": 0.0, "corr": {"ok": true, "rtol": 0.003, "atol": 0.005, "absmax": 0.0, "mae": 0.0, "mse": 0.0, "ref": "causal_conv1d_fp32"}, "err": null}
9
+ {"ts": "2025-10-29T15:50:27Z", "run": "73b51cf0e9b74d0eaee1c67a266218ef", "impl": "torch_eager", "tags": {"family": "pytorch", "backend": "eager"}, "wl": {"name": "cuda_B2_D2048_S512_W2", "batch": 2, "dim": 2048, "seqlen": 512, "width": 2, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.08235199999262477, "p50": 0.08327199998348078, "p90": 0.0835210000218467, "mean": 0.08336580000332106, "iqr": 0.00033899999607456266, "raw_times": [0.08235199999262477, 0.0835210000218467, 0.08450199999288088, 0.08327199998348078, 0.08318200002577214], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.08735199998000098, "peak_bytes": 37773312, "ok": true, "absmax": 0.0, "corr": {"ok": true, "rtol": 0.003, "atol": 0.005, "absmax": 0.0, "mae": 0.0, "mse": 0.0, "ref": "causal_conv1d_fp32"}, "err": null}
10
+ {"ts": "2025-10-29T15:50:27Z", "run": "73b51cf0e9b74d0eaee1c67a266218ef", "impl": "torch_eager", "tags": {"family": "pytorch", "backend": "eager"}, "wl": {"name": "cuda_B2_D2048_S512_W4", "batch": 2, "dim": 2048, "seqlen": 512, "width": 4, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.08233200003360253, "p50": 0.08335199999010001, "p90": 0.08342199998878641, "mean": 0.08314600000858263, "iqr": 0.0004799999828719592, "raw_times": [0.08342199998878641, 0.08233200003360253, 0.08335199999010001, 0.08294200000591445, 0.08368200002450976], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.08666200000106983, "peak_bytes": 37789696, "ok": true, "absmax": 0.0, "corr": {"ok": true, "rtol": 0.003, "atol": 0.005, "absmax": 0.0, "mae": 0.0, "mse": 0.0, "ref": "causal_conv1d_fp32"}, "err": null}
11
+ {"ts": "2025-10-29T15:50:27Z", "run": "73b51cf0e9b74d0eaee1c67a266218ef", "impl": "torch_eager", "tags": {"family": "pytorch", "backend": "eager"}, "wl": {"name": "cuda_B2_D2048_S2048_W2", "batch": 2, "dim": 2048, "seqlen": 2048, "width": 2, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.1449639999577812, "p50": 0.14544300000807198, "p90": 0.14571399998430934, "mean": 0.14548759999115646, "iqr": 0.00032100001590151805, "raw_times": [0.14544300000807198, 0.1449639999577812, 0.14539299996840782, 0.14571399998430934, 0.14592400003721195], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.14803300001631214, "peak_bytes": 151019520, "ok": true, "absmax": 0.0, "corr": {"ok": true, "rtol": 0.003, "atol": 0.005, "absmax": 0.0, "mae": 0.0, "mse": 0.0, "ref": "causal_conv1d_fp32"}, "err": null}
12
+ {"ts": "2025-10-29T15:50:27Z", "run": "73b51cf0e9b74d0eaee1c67a266218ef", "impl": "torch_eager", "tags": {"family": "pytorch", "backend": "eager"}, "wl": {"name": "cuda_B2_D2048_S2048_W4", "batch": 2, "dim": 2048, "seqlen": 2048, "width": 4, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.16181400002324153, "p50": 0.1630739999995967, "p90": 0.16360400002213282, "mean": 0.16567200000281446, "iqr": 0.0017800000478018774, "raw_times": [0.16181400002324153, 0.17804399999477027, 0.1630739999995967, 0.16182399997433095, 0.16360400002213282], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.16251400001010552, "peak_bytes": 151035904, "ok": true, "absmax": 0.0, "corr": {"ok": true, "rtol": 0.003, "atol": 0.005, "absmax": 0.0, "mae": 0.0, "mse": 0.0, "ref": "causal_conv1d_fp32"}, "err": null}
13
+ {"ts": "2025-10-29T15:50:27Z", "run": "73b51cf0e9b74d0eaee1c67a266218ef", "impl": "torch_eager", "tags": {"family": "pytorch", "backend": "eager"}, "wl": {"name": "cuda_B4_D64_S128_W2", "batch": 4, "dim": 64, "seqlen": 128, "width": 2, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.08132199997135103, "p50": 0.08263099999794576, "p90": 0.08295200001384728, "mean": 0.0824317999899904, "iqr": 0.0009100000397666008, "raw_times": [0.08132199997135103, 0.08321199999272721, 0.08204199997408068, 0.08295200001384728, 0.08263099999794576], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.08486200005108913, "peak_bytes": 33727488, "ok": true, "absmax": 0.0, "corr": {"ok": true, "rtol": 0.003, "atol": 0.005, "absmax": 0.0, "mae": 0.0, "mse": 0.0, "ref": "causal_conv1d_fp32"}, "err": null}
14
+ {"ts": "2025-10-29T15:50:27Z", "run": "73b51cf0e9b74d0eaee1c67a266218ef", "impl": "torch_eager", "tags": {"family": "pytorch", "backend": "eager"}, "wl": {"name": "cuda_B4_D64_S128_W4", "batch": 4, "dim": 64, "seqlen": 128, "width": 4, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.08171299998593895, "p50": 0.08253199996488547, "p90": 0.08321199999272721, "mean": 0.08254819997546292, "iqr": 0.001280000049064256, "raw_times": [0.08193199994366296, 0.08335199999010001, 0.08321199999272721, 0.08171299998593895, 0.08253199996488547], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.08592199998247452, "peak_bytes": 591360, "ok": true, "absmax": 0.0, "corr": {"ok": true, "rtol": 0.003, "atol": 0.005, "absmax": 0.0, "mae": 0.0, "mse": 0.0, "ref": "causal_conv1d_fp32"}, "err": null}
15
+ {"ts": "2025-10-29T15:50:27Z", "run": "73b51cf0e9b74d0eaee1c67a266218ef", "impl": "torch_eager", "tags": {"family": "pytorch", "backend": "eager"}, "wl": {"name": "cuda_B4_D64_S512_W2", "batch": 4, "dim": 64, "seqlen": 512, "width": 2, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.08148200004143291, "p50": 0.08176199997933509, "p90": 0.08353199996236071, "mean": 0.08242180000479493, "iqr": 0.002030999951330159, "raw_times": [0.08150100001103056, 0.08148200004143291, 0.08383200002981539, 0.08176199997933509, 0.08353199996236071], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.08680199999844262, "peak_bytes": 2360320, "ok": true, "absmax": 0.0, "corr": {"ok": true, "rtol": 0.003, "atol": 0.005, "absmax": 0.0, "mae": 0.0, "mse": 0.0, "ref": "causal_conv1d_fp32"}, "err": null}
16
+ {"ts": "2025-10-29T15:50:27Z", "run": "73b51cf0e9b74d0eaee1c67a266218ef", "impl": "torch_eager", "tags": {"family": "pytorch", "backend": "eager"}, "wl": {"name": "cuda_B4_D64_S512_W4", "batch": 4, "dim": 64, "seqlen": 512, "width": 4, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.08221299998467657, "p50": 0.08294200000591445, "p90": 0.08321200004957063, "mean": 0.08299800000486357, "iqr": 0.0007910000476840651, "raw_times": [0.08242100000188657, 0.08420199998226963, 0.08321200004957063, 0.08221299998467657, 0.08294200000591445], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.08663200003411475, "peak_bytes": 2360832, "ok": true, "absmax": 0.0, "corr": {"ok": true, "rtol": 0.003, "atol": 0.005, "absmax": 0.0, "mae": 0.0, "mse": 0.0, "ref": "causal_conv1d_fp32"}, "err": null}
17
+ {"ts": "2025-10-29T15:50:27Z", "run": "73b51cf0e9b74d0eaee1c67a266218ef", "impl": "torch_eager", "tags": {"family": "pytorch", "backend": "eager"}, "wl": {"name": "cuda_B4_D64_S2048_W2", "batch": 4, "dim": 64, "seqlen": 2048, "width": 2, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.08301200000460085, "p50": 0.08371199999146484, "p90": 0.08385299997826223, "mean": 0.08369219999622146, "iqr": 0.0001610000026630587, "raw_times": [0.08301200000460085, 0.08371199999146484, 0.08419200003118021, 0.08385299997826223, 0.08369199997559917], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.086651999993137, "peak_bytes": 9438208, "ok": true, "absmax": 0.0, "corr": {"ok": true, "rtol": 0.003, "atol": 0.005, "absmax": 0.0, "mae": 0.0, "mse": 0.0, "ref": "causal_conv1d_fp32"}, "err": null}
18
+ {"ts": "2025-10-29T15:50:27Z", "run": "73b51cf0e9b74d0eaee1c67a266218ef", "impl": "torch_eager", "tags": {"family": "pytorch", "backend": "eager"}, "wl": {"name": "cuda_B4_D64_S2048_W4", "batch": 4, "dim": 64, "seqlen": 2048, "width": 4, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.08138200001894802, "p50": 0.08318200002577214, "p90": 0.08328199999141361, "mean": 0.08309020000751843, "iqr": 0.0010899999551838846, "raw_times": [0.08219200003622973, 0.08541299996522866, 0.08318200002577214, 0.08138200001894802, 0.08328199999141361], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.08645299999443523, "peak_bytes": 9438720, "ok": true, "absmax": 0.0, "corr": {"ok": true, "rtol": 0.003, "atol": 0.005, "absmax": 0.0, "mae": 0.0, "mse": 0.0, "ref": "causal_conv1d_fp32"}, "err": null}
19
+ {"ts": "2025-10-29T15:50:27Z", "run": "73b51cf0e9b74d0eaee1c67a266218ef", "impl": "torch_eager", "tags": {"family": "pytorch", "backend": "eager"}, "wl": {"name": "cuda_B4_D2048_S128_W2", "batch": 4, "dim": 2048, "seqlen": 128, "width": 2, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.0822520000269833, "p50": 0.08321100000330262, "p90": 0.08357199999409204, "mean": 0.08451599999261816, "iqr": 0.0009600000225873373, "raw_times": [0.09093299996720816, 0.0822520000269833, 0.08321100000330262, 0.0826119999715047, 0.08357199999409204], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.08730200005402367, "peak_bytes": 18931712, "ok": true, "absmax": 0.0, "corr": {"ok": true, "rtol": 0.003, "atol": 0.005, "absmax": 0.0, "mae": 0.0, "mse": 0.0, "ref": "causal_conv1d_fp32"}, "err": null}
20
+ {"ts": "2025-10-29T15:50:27Z", "run": "73b51cf0e9b74d0eaee1c67a266218ef", "impl": "torch_eager", "tags": {"family": "pytorch", "backend": "eager"}, "wl": {"name": "cuda_B4_D2048_S128_W4", "batch": 4, "dim": 2048, "seqlen": 128, "width": 4, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.08279200000060882, "p50": 0.08370200004037542, "p90": 0.08400199999414326, "mean": 0.08373800000072151, "iqr": 0.0006500000040432496, "raw_times": [0.08335199999010001, 0.08400199999414326, 0.08484199997838004, 0.08279200000060882, 0.08370200004037542], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.08856199997353542, "peak_bytes": 18948096, "ok": true, "absmax": 0.0, "corr": {"ok": true, "rtol": 0.003, "atol": 0.005, "absmax": 0.0, "mae": 0.0, "mse": 0.0, "ref": "causal_conv1d_fp32"}, "err": null}
21
+ {"ts": "2025-10-29T15:50:27Z", "run": "73b51cf0e9b74d0eaee1c67a266218ef", "impl": "torch_eager", "tags": {"family": "pytorch", "backend": "eager"}, "wl": {"name": "cuda_B4_D2048_S512_W2", "batch": 4, "dim": 2048, "seqlen": 512, "width": 2, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.09200200003078862, "p50": 0.09372200003099351, "p90": 0.09380200003761274, "mean": 0.09347200001457168, "iqr": 0.00012000003835055395, "raw_times": [0.09200200003078862, 0.09415199997420132, 0.09380200003761274, 0.09372200003099351, 0.09368199999926219], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.09485200001790872, "peak_bytes": 75522048, "ok": true, "absmax": 0.0, "corr": {"ok": true, "rtol": 0.003, "atol": 0.005, "absmax": 0.0, "mae": 0.0, "mse": 0.0, "ref": "causal_conv1d_fp32"}, "err": null}
22
+ {"ts": "2025-10-29T15:50:27Z", "run": "73b51cf0e9b74d0eaee1c67a266218ef", "impl": "torch_eager", "tags": {"family": "pytorch", "backend": "eager"}, "wl": {"name": "cuda_B4_D2048_S512_W4", "batch": 4, "dim": 2048, "seqlen": 512, "width": 4, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.098961999981384, "p50": 0.10011200004100829, "p90": 0.10014200000796336, "mean": 0.10138220001181253, "iqr": 0.0004400000079840538, "raw_times": [0.09970199999997931, 0.098961999981384, 0.10011200004100829, 0.10014200000796336, 0.10799300002872769], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.11010200000782788, "peak_bytes": 75538432, "ok": true, "absmax": 0.0, "corr": {"ok": true, "rtol": 0.003, "atol": 0.005, "absmax": 0.0, "mae": 0.0, "mse": 0.0, "ref": "causal_conv1d_fp32"}, "err": null}
23
+ {"ts": "2025-10-29T15:50:27Z", "run": "73b51cf0e9b74d0eaee1c67a266218ef", "impl": "torch_eager", "tags": {"family": "pytorch", "backend": "eager"}, "wl": {"name": "cuda_B4_D2048_S2048_W2", "batch": 4, "dim": 2048, "seqlen": 2048, "width": 2, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.48627099999976053, "p50": 0.48646099997995407, "p90": 0.4873609999549444, "mean": 0.48691319999534244, "iqr": 0.00103899992609513, "raw_times": [0.48627099999976053, 0.4873609999549444, 0.4881510000132039, 0.4863220000288493, 0.48646099997995407], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.48353100004305816, "peak_bytes": 302014464, "ok": true, "absmax": 0.0, "corr": {"ok": true, "rtol": 0.003, "atol": 0.005, "absmax": 0.0, "mae": 0.0, "mse": 0.0, "ref": "causal_conv1d_fp32"}, "err": null}
24
+ {"ts": "2025-10-29T15:50:27Z", "run": "73b51cf0e9b74d0eaee1c67a266218ef", "impl": "torch_eager", "tags": {"family": "pytorch", "backend": "eager"}, "wl": {"name": "cuda_B4_D2048_S2048_W4", "batch": 4, "dim": 2048, "seqlen": 2048, "width": 4, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.491851000049337, "p50": 0.49710199999708493, "p90": 0.49729199997727846, "mean": 0.49653980000812226, "iqr": 0.0012099999935344385, "raw_times": [0.496081999983744, 0.5003720000331668, 0.49729199997727846, 0.49710199999708493, 0.491851000049337], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.5018319999976484, "peak_bytes": 302030848, "ok": true, "absmax": 0.0, "corr": {"ok": true, "rtol": 0.003, "atol": 0.005, "absmax": 0.0, "mae": 0.0, "mse": 0.0, "ref": "causal_conv1d_fp32"}, "err": null}
causal_conv1d/impls/cells/benchmark.py CHANGED
@@ -4,28 +4,37 @@
4
  # "numpy",
5
  # "torch==2.8.0",
6
  # "kernels-benchmark-tools",
7
- # "kernels",
8
  # ]
9
  #
10
  # [tool.uv.sources]
11
  # kernels-benchmark-tools = { path = "../../../../../tools", editable = true }
12
  # ///
13
  import torch
 
14
  import sys
15
  from kernels_benchmark_tools import KernelTypeEnum, run_benchmark
16
- from kernels import get_kernel
17
 
18
- # Load the causal conv1d kernel
19
- causal_conv1d = get_kernel("kernels-community/causal-conv1d")
20
 
 
 
 
 
 
 
21
 
22
- def hf_kernels_causal_conv1d(input_tensor, weight, bias):
23
- return causal_conv1d.causal_conv1d_fn(input_tensor, weight, bias)
 
 
 
 
 
 
24
 
25
 
26
  run_benchmark(
27
  kernel_type=KernelTypeEnum.CAUSAL_CONV1D,
28
- impl_name="hf_kernels_causal_conv1d",
29
- impl_tags={"family": "hf-kernels", "backend": "cuda"},
30
- impl_func=hf_kernels_causal_conv1d,
31
  )
 
4
  # "numpy",
5
  # "torch==2.8.0",
6
  # "kernels-benchmark-tools",
 
7
  # ]
8
  #
9
  # [tool.uv.sources]
10
  # kernels-benchmark-tools = { path = "../../../../../tools", editable = true }
11
  # ///
12
  import torch
13
+ import torch.nn.functional as F
14
  import sys
15
  from kernels_benchmark_tools import KernelTypeEnum, run_benchmark
 
16
 
 
 
17
 
18
+ def torch_causal_conv1d(input_tensor, weight, bias):
19
+ # Convert to weight dtype for computation
20
+ x = input_tensor.to(weight.dtype)
21
+ dim = weight.shape[0]
22
+ width = weight.shape[1]
23
+ seqlen = input_tensor.shape[-1]
24
 
25
+ # Depthwise causal conv1d using PyTorch
26
+ out = F.conv1d(x, weight.unsqueeze(1), bias, padding=width - 1, groups=dim)
27
+
28
+ # Truncate to original sequence length
29
+ out = out[..., :seqlen]
30
+
31
+ # Convert back to original dtype
32
+ return out.to(input_tensor.dtype)
33
 
34
 
35
  run_benchmark(
36
  kernel_type=KernelTypeEnum.CAUSAL_CONV1D,
37
+ impl_name="torch_eager",
38
+ impl_tags={"family": "pytorch", "backend": "eager"},
39
+ impl_func=torch_causal_conv1d,
40
  )
causal_conv1d/impls/hf_kernels_causal_conv1d.html CHANGED
The diff for this file is too large to render. See raw diff
 
causal_conv1d/impls/torch_causal_conv1d.html CHANGED
The diff for this file is too large to render. See raw diff
 
causal_conv1d/results/artifacts/combine/latency.svg CHANGED

Git LFS Details

  • SHA256: 3d92f3a3aa92e11f21958cf1c591a4e709fd40f7b0cccbd544c1e1a77b11bcd2
  • Pointer size: 130 Bytes
  • Size of remote file: 35.4 kB

Git LFS Details

  • SHA256: 1d242a099b34afd09f08c43f438a0f0428d98a0ebd51a9a36d0be25ca9da89df
  • Pointer size: 130 Bytes
  • Size of remote file: 35.4 kB
causal_conv1d/results/combined_results.html CHANGED
@@ -809,6 +809,14 @@
809
  .artifact-preview svg {
810
  background: transparent;
811
  }
 
 
 
 
 
 
 
 
812
  /* CSV table styling */
813
  .artifact-csv {
814
  margin-top: 1rem;
@@ -3872,7 +3880,7 @@ span.linenos.special { color: #000000; background-color: #ffffc0; padding-left:
3872
  <rdf:RDF>
3873
  <ns2:Work>
3874
  <dc:type rdf:resource="http://purl.org/dc/dcmitype/StillImage" />
3875
- <dc:date>2025-10-29T14:27:58.771179</dc:date>
3876
  <dc:format>image/svg+xml</dc:format>
3877
  <dc:creator>
3878
  <ns2:Agent>
@@ -4216,70 +4224,70 @@ span.linenos.special { color: #000000; background-color: #ffffc0; padding-left:
4216
  <g id="matplotlib.axis_2">
4217
  <g id="ytick_1">
4218
  <g id="grid-y--2" class="grid grid-y">
4219
- <path d="M 47.72 377.079386 L 831.034248 377.079386 " clip-path="url(#pb49fc4c8d2)" style="fill: none; stroke: #b0b0b0; stroke-opacity: 0.3; stroke-width: 0.8; stroke-linecap: square" />
4220
  </g>
4221
  <g id="line2d_25">
4222
  <defs>
4223
  <path id="m0fca2865ba" d="M 0 0 L -3.5 0 " style="stroke: #000000; stroke-width: 0.8" />
4224
  </defs>
4225
  <g>
4226
- <use ns4:href="#m0fca2865ba" x="47.72" y="377.079386" style="stroke: #000000; stroke-width: 0.8" />
4227
  </g>
4228
  </g>
4229
  <g id="text_25">
4230
- <text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: end" x="40.72" y="380.878605" transform="rotate(-0 40.72 380.878605)">0.1</text>
4231
  </g>
4232
  </g>
4233
  <g id="ytick_2">
4234
  <g id="grid-y--3" class="grid grid-y">
4235
- <path d="M 47.72 293.552318 L 831.034248 293.552318 " clip-path="url(#pb49fc4c8d2)" style="fill: none; stroke: #b0b0b0; stroke-opacity: 0.3; stroke-width: 0.8; stroke-linecap: square" />
4236
  </g>
4237
  <g id="line2d_26">
4238
  <g>
4239
- <use ns4:href="#m0fca2865ba" x="47.72" y="293.552318" style="stroke: #000000; stroke-width: 0.8" />
4240
  </g>
4241
  </g>
4242
  <g id="text_26">
4243
- <text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: end" x="40.72" y="297.351537" transform="rotate(-0 40.72 297.351537)">0.2</text>
4244
  </g>
4245
  </g>
4246
  <g id="ytick_3">
4247
  <g id="grid-y--4" class="grid grid-y">
4248
- <path d="M 47.72 210.02525 L 831.034248 210.02525 " clip-path="url(#pb49fc4c8d2)" style="fill: none; stroke: #b0b0b0; stroke-opacity: 0.3; stroke-width: 0.8; stroke-linecap: square" />
4249
  </g>
4250
  <g id="line2d_27">
4251
  <g>
4252
- <use ns4:href="#m0fca2865ba" x="47.72" y="210.02525" style="stroke: #000000; stroke-width: 0.8" />
4253
  </g>
4254
  </g>
4255
  <g id="text_27">
4256
- <text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: end" x="40.72" y="213.824469" transform="rotate(-0 40.72 213.824469)">0.3</text>
4257
  </g>
4258
  </g>
4259
  <g id="ytick_4">
4260
  <g id="grid-y--5" class="grid grid-y">
4261
- <path d="M 47.72 126.498182 L 831.034248 126.498182 " clip-path="url(#pb49fc4c8d2)" style="fill: none; stroke: #b0b0b0; stroke-opacity: 0.3; stroke-width: 0.8; stroke-linecap: square" />
4262
  </g>
4263
  <g id="line2d_28">
4264
  <g>
4265
- <use ns4:href="#m0fca2865ba" x="47.72" y="126.498182" style="stroke: #000000; stroke-width: 0.8" />
4266
  </g>
4267
  </g>
4268
  <g id="text_28">
4269
- <text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: end" x="40.72" y="130.297401" transform="rotate(-0 40.72 130.297401)">0.4</text>
4270
  </g>
4271
  </g>
4272
  <g id="ytick_5">
4273
  <g id="grid-y--6" class="grid grid-y">
4274
- <path d="M 47.72 42.971114 L 831.034248 42.971114 " clip-path="url(#pb49fc4c8d2)" style="fill: none; stroke: #b0b0b0; stroke-opacity: 0.3; stroke-width: 0.8; stroke-linecap: square" />
4275
  </g>
4276
  <g id="line2d_29">
4277
  <g>
4278
- <use ns4:href="#m0fca2865ba" x="47.72" y="42.971114" style="stroke: #000000; stroke-width: 0.8" />
4279
  </g>
4280
  </g>
4281
  <g id="text_29">
4282
- <text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: end" x="40.72" y="46.770333" transform="rotate(-0 40.72 46.770333)">0.5</text>
4283
  </g>
4284
  </g>
4285
  <g id="label--y" class="ylabel">
@@ -4287,66 +4295,66 @@ span.linenos.special { color: #000000; background-color: #ffffc0; padding-left:
4287
  </g>
4288
  </g>
4289
  <g id="series--hf-kernels-causal-conv1d" class="series">
4290
- <path d="M 83.325193 420.186871 L 114.286231 413.746934 L 145.247268 413.019413 L 176.208306 414.649026 L 207.169343 414.665731 L 238.130381 415.985459 L 269.091418 416.252746 L 300.052455 415.525225 L 331.013493 416.703792 L 361.97453 415.692279 L 392.935568 416.269451 L 423.896605 416.168383 L 454.857643 415.52606 L 485.81868 415.952048 L 516.779718 414.072689 L 547.740755 415.399934 L 578.701793 415.43418 L 609.66283 416.402259 L 640.623868 414.841138 L 671.584905 415.024898 L 702.545943 414.990652 L 733.50698 414.974782 L 764.468018 415.61794 L 795.429055 415.759936 " clip-path="url(#pb49fc4c8d2)" style="fill: none; stroke: #1f77b4; stroke-width: 1.5; stroke-linecap: square" />
4291
  <defs>
4292
  <path id="md7efaf3aec" d="M 0 3 C 0.795609 3 1.55874 2.683901 2.12132 2.12132 C 2.683901 1.55874 3 0.795609 3 0 C 3 -0.795609 2.683901 -1.55874 2.12132 -2.12132 C 1.55874 -2.683901 0.795609 -3 0 -3 C -0.795609 -3 -1.55874 -2.683901 -2.12132 -2.12132 C -2.683901 -1.55874 -3 -0.795609 -3 0 C -3 0.795609 -2.683901 1.55874 -2.12132 2.12132 C -1.55874 2.683901 -0.795609 3 0 3 z " style="stroke: #1f77b4" />
4293
  </defs>
4294
  <g clip-path="url(#pb49fc4c8d2)">
4295
  <use ns4:href="#md7efaf3aec" x="83.325193" y="420.186871" style="fill: #1f77b4; stroke: #1f77b4" />
4296
- <use ns4:href="#md7efaf3aec" x="114.286231" y="413.746934" style="fill: #1f77b4; stroke: #1f77b4" />
4297
- <use ns4:href="#md7efaf3aec" x="145.247268" y="413.019413" style="fill: #1f77b4; stroke: #1f77b4" />
4298
- <use ns4:href="#md7efaf3aec" x="176.208306" y="414.649026" style="fill: #1f77b4; stroke: #1f77b4" />
4299
- <use ns4:href="#md7efaf3aec" x="207.169343" y="414.665731" style="fill: #1f77b4; stroke: #1f77b4" />
4300
- <use ns4:href="#md7efaf3aec" x="238.130381" y="415.985459" style="fill: #1f77b4; stroke: #1f77b4" />
4301
- <use ns4:href="#md7efaf3aec" x="269.091418" y="416.252746" style="fill: #1f77b4; stroke: #1f77b4" />
4302
- <use ns4:href="#md7efaf3aec" x="300.052455" y="415.525225" style="fill: #1f77b4; stroke: #1f77b4" />
4303
- <use ns4:href="#md7efaf3aec" x="331.013493" y="416.703792" style="fill: #1f77b4; stroke: #1f77b4" />
4304
- <use ns4:href="#md7efaf3aec" x="361.97453" y="415.692279" style="fill: #1f77b4; stroke: #1f77b4" />
4305
- <use ns4:href="#md7efaf3aec" x="392.935568" y="416.269451" style="fill: #1f77b4; stroke: #1f77b4" />
4306
- <use ns4:href="#md7efaf3aec" x="423.896605" y="416.168383" style="fill: #1f77b4; stroke: #1f77b4" />
4307
- <use ns4:href="#md7efaf3aec" x="454.857643" y="415.52606" style="fill: #1f77b4; stroke: #1f77b4" />
4308
- <use ns4:href="#md7efaf3aec" x="485.81868" y="415.952048" style="fill: #1f77b4; stroke: #1f77b4" />
4309
- <use ns4:href="#md7efaf3aec" x="516.779718" y="414.072689" style="fill: #1f77b4; stroke: #1f77b4" />
4310
- <use ns4:href="#md7efaf3aec" x="547.740755" y="415.399934" style="fill: #1f77b4; stroke: #1f77b4" />
4311
- <use ns4:href="#md7efaf3aec" x="578.701793" y="415.43418" style="fill: #1f77b4; stroke: #1f77b4" />
4312
- <use ns4:href="#md7efaf3aec" x="609.66283" y="416.402259" style="fill: #1f77b4; stroke: #1f77b4" />
4313
- <use ns4:href="#md7efaf3aec" x="640.623868" y="414.841138" style="fill: #1f77b4; stroke: #1f77b4" />
4314
- <use ns4:href="#md7efaf3aec" x="671.584905" y="415.024898" style="fill: #1f77b4; stroke: #1f77b4" />
4315
- <use ns4:href="#md7efaf3aec" x="702.545943" y="414.990652" style="fill: #1f77b4; stroke: #1f77b4" />
4316
- <use ns4:href="#md7efaf3aec" x="733.50698" y="414.974782" style="fill: #1f77b4; stroke: #1f77b4" />
4317
- <use ns4:href="#md7efaf3aec" x="764.468018" y="415.61794" style="fill: #1f77b4; stroke: #1f77b4" />
4318
- <use ns4:href="#md7efaf3aec" x="795.429055" y="415.759936" style="fill: #1f77b4; stroke: #1f77b4" />
4319
  </g>
4320
  </g>
4321
  <g id="series--torch-eager" class="series">
4322
- <path d="M 83.325193 401.710683 L 114.286231 389.180788 L 145.247268 389.523249 L 176.208306 390.141349 L 207.169343 391.126968 L 238.130381 390.809566 L 269.091418 390.934856 L 300.052455 390.667569 L 331.013493 390.500515 L 361.97453 389.707008 L 392.935568 339.037818 L 423.896605 325.239147 L 454.857643 391.043441 L 485.81868 391.009195 L 516.779718 391.143674 L 547.740755 390.442046 L 578.701793 390.951562 L 609.66283 389.129836 L 640.623868 391.795185 L 671.584905 391.319081 L 702.545943 381.654999 L 733.50698 375.966806 L 764.468018 53.96077 L 795.429055 45.608899 " clip-path="url(#pb49fc4c8d2)" style="fill: none; stroke: #ff7f0e; stroke-width: 1.5; stroke-linecap: square" />
4323
  <defs>
4324
  <path id="m9b8c54d372" d="M 0 3 C 0.795609 3 1.55874 2.683901 2.12132 2.12132 C 2.683901 1.55874 3 0.795609 3 0 C 3 -0.795609 2.683901 -1.55874 2.12132 -2.12132 C 1.55874 -2.683901 0.795609 -3 0 -3 C -0.795609 -3 -1.55874 -2.683901 -2.12132 -2.12132 C -2.683901 -1.55874 -3 -0.795609 -3 0 C -3 0.795609 -2.683901 1.55874 -2.12132 2.12132 C -1.55874 2.683901 -0.795609 3 0 3 z " style="stroke: #ff7f0e" />
4325
  </defs>
4326
  <g clip-path="url(#pb49fc4c8d2)">
4327
- <use ns4:href="#m9b8c54d372" x="83.325193" y="401.710683" style="fill: #ff7f0e; stroke: #ff7f0e" />
4328
- <use ns4:href="#m9b8c54d372" x="114.286231" y="389.180788" style="fill: #ff7f0e; stroke: #ff7f0e" />
4329
- <use ns4:href="#m9b8c54d372" x="145.247268" y="389.523249" style="fill: #ff7f0e; stroke: #ff7f0e" />
4330
- <use ns4:href="#m9b8c54d372" x="176.208306" y="390.141349" style="fill: #ff7f0e; stroke: #ff7f0e" />
4331
- <use ns4:href="#m9b8c54d372" x="207.169343" y="391.126968" style="fill: #ff7f0e; stroke: #ff7f0e" />
4332
- <use ns4:href="#m9b8c54d372" x="238.130381" y="390.809566" style="fill: #ff7f0e; stroke: #ff7f0e" />
4333
- <use ns4:href="#m9b8c54d372" x="269.091418" y="390.934856" style="fill: #ff7f0e; stroke: #ff7f0e" />
4334
- <use ns4:href="#m9b8c54d372" x="300.052455" y="390.667569" style="fill: #ff7f0e; stroke: #ff7f0e" />
4335
- <use ns4:href="#m9b8c54d372" x="331.013493" y="390.500515" style="fill: #ff7f0e; stroke: #ff7f0e" />
4336
- <use ns4:href="#m9b8c54d372" x="361.97453" y="389.707008" style="fill: #ff7f0e; stroke: #ff7f0e" />
4337
- <use ns4:href="#m9b8c54d372" x="392.935568" y="339.037818" style="fill: #ff7f0e; stroke: #ff7f0e" />
4338
- <use ns4:href="#m9b8c54d372" x="423.896605" y="325.239147" style="fill: #ff7f0e; stroke: #ff7f0e" />
4339
- <use ns4:href="#m9b8c54d372" x="454.857643" y="391.043441" style="fill: #ff7f0e; stroke: #ff7f0e" />
4340
- <use ns4:href="#m9b8c54d372" x="485.81868" y="391.009195" style="fill: #ff7f0e; stroke: #ff7f0e" />
4341
- <use ns4:href="#m9b8c54d372" x="516.779718" y="391.143674" style="fill: #ff7f0e; stroke: #ff7f0e" />
4342
- <use ns4:href="#m9b8c54d372" x="547.740755" y="390.442046" style="fill: #ff7f0e; stroke: #ff7f0e" />
4343
- <use ns4:href="#m9b8c54d372" x="578.701793" y="390.951562" style="fill: #ff7f0e; stroke: #ff7f0e" />
4344
- <use ns4:href="#m9b8c54d372" x="609.66283" y="389.129836" style="fill: #ff7f0e; stroke: #ff7f0e" />
4345
- <use ns4:href="#m9b8c54d372" x="640.623868" y="391.795185" style="fill: #ff7f0e; stroke: #ff7f0e" />
4346
- <use ns4:href="#m9b8c54d372" x="671.584905" y="391.319081" style="fill: #ff7f0e; stroke: #ff7f0e" />
4347
- <use ns4:href="#m9b8c54d372" x="702.545943" y="381.654999" style="fill: #ff7f0e; stroke: #ff7f0e" />
4348
- <use ns4:href="#m9b8c54d372" x="733.50698" y="375.966806" style="fill: #ff7f0e; stroke: #ff7f0e" />
4349
- <use ns4:href="#m9b8c54d372" x="764.468018" y="53.96077" style="fill: #ff7f0e; stroke: #ff7f0e" />
4350
  <use ns4:href="#m9b8c54d372" x="795.429055" y="45.608899" style="fill: #ff7f0e; stroke: #ff7f0e" />
4351
  </g>
4352
  </g>
@@ -4405,7 +4413,7 @@ span.linenos.special { color: #000000; background-color: #ffffc0; padding-left:
4405
  <span onclick="toggleOutput('combine')" style="cursor: pointer;">▼ output</span>
4406
  <span id="uv-indicator-combine" onclick="toggleUvLogsFromHeader('combine')" style="cursor: pointer;">▶ uv-logs</span>
4407
  </span> |
4408
- Cell: combine | 4.32s
4409
  | <button class="run-btn" onclick="runCell('combine')">▶ run</button>
4410
  <button class="copy-btn" onclick="copyCell('combine')">Copy</button>
4411
  <a href="cells/combine.py" target="_blank" class="raw-btn">Raw</a>
@@ -4498,12 +4506,12 @@ hf_kernels_causal_conv1d cuda_B2_D2048_S2048_W2 0.05 True
4498
  hf_kernels_causal_conv1d cuda_B2_D2048_S2048_W4 0.05 True
4499
  hf_kernels_causal_conv1d cuda_B2_D2048_S512_W2 0.05 True
4500
  hf_kernels_causal_conv1d cuda_B2_D2048_S512_W4 0.05 True
4501
- hf_kernels_causal_conv1d cuda_B2_D64_S128_W2 0.05 True
4502
- hf_kernels_causal_conv1d cuda_B2_D64_S128_W4 0.06 True
4503
- hf_kernels_causal_conv1d cuda_B2_D64_S2048_W2 0.06 True
4504
  hf_kernels_causal_conv1d cuda_B2_D64_S2048_W4 0.05 True
4505
- hf_kernels_causal_conv1d cuda_B2_D64_S512_W2 0.06 True
4506
- hf_kernels_causal_conv1d cuda_B2_D64_S512_W4 0.06 True
4507
  hf_kernels_causal_conv1d cuda_B4_D2048_S128_W2 0.05 True
4508
  hf_kernels_causal_conv1d cuda_B4_D2048_S128_W4 0.05 True
4509
  hf_kernels_causal_conv1d cuda_B4_D2048_S2048_W2 0.05 True
@@ -4514,7 +4522,7 @@ hf_kernels_causal_conv1d cuda_B4_D64_S128_W2 0.05 True
4514
  hf_kernels_causal_conv1d cuda_B4_D64_S128_W4 0.05 True
4515
  hf_kernels_causal_conv1d cuda_B4_D64_S2048_W2 0.05 True
4516
  hf_kernels_causal_conv1d cuda_B4_D64_S2048_W4 0.05 True
4517
- hf_kernels_causal_conv1d cuda_B4_D64_S512_W2 0.06 True
4518
  hf_kernels_causal_conv1d cuda_B4_D64_S512_W4 0.05 True
4519
  torch_eager cuda_B2_D2048_S128_W2 0.08 True
4520
  torch_eager cuda_B2_D2048_S128_W4 0.08 True
@@ -4537,7 +4545,7 @@ torch_eager cuda_B4_D2048_S512_W4 0.10 True
4537
  torch_eager cuda_B4_D64_S128_W2 0.08 True
4538
  torch_eager cuda_B4_D64_S128_W4 0.08 True
4539
  torch_eager cuda_B4_D64_S2048_W2 0.08 True
4540
- torch_eager cuda_B4_D64_S2048_W4 0.09 True
4541
  torch_eager cuda_B4_D64_S512_W2 0.08 True
4542
  torch_eager cuda_B4_D64_S512_W4 0.08 True
4543
 
@@ -4559,7 +4567,7 @@ Implementations included:
4559
  <div class="uv-install-logs" id="uv-logs-combine">
4560
  <div class="uv-logs-header" onclick="toggleUvLogs(this)">▶ UV Install Logs</div>
4561
  <div class="uv-logs-content" style="display: none;">
4562
- Installed 37 packages in 214ms
4563
  </div>
4564
  </div>
4565
  <div class="cell-artifacts">
@@ -4572,7 +4580,7 @@ Installed 37 packages in 214ms
4572
  <rdf:RDF>
4573
  <ns2:Work>
4574
  <dc:type rdf:resource="http://purl.org/dc/dcmitype/StillImage" />
4575
- <dc:date>2025-10-29T14:27:58.771179</dc:date>
4576
  <dc:format>image/svg+xml</dc:format>
4577
  <dc:creator>
4578
  <ns2:Agent>
@@ -4916,70 +4924,70 @@ Installed 37 packages in 214ms
4916
  <g id="matplotlib.axis_2">
4917
  <g id="ytick_1">
4918
  <g id="grid-y--2" class="grid grid-y">
4919
- <path d="M 47.72 377.079386 L 831.034248 377.079386 " clip-path="url(#pb49fc4c8d2)" style="fill: none; stroke: #b0b0b0; stroke-opacity: 0.3; stroke-width: 0.8; stroke-linecap: square" />
4920
  </g>
4921
  <g id="line2d_25">
4922
  <defs>
4923
  <path id="m0fca2865ba" d="M 0 0 L -3.5 0 " style="stroke: #000000; stroke-width: 0.8" />
4924
  </defs>
4925
  <g>
4926
- <use ns4:href="#m0fca2865ba" x="47.72" y="377.079386" style="stroke: #000000; stroke-width: 0.8" />
4927
  </g>
4928
  </g>
4929
  <g id="text_25">
4930
- <text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: end" x="40.72" y="380.878605" transform="rotate(-0 40.72 380.878605)">0.1</text>
4931
  </g>
4932
  </g>
4933
  <g id="ytick_2">
4934
  <g id="grid-y--3" class="grid grid-y">
4935
- <path d="M 47.72 293.552318 L 831.034248 293.552318 " clip-path="url(#pb49fc4c8d2)" style="fill: none; stroke: #b0b0b0; stroke-opacity: 0.3; stroke-width: 0.8; stroke-linecap: square" />
4936
  </g>
4937
  <g id="line2d_26">
4938
  <g>
4939
- <use ns4:href="#m0fca2865ba" x="47.72" y="293.552318" style="stroke: #000000; stroke-width: 0.8" />
4940
  </g>
4941
  </g>
4942
  <g id="text_26">
4943
- <text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: end" x="40.72" y="297.351537" transform="rotate(-0 40.72 297.351537)">0.2</text>
4944
  </g>
4945
  </g>
4946
  <g id="ytick_3">
4947
  <g id="grid-y--4" class="grid grid-y">
4948
- <path d="M 47.72 210.02525 L 831.034248 210.02525 " clip-path="url(#pb49fc4c8d2)" style="fill: none; stroke: #b0b0b0; stroke-opacity: 0.3; stroke-width: 0.8; stroke-linecap: square" />
4949
  </g>
4950
  <g id="line2d_27">
4951
  <g>
4952
- <use ns4:href="#m0fca2865ba" x="47.72" y="210.02525" style="stroke: #000000; stroke-width: 0.8" />
4953
  </g>
4954
  </g>
4955
  <g id="text_27">
4956
- <text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: end" x="40.72" y="213.824469" transform="rotate(-0 40.72 213.824469)">0.3</text>
4957
  </g>
4958
  </g>
4959
  <g id="ytick_4">
4960
  <g id="grid-y--5" class="grid grid-y">
4961
- <path d="M 47.72 126.498182 L 831.034248 126.498182 " clip-path="url(#pb49fc4c8d2)" style="fill: none; stroke: #b0b0b0; stroke-opacity: 0.3; stroke-width: 0.8; stroke-linecap: square" />
4962
  </g>
4963
  <g id="line2d_28">
4964
  <g>
4965
- <use ns4:href="#m0fca2865ba" x="47.72" y="126.498182" style="stroke: #000000; stroke-width: 0.8" />
4966
  </g>
4967
  </g>
4968
  <g id="text_28">
4969
- <text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: end" x="40.72" y="130.297401" transform="rotate(-0 40.72 130.297401)">0.4</text>
4970
  </g>
4971
  </g>
4972
  <g id="ytick_5">
4973
  <g id="grid-y--6" class="grid grid-y">
4974
- <path d="M 47.72 42.971114 L 831.034248 42.971114 " clip-path="url(#pb49fc4c8d2)" style="fill: none; stroke: #b0b0b0; stroke-opacity: 0.3; stroke-width: 0.8; stroke-linecap: square" />
4975
  </g>
4976
  <g id="line2d_29">
4977
  <g>
4978
- <use ns4:href="#m0fca2865ba" x="47.72" y="42.971114" style="stroke: #000000; stroke-width: 0.8" />
4979
  </g>
4980
  </g>
4981
  <g id="text_29">
4982
- <text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: end" x="40.72" y="46.770333" transform="rotate(-0 40.72 46.770333)">0.5</text>
4983
  </g>
4984
  </g>
4985
  <g id="label--y" class="ylabel">
@@ -4987,66 +4995,66 @@ Installed 37 packages in 214ms
4987
  </g>
4988
  </g>
4989
  <g id="series--hf-kernels-causal-conv1d" class="series">
4990
- <path d="M 83.325193 420.186871 L 114.286231 413.746934 L 145.247268 413.019413 L 176.208306 414.649026 L 207.169343 414.665731 L 238.130381 415.985459 L 269.091418 416.252746 L 300.052455 415.525225 L 331.013493 416.703792 L 361.97453 415.692279 L 392.935568 416.269451 L 423.896605 416.168383 L 454.857643 415.52606 L 485.81868 415.952048 L 516.779718 414.072689 L 547.740755 415.399934 L 578.701793 415.43418 L 609.66283 416.402259 L 640.623868 414.841138 L 671.584905 415.024898 L 702.545943 414.990652 L 733.50698 414.974782 L 764.468018 415.61794 L 795.429055 415.759936 " clip-path="url(#pb49fc4c8d2)" style="fill: none; stroke: #1f77b4; stroke-width: 1.5; stroke-linecap: square" />
4991
  <defs>
4992
  <path id="md7efaf3aec" d="M 0 3 C 0.795609 3 1.55874 2.683901 2.12132 2.12132 C 2.683901 1.55874 3 0.795609 3 0 C 3 -0.795609 2.683901 -1.55874 2.12132 -2.12132 C 1.55874 -2.683901 0.795609 -3 0 -3 C -0.795609 -3 -1.55874 -2.683901 -2.12132 -2.12132 C -2.683901 -1.55874 -3 -0.795609 -3 0 C -3 0.795609 -2.683901 1.55874 -2.12132 2.12132 C -1.55874 2.683901 -0.795609 3 0 3 z " style="stroke: #1f77b4" />
4993
  </defs>
4994
  <g clip-path="url(#pb49fc4c8d2)">
4995
  <use ns4:href="#md7efaf3aec" x="83.325193" y="420.186871" style="fill: #1f77b4; stroke: #1f77b4" />
4996
- <use ns4:href="#md7efaf3aec" x="114.286231" y="413.746934" style="fill: #1f77b4; stroke: #1f77b4" />
4997
- <use ns4:href="#md7efaf3aec" x="145.247268" y="413.019413" style="fill: #1f77b4; stroke: #1f77b4" />
4998
- <use ns4:href="#md7efaf3aec" x="176.208306" y="414.649026" style="fill: #1f77b4; stroke: #1f77b4" />
4999
- <use ns4:href="#md7efaf3aec" x="207.169343" y="414.665731" style="fill: #1f77b4; stroke: #1f77b4" />
5000
- <use ns4:href="#md7efaf3aec" x="238.130381" y="415.985459" style="fill: #1f77b4; stroke: #1f77b4" />
5001
- <use ns4:href="#md7efaf3aec" x="269.091418" y="416.252746" style="fill: #1f77b4; stroke: #1f77b4" />
5002
- <use ns4:href="#md7efaf3aec" x="300.052455" y="415.525225" style="fill: #1f77b4; stroke: #1f77b4" />
5003
- <use ns4:href="#md7efaf3aec" x="331.013493" y="416.703792" style="fill: #1f77b4; stroke: #1f77b4" />
5004
- <use ns4:href="#md7efaf3aec" x="361.97453" y="415.692279" style="fill: #1f77b4; stroke: #1f77b4" />
5005
- <use ns4:href="#md7efaf3aec" x="392.935568" y="416.269451" style="fill: #1f77b4; stroke: #1f77b4" />
5006
- <use ns4:href="#md7efaf3aec" x="423.896605" y="416.168383" style="fill: #1f77b4; stroke: #1f77b4" />
5007
- <use ns4:href="#md7efaf3aec" x="454.857643" y="415.52606" style="fill: #1f77b4; stroke: #1f77b4" />
5008
- <use ns4:href="#md7efaf3aec" x="485.81868" y="415.952048" style="fill: #1f77b4; stroke: #1f77b4" />
5009
- <use ns4:href="#md7efaf3aec" x="516.779718" y="414.072689" style="fill: #1f77b4; stroke: #1f77b4" />
5010
- <use ns4:href="#md7efaf3aec" x="547.740755" y="415.399934" style="fill: #1f77b4; stroke: #1f77b4" />
5011
- <use ns4:href="#md7efaf3aec" x="578.701793" y="415.43418" style="fill: #1f77b4; stroke: #1f77b4" />
5012
- <use ns4:href="#md7efaf3aec" x="609.66283" y="416.402259" style="fill: #1f77b4; stroke: #1f77b4" />
5013
- <use ns4:href="#md7efaf3aec" x="640.623868" y="414.841138" style="fill: #1f77b4; stroke: #1f77b4" />
5014
- <use ns4:href="#md7efaf3aec" x="671.584905" y="415.024898" style="fill: #1f77b4; stroke: #1f77b4" />
5015
- <use ns4:href="#md7efaf3aec" x="702.545943" y="414.990652" style="fill: #1f77b4; stroke: #1f77b4" />
5016
- <use ns4:href="#md7efaf3aec" x="733.50698" y="414.974782" style="fill: #1f77b4; stroke: #1f77b4" />
5017
- <use ns4:href="#md7efaf3aec" x="764.468018" y="415.61794" style="fill: #1f77b4; stroke: #1f77b4" />
5018
- <use ns4:href="#md7efaf3aec" x="795.429055" y="415.759936" style="fill: #1f77b4; stroke: #1f77b4" />
5019
  </g>
5020
  </g>
5021
  <g id="series--torch-eager" class="series">
5022
- <path d="M 83.325193 401.710683 L 114.286231 389.180788 L 145.247268 389.523249 L 176.208306 390.141349 L 207.169343 391.126968 L 238.130381 390.809566 L 269.091418 390.934856 L 300.052455 390.667569 L 331.013493 390.500515 L 361.97453 389.707008 L 392.935568 339.037818 L 423.896605 325.239147 L 454.857643 391.043441 L 485.81868 391.009195 L 516.779718 391.143674 L 547.740755 390.442046 L 578.701793 390.951562 L 609.66283 389.129836 L 640.623868 391.795185 L 671.584905 391.319081 L 702.545943 381.654999 L 733.50698 375.966806 L 764.468018 53.96077 L 795.429055 45.608899 " clip-path="url(#pb49fc4c8d2)" style="fill: none; stroke: #ff7f0e; stroke-width: 1.5; stroke-linecap: square" />
5023
  <defs>
5024
  <path id="m9b8c54d372" d="M 0 3 C 0.795609 3 1.55874 2.683901 2.12132 2.12132 C 2.683901 1.55874 3 0.795609 3 0 C 3 -0.795609 2.683901 -1.55874 2.12132 -2.12132 C 1.55874 -2.683901 0.795609 -3 0 -3 C -0.795609 -3 -1.55874 -2.683901 -2.12132 -2.12132 C -2.683901 -1.55874 -3 -0.795609 -3 0 C -3 0.795609 -2.683901 1.55874 -2.12132 2.12132 C -1.55874 2.683901 -0.795609 3 0 3 z " style="stroke: #ff7f0e" />
5025
  </defs>
5026
  <g clip-path="url(#pb49fc4c8d2)">
5027
- <use ns4:href="#m9b8c54d372" x="83.325193" y="401.710683" style="fill: #ff7f0e; stroke: #ff7f0e" />
5028
- <use ns4:href="#m9b8c54d372" x="114.286231" y="389.180788" style="fill: #ff7f0e; stroke: #ff7f0e" />
5029
- <use ns4:href="#m9b8c54d372" x="145.247268" y="389.523249" style="fill: #ff7f0e; stroke: #ff7f0e" />
5030
- <use ns4:href="#m9b8c54d372" x="176.208306" y="390.141349" style="fill: #ff7f0e; stroke: #ff7f0e" />
5031
- <use ns4:href="#m9b8c54d372" x="207.169343" y="391.126968" style="fill: #ff7f0e; stroke: #ff7f0e" />
5032
- <use ns4:href="#m9b8c54d372" x="238.130381" y="390.809566" style="fill: #ff7f0e; stroke: #ff7f0e" />
5033
- <use ns4:href="#m9b8c54d372" x="269.091418" y="390.934856" style="fill: #ff7f0e; stroke: #ff7f0e" />
5034
- <use ns4:href="#m9b8c54d372" x="300.052455" y="390.667569" style="fill: #ff7f0e; stroke: #ff7f0e" />
5035
- <use ns4:href="#m9b8c54d372" x="331.013493" y="390.500515" style="fill: #ff7f0e; stroke: #ff7f0e" />
5036
- <use ns4:href="#m9b8c54d372" x="361.97453" y="389.707008" style="fill: #ff7f0e; stroke: #ff7f0e" />
5037
- <use ns4:href="#m9b8c54d372" x="392.935568" y="339.037818" style="fill: #ff7f0e; stroke: #ff7f0e" />
5038
- <use ns4:href="#m9b8c54d372" x="423.896605" y="325.239147" style="fill: #ff7f0e; stroke: #ff7f0e" />
5039
- <use ns4:href="#m9b8c54d372" x="454.857643" y="391.043441" style="fill: #ff7f0e; stroke: #ff7f0e" />
5040
- <use ns4:href="#m9b8c54d372" x="485.81868" y="391.009195" style="fill: #ff7f0e; stroke: #ff7f0e" />
5041
- <use ns4:href="#m9b8c54d372" x="516.779718" y="391.143674" style="fill: #ff7f0e; stroke: #ff7f0e" />
5042
- <use ns4:href="#m9b8c54d372" x="547.740755" y="390.442046" style="fill: #ff7f0e; stroke: #ff7f0e" />
5043
- <use ns4:href="#m9b8c54d372" x="578.701793" y="390.951562" style="fill: #ff7f0e; stroke: #ff7f0e" />
5044
- <use ns4:href="#m9b8c54d372" x="609.66283" y="389.129836" style="fill: #ff7f0e; stroke: #ff7f0e" />
5045
- <use ns4:href="#m9b8c54d372" x="640.623868" y="391.795185" style="fill: #ff7f0e; stroke: #ff7f0e" />
5046
- <use ns4:href="#m9b8c54d372" x="671.584905" y="391.319081" style="fill: #ff7f0e; stroke: #ff7f0e" />
5047
- <use ns4:href="#m9b8c54d372" x="702.545943" y="381.654999" style="fill: #ff7f0e; stroke: #ff7f0e" />
5048
- <use ns4:href="#m9b8c54d372" x="733.50698" y="375.966806" style="fill: #ff7f0e; stroke: #ff7f0e" />
5049
- <use ns4:href="#m9b8c54d372" x="764.468018" y="53.96077" style="fill: #ff7f0e; stroke: #ff7f0e" />
5050
  <use ns4:href="#m9b8c54d372" x="795.429055" y="45.608899" style="fill: #ff7f0e; stroke: #ff7f0e" />
5051
  </g>
5052
  </g>
 
809
  .artifact-preview svg {
810
  background: transparent;
811
  }
812
+ /* Invert SVG images in dark mode */
813
+ :root[data-theme="dark"] .artifact-preview img[src$=".svg"] {
814
+ filter: invert(0.9) hue-rotate(180deg);
815
+ }
816
+ /* Keep SVG images readable in monocolor mode */
817
+ :root[data-ui="monocolor"] .artifact-preview img[src$=".svg"] {
818
+ filter: none;
819
+ }
820
  /* CSV table styling */
821
  .artifact-csv {
822
  margin-top: 1rem;
 
3880
  <rdf:RDF>
3881
  <ns2:Work>
3882
  <dc:type rdf:resource="http://purl.org/dc/dcmitype/StillImage" />
3883
+ <dc:date>2025-10-29T15:50:56.264680</dc:date>
3884
  <dc:format>image/svg+xml</dc:format>
3885
  <dc:creator>
3886
  <ns2:Agent>
 
4224
  <g id="matplotlib.axis_2">
4225
  <g id="ytick_1">
4226
  <g id="grid-y--2" class="grid grid-y">
4227
+ <path d="M 47.72 373.1985 L 831.034248 373.1985 " clip-path="url(#pb49fc4c8d2)" style="fill: none; stroke: #b0b0b0; stroke-opacity: 0.3; stroke-width: 0.8; stroke-linecap: square" />
4228
  </g>
4229
  <g id="line2d_25">
4230
  <defs>
4231
  <path id="m0fca2865ba" d="M 0 0 L -3.5 0 " style="stroke: #000000; stroke-width: 0.8" />
4232
  </defs>
4233
  <g>
4234
+ <use ns4:href="#m0fca2865ba" x="47.72" y="373.1985" style="stroke: #000000; stroke-width: 0.8" />
4235
  </g>
4236
  </g>
4237
  <g id="text_25">
4238
+ <text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: end" x="40.72" y="376.997718" transform="rotate(-0 40.72 376.997718)">0.1</text>
4239
  </g>
4240
  </g>
4241
  <g id="ytick_2">
4242
  <g id="grid-y--3" class="grid grid-y">
4243
+ <path d="M 47.72 290.703423 L 831.034248 290.703423 " clip-path="url(#pb49fc4c8d2)" style="fill: none; stroke: #b0b0b0; stroke-opacity: 0.3; stroke-width: 0.8; stroke-linecap: square" />
4244
  </g>
4245
  <g id="line2d_26">
4246
  <g>
4247
+ <use ns4:href="#m0fca2865ba" x="47.72" y="290.703423" style="stroke: #000000; stroke-width: 0.8" />
4248
  </g>
4249
  </g>
4250
  <g id="text_26">
4251
+ <text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: end" x="40.72" y="294.502641" transform="rotate(-0 40.72 294.502641)">0.2</text>
4252
  </g>
4253
  </g>
4254
  <g id="ytick_3">
4255
  <g id="grid-y--4" class="grid grid-y">
4256
+ <path d="M 47.72 208.208345 L 831.034248 208.208345 " clip-path="url(#pb49fc4c8d2)" style="fill: none; stroke: #b0b0b0; stroke-opacity: 0.3; stroke-width: 0.8; stroke-linecap: square" />
4257
  </g>
4258
  <g id="line2d_27">
4259
  <g>
4260
+ <use ns4:href="#m0fca2865ba" x="47.72" y="208.208345" style="stroke: #000000; stroke-width: 0.8" />
4261
  </g>
4262
  </g>
4263
  <g id="text_27">
4264
+ <text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: end" x="40.72" y="212.007564" transform="rotate(-0 40.72 212.007564)">0.3</text>
4265
  </g>
4266
  </g>
4267
  <g id="ytick_4">
4268
  <g id="grid-y--5" class="grid grid-y">
4269
+ <path d="M 47.72 125.713268 L 831.034248 125.713268 " clip-path="url(#pb49fc4c8d2)" style="fill: none; stroke: #b0b0b0; stroke-opacity: 0.3; stroke-width: 0.8; stroke-linecap: square" />
4270
  </g>
4271
  <g id="line2d_28">
4272
  <g>
4273
+ <use ns4:href="#m0fca2865ba" x="47.72" y="125.713268" style="stroke: #000000; stroke-width: 0.8" />
4274
  </g>
4275
  </g>
4276
  <g id="text_28">
4277
+ <text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: end" x="40.72" y="129.512487" transform="rotate(-0 40.72 129.512487)">0.4</text>
4278
  </g>
4279
  </g>
4280
  <g id="ytick_5">
4281
  <g id="grid-y--6" class="grid grid-y">
4282
+ <path d="M 47.72 43.218191 L 831.034248 43.218191 " clip-path="url(#pb49fc4c8d2)" style="fill: none; stroke: #b0b0b0; stroke-opacity: 0.3; stroke-width: 0.8; stroke-linecap: square" />
4283
  </g>
4284
  <g id="line2d_29">
4285
  <g>
4286
+ <use ns4:href="#m0fca2865ba" x="47.72" y="43.218191" style="stroke: #000000; stroke-width: 0.8" />
4287
  </g>
4288
  </g>
4289
  <g id="text_29">
4290
+ <text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: end" x="40.72" y="47.01741" transform="rotate(-0 40.72 47.01741)">0.5</text>
4291
  </g>
4292
  </g>
4293
  <g id="label--y" class="ylabel">
 
4295
  </g>
4296
  </g>
4297
  <g id="series--hf-kernels-causal-conv1d" class="series">
4298
+ <path d="M 83.325193 420.186871 L 114.286231 413.282033 L 145.247268 414.725697 L 176.208306 414.527708 L 207.169343 414.733946 L 238.130381 414.742196 L 269.091418 415.294913 L 300.052455 415.129922 L 331.013493 416.168535 L 361.97453 415.319661 L 392.935568 415.451653 L 423.896605 416.399522 L 454.857643 415.476402 L 485.81868 414.915435 L 516.779718 415.558897 L 547.740755 415.080425 L 578.701793 415.905376 L 609.66283 414.428714 L 640.623868 414.980606 L 671.584905 414.964932 L 702.545943 414.634952 L 733.50698 414.321471 L 764.468018 414.766944 L 795.429055 414.676199 " clip-path="url(#pb49fc4c8d2)" style="fill: none; stroke: #1f77b4; stroke-width: 1.5; stroke-linecap: square" />
4299
  <defs>
4300
  <path id="md7efaf3aec" d="M 0 3 C 0.795609 3 1.55874 2.683901 2.12132 2.12132 C 2.683901 1.55874 3 0.795609 3 0 C 3 -0.795609 2.683901 -1.55874 2.12132 -2.12132 C 1.55874 -2.683901 0.795609 -3 0 -3 C -0.795609 -3 -1.55874 -2.683901 -2.12132 -2.12132 C -2.683901 -1.55874 -3 -0.795609 -3 0 C -3 0.795609 -2.683901 1.55874 -2.12132 2.12132 C -1.55874 2.683901 -0.795609 3 0 3 z " style="stroke: #1f77b4" />
4301
  </defs>
4302
  <g clip-path="url(#pb49fc4c8d2)">
4303
  <use ns4:href="#md7efaf3aec" x="83.325193" y="420.186871" style="fill: #1f77b4; stroke: #1f77b4" />
4304
+ <use ns4:href="#md7efaf3aec" x="114.286231" y="413.282033" style="fill: #1f77b4; stroke: #1f77b4" />
4305
+ <use ns4:href="#md7efaf3aec" x="145.247268" y="414.725697" style="fill: #1f77b4; stroke: #1f77b4" />
4306
+ <use ns4:href="#md7efaf3aec" x="176.208306" y="414.527708" style="fill: #1f77b4; stroke: #1f77b4" />
4307
+ <use ns4:href="#md7efaf3aec" x="207.169343" y="414.733946" style="fill: #1f77b4; stroke: #1f77b4" />
4308
+ <use ns4:href="#md7efaf3aec" x="238.130381" y="414.742196" style="fill: #1f77b4; stroke: #1f77b4" />
4309
+ <use ns4:href="#md7efaf3aec" x="269.091418" y="415.294913" style="fill: #1f77b4; stroke: #1f77b4" />
4310
+ <use ns4:href="#md7efaf3aec" x="300.052455" y="415.129922" style="fill: #1f77b4; stroke: #1f77b4" />
4311
+ <use ns4:href="#md7efaf3aec" x="331.013493" y="416.168535" style="fill: #1f77b4; stroke: #1f77b4" />
4312
+ <use ns4:href="#md7efaf3aec" x="361.97453" y="415.319661" style="fill: #1f77b4; stroke: #1f77b4" />
4313
+ <use ns4:href="#md7efaf3aec" x="392.935568" y="415.451653" style="fill: #1f77b4; stroke: #1f77b4" />
4314
+ <use ns4:href="#md7efaf3aec" x="423.896605" y="416.399522" style="fill: #1f77b4; stroke: #1f77b4" />
4315
+ <use ns4:href="#md7efaf3aec" x="454.857643" y="415.476402" style="fill: #1f77b4; stroke: #1f77b4" />
4316
+ <use ns4:href="#md7efaf3aec" x="485.81868" y="414.915435" style="fill: #1f77b4; stroke: #1f77b4" />
4317
+ <use ns4:href="#md7efaf3aec" x="516.779718" y="415.558897" style="fill: #1f77b4; stroke: #1f77b4" />
4318
+ <use ns4:href="#md7efaf3aec" x="547.740755" y="415.080425" style="fill: #1f77b4; stroke: #1f77b4" />
4319
+ <use ns4:href="#md7efaf3aec" x="578.701793" y="415.905376" style="fill: #1f77b4; stroke: #1f77b4" />
4320
+ <use ns4:href="#md7efaf3aec" x="609.66283" y="414.428714" style="fill: #1f77b4; stroke: #1f77b4" />
4321
+ <use ns4:href="#md7efaf3aec" x="640.623868" y="414.980606" style="fill: #1f77b4; stroke: #1f77b4" />
4322
+ <use ns4:href="#md7efaf3aec" x="671.584905" y="414.964932" style="fill: #1f77b4; stroke: #1f77b4" />
4323
+ <use ns4:href="#md7efaf3aec" x="702.545943" y="414.634952" style="fill: #1f77b4; stroke: #1f77b4" />
4324
+ <use ns4:href="#md7efaf3aec" x="733.50698" y="414.321471" style="fill: #1f77b4; stroke: #1f77b4" />
4325
+ <use ns4:href="#md7efaf3aec" x="764.468018" y="414.766944" style="fill: #1f77b4; stroke: #1f77b4" />
4326
+ <use ns4:href="#md7efaf3aec" x="795.429055" y="414.676199" style="fill: #1f77b4; stroke: #1f77b4" />
4327
  </g>
4328
  </g>
4329
  <g id="series--torch-eager" class="series">
4330
+ <path d="M 83.325193 397.16167 L 114.286231 384.655416 L 145.247268 385.439119 L 176.208306 386.603125 L 207.169343 386.93228 L 238.130381 386.948779 L 269.091418 386.320992 L 300.052455 386.924031 L 331.013493 386.998276 L 361.97453 386.93228 L 392.935568 335.710262 L 423.896605 321.165555 L 454.857643 387.52707 L 485.81868 387.60874 L 516.779718 388.243952 L 547.740755 387.27051 L 578.701793 386.635298 L 609.66283 387.072522 L 640.623868 387.048598 L 671.584905 386.643547 L 702.545943 378.377541 L 733.50698 373.106105 L 764.468018 54.3872 L 795.429055 45.608899 " clip-path="url(#pb49fc4c8d2)" style="fill: none; stroke: #ff7f0e; stroke-width: 1.5; stroke-linecap: square" />
4331
  <defs>
4332
  <path id="m9b8c54d372" d="M 0 3 C 0.795609 3 1.55874 2.683901 2.12132 2.12132 C 2.683901 1.55874 3 0.795609 3 0 C 3 -0.795609 2.683901 -1.55874 2.12132 -2.12132 C 1.55874 -2.683901 0.795609 -3 0 -3 C -0.795609 -3 -1.55874 -2.683901 -2.12132 -2.12132 C -2.683901 -1.55874 -3 -0.795609 -3 0 C -3 0.795609 -2.683901 1.55874 -2.12132 2.12132 C -1.55874 2.683901 -0.795609 3 0 3 z " style="stroke: #ff7f0e" />
4333
  </defs>
4334
  <g clip-path="url(#pb49fc4c8d2)">
4335
+ <use ns4:href="#m9b8c54d372" x="83.325193" y="397.16167" style="fill: #ff7f0e; stroke: #ff7f0e" />
4336
+ <use ns4:href="#m9b8c54d372" x="114.286231" y="384.655416" style="fill: #ff7f0e; stroke: #ff7f0e" />
4337
+ <use ns4:href="#m9b8c54d372" x="145.247268" y="385.439119" style="fill: #ff7f0e; stroke: #ff7f0e" />
4338
+ <use ns4:href="#m9b8c54d372" x="176.208306" y="386.603125" style="fill: #ff7f0e; stroke: #ff7f0e" />
4339
+ <use ns4:href="#m9b8c54d372" x="207.169343" y="386.93228" style="fill: #ff7f0e; stroke: #ff7f0e" />
4340
+ <use ns4:href="#m9b8c54d372" x="238.130381" y="386.948779" style="fill: #ff7f0e; stroke: #ff7f0e" />
4341
+ <use ns4:href="#m9b8c54d372" x="269.091418" y="386.320992" style="fill: #ff7f0e; stroke: #ff7f0e" />
4342
+ <use ns4:href="#m9b8c54d372" x="300.052455" y="386.924031" style="fill: #ff7f0e; stroke: #ff7f0e" />
4343
+ <use ns4:href="#m9b8c54d372" x="331.013493" y="386.998276" style="fill: #ff7f0e; stroke: #ff7f0e" />
4344
+ <use ns4:href="#m9b8c54d372" x="361.97453" y="386.93228" style="fill: #ff7f0e; stroke: #ff7f0e" />
4345
+ <use ns4:href="#m9b8c54d372" x="392.935568" y="335.710262" style="fill: #ff7f0e; stroke: #ff7f0e" />
4346
+ <use ns4:href="#m9b8c54d372" x="423.896605" y="321.165555" style="fill: #ff7f0e; stroke: #ff7f0e" />
4347
+ <use ns4:href="#m9b8c54d372" x="454.857643" y="387.52707" style="fill: #ff7f0e; stroke: #ff7f0e" />
4348
+ <use ns4:href="#m9b8c54d372" x="485.81868" y="387.60874" style="fill: #ff7f0e; stroke: #ff7f0e" />
4349
+ <use ns4:href="#m9b8c54d372" x="516.779718" y="388.243952" style="fill: #ff7f0e; stroke: #ff7f0e" />
4350
+ <use ns4:href="#m9b8c54d372" x="547.740755" y="387.27051" style="fill: #ff7f0e; stroke: #ff7f0e" />
4351
+ <use ns4:href="#m9b8c54d372" x="578.701793" y="386.635298" style="fill: #ff7f0e; stroke: #ff7f0e" />
4352
+ <use ns4:href="#m9b8c54d372" x="609.66283" y="387.072522" style="fill: #ff7f0e; stroke: #ff7f0e" />
4353
+ <use ns4:href="#m9b8c54d372" x="640.623868" y="387.048598" style="fill: #ff7f0e; stroke: #ff7f0e" />
4354
+ <use ns4:href="#m9b8c54d372" x="671.584905" y="386.643547" style="fill: #ff7f0e; stroke: #ff7f0e" />
4355
+ <use ns4:href="#m9b8c54d372" x="702.545943" y="378.377541" style="fill: #ff7f0e; stroke: #ff7f0e" />
4356
+ <use ns4:href="#m9b8c54d372" x="733.50698" y="373.106105" style="fill: #ff7f0e; stroke: #ff7f0e" />
4357
+ <use ns4:href="#m9b8c54d372" x="764.468018" y="54.3872" style="fill: #ff7f0e; stroke: #ff7f0e" />
4358
  <use ns4:href="#m9b8c54d372" x="795.429055" y="45.608899" style="fill: #ff7f0e; stroke: #ff7f0e" />
4359
  </g>
4360
  </g>
 
4413
  <span onclick="toggleOutput('combine')" style="cursor: pointer;">▼ output</span>
4414
  <span id="uv-indicator-combine" onclick="toggleUvLogsFromHeader('combine')" style="cursor: pointer;">▶ uv-logs</span>
4415
  </span> |
4416
+ Cell: combine | 4.35s
4417
  | <button class="run-btn" onclick="runCell('combine')">▶ run</button>
4418
  <button class="copy-btn" onclick="copyCell('combine')">Copy</button>
4419
  <a href="cells/combine.py" target="_blank" class="raw-btn">Raw</a>
 
4506
  hf_kernels_causal_conv1d cuda_B2_D2048_S2048_W4 0.05 True
4507
  hf_kernels_causal_conv1d cuda_B2_D2048_S512_W2 0.05 True
4508
  hf_kernels_causal_conv1d cuda_B2_D2048_S512_W4 0.05 True
4509
+ hf_kernels_causal_conv1d cuda_B2_D64_S128_W2 0.04 True
4510
+ hf_kernels_causal_conv1d cuda_B2_D64_S128_W4 0.05 True
4511
+ hf_kernels_causal_conv1d cuda_B2_D64_S2048_W2 0.05 True
4512
  hf_kernels_causal_conv1d cuda_B2_D64_S2048_W4 0.05 True
4513
+ hf_kernels_causal_conv1d cuda_B2_D64_S512_W2 0.05 True
4514
+ hf_kernels_causal_conv1d cuda_B2_D64_S512_W4 0.05 True
4515
  hf_kernels_causal_conv1d cuda_B4_D2048_S128_W2 0.05 True
4516
  hf_kernels_causal_conv1d cuda_B4_D2048_S128_W4 0.05 True
4517
  hf_kernels_causal_conv1d cuda_B4_D2048_S2048_W2 0.05 True
 
4522
  hf_kernels_causal_conv1d cuda_B4_D64_S128_W4 0.05 True
4523
  hf_kernels_causal_conv1d cuda_B4_D64_S2048_W2 0.05 True
4524
  hf_kernels_causal_conv1d cuda_B4_D64_S2048_W4 0.05 True
4525
+ hf_kernels_causal_conv1d cuda_B4_D64_S512_W2 0.05 True
4526
  hf_kernels_causal_conv1d cuda_B4_D64_S512_W4 0.05 True
4527
  torch_eager cuda_B2_D2048_S128_W2 0.08 True
4528
  torch_eager cuda_B2_D2048_S128_W4 0.08 True
 
4545
  torch_eager cuda_B4_D64_S128_W2 0.08 True
4546
  torch_eager cuda_B4_D64_S128_W4 0.08 True
4547
  torch_eager cuda_B4_D64_S2048_W2 0.08 True
4548
+ torch_eager cuda_B4_D64_S2048_W4 0.08 True
4549
  torch_eager cuda_B4_D64_S512_W2 0.08 True
4550
  torch_eager cuda_B4_D64_S512_W4 0.08 True
4551
 
 
4567
  <div class="uv-install-logs" id="uv-logs-combine">
4568
  <div class="uv-logs-header" onclick="toggleUvLogs(this)">▶ UV Install Logs</div>
4569
  <div class="uv-logs-content" style="display: none;">
4570
+ Installed 37 packages in 191ms
4571
  </div>
4572
  </div>
4573
  <div class="cell-artifacts">
 
4580
  <rdf:RDF>
4581
  <ns2:Work>
4582
  <dc:type rdf:resource="http://purl.org/dc/dcmitype/StillImage" />
4583
+ <dc:date>2025-10-29T15:50:56.264680</dc:date>
4584
  <dc:format>image/svg+xml</dc:format>
4585
  <dc:creator>
4586
  <ns2:Agent>
 
4924
  <g id="matplotlib.axis_2">
4925
  <g id="ytick_1">
4926
  <g id="grid-y--2" class="grid grid-y">
4927
+ <path d="M 47.72 373.1985 L 831.034248 373.1985 " clip-path="url(#pb49fc4c8d2)" style="fill: none; stroke: #b0b0b0; stroke-opacity: 0.3; stroke-width: 0.8; stroke-linecap: square" />
4928
  </g>
4929
  <g id="line2d_25">
4930
  <defs>
4931
  <path id="m0fca2865ba" d="M 0 0 L -3.5 0 " style="stroke: #000000; stroke-width: 0.8" />
4932
  </defs>
4933
  <g>
4934
+ <use ns4:href="#m0fca2865ba" x="47.72" y="373.1985" style="stroke: #000000; stroke-width: 0.8" />
4935
  </g>
4936
  </g>
4937
  <g id="text_25">
4938
+ <text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: end" x="40.72" y="376.997718" transform="rotate(-0 40.72 376.997718)">0.1</text>
4939
  </g>
4940
  </g>
4941
  <g id="ytick_2">
4942
  <g id="grid-y--3" class="grid grid-y">
4943
+ <path d="M 47.72 290.703423 L 831.034248 290.703423 " clip-path="url(#pb49fc4c8d2)" style="fill: none; stroke: #b0b0b0; stroke-opacity: 0.3; stroke-width: 0.8; stroke-linecap: square" />
4944
  </g>
4945
  <g id="line2d_26">
4946
  <g>
4947
+ <use ns4:href="#m0fca2865ba" x="47.72" y="290.703423" style="stroke: #000000; stroke-width: 0.8" />
4948
  </g>
4949
  </g>
4950
  <g id="text_26">
4951
+ <text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: end" x="40.72" y="294.502641" transform="rotate(-0 40.72 294.502641)">0.2</text>
4952
  </g>
4953
  </g>
4954
  <g id="ytick_3">
4955
  <g id="grid-y--4" class="grid grid-y">
4956
+ <path d="M 47.72 208.208345 L 831.034248 208.208345 " clip-path="url(#pb49fc4c8d2)" style="fill: none; stroke: #b0b0b0; stroke-opacity: 0.3; stroke-width: 0.8; stroke-linecap: square" />
4957
  </g>
4958
  <g id="line2d_27">
4959
  <g>
4960
+ <use ns4:href="#m0fca2865ba" x="47.72" y="208.208345" style="stroke: #000000; stroke-width: 0.8" />
4961
  </g>
4962
  </g>
4963
  <g id="text_27">
4964
+ <text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: end" x="40.72" y="212.007564" transform="rotate(-0 40.72 212.007564)">0.3</text>
4965
  </g>
4966
  </g>
4967
  <g id="ytick_4">
4968
  <g id="grid-y--5" class="grid grid-y">
4969
+ <path d="M 47.72 125.713268 L 831.034248 125.713268 " clip-path="url(#pb49fc4c8d2)" style="fill: none; stroke: #b0b0b0; stroke-opacity: 0.3; stroke-width: 0.8; stroke-linecap: square" />
4970
  </g>
4971
  <g id="line2d_28">
4972
  <g>
4973
+ <use ns4:href="#m0fca2865ba" x="47.72" y="125.713268" style="stroke: #000000; stroke-width: 0.8" />
4974
  </g>
4975
  </g>
4976
  <g id="text_28">
4977
+ <text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: end" x="40.72" y="129.512487" transform="rotate(-0 40.72 129.512487)">0.4</text>
4978
  </g>
4979
  </g>
4980
  <g id="ytick_5">
4981
  <g id="grid-y--6" class="grid grid-y">
4982
+ <path d="M 47.72 43.218191 L 831.034248 43.218191 " clip-path="url(#pb49fc4c8d2)" style="fill: none; stroke: #b0b0b0; stroke-opacity: 0.3; stroke-width: 0.8; stroke-linecap: square" />
4983
  </g>
4984
  <g id="line2d_29">
4985
  <g>
4986
+ <use ns4:href="#m0fca2865ba" x="47.72" y="43.218191" style="stroke: #000000; stroke-width: 0.8" />
4987
  </g>
4988
  </g>
4989
  <g id="text_29">
4990
+ <text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: end" x="40.72" y="47.01741" transform="rotate(-0 40.72 47.01741)">0.5</text>
4991
  </g>
4992
  </g>
4993
  <g id="label--y" class="ylabel">
 
4995
  </g>
4996
  </g>
4997
  <g id="series--hf-kernels-causal-conv1d" class="series">
4998
+ <path d="M 83.325193 420.186871 L 114.286231 413.282033 L 145.247268 414.725697 L 176.208306 414.527708 L 207.169343 414.733946 L 238.130381 414.742196 L 269.091418 415.294913 L 300.052455 415.129922 L 331.013493 416.168535 L 361.97453 415.319661 L 392.935568 415.451653 L 423.896605 416.399522 L 454.857643 415.476402 L 485.81868 414.915435 L 516.779718 415.558897 L 547.740755 415.080425 L 578.701793 415.905376 L 609.66283 414.428714 L 640.623868 414.980606 L 671.584905 414.964932 L 702.545943 414.634952 L 733.50698 414.321471 L 764.468018 414.766944 L 795.429055 414.676199 " clip-path="url(#pb49fc4c8d2)" style="fill: none; stroke: #1f77b4; stroke-width: 1.5; stroke-linecap: square" />
4999
  <defs>
5000
  <path id="md7efaf3aec" d="M 0 3 C 0.795609 3 1.55874 2.683901 2.12132 2.12132 C 2.683901 1.55874 3 0.795609 3 0 C 3 -0.795609 2.683901 -1.55874 2.12132 -2.12132 C 1.55874 -2.683901 0.795609 -3 0 -3 C -0.795609 -3 -1.55874 -2.683901 -2.12132 -2.12132 C -2.683901 -1.55874 -3 -0.795609 -3 0 C -3 0.795609 -2.683901 1.55874 -2.12132 2.12132 C -1.55874 2.683901 -0.795609 3 0 3 z " style="stroke: #1f77b4" />
5001
  </defs>
5002
  <g clip-path="url(#pb49fc4c8d2)">
5003
  <use ns4:href="#md7efaf3aec" x="83.325193" y="420.186871" style="fill: #1f77b4; stroke: #1f77b4" />
5004
+ <use ns4:href="#md7efaf3aec" x="114.286231" y="413.282033" style="fill: #1f77b4; stroke: #1f77b4" />
5005
+ <use ns4:href="#md7efaf3aec" x="145.247268" y="414.725697" style="fill: #1f77b4; stroke: #1f77b4" />
5006
+ <use ns4:href="#md7efaf3aec" x="176.208306" y="414.527708" style="fill: #1f77b4; stroke: #1f77b4" />
5007
+ <use ns4:href="#md7efaf3aec" x="207.169343" y="414.733946" style="fill: #1f77b4; stroke: #1f77b4" />
5008
+ <use ns4:href="#md7efaf3aec" x="238.130381" y="414.742196" style="fill: #1f77b4; stroke: #1f77b4" />
5009
+ <use ns4:href="#md7efaf3aec" x="269.091418" y="415.294913" style="fill: #1f77b4; stroke: #1f77b4" />
5010
+ <use ns4:href="#md7efaf3aec" x="300.052455" y="415.129922" style="fill: #1f77b4; stroke: #1f77b4" />
5011
+ <use ns4:href="#md7efaf3aec" x="331.013493" y="416.168535" style="fill: #1f77b4; stroke: #1f77b4" />
5012
+ <use ns4:href="#md7efaf3aec" x="361.97453" y="415.319661" style="fill: #1f77b4; stroke: #1f77b4" />
5013
+ <use ns4:href="#md7efaf3aec" x="392.935568" y="415.451653" style="fill: #1f77b4; stroke: #1f77b4" />
5014
+ <use ns4:href="#md7efaf3aec" x="423.896605" y="416.399522" style="fill: #1f77b4; stroke: #1f77b4" />
5015
+ <use ns4:href="#md7efaf3aec" x="454.857643" y="415.476402" style="fill: #1f77b4; stroke: #1f77b4" />
5016
+ <use ns4:href="#md7efaf3aec" x="485.81868" y="414.915435" style="fill: #1f77b4; stroke: #1f77b4" />
5017
+ <use ns4:href="#md7efaf3aec" x="516.779718" y="415.558897" style="fill: #1f77b4; stroke: #1f77b4" />
5018
+ <use ns4:href="#md7efaf3aec" x="547.740755" y="415.080425" style="fill: #1f77b4; stroke: #1f77b4" />
5019
+ <use ns4:href="#md7efaf3aec" x="578.701793" y="415.905376" style="fill: #1f77b4; stroke: #1f77b4" />
5020
+ <use ns4:href="#md7efaf3aec" x="609.66283" y="414.428714" style="fill: #1f77b4; stroke: #1f77b4" />
5021
+ <use ns4:href="#md7efaf3aec" x="640.623868" y="414.980606" style="fill: #1f77b4; stroke: #1f77b4" />
5022
+ <use ns4:href="#md7efaf3aec" x="671.584905" y="414.964932" style="fill: #1f77b4; stroke: #1f77b4" />
5023
+ <use ns4:href="#md7efaf3aec" x="702.545943" y="414.634952" style="fill: #1f77b4; stroke: #1f77b4" />
5024
+ <use ns4:href="#md7efaf3aec" x="733.50698" y="414.321471" style="fill: #1f77b4; stroke: #1f77b4" />
5025
+ <use ns4:href="#md7efaf3aec" x="764.468018" y="414.766944" style="fill: #1f77b4; stroke: #1f77b4" />
5026
+ <use ns4:href="#md7efaf3aec" x="795.429055" y="414.676199" style="fill: #1f77b4; stroke: #1f77b4" />
5027
  </g>
5028
  </g>
5029
  <g id="series--torch-eager" class="series">
5030
+ <path d="M 83.325193 397.16167 L 114.286231 384.655416 L 145.247268 385.439119 L 176.208306 386.603125 L 207.169343 386.93228 L 238.130381 386.948779 L 269.091418 386.320992 L 300.052455 386.924031 L 331.013493 386.998276 L 361.97453 386.93228 L 392.935568 335.710262 L 423.896605 321.165555 L 454.857643 387.52707 L 485.81868 387.60874 L 516.779718 388.243952 L 547.740755 387.27051 L 578.701793 386.635298 L 609.66283 387.072522 L 640.623868 387.048598 L 671.584905 386.643547 L 702.545943 378.377541 L 733.50698 373.106105 L 764.468018 54.3872 L 795.429055 45.608899 " clip-path="url(#pb49fc4c8d2)" style="fill: none; stroke: #ff7f0e; stroke-width: 1.5; stroke-linecap: square" />
5031
  <defs>
5032
  <path id="m9b8c54d372" d="M 0 3 C 0.795609 3 1.55874 2.683901 2.12132 2.12132 C 2.683901 1.55874 3 0.795609 3 0 C 3 -0.795609 2.683901 -1.55874 2.12132 -2.12132 C 1.55874 -2.683901 0.795609 -3 0 -3 C -0.795609 -3 -1.55874 -2.683901 -2.12132 -2.12132 C -2.683901 -1.55874 -3 -0.795609 -3 0 C -3 0.795609 -2.683901 1.55874 -2.12132 2.12132 C -1.55874 2.683901 -0.795609 3 0 3 z " style="stroke: #ff7f0e" />
5033
  </defs>
5034
  <g clip-path="url(#pb49fc4c8d2)">
5035
+ <use ns4:href="#m9b8c54d372" x="83.325193" y="397.16167" style="fill: #ff7f0e; stroke: #ff7f0e" />
5036
+ <use ns4:href="#m9b8c54d372" x="114.286231" y="384.655416" style="fill: #ff7f0e; stroke: #ff7f0e" />
5037
+ <use ns4:href="#m9b8c54d372" x="145.247268" y="385.439119" style="fill: #ff7f0e; stroke: #ff7f0e" />
5038
+ <use ns4:href="#m9b8c54d372" x="176.208306" y="386.603125" style="fill: #ff7f0e; stroke: #ff7f0e" />
5039
+ <use ns4:href="#m9b8c54d372" x="207.169343" y="386.93228" style="fill: #ff7f0e; stroke: #ff7f0e" />
5040
+ <use ns4:href="#m9b8c54d372" x="238.130381" y="386.948779" style="fill: #ff7f0e; stroke: #ff7f0e" />
5041
+ <use ns4:href="#m9b8c54d372" x="269.091418" y="386.320992" style="fill: #ff7f0e; stroke: #ff7f0e" />
5042
+ <use ns4:href="#m9b8c54d372" x="300.052455" y="386.924031" style="fill: #ff7f0e; stroke: #ff7f0e" />
5043
+ <use ns4:href="#m9b8c54d372" x="331.013493" y="386.998276" style="fill: #ff7f0e; stroke: #ff7f0e" />
5044
+ <use ns4:href="#m9b8c54d372" x="361.97453" y="386.93228" style="fill: #ff7f0e; stroke: #ff7f0e" />
5045
+ <use ns4:href="#m9b8c54d372" x="392.935568" y="335.710262" style="fill: #ff7f0e; stroke: #ff7f0e" />
5046
+ <use ns4:href="#m9b8c54d372" x="423.896605" y="321.165555" style="fill: #ff7f0e; stroke: #ff7f0e" />
5047
+ <use ns4:href="#m9b8c54d372" x="454.857643" y="387.52707" style="fill: #ff7f0e; stroke: #ff7f0e" />
5048
+ <use ns4:href="#m9b8c54d372" x="485.81868" y="387.60874" style="fill: #ff7f0e; stroke: #ff7f0e" />
5049
+ <use ns4:href="#m9b8c54d372" x="516.779718" y="388.243952" style="fill: #ff7f0e; stroke: #ff7f0e" />
5050
+ <use ns4:href="#m9b8c54d372" x="547.740755" y="387.27051" style="fill: #ff7f0e; stroke: #ff7f0e" />
5051
+ <use ns4:href="#m9b8c54d372" x="578.701793" y="386.635298" style="fill: #ff7f0e; stroke: #ff7f0e" />
5052
+ <use ns4:href="#m9b8c54d372" x="609.66283" y="387.072522" style="fill: #ff7f0e; stroke: #ff7f0e" />
5053
+ <use ns4:href="#m9b8c54d372" x="640.623868" y="387.048598" style="fill: #ff7f0e; stroke: #ff7f0e" />
5054
+ <use ns4:href="#m9b8c54d372" x="671.584905" y="386.643547" style="fill: #ff7f0e; stroke: #ff7f0e" />
5055
+ <use ns4:href="#m9b8c54d372" x="702.545943" y="378.377541" style="fill: #ff7f0e; stroke: #ff7f0e" />
5056
+ <use ns4:href="#m9b8c54d372" x="733.50698" y="373.106105" style="fill: #ff7f0e; stroke: #ff7f0e" />
5057
+ <use ns4:href="#m9b8c54d372" x="764.468018" y="54.3872" style="fill: #ff7f0e; stroke: #ff7f0e" />
5058
  <use ns4:href="#m9b8c54d372" x="795.429055" y="45.608899" style="fill: #ff7f0e; stroke: #ff7f0e" />
5059
  </g>
5060
  </g>
flash_attn/impls/artifacts/benchmark/attention.jsonl CHANGED
@@ -1,6 +1,6 @@
1
- {"ts": "2025-10-29T14:27:40Z", "run": "f6f1662cc0064e52be2f08072a7254af", "impl": "xformers_meff", "tags": {"family": "xformers", "backend": "memory_efficient", "compile": "none"}, "wl": {"name": "cuda_attn_L128_bfloat16", "batch": 1, "seq_len": 4224, "heads": 24, "head_dim": 128, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.98791400000664, "p50": 0.995113999977093, "p90": 1.0003840000081254, "mean": 0.9967803999984426, "iqr": 0.00634899998885885, "raw_times": [0.98791400000664, 0.9940350000192666, 1.006454999981088, 1.0003840000081254, 0.995113999977093], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 1.0150049999992916, "peak_bytes": 295567360, "ok": true, "absmax": 0.0625, "corr": {"ok": true, "rtol": 0.02, "atol": 0.02, "absmax": 0.0625, "mae": 0.0003604888916015625, "mse": 2.8014183044433594e-06, "ref": "sdpa_math_fp32"}, "err": null}
2
- {"ts": "2025-10-29T14:27:41Z", "run": "f6f1662cc0064e52be2f08072a7254af", "impl": "xformers_meff", "tags": {"family": "xformers", "backend": "memory_efficient", "compile": "none"}, "wl": {"name": "cuda_attn_L256_bfloat16", "batch": 1, "seq_len": 4352, "heads": 24, "head_dim": 128, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 1.0233649999804584, "p50": 1.0321449999537435, "p90": 1.0348449999924014, "mean": 1.032277399974646, "iqr": 0.009739000006447895, "raw_times": [1.045925999960673, 1.0251059999859535, 1.0321449999537435, 1.0233649999804584, 1.0348449999924014], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 1.035865999995167, "peak_bytes": 304742400, "ok": true, "absmax": 0.0625, "corr": {"ok": true, "rtol": 0.02, "atol": 0.02, "absmax": 0.0625, "mae": 0.00035858154296875, "mse": 2.7865171432495117e-06, "ref": "sdpa_math_fp32"}, "err": null}
3
- {"ts": "2025-10-29T14:27:41Z", "run": "f6f1662cc0064e52be2f08072a7254af", "impl": "xformers_meff", "tags": {"family": "xformers", "backend": "memory_efficient", "compile": "none"}, "wl": {"name": "cuda_attn_L320_bfloat16", "batch": 1, "seq_len": 4416, "heads": 24, "head_dim": 128, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 1.0737370000128976, "p50": 1.084086999981082, "p90": 1.088675999994848, "mean": 1.0826705999988917, "iqr": 0.0148400000057336, "raw_times": [1.0738359999891145, 1.0930170000165162, 1.084086999981082, 1.0737370000128976, 1.088675999994848], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 1.0889670000437945, "peak_bytes": 307494912, "ok": true, "absmax": 0.0625, "corr": {"ok": true, "rtol": 0.02, "atol": 0.02, "absmax": 0.0625, "mae": 0.00035858154296875, "mse": 2.7865171432495117e-06, "ref": "sdpa_math_fp32"}, "err": null}
4
- {"ts": "2025-10-29T14:27:41Z", "run": "f6f1662cc0064e52be2f08072a7254af", "impl": "xformers_meff", "tags": {"family": "xformers", "backend": "memory_efficient", "compile": "none"}, "wl": {"name": "cuda_attn_L384_bfloat16", "batch": 1, "seq_len": 4480, "heads": 24, "head_dim": 128, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 1.0887770000067576, "p50": 1.0916359999555425, "p90": 1.096396999969329, "mean": 1.0932085999797891, "iqr": 0.005600999998023326, "raw_times": [1.0887770000067576, 1.0907959999713057, 1.0916359999555425, 1.0984369999960109, 1.096396999969329], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 1.1031370000296192, "peak_bytes": 311296000, "ok": true, "absmax": 0.0625, "corr": {"ok": true, "rtol": 0.02, "atol": 0.02, "absmax": 0.0625, "mae": 0.00035858154296875, "mse": 2.7865171432495117e-06, "ref": "sdpa_math_fp32"}, "err": null}
5
- {"ts": "2025-10-29T14:27:41Z", "run": "f6f1662cc0064e52be2f08072a7254af", "impl": "xformers_meff", "tags": {"family": "xformers", "backend": "memory_efficient", "compile": "none"}, "wl": {"name": "cuda_attn_L448_bfloat16", "batch": 1, "seq_len": 4544, "heads": 24, "head_dim": 128, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 1.2465009999687027, "p50": 1.2523310000460697, "p90": 1.2523909999799798, "mean": 1.2538410000047406, "iqr": 0.005290999979479238, "raw_times": [1.2523310000460697, 1.2523909999799798, 1.2471000000005006, 1.2465009999687027, 1.2708820000284504], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 1.2551809999763464, "peak_bytes": 315621376, "ok": true, "absmax": 0.0625, "corr": {"ok": true, "rtol": 0.02, "atol": 0.02, "absmax": 0.0625, "mae": 0.00035858154296875, "mse": 2.7865171432495117e-06, "ref": "sdpa_math_fp32"}, "err": null}
6
- {"ts": "2025-10-29T14:27:41Z", "run": "f6f1662cc0064e52be2f08072a7254af", "impl": "xformers_meff", "tags": {"family": "xformers", "backend": "memory_efficient", "compile": "none"}, "wl": {"name": "cuda_attn_L512_bfloat16", "batch": 1, "seq_len": 4608, "heads": 24, "head_dim": 128, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 1.2355700000057368, "p50": 1.241141000036805, "p90": 1.2576200000466997, "mean": 1.2477664000130062, "iqr": 0.02047000003813082, "raw_times": [1.2355700000057368, 1.241141000036805, 1.237150000008569, 1.2576200000466997, 1.2673509999672206], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 1.2579809999806457, "peak_bytes": 319946752, "ok": true, "absmax": 0.125, "corr": {"ok": true, "rtol": 0.02, "atol": 0.02, "absmax": 0.125, "mae": 0.000362396240234375, "mse": 2.8014183044433594e-06, "ref": "sdpa_math_fp32"}, "err": null}
 
1
+ {"ts": "2025-10-29T15:50:47Z", "run": "cdb0295471e5453fb4c555529da92783", "impl": "torch_mem_eff", "tags": {"family": "torch-sdpa", "backend": "EFFICIENT", "compile": "none"}, "wl": {"name": "cuda_attn_L128_bfloat16", "batch": 1, "seq_len": 4224, "heads": 24, "head_dim": 128, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 1.8233430000123008, "p50": 1.8343830000162598, "p90": 1.8450139999686144, "mean": 1.8363673999942876, "iqr": 0.021300000014434772, "raw_times": [1.8450139999686144, 1.8233430000123008, 1.8237139999541796, 1.8343830000162598, 1.8553830000200833], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 1.8232439999792405, "peak_bytes": 295567360, "ok": true, "absmax": 0.0625, "corr": {"ok": true, "rtol": 0.02, "atol": 0.02, "absmax": 0.0625, "mae": 0.0003604888916015625, "mse": 2.7865171432495117e-06, "ref": "sdpa_math_fp32"}, "err": null}
2
+ {"ts": "2025-10-29T15:50:47Z", "run": "cdb0295471e5453fb4c555529da92783", "impl": "torch_mem_eff", "tags": {"family": "torch-sdpa", "backend": "EFFICIENT", "compile": "none"}, "wl": {"name": "cuda_attn_L256_bfloat16", "batch": 1, "seq_len": 4352, "heads": 24, "head_dim": 128, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 1.8942840000022443, "p50": 1.9424449999974058, "p90": 1.9434060000094178, "mean": 1.9367254000030698, "iqr": 0.0023400000372930663, "raw_times": [1.8942840000022443, 1.9424449999974058, 1.9410659999721247, 1.9434060000094178, 1.9624260000341565], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 1.9008649999818772, "peak_bytes": 304742400, "ok": true, "absmax": 0.0625, "corr": {"ok": true, "rtol": 0.02, "atol": 0.02, "absmax": 0.0625, "mae": 0.00035858154296875, "mse": 2.7567148208618164e-06, "ref": "sdpa_math_fp32"}, "err": null}
3
+ {"ts": "2025-10-29T15:50:48Z", "run": "cdb0295471e5453fb4c555529da92783", "impl": "torch_mem_eff", "tags": {"family": "torch-sdpa", "backend": "EFFICIENT", "compile": "none"}, "wl": {"name": "cuda_attn_L320_bfloat16", "batch": 1, "seq_len": 4416, "heads": 24, "head_dim": 128, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 1.942595999992136, "p50": 1.9503360000499015, "p90": 2.019877999998698, "mean": 1.9758666000029734, "iqr": 0.0764520000302582, "raw_times": [1.9503360000499015, 1.94342599996844, 1.942595999992136, 2.019877999998698, 2.0230970000056914], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 1.9501660000287302, "peak_bytes": 307494912, "ok": true, "absmax": 0.0625, "corr": {"ok": true, "rtol": 0.02, "atol": 0.02, "absmax": 0.0625, "mae": 0.0003566741943359375, "mse": 2.771615982055664e-06, "ref": "sdpa_math_fp32"}, "err": null}
4
+ {"ts": "2025-10-29T15:50:48Z", "run": "cdb0295471e5453fb4c555529da92783", "impl": "torch_mem_eff", "tags": {"family": "torch-sdpa", "backend": "EFFICIENT", "compile": "none"}, "wl": {"name": "cuda_attn_L384_bfloat16", "batch": 1, "seq_len": 4480, "heads": 24, "head_dim": 128, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 1.9654459999856044, "p50": 2.0491880000008678, "p90": 2.050657999973282, "mean": 2.0347600000036437, "iqr": 0.0033989999224104395, "raw_times": [1.9654459999856044, 2.0491880000008678, 2.0472590000508717, 2.050657999973282, 2.0612490000075923], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 2.0352980000097887, "peak_bytes": 311296000, "ok": true, "absmax": 0.0625, "corr": {"ok": true, "rtol": 0.02, "atol": 0.02, "absmax": 0.0625, "mae": 0.00035858154296875, "mse": 2.771615982055664e-06, "ref": "sdpa_math_fp32"}, "err": null}
5
+ {"ts": "2025-10-29T15:50:48Z", "run": "cdb0295471e5453fb4c555529da92783", "impl": "torch_mem_eff", "tags": {"family": "torch-sdpa", "backend": "EFFICIENT", "compile": "none"}, "wl": {"name": "cuda_attn_L448_bfloat16", "batch": 1, "seq_len": 4544, "heads": 24, "head_dim": 128, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 2.0188670000038655, "p50": 2.067507999981899, "p90": 2.1027900000376576, "mean": 2.0633722000184207, "iqr": 0.07837300000801406, "raw_times": [2.0188670000038655, 2.0244170000296435, 2.067507999981899, 2.1027900000376576, 2.103279000039038], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 2.0235979999938536, "peak_bytes": 315621376, "ok": true, "absmax": 0.0625, "corr": {"ok": true, "rtol": 0.02, "atol": 0.02, "absmax": 0.0625, "mae": 0.00035858154296875, "mse": 2.771615982055664e-06, "ref": "sdpa_math_fp32"}, "err": null}
6
+ {"ts": "2025-10-29T15:50:48Z", "run": "cdb0295471e5453fb4c555529da92783", "impl": "torch_mem_eff", "tags": {"family": "torch-sdpa", "backend": "EFFICIENT", "compile": "none"}, "wl": {"name": "cuda_attn_L512_bfloat16", "batch": 1, "seq_len": 4608, "heads": 24, "head_dim": 128, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 2.1849919999681333, "p50": 2.1887119999632887, "p90": 2.2487329999876238, "mean": 2.212510399988332, "iqr": 0.06324099996390942, "raw_times": [2.1849919999681333, 2.1887119999632887, 2.1854920000237144, 2.2487329999876238, 2.254622999998901], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 2.1668410000188487, "peak_bytes": 319946752, "ok": true, "absmax": 0.0625, "corr": {"ok": true, "rtol": 0.02, "atol": 0.02, "absmax": 0.0625, "mae": 0.0003604888916015625, "mse": 2.771615982055664e-06, "ref": "sdpa_math_fp32"}, "err": null}
flash_attn/impls/cells/benchmark.py CHANGED
@@ -4,7 +4,6 @@
4
  # "numpy",
5
  # "torch==2.8.0",
6
  # "kernels-benchmark-tools",
7
- # "xformers",
8
  # ]
9
  #
10
  # [tool.uv.sources]
@@ -13,18 +12,20 @@
13
  import torch
14
  import sys
15
  from kernels_benchmark_tools import KernelTypeEnum, run_benchmark
16
- import xformers.ops as xops
17
 
18
 
19
- def xformers_attention(q, k, v):
20
- """xFormers memory efficient attention"""
21
- # xFormers expects [batch, seq_len, heads, head_dim]
22
- return xops.memory_efficient_attention(q, k, v)
 
 
 
23
 
24
 
25
  run_benchmark(
26
  kernel_type=KernelTypeEnum.ATTENTION,
27
- impl_name="xformers_meff",
28
- impl_tags={"family": "xformers", "backend": "memory_efficient", "compile": "none"},
29
- impl_func=xformers_attention,
30
  )
 
4
  # "numpy",
5
  # "torch==2.8.0",
6
  # "kernels-benchmark-tools",
 
7
  # ]
8
  #
9
  # [tool.uv.sources]
 
12
  import torch
13
  import sys
14
  from kernels_benchmark_tools import KernelTypeEnum, run_benchmark
 
15
 
16
 
17
+ def torch_mem_eff(q, k, v):
18
+ qt, kt, vt = (x.transpose(1, 2).contiguous() for x in (q, k, v))
19
+ with torch.nn.attention.sdpa_kernel(
20
+ torch.nn.attention.SDPBackend.EFFICIENT_ATTENTION
21
+ ):
22
+ o = torch.nn.functional.scaled_dot_product_attention(qt, kt, vt)
23
+ return o.transpose(1, 2).contiguous()
24
 
25
 
26
  run_benchmark(
27
  kernel_type=KernelTypeEnum.ATTENTION,
28
+ impl_name="torch_mem_eff",
29
+ impl_tags={"family": "torch-sdpa", "backend": "EFFICIENT", "compile": "none"},
30
+ impl_func=torch_mem_eff,
31
  )
flash_attn/impls/flash_attention.html CHANGED
@@ -809,6 +809,14 @@
809
  .artifact-preview svg {
810
  background: transparent;
811
  }
 
 
 
 
 
 
 
 
812
  /* CSV table styling */
813
  .artifact-csv {
814
  margin-top: 1rem;
@@ -3871,7 +3879,7 @@ span.linenos.special { color: #000000; background-color: #ffffc0; padding-left:
3871
  <span onclick="toggleOutput('nv')" style="cursor: pointer;">▼ output</span>
3872
  <span id="uv-indicator-nv" style="cursor: default; opacity: 0.3;">▶ uv-logs</span>
3873
  </span> |
3874
- Cell: nv | 0.28s
3875
  | <button class="run-btn" onclick="runCell('nv')">▶ run</button>
3876
  <button class="copy-btn" onclick="copyCell('nv')">Copy</button>
3877
  <a href="cells/nv.py" target="_blank" class="raw-btn">Raw</a>
@@ -3888,7 +3896,7 @@ Cell: nv | 0.28s
3888
  </div>
3889
  </div>
3890
  <div id="output-nv" class="cell-output">
3891
- <div class="cell-stdout"><pre class="stdout-text">Wed Oct 29 14:25:53 2025
3892
  +-----------------------------------------------------------------------------------------+
3893
  | NVIDIA-SMI 570.195.03 Driver Version: 570.195.03 CUDA Version: 12.8 |
3894
  |-----------------------------------------+------------------------+----------------------+
@@ -3897,7 +3905,7 @@ Cell: nv | 0.28s
3897
  | | | MIG M. |
3898
  |=========================================+========================+======================|
3899
  | 0 NVIDIA L40S On | 00000000:4D:00.0 Off | 0 |
3900
- | N/A 27C P8 21W / 350W | 0MiB / 46068MiB | 0% Default |
3901
  | | | N/A |
3902
  +-----------------------------------------+------------------------+----------------------+
3903
 
@@ -3919,9 +3927,9 @@ Cell: nv | 0.28s
3919
  <span class="collapse-indicators">
3920
  <span onclick="toggleCode('benchmark')" style="cursor: pointer;">▼ code</span>
3921
  <span onclick="toggleOutput('benchmark')" style="cursor: pointer;">▼ output</span>
3922
- <span id="uv-indicator-benchmark" onclick="toggleUvLogsFromHeader('benchmark')" style="cursor: pointer;">▶ uv-logs</span>
3923
  </span> |
3924
- Cell: benchmark | 32.77s
3925
  | <button class="run-btn" onclick="runCell('benchmark')">▶ run</button>
3926
  <button class="copy-btn" onclick="copyCell('benchmark')">Copy</button>
3927
  <a href="cells/benchmark.py" target="_blank" class="raw-btn">Raw</a>
@@ -3972,29 +3980,29 @@ PROFILE TRACE: torch_flash_ma | cuda_attn_L128_bfloat16
3972
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
3973
  Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
3974
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
3975
- torch_flash_ma 0.00% 0.000us 0.00% 0.000us 0.000us 3.644ms 102.02% 3.644ms 3.644ms 1
3976
- torch_flash_ma 6.80% 356.846us 47.04% 2.468ms 2.468ms 0.000us 0.00% 3.612ms 3.612ms 1
3977
- aten::scaled_dot_product_attention 0.82% 43.042us 4.47% 234.776us 78.259us 0.000us 0.00% 2.857ms 952.201us 3
3978
- aten::_scaled_dot_product_flash_attention 0.56% 29.330us 3.65% 191.734us 63.911us 0.000us 0.00% 2.857ms 952.201us 3
3979
- aten::_flash_attention_forward 0.75% 39.581us 2.59% 135.674us 45.225us 2.857ms 79.97% 2.857ms 952.201us 3
3980
- void pytorch_flash::flash_fwd_kernel&lt;Flash_fwd_kerne... 0.00% 0.000us 0.00% 0.000us 0.000us 2.857ms 79.97% 2.857ms 952.201us 3
3981
- aten::contiguous 0.27% 14.180us 34.32% 1.801ms 150.051us 0.000us 0.00% 755.680us 62.973us 12
3982
- aten::clone 0.74% 38.791us 34.04% 1.786ms 148.870us 0.000us 0.00% 755.680us 62.973us 12
3983
- aten::copy_ 1.85% 97.030us 31.43% 1.649ms 137.429us 715.456us 20.03% 755.680us 62.973us 12
3984
- void at::native::elementwise_kernel&lt;128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 715.456us 20.03% 715.456us 59.621us 12
3985
- Activity Buffer Request 27.38% 1.437ms 27.38% 1.437ms 1.437ms 40.224us 1.13% 40.224us 40.224us 1
3986
- aten::transpose 1.47% 77.273us 1.96% 102.714us 4.280us 0.000us 0.00% 0.000us 0.000us 24
3987
- aten::as_strided 0.48% 25.441us 0.48% 25.441us 1.060us 0.000us 0.00% 0.000us 0.000us 24
3988
- aten::empty_like 0.70% 36.821us 2.35% 123.326us 8.222us 0.000us 0.00% 0.000us 0.000us 15
3989
- aten::empty 1.93% 101.493us 1.93% 101.493us 4.229us 0.000us 0.00% 0.000us 0.000us 24
3990
- cudaLaunchKernel 2.70% 141.775us 2.70% 141.775us 9.452us 0.000us 0.00% 0.000us 0.000us 15
3991
- aten::empty_strided 0.35% 18.402us 0.35% 18.402us 6.134us 0.000us 0.00% 0.000us 0.000us 3
3992
- cudaDeviceGetAttribute 0.05% 2.540us 0.05% 2.540us 0.423us 0.000us 0.00% 0.000us 0.000us 6
3993
- cudaFuncSetAttribute 0.17% 8.890us 0.17% 8.890us 2.963us 0.000us 0.00% 0.000us 0.000us 3
3994
- cudaDeviceSynchronize 52.96% 2.779ms 52.96% 2.779ms 2.779ms 0.000us 0.00% 0.000us 0.000us 1
3995
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
3996
- Self CPU time total: 5.247ms
3997
- Self CUDA time total: 3.572ms
3998
 
3999
 
4000
 
@@ -4004,29 +4012,29 @@ PROFILE TRACE: torch_flash_ma | cuda_attn_L256_bfloat16
4004
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4005
  Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
4006
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4007
- torch_flash_ma 4.70% 246.528us 41.73% 2.189ms 2.189ms 0.000us 0.00% 3.817ms 3.817ms 1
4008
- torch_flash_ma 0.00% 0.000us 0.00% 0.000us 0.000us 3.772ms 100.28% 3.772ms 3.772ms 1
4009
- aten::scaled_dot_product_attention 0.51% 26.610us 3.43% 180.143us 60.048us 0.000us 0.00% 2.999ms 999.573us 3
4010
- aten::_scaled_dot_product_flash_attention 0.37% 19.600us 2.93% 153.533us 51.178us 0.000us 0.00% 2.999ms 999.573us 3
4011
- aten::_flash_attention_forward 0.63% 32.980us 2.12% 111.443us 37.148us 2.999ms 79.71% 2.999ms 999.573us 3
4012
- void pytorch_flash::flash_fwd_kernel&lt;Flash_fwd_kerne... 0.00% 0.000us 0.00% 0.000us 0.000us 2.999ms 79.71% 2.999ms 999.573us 3
4013
- aten::contiguous 0.19% 10.030us 32.68% 1.715ms 142.893us 0.000us 0.00% 818.210us 68.184us 12
4014
- aten::clone 0.55% 29.002us 32.49% 1.705ms 142.057us 0.000us 0.00% 818.210us 68.184us 12
4015
- aten::copy_ 2.09% 109.441us 30.74% 1.613ms 134.399us 763.297us 20.29% 818.210us 68.184us 12
4016
- void at::native::elementwise_kernel&lt;128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 763.297us 20.29% 763.297us 63.608us 12
4017
- Activity Buffer Request 26.94% 1.413ms 26.94% 1.413ms 1.413ms 54.913us 1.46% 54.913us 54.913us 1
4018
- aten::transpose 1.00% 52.652us 1.34% 70.433us 2.935us 0.000us 0.00% 0.000us 0.000us 24
4019
- aten::as_strided 0.34% 17.781us 0.34% 17.781us 0.741us 0.000us 0.00% 0.000us 0.000us 24
4020
- aten::empty_like 0.38% 19.980us 1.61% 84.581us 5.639us 0.000us 0.00% 0.000us 0.000us 15
4021
- aten::empty 1.45% 76.201us 1.45% 76.201us 3.175us 0.000us 0.00% 0.000us 0.000us 24
4022
- cudaLaunchKernel 2.16% 113.102us 2.16% 113.102us 7.540us 0.000us 0.00% 0.000us 0.000us 15
4023
- aten::empty_strided 0.31% 16.430us 0.31% 16.430us 5.477us 0.000us 0.00% 0.000us 0.000us 3
4024
- cudaDeviceGetAttribute 0.03% 1.751us 0.03% 1.751us 0.292us 0.000us 0.00% 0.000us 0.000us 6
4025
- cudaFuncSetAttribute 0.07% 3.771us 0.07% 3.771us 1.257us 0.000us 0.00% 0.000us 0.000us 3
4026
- cudaDeviceSynchronize 58.27% 3.058ms 58.27% 3.058ms 3.058ms 0.000us 0.00% 0.000us 0.000us 1
4027
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4028
- Self CPU time total: 5.247ms
4029
- Self CUDA time total: 3.762ms
4030
 
4031
 
4032
 
@@ -4036,29 +4044,29 @@ PROFILE TRACE: torch_flash_ma | cuda_attn_L320_bfloat16
4036
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4037
  Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
4038
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4039
- torch_flash_ma 4.50% 237.986us 41.18% 2.178ms 2.178ms 0.000us 0.00% 3.833ms 3.833ms 1
4040
- torch_flash_ma 0.00% 0.000us 0.00% 0.000us 0.000us 3.785ms 100.29% 3.785ms 3.785ms 1
4041
- aten::scaled_dot_product_attention 0.46% 24.381us 3.40% 179.915us 59.972us 0.000us 0.00% 2.998ms 999.221us 3
4042
- aten::_scaled_dot_product_flash_attention 0.36% 19.171us 2.94% 155.534us 51.845us 0.000us 0.00% 2.998ms 999.221us 3
4043
- aten::_flash_attention_forward 0.65% 34.259us 2.15% 113.691us 37.897us 2.998ms 79.44% 2.998ms 999.221us 3
4044
- void pytorch_flash::flash_fwd_kernel&lt;Flash_fwd_kerne... 0.00% 0.000us 0.00% 0.000us 0.000us 2.998ms 79.44% 2.998ms 999.221us 3
4045
- aten::contiguous 0.19% 9.800us 32.38% 1.712ms 142.708us 0.000us 0.00% 835.263us 69.605us 12
4046
- aten::clone 0.53% 28.211us 32.20% 1.703ms 141.891us 0.000us 0.00% 835.263us 69.605us 12
4047
- aten::copy_ 1.60% 84.650us 30.46% 1.611ms 134.247us 776.063us 20.56% 835.263us 69.605us 12
4048
- void at::native::elementwise_kernel&lt;128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 776.063us 20.56% 776.063us 64.672us 12
4049
- Activity Buffer Request 27.18% 1.437ms 27.18% 1.437ms 1.437ms 59.200us 1.57% 59.200us 59.200us 1
4050
- aten::transpose 0.99% 52.225us 1.33% 70.125us 2.922us 0.000us 0.00% 0.000us 0.000us 24
4051
- aten::as_strided 0.34% 17.900us 0.34% 17.900us 0.746us 0.000us 0.00% 0.000us 0.000us 24
4052
- aten::empty_like 0.37% 19.782us 1.60% 84.803us 5.654us 0.000us 0.00% 0.000us 0.000us 15
4053
- aten::empty 1.45% 76.431us 1.45% 76.431us 3.185us 0.000us 0.00% 0.000us 0.000us 24
4054
- cudaLaunchKernel 2.16% 114.204us 2.16% 114.204us 7.614us 0.000us 0.00% 0.000us 0.000us 15
4055
- aten::empty_strided 0.30% 16.100us 0.30% 16.100us 5.367us 0.000us 0.00% 0.000us 0.000us 3
4056
- cudaDeviceGetAttribute 0.03% 1.730us 0.03% 1.730us 0.288us 0.000us 0.00% 0.000us 0.000us 6
4057
- cudaFuncSetAttribute 0.07% 3.730us 0.07% 3.730us 1.243us 0.000us 0.00% 0.000us 0.000us 3
4058
- cudaDeviceSynchronize 58.82% 3.110ms 58.82% 3.110ms 3.110ms 0.000us 0.00% 0.000us 0.000us 1
4059
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4060
- Self CPU time total: 5.288ms
4061
- Self CUDA time total: 3.774ms
4062
 
4063
 
4064
 
@@ -4068,29 +4076,29 @@ PROFILE TRACE: torch_flash_ma | cuda_attn_L384_bfloat16
4068
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4069
  Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
4070
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4071
- torch_flash_ma 4.36% 241.837us 43.33% 2.405ms 2.405ms 0.000us 0.00% 3.884ms 3.884ms 1
4072
- torch_flash_ma 0.00% 0.000us 0.00% 0.000us 0.000us 3.837ms 100.27% 3.837ms 3.837ms 1
4073
- aten::scaled_dot_product_attention 0.48% 26.802us 3.27% 181.715us 60.572us 0.000us 0.00% 3.042ms 1.014ms 3
4074
- aten::_scaled_dot_product_flash_attention 0.35% 19.308us 2.79% 154.913us 51.638us 0.000us 0.00% 3.042ms 1.014ms 3
4075
- aten::_flash_attention_forward 0.60% 33.361us 2.03% 112.712us 37.571us 3.042ms 79.50% 3.042ms 1.014ms 3
4076
- void pytorch_flash::flash_fwd_kernel&lt;Flash_fwd_kerne... 0.00% 0.000us 0.00% 0.000us 0.000us 3.042ms 79.50% 3.042ms 1.014ms 3
4077
- aten::contiguous 0.17% 9.659us 34.84% 1.934ms 161.162us 0.000us 0.00% 841.829us 70.152us 12
4078
- aten::clone 0.50% 27.830us 34.67% 1.924ms 160.357us 0.000us 0.00% 841.829us 70.152us 12
4079
- aten::copy_ 1.56% 86.702us 32.55% 1.807ms 150.547us 784.548us 20.50% 841.829us 70.152us 12
4080
- void at::native::elementwise_kernel&lt;128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 784.548us 20.50% 784.548us 65.379us 12
4081
- Activity Buffer Request 25.45% 1.413ms 25.45% 1.413ms 1.413ms 57.281us 1.50% 57.281us 57.281us 1
4082
- aten::transpose 0.95% 52.620us 1.27% 70.404us 2.933us 0.000us 0.00% 0.000us 0.000us 24
4083
- aten::as_strided 0.32% 17.784us 0.32% 17.784us 0.741us 0.000us 0.00% 0.000us 0.000us 24
4084
- aten::empty_like 0.78% 43.221us 2.00% 111.194us 7.413us 0.000us 0.00% 0.000us 0.000us 15
4085
- aten::empty 1.45% 80.673us 1.45% 80.673us 3.361us 0.000us 0.00% 0.000us 0.000us 24
4086
- cudaLaunchKernel 5.96% 331.078us 5.96% 331.078us 22.072us 0.000us 0.00% 0.000us 0.000us 15
4087
- aten::empty_strided 0.28% 15.800us 0.28% 15.800us 5.267us 0.000us 0.00% 0.000us 0.000us 3
4088
- cudaDeviceGetAttribute 0.03% 1.730us 0.03% 1.730us 0.288us 0.000us 0.00% 0.000us 0.000us 6
4089
- cudaFuncSetAttribute 0.07% 3.850us 0.07% 3.850us 1.283us 0.000us 0.00% 0.000us 0.000us 3
4090
- cudaDeviceSynchronize 56.67% 3.146ms 56.67% 3.146ms 3.146ms 0.000us 0.00% 0.000us 0.000us 1
4091
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4092
- Self CPU time total: 5.551ms
4093
- Self CUDA time total: 3.827ms
4094
 
4095
 
4096
 
@@ -4100,29 +4108,29 @@ PROFILE TRACE: torch_flash_ma | cuda_attn_L448_bfloat16
4100
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4101
  Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
4102
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4103
- torch_flash_ma 4.46% 268.165us 40.09% 2.413ms 2.413ms 0.000us 0.00% 4.405ms 4.405ms 1
4104
- torch_flash_ma 0.00% 0.000us 0.00% 0.000us 0.000us 4.355ms 100.25% 4.355ms 4.355ms 1
4105
- aten::scaled_dot_product_attention 0.46% 27.642us 3.64% 218.806us 72.935us 0.000us 0.00% 3.540ms 1.180ms 3
4106
- aten::_scaled_dot_product_flash_attention 0.75% 45.250us 3.18% 191.164us 63.721us 0.000us 0.00% 3.540ms 1.180ms 3
4107
- aten::_flash_attention_forward 0.61% 36.651us 2.01% 120.923us 40.308us 3.540ms 81.48% 3.540ms 1.180ms 3
4108
- void pytorch_flash::flash_fwd_kernel&lt;Flash_fwd_kerne... 0.00% 0.000us 0.00% 0.000us 0.000us 3.540ms 81.48% 3.540ms 1.180ms 3
4109
- aten::contiguous 0.18% 10.862us 31.11% 1.873ms 156.050us 0.000us 0.00% 865.606us 72.134us 12
4110
- aten::clone 0.51% 30.490us 30.93% 1.862ms 155.145us 0.000us 0.00% 865.606us 72.134us 12
4111
- aten::copy_ 1.51% 90.931us 29.34% 1.766ms 147.155us 804.645us 18.52% 865.606us 72.134us 12
4112
- void at::native::elementwise_kernel&lt;128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 804.645us 18.52% 804.645us 67.054us 12
4113
- Activity Buffer Request 21.61% 1.300ms 21.61% 1.300ms 1.300ms 60.961us 1.40% 60.961us 60.961us 1
4114
- aten::transpose 0.99% 59.753us 1.30% 78.501us 3.271us 0.000us 0.00% 0.000us 0.000us 24
4115
- aten::as_strided 0.31% 18.748us 0.31% 18.748us 0.781us 0.000us 0.00% 0.000us 0.000us 24
4116
- aten::empty_like 0.35% 20.935us 1.45% 87.165us 5.811us 0.000us 0.00% 0.000us 0.000us 15
4117
- aten::empty 1.32% 79.690us 1.32% 79.690us 3.320us 0.000us 0.00% 0.000us 0.000us 24
4118
- cudaLaunchKernel 6.67% 401.680us 6.67% 401.680us 26.779us 0.000us 0.00% 0.000us 0.000us 15
4119
- aten::empty_strided 0.27% 16.081us 0.27% 16.081us 5.360us 0.000us 0.00% 0.000us 0.000us 3
4120
- cudaDeviceGetAttribute 0.03% 2.030us 0.03% 2.030us 0.338us 0.000us 0.00% 0.000us 0.000us 6
4121
- cudaFuncSetAttribute 0.06% 3.810us 0.06% 3.810us 1.270us 0.000us 0.00% 0.000us 0.000us 3
4122
- cudaDeviceSynchronize 59.91% 3.605ms 59.91% 3.605ms 3.605ms 0.000us 0.00% 0.000us 0.000us 1
4123
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4124
- Self CPU time total: 6.018ms
4125
- Self CUDA time total: 4.344ms
4126
 
4127
 
4128
 
@@ -4132,91 +4140,39 @@ PROFILE TRACE: torch_flash_ma | cuda_attn_L512_bfloat16
4132
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4133
  Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
4134
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4135
- torch_flash_ma 4.01% 246.839us 39.75% 2.447ms 2.447ms 0.000us 0.00% 4.458ms 4.458ms 1
4136
- torch_flash_ma 0.00% 0.000us 0.00% 0.000us 0.000us 4.407ms 100.23% 4.407ms 4.407ms 1
4137
- aten::scaled_dot_product_attention 0.40% 24.621us 2.95% 181.474us 60.491us 0.000us 0.00% 3.579ms 1.193ms 3
4138
- aten::_scaled_dot_product_flash_attention 0.34% 20.980us 2.55% 156.853us 52.284us 0.000us 0.00% 3.579ms 1.193ms 3
4139
- aten::_flash_attention_forward 0.58% 35.588us 1.84% 113.003us 37.668us 3.579ms 81.40% 3.579ms 1.193ms 3
4140
- void pytorch_flash::flash_fwd_kernel&lt;Flash_fwd_kerne... 0.00% 0.000us 0.00% 0.000us 0.000us 3.579ms 81.40% 3.579ms 1.193ms 3
4141
- aten::contiguous 0.16% 10.061us 32.01% 1.971ms 164.244us 0.000us 0.00% 878.818us 73.235us 12
4142
- aten::clone 0.50% 30.903us 31.85% 1.961ms 163.406us 0.000us 0.00% 878.818us 73.235us 12
4143
- aten::copy_ 1.35% 82.841us 30.27% 1.864ms 155.305us 817.634us 18.60% 878.818us 73.235us 12
4144
- void at::native::elementwise_kernel&lt;128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 817.634us 18.60% 817.634us 68.136us 12
4145
- Activity Buffer Request 23.50% 1.447ms 23.50% 1.447ms 1.447ms 61.184us 1.39% 61.184us 61.184us 1
4146
- aten::transpose 0.85% 52.630us 1.15% 70.790us 2.950us 0.000us 0.00% 0.000us 0.000us 24
4147
- aten::as_strided 0.29% 18.160us 0.29% 18.160us 0.757us 0.000us 0.00% 0.000us 0.000us 24
4148
- aten::empty_like 0.33% 20.456us 1.41% 86.700us 5.780us 0.000us 0.00% 0.000us 0.000us 15
4149
- aten::empty 1.28% 78.794us 1.28% 78.794us 3.283us 0.000us 0.00% 0.000us 0.000us 24
4150
- cudaLaunchKernel 5.81% 357.919us 5.81% 357.919us 23.861us 0.000us 0.00% 0.000us 0.000us 15
4151
- aten::empty_strided 0.25% 15.401us 0.25% 15.401us 5.134us 0.000us 0.00% 0.000us 0.000us 3
4152
- cudaDeviceGetAttribute 0.03% 1.632us 0.03% 1.632us 0.272us 0.000us 0.00% 0.000us 0.000us 6
4153
- cudaFuncSetAttribute 0.06% 3.720us 0.06% 3.720us 1.240us 0.000us 0.00% 0.000us 0.000us 3
4154
- cudaDeviceSynchronize 60.25% 3.709ms 60.25% 3.709ms 3.709ms 0.000us 0.00% 0.000us 0.000us 1
4155
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4156
- Self CPU time total: 6.156ms
4157
- Self CUDA time total: 4.397ms
4158
 
4159
 
4160
  impl wl p50(ms) ok
4161
- torch_flash_ma cuda_attn_L128_bfloat16 1.22 True
4162
- torch_flash_ma cuda_attn_L256_bfloat16 1.28 True
4163
- torch_flash_ma cuda_attn_L320_bfloat16 1.29 True
4164
- torch_flash_ma cuda_attn_L384_bfloat16 1.33 True
4165
- torch_flash_ma cuda_attn_L448_bfloat16 1.47 True
4166
- torch_flash_ma cuda_attn_L512_bfloat16 1.50 True
4167
  </pre></div>
4168
- <div class="uv-install-logs" id="uv-logs-benchmark">
4169
- <div class="uv-logs-header" onclick="toggleUvLogs(this)">▶ UV Install Logs</div>
4170
- <div class="uv-logs-content" style="display: none;">
4171
- Building kernels-benchmark-tools @ file:///__w/kernels-benchmarks/kernels-benchmarks/tools
4172
- Downloading nvidia-cuda-cupti-cu12 (9.8MiB)
4173
- Downloading matplotlib (8.3MiB)
4174
- Downloading nvidia-cufile-cu12 (1.1MiB)
4175
- Downloading numpy (16.2MiB)
4176
- Downloading nvidia-cudnn-cu12 (674.0MiB)
4177
- Downloading nvidia-cusparse-cu12 (274.9MiB)
4178
- Downloading nvidia-cublas-cu12 (566.8MiB)
4179
- Downloading nvidia-cusparselt-cu12 (273.9MiB)
4180
- Downloading nvidia-nvjitlink-cu12 (37.4MiB)
4181
- Downloading kiwisolver (1.4MiB)
4182
- Downloading networkx (1.9MiB)
4183
- Downloading nvidia-curand-cu12 (60.7MiB)
4184
- Downloading sympy (6.0MiB)
4185
- Downloading nvidia-cuda-nvrtc-cu12 (84.0MiB)
4186
- Downloading setuptools (1.1MiB)
4187
- Downloading nvidia-cusolver-cu12 (255.1MiB)
4188
- Downloading triton (148.3MiB)
4189
- Downloading pillow (6.7MiB)
4190
- Downloading nvidia-cufft-cu12 (184.2MiB)
4191
- Downloading fonttools (4.7MiB)
4192
- Downloading nvidia-nccl-cu12 (307.4MiB)
4193
- Downloading torch (846.9MiB)
4194
- Downloading nvidia-cufile-cu12
4195
- Downloading kiwisolver
4196
- Downloading setuptools
4197
- Downloading networkx
4198
- Downloading fonttools
4199
- Downloading pillow
4200
- Built kernels-benchmark-tools @ file:///__w/kernels-benchmarks/kernels-benchmarks/tools
4201
- Downloading nvidia-cuda-cupti-cu12
4202
- Downloading matplotlib
4203
- Downloading numpy
4204
- Downloading sympy
4205
- Downloading nvidia-nvjitlink-cu12
4206
- Downloading nvidia-curand-cu12
4207
- Downloading nvidia-cuda-nvrtc-cu12
4208
- Downloading triton
4209
- Downloading nvidia-cufft-cu12
4210
- Downloading nvidia-cusolver-cu12
4211
- Downloading nvidia-cusparse-cu12
4212
- Downloading nvidia-cusparselt-cu12
4213
- Downloading nvidia-nccl-cu12
4214
- Downloading nvidia-cublas-cu12
4215
- Downloading nvidia-cudnn-cu12
4216
- Downloading torch
4217
- Installed 37 packages in 212ms
4218
- </div>
4219
- </div>
4220
  <div class="cell-artifacts">
4221
  <h4>Artifacts:</h4>
4222
  <a href="artifacts/benchmark/attention.jsonl" class="artifact" target="_blank">attention.jsonl</a>
 
809
  .artifact-preview svg {
810
  background: transparent;
811
  }
812
+ /* Invert SVG images in dark mode */
813
+ :root[data-theme="dark"] .artifact-preview img[src$=".svg"] {
814
+ filter: invert(0.9) hue-rotate(180deg);
815
+ }
816
+ /* Keep SVG images readable in monocolor mode */
817
+ :root[data-ui="monocolor"] .artifact-preview img[src$=".svg"] {
818
+ filter: none;
819
+ }
820
  /* CSV table styling */
821
  .artifact-csv {
822
  margin-top: 1rem;
 
3879
  <span onclick="toggleOutput('nv')" style="cursor: pointer;">▼ output</span>
3880
  <span id="uv-indicator-nv" style="cursor: default; opacity: 0.3;">▶ uv-logs</span>
3881
  </span> |
3882
+ Cell: nv | 0.26s
3883
  | <button class="run-btn" onclick="runCell('nv')">▶ run</button>
3884
  <button class="copy-btn" onclick="copyCell('nv')">Copy</button>
3885
  <a href="cells/nv.py" target="_blank" class="raw-btn">Raw</a>
 
3896
  </div>
3897
  </div>
3898
  <div id="output-nv" class="cell-output">
3899
+ <div class="cell-stdout"><pre class="stdout-text">Wed Oct 29 15:50:02 2025
3900
  +-----------------------------------------------------------------------------------------+
3901
  | NVIDIA-SMI 570.195.03 Driver Version: 570.195.03 CUDA Version: 12.8 |
3902
  |-----------------------------------------+------------------------+----------------------+
 
3905
  | | | MIG M. |
3906
  |=========================================+========================+======================|
3907
  | 0 NVIDIA L40S On | 00000000:4D:00.0 Off | 0 |
3908
+ | N/A 29C P0 165W / 350W | 0MiB / 46068MiB | 61% Default |
3909
  | | | N/A |
3910
  +-----------------------------------------+------------------------+----------------------+
3911
 
 
3927
  <span class="collapse-indicators">
3928
  <span onclick="toggleCode('benchmark')" style="cursor: pointer;">▼ code</span>
3929
  <span onclick="toggleOutput('benchmark')" style="cursor: pointer;">▼ output</span>
3930
+ <span id="uv-indicator-benchmark" style="cursor: default; opacity: 0.3;">▶ uv-logs</span>
3931
  </span> |
3932
+ Cell: benchmark | 3.82s
3933
  | <button class="run-btn" onclick="runCell('benchmark')">▶ run</button>
3934
  <button class="copy-btn" onclick="copyCell('benchmark')">Copy</button>
3935
  <a href="cells/benchmark.py" target="_blank" class="raw-btn">Raw</a>
 
3980
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
3981
  Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
3982
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
3983
+ torch_flash_ma 0.00% 0.000us 0.00% 0.000us 0.000us 3.562ms 101.45% 3.562ms 3.562ms 1
3984
+ torch_flash_ma 6.38% 328.580us 45.84% 2.360ms 2.360ms 0.000us 0.00% 3.551ms 3.551ms 1
3985
+ aten::scaled_dot_product_attention 0.79% 40.571us 4.12% 212.315us 70.772us 0.000us 0.00% 2.798ms 932.779us 3
3986
+ aten::_scaled_dot_product_flash_attention 0.52% 26.642us 3.34% 171.744us 57.248us 0.000us 0.00% 2.798ms 932.779us 3
3987
+ aten::_flash_attention_forward 0.74% 37.939us 2.40% 123.383us 41.128us 2.798ms 79.71% 2.798ms 932.779us 3
3988
+ void pytorch_flash::flash_fwd_kernel&lt;Flash_fwd_kerne... 0.00% 0.000us 0.00% 0.000us 0.000us 2.798ms 79.71% 2.798ms 932.779us 3
3989
+ aten::contiguous 0.27% 13.720us 34.12% 1.757ms 146.409us 0.000us 0.00% 752.288us 62.691us 12
3990
+ aten::clone 0.73% 37.449us 33.85% 1.743ms 145.266us 0.000us 0.00% 752.288us 62.691us 12
3991
+ aten::copy_ 1.68% 86.484us 31.57% 1.625ms 135.456us 712.095us 20.29% 752.288us 62.691us 12
3992
+ void at::native::elementwise_kernel&lt;128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 712.095us 20.29% 712.095us 59.341us 12
3993
+ Activity Buffer Request 28.00% 1.442ms 28.00% 1.442ms 1.442ms 40.193us 1.14% 40.193us 40.193us 1
3994
+ aten::transpose 1.22% 62.637us 1.64% 84.218us 3.509us 0.000us 0.00% 0.000us 0.000us 24
3995
+ aten::as_strided 0.42% 21.581us 0.42% 21.581us 0.899us 0.000us 0.00% 0.000us 0.000us 24
3996
+ aten::empty_like 0.48% 24.619us 1.97% 101.523us 6.768us 0.000us 0.00% 0.000us 0.000us 15
3997
+ aten::empty 1.76% 90.465us 1.76% 90.465us 3.769us 0.000us 0.00% 0.000us 0.000us 24
3998
+ cudaLaunchKernel 2.36% 121.521us 2.36% 121.521us 8.101us 0.000us 0.00% 0.000us 0.000us 15
3999
+ aten::empty_strided 0.31% 15.721us 0.31% 15.721us 5.240us 0.000us 0.00% 0.000us 0.000us 3
4000
+ cudaDeviceGetAttribute 0.04% 2.280us 0.04% 2.280us 0.380us 0.000us 0.00% 0.000us 0.000us 6
4001
+ cudaFuncSetAttribute 0.16% 8.181us 0.16% 8.181us 2.727us 0.000us 0.00% 0.000us 0.000us 3
4002
+ cudaDeviceSynchronize 54.16% 2.789ms 54.16% 2.789ms 2.789ms 0.000us 0.00% 0.000us 0.000us 1
4003
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4004
+ Self CPU time total: 5.149ms
4005
+ Self CUDA time total: 3.510ms
4006
 
4007
 
4008
 
 
4012
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4013
  Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
4014
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4015
+ torch_flash_ma 4.71% 257.538us 44.52% 2.436ms 2.436ms 0.000us 0.00% 3.763ms 3.763ms 1
4016
+ torch_flash_ma 0.00% 0.000us 0.00% 0.000us 0.000us 3.719ms 100.29% 3.719ms 3.719ms 1
4017
+ aten::scaled_dot_product_attention 0.45% 24.440us 3.30% 180.683us 60.228us 0.000us 0.00% 2.948ms 982.525us 3
4018
+ aten::_scaled_dot_product_flash_attention 0.35% 18.890us 2.86% 156.243us 52.081us 0.000us 0.00% 2.948ms 982.525us 3
4019
+ aten::_flash_attention_forward 0.68% 37.218us 2.07% 113.133us 37.711us 2.948ms 79.49% 2.948ms 982.525us 3
4020
+ void pytorch_flash::flash_fwd_kernel&lt;Flash_fwd_kerne... 0.00% 0.000us 0.00% 0.000us 0.000us 2.948ms 79.49% 2.948ms 982.525us 3
4021
+ aten::contiguous 0.16% 8.651us 35.72% 1.955ms 162.890us 0.000us 0.00% 815.678us 67.973us 12
4022
+ aten::clone 0.48% 26.452us 35.56% 1.946ms 162.169us 0.000us 0.00% 815.678us 67.973us 12
4023
+ aten::copy_ 1.81% 99.279us 33.97% 1.859ms 154.885us 760.479us 20.51% 815.678us 67.973us 12
4024
+ void at::native::elementwise_kernel&lt;128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 760.479us 20.51% 760.479us 63.373us 12
4025
+ Activity Buffer Request 30.60% 1.674ms 30.60% 1.674ms 1.674ms 55.199us 1.49% 55.199us 55.199us 1
4026
+ aten::transpose 0.92% 50.270us 1.23% 67.460us 2.811us 0.000us 0.00% 0.000us 0.000us 24
4027
+ aten::as_strided 0.31% 17.190us 0.31% 17.190us 0.716us 0.000us 0.00% 0.000us 0.000us 24
4028
+ aten::empty_like 0.34% 18.723us 1.45% 79.503us 5.300us 0.000us 0.00% 0.000us 0.000us 15
4029
+ aten::empty 1.39% 75.933us 1.39% 75.933us 3.164us 0.000us 0.00% 0.000us 0.000us 24
4030
+ cudaLaunchKernel 1.98% 108.143us 1.98% 108.143us 7.210us 0.000us 0.00% 0.000us 0.000us 15
4031
+ aten::empty_strided 0.25% 13.599us 0.25% 13.599us 4.533us 0.000us 0.00% 0.000us 0.000us 3
4032
+ cudaDeviceGetAttribute 0.03% 1.831us 0.03% 1.831us 0.305us 0.000us 0.00% 0.000us 0.000us 6
4033
+ cudaFuncSetAttribute 0.07% 3.690us 0.07% 3.690us 1.230us 0.000us 0.00% 0.000us 0.000us 3
4034
+ cudaDeviceSynchronize 55.48% 3.036ms 55.48% 3.036ms 3.036ms 0.000us 0.00% 0.000us 0.000us 1
4035
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4036
+ Self CPU time total: 5.472ms
4037
+ Self CUDA time total: 3.708ms
4038
 
4039
 
4040
 
 
4044
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4045
  Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
4046
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4047
+ torch_flash_ma 4.65% 248.558us 40.70% 2.176ms 2.176ms 0.000us 0.00% 3.868ms 3.868ms 1
4048
+ torch_flash_ma 0.00% 0.000us 0.00% 0.000us 0.000us 3.819ms 100.29% 3.819ms 3.819ms 1
4049
+ aten::scaled_dot_product_attention 0.45% 24.181us 3.36% 179.834us 59.945us 0.000us 0.00% 3.027ms 1.009ms 3
4050
+ aten::_scaled_dot_product_flash_attention 0.34% 18.100us 2.91% 155.653us 51.884us 0.000us 0.00% 3.027ms 1.009ms 3
4051
+ aten::_flash_attention_forward 0.73% 38.760us 2.16% 115.412us 38.471us 3.027ms 79.48% 3.027ms 1.009ms 3
4052
+ void pytorch_flash::flash_fwd_kernel&lt;Flash_fwd_kerne... 0.00% 0.000us 0.00% 0.000us 0.000us 3.027ms 79.48% 3.027ms 1.009ms 3
4053
+ aten::contiguous 0.16% 8.609us 31.88% 1.704ms 142.018us 0.000us 0.00% 841.280us 70.107us 12
4054
+ aten::clone 0.50% 26.820us 31.72% 1.696ms 141.301us 0.000us 0.00% 841.280us 70.107us 12
4055
+ aten::copy_ 1.47% 78.703us 30.10% 1.609ms 134.076us 781.631us 20.52% 841.280us 70.107us 12
4056
+ void at::native::elementwise_kernel&lt;128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 781.631us 20.52% 781.631us 65.136us 12
4057
+ Activity Buffer Request 27.11% 1.449ms 27.11% 1.449ms 1.449ms 59.649us 1.57% 59.649us 59.649us 1
4058
+ aten::transpose 0.90% 48.151us 1.22% 65.102us 2.713us 0.000us 0.00% 0.000us 0.000us 24
4059
+ aten::as_strided 0.32% 16.951us 0.32% 16.951us 0.706us 0.000us 0.00% 0.000us 0.000us 24
4060
+ aten::empty_like 0.35% 18.789us 1.49% 79.862us 5.324us 0.000us 0.00% 0.000us 0.000us 15
4061
+ aten::empty 1.38% 73.892us 1.38% 73.892us 3.079us 0.000us 0.00% 0.000us 0.000us 24
4062
+ cudaLaunchKernel 1.96% 104.680us 1.96% 104.680us 6.979us 0.000us 0.00% 0.000us 0.000us 15
4063
+ aten::empty_strided 0.28% 15.081us 0.28% 15.081us 5.027us 0.000us 0.00% 0.000us 0.000us 3
4064
+ cudaDeviceGetAttribute 0.03% 1.791us 0.03% 1.791us 0.299us 0.000us 0.00% 0.000us 0.000us 6
4065
+ cudaFuncSetAttribute 0.07% 3.500us 0.07% 3.500us 1.167us 0.000us 0.00% 0.000us 0.000us 3
4066
+ cudaDeviceSynchronize 59.30% 3.169ms 59.30% 3.169ms 3.169ms 0.000us 0.00% 0.000us 0.000us 1
4067
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4068
+ Self CPU time total: 5.345ms
4069
+ Self CUDA time total: 3.808ms
4070
 
4071
 
4072
 
 
4076
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4077
  Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
4078
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4079
+ torch_flash_ma 4.50% 255.237us 42.25% 2.398ms 2.398ms 0.000us 0.00% 3.984ms 3.984ms 1
4080
+ torch_flash_ma 0.00% 0.000us 0.00% 0.000us 0.000us 3.936ms 100.28% 3.936ms 3.936ms 1
4081
+ aten::scaled_dot_product_attention 0.42% 23.840us 3.17% 179.904us 59.968us 0.000us 0.00% 3.135ms 1.045ms 3
4082
+ aten::_scaled_dot_product_flash_attention 0.36% 20.442us 2.75% 156.064us 52.021us 0.000us 0.00% 3.135ms 1.045ms 3
4083
+ aten::_flash_attention_forward 0.68% 38.721us 1.99% 113.183us 37.728us 3.135ms 79.87% 3.135ms 1.045ms 3
4084
+ void pytorch_flash::flash_fwd_kernel&lt;Flash_fwd_kerne... 0.00% 0.000us 0.00% 0.000us 0.000us 3.135ms 79.87% 3.135ms 1.045ms 3
4085
+ aten::contiguous 0.17% 9.382us 33.81% 1.919ms 159.915us 0.000us 0.00% 848.416us 70.701us 12
4086
+ aten::clone 0.52% 29.639us 33.64% 1.910ms 159.133us 0.000us 0.00% 848.416us 70.701us 12
4087
+ aten::copy_ 1.40% 79.644us 32.03% 1.818ms 151.492us 790.048us 20.13% 848.416us 70.701us 12
4088
+ void at::native::elementwise_kernel&lt;128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 790.048us 20.13% 790.048us 65.837us 12
4089
+ Activity Buffer Request 25.14% 1.427ms 25.14% 1.427ms 1.427ms 58.368us 1.49% 58.368us 58.368us 1
4090
+ aten::transpose 0.87% 49.289us 1.17% 66.169us 2.757us 0.000us 0.00% 0.000us 0.000us 24
4091
+ aten::as_strided 0.30% 16.880us 0.30% 16.880us 0.703us 0.000us 0.00% 0.000us 0.000us 24
4092
+ aten::empty_like 0.35% 19.852us 1.42% 80.662us 5.377us 0.000us 0.00% 0.000us 0.000us 15
4093
+ aten::empty 1.32% 74.981us 1.32% 74.981us 3.124us 0.000us 0.00% 0.000us 0.000us 24
4094
+ cudaLaunchKernel 5.89% 334.125us 5.89% 334.125us 22.275us 0.000us 0.00% 0.000us 0.000us 15
4095
+ aten::empty_strided 0.24% 13.720us 0.24% 13.720us 4.573us 0.000us 0.00% 0.000us 0.000us 3
4096
+ cudaDeviceGetAttribute 0.03% 1.760us 0.03% 1.760us 0.293us 0.000us 0.00% 0.000us 0.000us 6
4097
+ cudaFuncSetAttribute 0.06% 3.570us 0.06% 3.570us 1.190us 0.000us 0.00% 0.000us 0.000us 3
4098
+ cudaDeviceSynchronize 57.75% 3.278ms 57.75% 3.278ms 3.278ms 0.000us 0.00% 0.000us 0.000us 1
4099
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4100
+ Self CPU time total: 5.676ms
4101
+ Self CUDA time total: 3.925ms
4102
 
4103
 
4104
 
 
4108
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4109
  Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
4110
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4111
+ torch_flash_ma 5.07% 311.056us 40.82% 2.505ms 2.505ms 0.000us 0.00% 4.409ms 4.409ms 1
4112
+ torch_flash_ma 0.00% 0.000us 0.00% 0.000us 0.000us 4.359ms 100.26% 4.359ms 4.359ms 1
4113
+ aten::scaled_dot_product_attention 0.41% 24.931us 3.07% 188.265us 62.755us 0.000us 0.00% 3.539ms 1.180ms 3
4114
+ aten::_scaled_dot_product_flash_attention 0.33% 20.199us 2.66% 163.334us 54.445us 0.000us 0.00% 3.539ms 1.180ms 3
4115
+ aten::_flash_attention_forward 0.67% 41.371us 1.94% 118.823us 39.608us 3.539ms 81.38% 3.539ms 1.180ms 3
4116
+ void pytorch_flash::flash_fwd_kernel&lt;Flash_fwd_kerne... 0.00% 0.000us 0.00% 0.000us 0.000us 3.539ms 81.38% 3.539ms 1.180ms 3
4117
+ aten::contiguous 0.16% 9.771us 31.97% 1.962ms 163.526us 0.000us 0.00% 870.819us 72.568us 12
4118
+ aten::clone 0.47% 28.779us 31.82% 1.953ms 162.712us 0.000us 0.00% 870.819us 72.568us 12
4119
+ aten::copy_ 1.27% 77.896us 30.33% 1.862ms 155.132us 809.571us 18.62% 870.819us 72.568us 12
4120
+ void at::native::elementwise_kernel&lt;128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 809.571us 18.62% 809.571us 67.464us 12
4121
+ Activity Buffer Request 24.14% 1.481ms 24.14% 1.481ms 1.481ms 61.248us 1.41% 61.248us 61.248us 1
4122
+ aten::transpose 0.82% 50.583us 1.11% 68.092us 2.837us 0.000us 0.00% 0.000us 0.000us 24
4123
+ aten::as_strided 0.29% 17.509us 0.29% 17.509us 0.730us 0.000us 0.00% 0.000us 0.000us 24
4124
+ aten::empty_like 0.32% 19.913us 1.33% 81.883us 5.459us 0.000us 0.00% 0.000us 0.000us 15
4125
+ aten::empty 1.23% 75.660us 1.23% 75.660us 3.153us 0.000us 0.00% 0.000us 0.000us 24
4126
+ cudaLaunchKernel 5.31% 325.825us 5.31% 325.825us 21.722us 0.000us 0.00% 0.000us 0.000us 15
4127
+ aten::empty_strided 0.24% 14.770us 0.24% 14.770us 4.923us 0.000us 0.00% 0.000us 0.000us 3
4128
+ cudaDeviceGetAttribute 0.03% 1.990us 0.03% 1.990us 0.332us 0.000us 0.00% 0.000us 0.000us 6
4129
+ cudaFuncSetAttribute 0.06% 3.670us 0.06% 3.670us 1.223us 0.000us 0.00% 0.000us 0.000us 3
4130
+ cudaDeviceSynchronize 59.18% 3.632ms 59.18% 3.632ms 3.632ms 0.000us 0.00% 0.000us 0.000us 1
4131
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4132
+ Self CPU time total: 6.137ms
4133
+ Self CUDA time total: 4.348ms
4134
 
4135
 
4136
 
 
4140
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4141
  Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
4142
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4143
+ torch_flash_ma 4.13% 252.675us 38.98% 2.384ms 2.384ms 0.000us 0.00% 4.451ms 4.451ms 1
4144
+ torch_flash_ma 0.00% 0.000us 0.00% 0.000us 0.000us 4.400ms 100.24% 4.400ms 4.400ms 1
4145
+ aten::scaled_dot_product_attention 0.50% 30.480us 3.11% 190.334us 63.445us 0.000us 0.00% 3.566ms 1.189ms 3
4146
+ aten::_scaled_dot_product_flash_attention 0.31% 19.082us 2.61% 159.854us 53.285us 0.000us 0.00% 3.566ms 1.189ms 3
4147
+ aten::_flash_attention_forward 0.62% 38.112us 1.93% 118.053us 39.351us 3.566ms 81.24% 3.566ms 1.189ms 3
4148
+ void pytorch_flash::flash_fwd_kernel&lt;Flash_fwd_kerne... 0.00% 0.000us 0.00% 0.000us 0.000us 3.566ms 81.24% 3.566ms 1.189ms 3
4149
+ aten::contiguous 0.16% 9.891us 31.02% 1.897ms 158.059us 0.000us 0.00% 884.831us 73.736us 12
4150
+ aten::clone 0.50% 30.290us 30.85% 1.887ms 157.234us 0.000us 0.00% 884.831us 73.736us 12
4151
+ aten::copy_ 1.28% 78.520us 29.35% 1.795ms 149.550us 823.711us 18.76% 884.831us 73.736us 12
4152
+ void at::native::elementwise_kernel&lt;128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 823.711us 18.76% 823.711us 68.643us 12
4153
+ Activity Buffer Request 23.29% 1.424ms 23.29% 1.424ms 1.424ms 61.120us 1.39% 61.120us 61.120us 1
4154
+ aten::transpose 0.81% 49.593us 1.09% 66.721us 2.780us 0.000us 0.00% 0.000us 0.000us 24
4155
+ aten::as_strided 0.28% 17.128us 0.28% 17.128us 0.714us 0.000us 0.00% 0.000us 0.000us 24
4156
+ aten::empty_like 0.33% 20.381us 1.35% 82.362us 5.491us 0.000us 0.00% 0.000us 0.000us 15
4157
+ aten::empty 1.23% 74.920us 1.23% 74.920us 3.122us 0.000us 0.00% 0.000us 0.000us 24
4158
+ cudaLaunchKernel 5.19% 317.558us 5.19% 317.558us 21.171us 0.000us 0.00% 0.000us 0.000us 15
4159
+ aten::empty_strided 0.25% 15.161us 0.25% 15.161us 5.054us 0.000us 0.00% 0.000us 0.000us 3
4160
+ cudaDeviceGetAttribute 0.03% 1.791us 0.03% 1.791us 0.299us 0.000us 0.00% 0.000us 0.000us 6
4161
+ cudaFuncSetAttribute 0.06% 3.670us 0.06% 3.670us 1.223us 0.000us 0.00% 0.000us 0.000us 3
4162
+ cudaDeviceSynchronize 61.02% 3.732ms 61.02% 3.732ms 3.732ms 0.000us 0.00% 0.000us 0.000us 1
4163
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4164
+ Self CPU time total: 6.115ms
4165
+ Self CUDA time total: 4.390ms
4166
 
4167
 
4168
  impl wl p50(ms) ok
4169
+ torch_flash_ma cuda_attn_L128_bfloat16 1.21 True
4170
+ torch_flash_ma cuda_attn_L256_bfloat16 1.27 True
4171
+ torch_flash_ma cuda_attn_L320_bfloat16 1.30 True
4172
+ torch_flash_ma cuda_attn_L384_bfloat16 1.32 True
4173
+ torch_flash_ma cuda_attn_L448_bfloat16 1.48 True
4174
+ torch_flash_ma cuda_attn_L512_bfloat16 1.49 True
4175
  </pre></div>
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
4176
  <div class="cell-artifacts">
4177
  <h4>Artifacts:</h4>
4178
  <a href="artifacts/benchmark/attention.jsonl" class="artifact" target="_blank">attention.jsonl</a>
flash_attn/impls/hf_kernels_flash_attn.html CHANGED
@@ -809,6 +809,14 @@
809
  .artifact-preview svg {
810
  background: transparent;
811
  }
 
 
 
 
 
 
 
 
812
  /* CSV table styling */
813
  .artifact-csv {
814
  margin-top: 1rem;
@@ -3871,7 +3879,7 @@ span.linenos.special { color: #000000; background-color: #ffffc0; padding-left:
3871
  <span onclick="toggleOutput('benchmark')" style="cursor: pointer;">▼ output</span>
3872
  <span id="uv-indicator-benchmark" style="cursor: default; opacity: 0.3;">▶ uv-logs</span>
3873
  </span> |
3874
- Cell: benchmark | 5.58s
3875
  | <button class="run-btn" onclick="runCell('benchmark')">▶ run</button>
3876
  <button class="copy-btn" onclick="copyCell('benchmark')">Copy</button>
3877
  <a href="cells/benchmark.py" target="_blank" class="raw-btn">Raw</a>
@@ -3926,21 +3934,21 @@ PROFILE TRACE: hf_kernels_flash_attn | cuda_attn_L128_bfloat16
3926
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
3927
  Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
3928
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
3929
- hf_kernels_flash_attn 3.55% 156.153us 41.08% 1.807ms 1.807ms 0.000us 0.00% 3.775ms 3.775ms 1
3930
- _flash_attn_9e27194::fwd 1.65% 72.542us 37.53% 1.651ms 550.240us 2.812ms 100.00% 3.775ms 1.258ms 3
3931
- hf_kernels_flash_attn 0.00% 0.000us 0.00% 0.000us 0.000us 2.814ms 100.05% 2.814ms 2.814ms 1
3932
- void flash::flash_fwd_kernel&lt;Flash_fwd_kernel_traits... 0.00% 0.000us 0.00% 0.000us 0.000us 2.812ms 100.00% 2.812ms 937.398us 3
3933
- Activity Buffer Request 32.22% 1.417ms 32.22% 1.417ms 1.417ms 962.880us 34.24% 962.880us 962.880us 1
3934
- cudaDeviceGetAttribute 0.13% 5.500us 0.13% 5.500us 0.367us 0.000us 0.00% 0.000us 0.000us 15
3935
- aten::empty_like 0.43% 19.110us 1.25% 54.882us 18.294us 0.000us 0.00% 0.000us 0.000us 3
3936
- aten::empty_strided 0.81% 35.772us 0.81% 35.772us 11.924us 0.000us 0.00% 0.000us 0.000us 3
3937
- aten::empty 0.57% 25.101us 0.57% 25.101us 2.789us 0.000us 0.00% 0.000us 0.000us 9
3938
- cudaFuncSetAttribute 0.30% 13.270us 0.30% 13.270us 4.423us 0.000us 0.00% 0.000us 0.000us 3
3939
- cudaLaunchKernel 1.42% 62.402us 1.42% 62.402us 20.801us 0.000us 0.00% 0.000us 0.000us 3
3940
- cudaDeviceSynchronize 58.92% 2.591ms 58.92% 2.591ms 2.591ms 0.000us 0.00% 0.000us 0.000us 1
3941
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
3942
- Self CPU time total: 4.398ms
3943
- Self CUDA time total: 2.812ms
3944
 
3945
 
3946
 
@@ -3950,21 +3958,21 @@ PROFILE TRACE: hf_kernels_flash_attn | cuda_attn_L256_bfloat16
3950
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
3951
  Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
3952
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
3953
- hf_kernels_flash_attn 2.04% 91.192us 36.62% 1.634ms 1.634ms 0.000us 0.00% 3.983ms 3.983ms 1
3954
- _flash_attn_9e27194::fwd 1.11% 49.718us 34.57% 1.543ms 514.203us 2.978ms 100.00% 3.983ms 1.328ms 3
3955
- hf_kernels_flash_attn 0.00% 0.000us 0.00% 0.000us 0.000us 2.980ms 100.05% 2.980ms 2.980ms 1
3956
- void flash::flash_fwd_kernel&lt;Flash_fwd_kernel_traits... 0.00% 0.000us 0.00% 0.000us 0.000us 2.978ms 100.00% 2.978ms 992.707us 3
3957
- Activity Buffer Request 31.74% 1.416ms 31.74% 1.416ms 1.416ms 1.004ms 33.73% 1.004ms 1.004ms 1
3958
- cudaDeviceGetAttribute 0.08% 3.711us 0.08% 3.711us 0.247us 0.000us 0.00% 0.000us 0.000us 15
3959
- aten::empty_like 0.17% 7.481us 0.51% 22.841us 7.614us 0.000us 0.00% 0.000us 0.000us 3
3960
- aten::empty_strided 0.34% 15.360us 0.34% 15.360us 5.120us 0.000us 0.00% 0.000us 0.000us 3
3961
- aten::empty 0.46% 20.620us 0.46% 20.620us 2.291us 0.000us 0.00% 0.000us 0.000us 9
3962
- cudaFuncSetAttribute 0.08% 3.741us 0.08% 3.741us 1.247us 0.000us 0.00% 0.000us 0.000us 3
3963
- cudaLaunchKernel 0.58% 25.842us 0.58% 25.842us 8.614us 0.000us 0.00% 0.000us 0.000us 3
3964
- cudaDeviceSynchronize 63.38% 2.828ms 63.38% 2.828ms 2.828ms 0.000us 0.00% 0.000us 0.000us 1
3965
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
3966
- Self CPU time total: 4.462ms
3967
- Self CUDA time total: 2.978ms
3968
 
3969
 
3970
 
@@ -3974,21 +3982,21 @@ PROFILE TRACE: hf_kernels_flash_attn | cuda_attn_L320_bfloat16
3974
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
3975
  Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
3976
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
3977
- hf_kernels_flash_attn 2.28% 105.284us 36.17% 1.673ms 1.673ms 0.000us 0.00% 4.145ms 4.145ms 1
3978
- _flash_attn_9e27194::fwd 1.09% 50.271us 33.89% 1.567ms 522.459us 3.096ms 100.00% 4.145ms 1.382ms 3
3979
- hf_kernels_flash_attn 0.00% 0.000us 0.00% 0.000us 0.000us 3.098ms 100.05% 3.098ms 3.098ms 1
3980
- void flash::flash_fwd_kernel&lt;Flash_fwd_kernel_traits... 0.00% 0.000us 0.00% 0.000us 0.000us 3.096ms 100.00% 3.096ms 1.032ms 3
3981
- Activity Buffer Request 31.08% 1.437ms 31.08% 1.437ms 1.437ms 1.049ms 33.87% 1.049ms 1.049ms 1
3982
- cudaDeviceGetAttribute 0.08% 3.850us 0.08% 3.850us 0.257us 0.000us 0.00% 0.000us 0.000us 15
3983
- aten::empty_like 0.15% 7.061us 0.49% 22.631us 7.544us 0.000us 0.00% 0.000us 0.000us 3
3984
- aten::empty_strided 0.34% 15.570us 0.34% 15.570us 5.190us 0.000us 0.00% 0.000us 0.000us 3
3985
- aten::empty 0.47% 21.760us 0.47% 21.760us 2.418us 0.000us 0.00% 0.000us 0.000us 9
3986
- cudaFuncSetAttribute 0.08% 3.689us 0.08% 3.689us 1.230us 0.000us 0.00% 0.000us 0.000us 3
3987
- cudaLaunchKernel 0.61% 27.992us 0.61% 27.992us 9.331us 0.000us 0.00% 0.000us 0.000us 3
3988
- cudaDeviceSynchronize 63.83% 2.952ms 63.83% 2.952ms 2.952ms 0.000us 0.00% 0.000us 0.000us 1
3989
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
3990
- Self CPU time total: 4.625ms
3991
- Self CUDA time total: 3.096ms
3992
 
3993
 
3994
 
@@ -3998,21 +4006,21 @@ PROFILE TRACE: hf_kernels_flash_attn | cuda_attn_L384_bfloat16
3998
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
3999
  Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
4000
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4001
- hf_kernels_flash_attn 2.30% 110.882us 38.29% 1.842ms 1.842ms 0.000us 0.00% 4.161ms 4.161ms 1
4002
- _flash_attn_9e27194::fwd 1.05% 50.321us 35.98% 1.731ms 577.014us 3.117ms 100.00% 4.161ms 1.387ms 3
4003
- hf_kernels_flash_attn 0.00% 0.000us 0.00% 0.000us 0.000us 3.118ms 100.05% 3.118ms 3.118ms 1
4004
- void flash::flash_fwd_kernel&lt;Flash_fwd_kernel_traits... 0.00% 0.000us 0.00% 0.000us 0.000us 3.117ms 100.00% 3.117ms 1.039ms 3
4005
- Activity Buffer Request 29.64% 1.426ms 29.64% 1.426ms 1.426ms 1.044ms 33.50% 1.044ms 1.044ms 1
4006
- cudaDeviceGetAttribute 0.08% 3.780us 0.08% 3.780us 0.252us 0.000us 0.00% 0.000us 0.000us 15
4007
- aten::empty_like 0.15% 7.259us 0.50% 24.240us 8.080us 0.000us 0.00% 0.000us 0.000us 3
4008
- aten::empty_strided 0.35% 16.981us 0.35% 16.981us 5.660us 0.000us 0.00% 0.000us 0.000us 3
4009
- aten::empty 0.45% 21.602us 0.45% 21.602us 2.400us 0.000us 0.00% 0.000us 0.000us 9
4010
- cudaFuncSetAttribute 0.08% 3.770us 0.08% 3.770us 1.257us 0.000us 0.00% 0.000us 0.000us 3
4011
- cudaLaunchKernel 4.18% 201.205us 4.18% 201.205us 67.068us 0.000us 0.00% 0.000us 0.000us 3
4012
- cudaDeviceSynchronize 61.71% 2.969ms 61.71% 2.969ms 2.969ms 0.000us 0.00% 0.000us 0.000us 1
4013
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4014
- Self CPU time total: 4.811ms
4015
- Self CUDA time total: 3.117ms
4016
 
4017
 
4018
 
@@ -4022,21 +4030,21 @@ PROFILE TRACE: hf_kernels_flash_attn | cuda_attn_L448_bfloat16
4022
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4023
  Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
4024
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4025
- hf_kernels_flash_attn 2.05% 108.443us 34.64% 1.832ms 1.832ms 0.000us 0.00% 4.810ms 4.810ms 1
4026
- _flash_attn_9e27194::fwd 0.96% 50.812us 32.59% 1.723ms 574.364us 3.602ms 100.00% 4.810ms 1.603ms 3
4027
- hf_kernels_flash_attn 0.00% 0.000us 0.00% 0.000us 0.000us 3.603ms 100.04% 3.603ms 3.603ms 1
4028
- void flash::flash_fwd_kernel&lt;Flash_fwd_kernel_traits... 0.00% 0.000us 0.00% 0.000us 0.000us 3.602ms 100.00% 3.602ms 1.201ms 3
4029
- Activity Buffer Request 27.53% 1.455ms 27.53% 1.455ms 1.455ms 1.209ms 33.55% 1.209ms 1.209ms 1
4030
- cudaDeviceGetAttribute 0.08% 4.070us 0.08% 4.070us 0.271us 0.000us 0.00% 0.000us 0.000us 15
4031
- aten::empty_like 0.14% 7.390us 0.45% 23.900us 7.967us 0.000us 0.00% 0.000us 0.000us 3
4032
- aten::empty_strided 0.31% 16.510us 0.31% 16.510us 5.503us 0.000us 0.00% 0.000us 0.000us 3
4033
- aten::empty 0.40% 21.151us 0.40% 21.151us 2.350us 0.000us 0.00% 0.000us 0.000us 9
4034
- cudaFuncSetAttribute 0.07% 3.770us 0.07% 3.770us 1.257us 0.000us 0.00% 0.000us 0.000us 3
4035
- cudaLaunchKernel 3.10% 164.023us 3.10% 164.023us 54.674us 0.000us 0.00% 0.000us 0.000us 3
4036
- cudaDeviceSynchronize 65.36% 3.455ms 65.36% 3.455ms 3.455ms 0.000us 0.00% 0.000us 0.000us 1
4037
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4038
- Self CPU time total: 5.287ms
4039
- Self CUDA time total: 3.602ms
4040
 
4041
 
4042
 
@@ -4046,35 +4054,36 @@ PROFILE TRACE: hf_kernels_flash_attn | cuda_attn_L512_bfloat16
4046
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4047
  Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
4048
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4049
- hf_kernels_flash_attn 1.95% 105.103us 34.11% 1.836ms 1.836ms 0.000us 0.00% 4.931ms 4.931ms 1
4050
- _flash_attn_9e27194::fwd 1.08% 58.141us 32.16% 1.731ms 577.087us 3.693ms 100.00% 4.931ms 1.644ms 3
4051
- hf_kernels_flash_attn 0.00% 0.000us 0.00% 0.000us 0.000us 3.695ms 100.04% 3.695ms 3.695ms 1
4052
- void flash::flash_fwd_kernel&lt;Flash_fwd_kernel_traits... 0.00% 0.000us 0.00% 0.000us 0.000us 3.693ms 100.00% 3.693ms 1.231ms 3
4053
- Activity Buffer Request 26.71% 1.438ms 26.71% 1.438ms 1.438ms 1.238ms 33.53% 1.238ms 1.238ms 1
4054
- cudaDeviceGetAttribute 0.08% 4.380us 0.08% 4.380us 0.292us 0.000us 0.00% 0.000us 0.000us 15
4055
- aten::empty_like 0.15% 8.230us 0.50% 26.750us 8.917us 0.000us 0.00% 0.000us 0.000us 3
4056
- aten::empty_strided 0.34% 18.520us 0.34% 18.520us 6.173us 0.000us 0.00% 0.000us 0.000us 3
4057
- aten::empty 0.48% 25.961us 0.48% 25.961us 2.885us 0.000us 0.00% 0.000us 0.000us 9
4058
- cudaFuncSetAttribute 0.08% 4.220us 0.08% 4.220us 1.407us 0.000us 0.00% 0.000us 0.000us 3
4059
- cudaLaunchKernel 3.23% 173.714us 3.23% 173.714us 57.905us 0.000us 0.00% 0.000us 0.000us 3
4060
- cudaDeviceSynchronize 65.89% 3.548ms 65.89% 3.548ms 3.548ms 0.000us 0.00% 0.000us 0.000us 1
4061
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4062
- Self CPU time total: 5.384ms
4063
- Self CUDA time total: 3.693ms
4064
 
4065
 
4066
  impl wl p50(ms) ok
4067
- hf_kernels_flash_attn cuda_attn_L128_bfloat16 0.97 True
4068
- hf_kernels_flash_attn cuda_attn_L256_bfloat16 1.01 True
4069
- hf_kernels_flash_attn cuda_attn_L320_bfloat16 1.06 True
4070
- hf_kernels_flash_attn cuda_attn_L384_bfloat16 1.09 True
4071
- hf_kernels_flash_attn cuda_attn_L448_bfloat16 1.24 True
4072
- hf_kernels_flash_attn cuda_attn_L512_bfloat16 1.23 True
4073
  </pre></div>
4074
  <div class="cell-stderr">
4075
  Fetching 20 files: 0%| | 0/20 [00:00&lt;?, ?it/s]
4076
- Fetching 20 files: 10%|█ | 2/20 [00:01&lt;00:13, 1.34it/s]
4077
- Fetching 20 files: 100%|██████████| 20/20 [00:01&lt;00:00, 13.40it/s]
 
4078
  </div>
4079
  <div class="cell-artifacts">
4080
  <h4>Artifacts:</h4>
 
809
  .artifact-preview svg {
810
  background: transparent;
811
  }
812
+ /* Invert SVG images in dark mode */
813
+ :root[data-theme="dark"] .artifact-preview img[src$=".svg"] {
814
+ filter: invert(0.9) hue-rotate(180deg);
815
+ }
816
+ /* Keep SVG images readable in monocolor mode */
817
+ :root[data-ui="monocolor"] .artifact-preview img[src$=".svg"] {
818
+ filter: none;
819
+ }
820
  /* CSV table styling */
821
  .artifact-csv {
822
  margin-top: 1rem;
 
3879
  <span onclick="toggleOutput('benchmark')" style="cursor: pointer;">▼ output</span>
3880
  <span id="uv-indicator-benchmark" style="cursor: default; opacity: 0.3;">▶ uv-logs</span>
3881
  </span> |
3882
+ Cell: benchmark | 5.46s
3883
  | <button class="run-btn" onclick="runCell('benchmark')">▶ run</button>
3884
  <button class="copy-btn" onclick="copyCell('benchmark')">Copy</button>
3885
  <a href="cells/benchmark.py" target="_blank" class="raw-btn">Raw</a>
 
3934
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
3935
  Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
3936
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
3937
+ hf_kernels_flash_attn 3.61% 157.413us 41.18% 1.795ms 1.795ms 0.000us 0.00% 3.726ms 3.726ms 1
3938
+ _flash_attn_9e27194::fwd 1.61% 70.165us 37.57% 1.638ms 545.853us 2.781ms 100.00% 3.726ms 1.242ms 3
3939
+ hf_kernels_flash_attn 0.00% 0.000us 0.00% 0.000us 0.000us 2.783ms 100.05% 2.783ms 2.783ms 1
3940
+ void flash::flash_fwd_kernel&lt;Flash_fwd_kernel_traits... 0.00% 0.000us 0.00% 0.000us 0.000us 2.781ms 100.00% 2.781ms 927.059us 3
3941
+ Activity Buffer Request 32.93% 1.435ms 32.93% 1.435ms 1.435ms 944.349us 33.96% 944.349us 944.349us 1
3942
+ cudaDeviceGetAttribute 0.11% 4.789us 0.11% 4.789us 0.319us 0.000us 0.00% 0.000us 0.000us 15
3943
+ aten::empty_like 0.38% 16.590us 1.18% 51.251us 17.084us 0.000us 0.00% 0.000us 0.000us 3
3944
+ aten::empty_strided 0.80% 34.661us 0.80% 34.661us 11.554us 0.000us 0.00% 0.000us 0.000us 3
3945
+ aten::empty 0.57% 24.950us 0.57% 24.950us 2.772us 0.000us 0.00% 0.000us 0.000us 9
3946
+ cudaFuncSetAttribute 0.27% 11.579us 0.27% 11.579us 3.860us 0.000us 0.00% 0.000us 0.000us 3
3947
+ cudaLaunchKernel 0.90% 39.431us 0.90% 39.431us 13.144us 0.000us 0.00% 0.000us 0.000us 3
3948
+ cudaDeviceSynchronize 58.82% 2.564ms 58.82% 2.564ms 2.564ms 0.000us 0.00% 0.000us 0.000us 1
3949
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
3950
+ Self CPU time total: 4.359ms
3951
+ Self CUDA time total: 2.781ms
3952
 
3953
 
3954
 
 
3958
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
3959
  Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
3960
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
3961
+ hf_kernels_flash_attn 1.92% 86.861us 37.15% 1.685ms 1.685ms 0.000us 0.00% 3.967ms 3.967ms 1
3962
+ _flash_attn_9e27194::fwd 1.05% 47.633us 35.24% 1.598ms 532.729us 2.988ms 100.00% 3.967ms 1.322ms 3
3963
+ hf_kernels_flash_attn 0.00% 0.000us 0.00% 0.000us 0.000us 2.989ms 100.05% 2.989ms 2.989ms 1
3964
+ void flash::flash_fwd_kernel&lt;Flash_fwd_kernel_traits... 0.00% 0.000us 0.00% 0.000us 0.000us 2.988ms 100.00% 2.988ms 995.953us 3
3965
+ Activity Buffer Request 32.54% 1.476ms 32.54% 1.476ms 1.476ms 979.196us 32.77% 979.196us 979.196us 1
3966
+ cudaDeviceGetAttribute 0.08% 3.549us 0.08% 3.549us 0.237us 0.000us 0.00% 0.000us 0.000us 15
3967
+ aten::empty_like 0.15% 6.770us 0.48% 21.750us 7.250us 0.000us 0.00% 0.000us 0.000us 3
3968
+ aten::empty_strided 0.33% 14.980us 0.33% 14.980us 4.993us 0.000us 0.00% 0.000us 0.000us 3
3969
+ aten::empty 0.45% 20.562us 0.45% 20.562us 2.285us 0.000us 0.00% 0.000us 0.000us 9
3970
+ cudaFuncSetAttribute 0.08% 3.410us 0.08% 3.410us 1.137us 0.000us 0.00% 0.000us 0.000us 3
3971
+ cudaLaunchKernel 0.56% 25.521us 0.56% 25.521us 8.507us 0.000us 0.00% 0.000us 0.000us 3
3972
+ cudaDeviceSynchronize 62.85% 2.850ms 62.85% 2.850ms 2.850ms 0.000us 0.00% 0.000us 0.000us 1
3973
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
3974
+ Self CPU time total: 4.535ms
3975
+ Self CUDA time total: 2.988ms
3976
 
3977
 
3978
 
 
3982
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
3983
  Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
3984
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
3985
+ hf_kernels_flash_attn 2.25% 102.643us 36.16% 1.652ms 1.652ms 0.000us 0.00% 4.081ms 4.081ms 1
3986
+ _flash_attn_9e27194::fwd 1.10% 50.081us 33.92% 1.550ms 516.605us 3.056ms 100.00% 4.081ms 1.360ms 3
3987
+ hf_kernels_flash_attn 0.00% 0.000us 0.00% 0.000us 0.000us 3.058ms 100.05% 3.058ms 3.058ms 1
3988
+ void flash::flash_fwd_kernel&lt;Flash_fwd_kernel_traits... 0.00% 0.000us 0.00% 0.000us 0.000us 3.056ms 100.00% 3.056ms 1.019ms 3
3989
+ Activity Buffer Request 31.13% 1.423ms 31.13% 1.423ms 1.423ms 1.024ms 33.52% 1.024ms 1.024ms 1
3990
+ cudaDeviceGetAttribute 0.08% 3.832us 0.08% 3.832us 0.255us 0.000us 0.00% 0.000us 0.000us 15
3991
+ aten::empty_like 0.15% 6.971us 0.48% 22.109us 7.370us 0.000us 0.00% 0.000us 0.000us 3
3992
+ aten::empty_strided 0.33% 15.138us 0.33% 15.138us 5.046us 0.000us 0.00% 0.000us 0.000us 3
3993
+ aten::empty 0.46% 20.860us 0.46% 20.860us 2.318us 0.000us 0.00% 0.000us 0.000us 9
3994
+ cudaFuncSetAttribute 0.08% 3.430us 0.08% 3.430us 1.143us 0.000us 0.00% 0.000us 0.000us 3
3995
+ cudaLaunchKernel 0.59% 26.891us 0.59% 26.891us 8.964us 0.000us 0.00% 0.000us 0.000us 3
3996
+ cudaDeviceSynchronize 63.84% 2.917ms 63.84% 2.917ms 2.917ms 0.000us 0.00% 0.000us 0.000us 1
3997
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
3998
+ Self CPU time total: 4.569ms
3999
+ Self CUDA time total: 3.056ms
4000
 
4001
 
4002
 
 
4006
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4007
  Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
4008
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4009
+ hf_kernels_flash_attn 2.25% 106.084us 38.22% 1.803ms 1.803ms 0.000us 0.00% 4.091ms 4.091ms 1
4010
+ _flash_attn_9e27194::fwd 1.01% 47.791us 35.97% 1.697ms 565.799us 3.060ms 100.00% 4.091ms 1.364ms 3
4011
+ hf_kernels_flash_attn 0.00% 0.000us 0.00% 0.000us 0.000us 3.062ms 100.06% 3.062ms 3.062ms 1
4012
+ void flash::flash_fwd_kernel&lt;Flash_fwd_kernel_traits... 0.00% 0.000us 0.00% 0.000us 0.000us 3.060ms 100.00% 3.060ms 1.020ms 3
4013
+ Activity Buffer Request 30.05% 1.418ms 30.05% 1.418ms 1.418ms 1.031ms 33.68% 1.031ms 1.031ms 1
4014
+ cudaDeviceGetAttribute 0.08% 3.720us 0.08% 3.720us 0.248us 0.000us 0.00% 0.000us 0.000us 15
4015
+ aten::empty_like 0.16% 7.600us 0.52% 24.620us 8.207us 0.000us 0.00% 0.000us 0.000us 3
4016
+ aten::empty_strided 0.36% 17.020us 0.36% 17.020us 5.673us 0.000us 0.00% 0.000us 0.000us 3
4017
+ aten::empty 0.44% 20.780us 0.44% 20.780us 2.309us 0.000us 0.00% 0.000us 0.000us 9
4018
+ cudaFuncSetAttribute 0.08% 3.620us 0.08% 3.620us 1.207us 0.000us 0.00% 0.000us 0.000us 3
4019
+ cudaLaunchKernel 3.79% 178.824us 3.79% 178.824us 59.608us 0.000us 0.00% 0.000us 0.000us 3
4020
+ cudaDeviceSynchronize 61.78% 2.916ms 61.78% 2.916ms 2.916ms 0.000us 0.00% 0.000us 0.000us 1
4021
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4022
+ Self CPU time total: 4.719ms
4023
+ Self CUDA time total: 3.060ms
4024
 
4025
 
4026
 
 
4030
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4031
  Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
4032
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4033
+ hf_kernels_flash_attn 2.06% 106.072us 34.88% 1.800ms 1.800ms 0.000us 0.00% 4.679ms 4.679ms 1
4034
+ _flash_attn_9e27194::fwd 0.97% 50.192us 32.82% 1.694ms 564.573us 3.505ms 100.00% 4.679ms 1.560ms 3
4035
+ hf_kernels_flash_attn 0.00% 0.000us 0.00% 0.000us 0.000us 3.507ms 100.04% 3.507ms 3.507ms 1
4036
+ void flash::flash_fwd_kernel&lt;Flash_fwd_kernel_traits... 0.00% 0.000us 0.00% 0.000us 0.000us 3.505ms 100.00% 3.505ms 1.168ms 3
4037
+ Activity Buffer Request 27.53% 1.421ms 27.53% 1.421ms 1.421ms 1.174ms 33.50% 1.174ms 1.174ms 1
4038
+ cudaDeviceGetAttribute 0.08% 4.219us 0.08% 4.219us 0.281us 0.000us 0.00% 0.000us 0.000us 15
4039
+ aten::empty_like 0.15% 7.700us 0.46% 23.940us 7.980us 0.000us 0.00% 0.000us 0.000us 3
4040
+ aten::empty_strided 0.31% 16.240us 0.31% 16.240us 5.413us 0.000us 0.00% 0.000us 0.000us 3
4041
+ aten::empty 0.41% 21.049us 0.41% 21.049us 2.339us 0.000us 0.00% 0.000us 0.000us 9
4042
+ cudaFuncSetAttribute 0.07% 3.601us 0.07% 3.601us 1.200us 0.000us 0.00% 0.000us 0.000us 3
4043
+ cudaLaunchKernel 3.29% 169.975us 3.29% 169.975us 56.658us 0.000us 0.00% 0.000us 0.000us 3
4044
+ cudaDeviceSynchronize 65.12% 3.360ms 65.12% 3.360ms 3.360ms 0.000us 0.00% 0.000us 0.000us 1
4045
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4046
+ Self CPU time total: 5.160ms
4047
+ Self CUDA time total: 3.505ms
4048
 
4049
 
4050
 
 
4054
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4055
  Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
4056
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4057
+ hf_kernels_flash_attn 2.05% 108.192us 34.34% 1.815ms 1.815ms 0.000us 0.00% 4.838ms 4.838ms 1
4058
+ _flash_attn_9e27194::fwd 0.96% 50.903us 32.30% 1.707ms 568.907us 3.618ms 100.00% 4.838ms 1.613ms 3
4059
+ hf_kernels_flash_attn 0.00% 0.000us 0.00% 0.000us 0.000us 3.620ms 100.04% 3.620ms 3.620ms 1
4060
+ void flash::flash_fwd_kernel&lt;Flash_fwd_kernel_traits... 0.00% 0.000us 0.00% 0.000us 0.000us 3.618ms 100.00% 3.618ms 1.206ms 3
4061
+ Activity Buffer Request 26.73% 1.413ms 26.73% 1.413ms 1.413ms 1.220ms 33.72% 1.220ms 1.220ms 1
4062
+ cudaDeviceGetAttribute 0.07% 3.869us 0.07% 3.869us 0.258us 0.000us 0.00% 0.000us 0.000us 15
4063
+ aten::empty_like 0.14% 7.319us 0.48% 25.360us 8.453us 0.000us 0.00% 0.000us 0.000us 3
4064
+ aten::empty_strided 0.34% 18.041us 0.34% 18.041us 6.014us 0.000us 0.00% 0.000us 0.000us 3
4065
+ aten::empty 0.41% 21.680us 0.41% 21.680us 2.409us 0.000us 0.00% 0.000us 0.000us 9
4066
+ cudaFuncSetAttribute 0.07% 3.810us 0.07% 3.810us 1.270us 0.000us 0.00% 0.000us 0.000us 3
4067
+ cudaLaunchKernel 3.57% 188.496us 3.57% 188.496us 62.832us 0.000us 0.00% 0.000us 0.000us 3
4068
+ cudaDeviceSynchronize 65.66% 3.470ms 65.66% 3.470ms 3.470ms 0.000us 0.00% 0.000us 0.000us 1
4069
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4070
+ Self CPU time total: 5.285ms
4071
+ Self CUDA time total: 3.618ms
4072
 
4073
 
4074
  impl wl p50(ms) ok
4075
+ hf_kernels_flash_attn cuda_attn_L128_bfloat16 0.95 True
4076
+ hf_kernels_flash_attn cuda_attn_L256_bfloat16 1.00 True
4077
+ hf_kernels_flash_attn cuda_attn_L320_bfloat16 1.05 True
4078
+ hf_kernels_flash_attn cuda_attn_L384_bfloat16 1.07 True
4079
+ hf_kernels_flash_attn cuda_attn_L448_bfloat16 1.21 True
4080
+ hf_kernels_flash_attn cuda_attn_L512_bfloat16 1.19 True
4081
  </pre></div>
4082
  <div class="cell-stderr">
4083
  Fetching 20 files: 0%| | 0/20 [00:00&lt;?, ?it/s]
4084
+ Fetching 20 files: 5%|▌ | 1/20 [00:00&lt;00:03, 6.07it/s]
4085
+ Fetching 20 files: 10%|█ | 2/20 [00:01&lt;00:12, 1.40it/s]
4086
+ Fetching 20 files: 100%|██████████| 20/20 [00:01&lt;00:00, 15.82it/s]
4087
  </div>
4088
  <div class="cell-artifacts">
4089
  <h4>Artifacts:</h4>
flash_attn/impls/hf_kernels_flash_attn3.html CHANGED
@@ -809,6 +809,14 @@
809
  .artifact-preview svg {
810
  background: transparent;
811
  }
 
 
 
 
 
 
 
 
812
  /* CSV table styling */
813
  .artifact-csv {
814
  margin-top: 1rem;
@@ -3869,9 +3877,9 @@ span.linenos.special { color: #000000; background-color: #ffffc0; padding-left:
3869
  <span class="collapse-indicators">
3870
  <span onclick="toggleCode('benchmark')" style="cursor: pointer;">▼ code</span>
3871
  <span onclick="toggleOutput('benchmark')" style="cursor: pointer;">▼ output</span>
3872
- <span id="uv-indicator-benchmark" style="cursor: default; opacity: 0.3;">▶ uv-logs</span>
3873
  </span> |
3874
- Cell: benchmark | 5.52s
3875
  | <button class="run-btn" onclick="runCell('benchmark')">▶ run</button>
3876
  <button class="copy-btn" onclick="copyCell('benchmark')">Copy</button>
3877
  <a href="cells/benchmark.py" target="_blank" class="raw-btn">Raw</a>
@@ -3925,19 +3933,19 @@ PROFILE TRACE: hf_kernels_flash_attn3 | cuda_attn_L128_bfloat16
3925
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
3926
  Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
3927
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
3928
- hf_kernels_flash_attn3 3.72% 161.222us 44.67% 1.935ms 1.935ms 0.000us 0.00% 3.599ms 3.599ms 1
3929
- FlashAttnFunc 2.81% 121.834us 40.95% 1.774ms 591.218us 0.000us 0.00% 3.599ms 1.200ms 3
3930
- _flash_attn3_48fe103_dirty::fwd 1.85% 79.992us 38.14% 1.652ms 550.607us 2.693ms 100.00% 3.599ms 1.200ms 3
3931
- hf_kernels_flash_attn3 0.00% 0.000us 0.00% 0.000us 0.000us 2.695ms 100.05% 2.695ms 2.695ms 1
3932
- void cutlass::device_kernel&lt;flash::enable_sm80_to_sm... 0.00% 0.000us 0.00% 0.000us 0.000us 2.693ms 100.00% 2.693ms 897.759us 3
3933
- Activity Buffer Request 33.93% 1.470ms 33.93% 1.470ms 1.470ms 905.439us 33.62% 905.439us 905.439us 1
3934
- aten::empty 1.00% 43.311us 1.00% 43.311us 7.219us 0.000us 0.00% 0.000us 0.000us 6
3935
- cudaFuncSetAttribute 0.32% 13.891us 0.32% 13.891us 4.630us 0.000us 0.00% 0.000us 0.000us 3
3936
- cudaLaunchKernel 1.04% 45.121us 1.04% 45.121us 15.040us 0.000us 0.00% 0.000us 0.000us 3
3937
- cudaDeviceSynchronize 55.33% 2.396ms 55.33% 2.396ms 2.396ms 0.000us 0.00% 0.000us 0.000us 1
3938
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
3939
- Self CPU time total: 4.331ms
3940
- Self CUDA time total: 2.693ms
3941
 
3942
 
3943
 
@@ -3947,19 +3955,19 @@ PROFILE TRACE: hf_kernels_flash_attn3 | cuda_attn_L256_bfloat16
3947
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
3948
  Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
3949
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
3950
- hf_kernels_flash_attn3 2.17% 96.772us 39.76% 1.770ms 1.770ms 0.000us 0.00% 3.876ms 3.876ms 1
3951
- FlashAttnFunc 2.04% 90.694us 37.59% 1.674ms 557.834us 0.000us 0.00% 3.876ms 1.292ms 3
3952
- _flash_attn3_48fe103_dirty::fwd 1.15% 51.142us 35.55% 1.583ms 527.603us 2.896ms 100.00% 3.876ms 1.292ms 3
3953
- hf_kernels_flash_attn3 0.00% 0.000us 0.00% 0.000us 0.000us 2.898ms 100.05% 2.898ms 2.898ms 1
3954
- void cutlass::device_kernel&lt;flash::enable_sm80_to_sm... 0.00% 0.000us 0.00% 0.000us 0.000us 2.896ms 100.00% 2.896ms 965.387us 3
3955
- Activity Buffer Request 33.04% 1.471ms 33.04% 1.471ms 1.471ms 979.809us 33.83% 979.809us 979.809us 1
3956
- aten::empty 0.58% 25.610us 0.58% 25.610us 4.268us 0.000us 0.00% 0.000us 0.000us 6
3957
- cudaFuncSetAttribute 0.12% 5.240us 0.12% 5.240us 1.747us 0.000us 0.00% 0.000us 0.000us 3
3958
- cudaLaunchKernel 0.67% 29.750us 0.67% 29.750us 9.917us 0.000us 0.00% 0.000us 0.000us 3
3959
- cudaDeviceSynchronize 60.24% 2.682ms 60.24% 2.682ms 2.682ms 0.000us 0.00% 0.000us 0.000us 1
3960
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
3961
- Self CPU time total: 4.452ms
3962
- Self CUDA time total: 2.896ms
3963
 
3964
 
3965
 
@@ -3969,19 +3977,19 @@ PROFILE TRACE: hf_kernels_flash_attn3 | cuda_attn_L320_bfloat16
3969
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
3970
  Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
3971
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
3972
- hf_kernels_flash_attn3 2.19% 98.331us 39.82% 1.786ms 1.786ms 0.000us 0.00% 3.885ms 3.885ms 1
3973
- FlashAttnFunc 1.99% 89.333us 37.63% 1.688ms 562.551us 0.000us 0.00% 3.885ms 1.295ms 3
3974
- _flash_attn3_48fe103_dirty::fwd 1.08% 48.311us 35.64% 1.598ms 532.773us 2.912ms 100.00% 3.885ms 1.295ms 3
3975
- hf_kernels_flash_attn3 0.00% 0.000us 0.00% 0.000us 0.000us 2.914ms 100.05% 2.914ms 2.914ms 1
3976
- void cutlass::device_kernel&lt;flash::enable_sm80_to_sm... 0.00% 0.000us 0.00% 0.000us 0.000us 2.912ms 100.00% 2.912ms 970.802us 3
3977
- Activity Buffer Request 33.18% 1.488ms 33.18% 1.488ms 1.488ms 972.637us 33.40% 972.637us 972.637us 1
3978
- aten::empty 0.57% 25.370us 0.57% 25.370us 4.228us 0.000us 0.00% 0.000us 0.000us 6
3979
- cudaFuncSetAttribute 0.13% 5.730us 0.13% 5.730us 1.910us 0.000us 0.00% 0.000us 0.000us 3
3980
- cudaLaunchKernel 0.69% 30.861us 0.69% 30.861us 10.287us 0.000us 0.00% 0.000us 0.000us 3
3981
- cudaDeviceSynchronize 60.18% 2.699ms 60.18% 2.699ms 2.699ms 0.000us 0.00% 0.000us 0.000us 1
3982
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
3983
- Self CPU time total: 4.485ms
3984
- Self CUDA time total: 2.912ms
3985
 
3986
 
3987
 
@@ -3991,19 +3999,19 @@ PROFILE TRACE: hf_kernels_flash_attn3 | cuda_attn_L384_bfloat16
3991
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
3992
  Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
3993
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
3994
- hf_kernels_flash_attn3 2.51% 118.553us 41.81% 1.973ms 1.973ms 0.000us 0.00% 3.964ms 3.964ms 1
3995
- FlashAttnFunc 1.94% 91.662us 39.30% 1.855ms 618.205us 0.000us 0.00% 3.964ms 1.321ms 3
3996
- _flash_attn3_48fe103_dirty::fwd 1.07% 50.373us 37.36% 1.763ms 587.651us 2.962ms 100.00% 3.964ms 1.321ms 3
3997
- hf_kernels_flash_attn3 0.00% 0.000us 0.00% 0.000us 0.000us 2.964ms 100.05% 2.964ms 2.964ms 1
3998
- void cutlass::device_kernel&lt;flash::enable_sm80_to_sm... 0.00% 0.000us 0.00% 0.000us 0.000us 2.962ms 100.00% 2.962ms 987.401us 3
3999
- Activity Buffer Request 30.92% 1.459ms 30.92% 1.459ms 1.459ms 1.002ms 33.82% 1.002ms 1.002ms 1
4000
- aten::empty 0.56% 26.451us 0.56% 26.451us 4.408us 0.000us 0.00% 0.000us 0.000us 6
4001
- cudaFuncSetAttribute 0.11% 5.270us 0.11% 5.270us 1.757us 0.000us 0.00% 0.000us 0.000us 3
4002
- cudaLaunchKernel 4.70% 221.845us 4.70% 221.845us 73.948us 0.000us 0.00% 0.000us 0.000us 3
4003
- cudaDeviceSynchronize 58.19% 2.746ms 58.19% 2.746ms 2.746ms 0.000us 0.00% 0.000us 0.000us 1
4004
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4005
- Self CPU time total: 4.719ms
4006
- Self CUDA time total: 2.962ms
4007
 
4008
 
4009
 
@@ -4013,19 +4021,19 @@ PROFILE TRACE: hf_kernels_flash_attn3 | cuda_attn_L448_bfloat16
4013
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4014
  Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
4015
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4016
- hf_kernels_flash_attn3 2.19% 114.453us 37.34% 1.953ms 1.953ms 0.000us 0.00% 4.662ms 4.662ms 1
4017
- FlashAttnFunc 1.73% 90.401us 35.15% 1.838ms 612.822us 0.000us 0.00% 4.662ms 1.554ms 3
4018
- _flash_attn3_48fe103_dirty::fwd 0.97% 50.643us 33.42% 1.748ms 582.688us 3.490ms 100.00% 4.662ms 1.554ms 3
4019
- hf_kernels_flash_attn3 0.00% 0.000us 0.00% 0.000us 0.000us 3.492ms 100.04% 3.492ms 3.492ms 1
4020
- void cutlass::device_kernel&lt;flash::enable_sm80_to_sm... 0.00% 0.000us 0.00% 0.000us 0.000us 3.490ms 100.00% 3.490ms 1.163ms 3
4021
- Activity Buffer Request 28.44% 1.487ms 28.44% 1.487ms 1.487ms 1.171ms 33.56% 1.171ms 1.171ms 1
4022
- aten::empty 0.52% 27.271us 0.52% 27.271us 4.545us 0.000us 0.00% 0.000us 0.000us 6
4023
- cudaFuncSetAttribute 0.09% 4.950us 0.09% 4.950us 1.650us 0.000us 0.00% 0.000us 0.000us 3
4024
- cudaLaunchKernel 3.40% 178.024us 3.40% 178.024us 59.341us 0.000us 0.00% 0.000us 0.000us 3
4025
- cudaDeviceSynchronize 62.66% 3.277ms 62.66% 3.277ms 3.277ms 0.000us 0.00% 0.000us 0.000us 1
4026
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4027
- Self CPU time total: 5.230ms
4028
- Self CUDA time total: 3.490ms
4029
 
4030
 
4031
 
@@ -4035,34 +4043,38 @@ PROFILE TRACE: hf_kernels_flash_attn3 | cuda_attn_L512_bfloat16
4035
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4036
  Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
4037
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4038
- hf_kernels_flash_attn3 2.26% 115.663us 36.27% 1.854ms 1.854ms 0.000us 0.00% 4.679ms 4.679ms 1
4039
- FlashAttnFunc 2.25% 114.773us 34.01% 1.738ms 579.364us 0.000us 0.00% 4.679ms 1.560ms 3
4040
- _flash_attn3_48fe103_dirty::fwd 1.02% 51.933us 31.76% 1.623ms 541.107us 3.499ms 100.00% 4.679ms 1.560ms 3
4041
- hf_kernels_flash_attn3 0.00% 0.000us 0.00% 0.000us 0.000us 3.500ms 100.04% 3.500ms 3.500ms 1
4042
- void cutlass::device_kernel&lt;flash::enable_sm80_to_sm... 0.00% 0.000us 0.00% 0.000us 0.000us 3.499ms 100.00% 3.499ms 1.166ms 3
4043
- Activity Buffer Request 26.80% 1.370ms 26.80% 1.370ms 1.370ms 1.181ms 33.75% 1.181ms 1.181ms 1
4044
- aten::empty 0.54% 27.681us 0.54% 27.681us 4.613us 0.000us 0.00% 0.000us 0.000us 6
4045
- cudaFuncSetAttribute 0.10% 5.079us 0.10% 5.079us 1.693us 0.000us 0.00% 0.000us 0.000us 3
4046
- cudaLaunchKernel 3.30% 168.813us 3.30% 168.813us 56.271us 0.000us 0.00% 0.000us 0.000us 3
4047
- cudaDeviceSynchronize 63.73% 3.257ms 63.73% 3.257ms 3.257ms 0.000us 0.00% 0.000us 0.000us 1
4048
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4049
- Self CPU time total: 5.111ms
4050
- Self CUDA time total: 3.499ms
4051
 
4052
 
4053
  impl wl p50(ms) ok
4054
- hf_kernels_flash_attn3 cuda_attn_L128_bfloat16 0.94 True
4055
- hf_kernels_flash_attn3 cuda_attn_L256_bfloat16 0.97 True
4056
- hf_kernels_flash_attn3 cuda_attn_L320_bfloat16 1.04 True
4057
- hf_kernels_flash_attn3 cuda_attn_L384_bfloat16 1.05 True
4058
- hf_kernels_flash_attn3 cuda_attn_L448_bfloat16 1.20 True
4059
- hf_kernels_flash_attn3 cuda_attn_L512_bfloat16 1.18 True
4060
  </pre></div>
4061
- <div class="cell-stderr">
4062
- Fetching 4 files: 0%| | 0/4 [00:00&lt;?, ?it/s]
4063
- Fetching 4 files: 50%|█████ | 2/4 [00:01&lt;00:01, 1.38it/s]
4064
- Fetching 4 files: 100%|██████████| 4/4 [00:01&lt;00:00, 2.75it/s]
 
4065
  </div>
 
 
 
4066
  <div class="cell-artifacts">
4067
  <h4>Artifacts:</h4>
4068
  <a href="artifacts/benchmark/attention.jsonl" class="artifact" target="_blank">attention.jsonl</a>
 
809
  .artifact-preview svg {
810
  background: transparent;
811
  }
812
+ /* Invert SVG images in dark mode */
813
+ :root[data-theme="dark"] .artifact-preview img[src$=".svg"] {
814
+ filter: invert(0.9) hue-rotate(180deg);
815
+ }
816
+ /* Keep SVG images readable in monocolor mode */
817
+ :root[data-ui="monocolor"] .artifact-preview img[src$=".svg"] {
818
+ filter: none;
819
+ }
820
  /* CSV table styling */
821
  .artifact-csv {
822
  margin-top: 1rem;
 
3877
  <span class="collapse-indicators">
3878
  <span onclick="toggleCode('benchmark')" style="cursor: pointer;">▼ code</span>
3879
  <span onclick="toggleOutput('benchmark')" style="cursor: pointer;">▼ output</span>
3880
+ <span id="uv-indicator-benchmark" onclick="toggleUvLogsFromHeader('benchmark')" style="cursor: pointer;">▶ uv-logs</span>
3881
  </span> |
3882
+ Cell: benchmark | 5.78s
3883
  | <button class="run-btn" onclick="runCell('benchmark')">▶ run</button>
3884
  <button class="copy-btn" onclick="copyCell('benchmark')">Copy</button>
3885
  <a href="cells/benchmark.py" target="_blank" class="raw-btn">Raw</a>
 
3933
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
3934
  Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
3935
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
3936
+ hf_kernels_flash_attn3 3.71% 164.893us 43.76% 1.944ms 1.944ms 0.000us 0.00% 3.688ms 3.688ms 1
3937
+ FlashAttnFunc 2.67% 118.403us 40.05% 1.779ms 593.141us 0.000us 0.00% 3.688ms 1.229ms 3
3938
+ _flash_attn3_48fe103_dirty::fwd 1.75% 77.922us 37.39% 1.661ms 553.673us 2.790ms 100.00% 3.688ms 1.229ms 3
3939
+ hf_kernels_flash_attn3 0.00% 0.000us 0.00% 0.000us 0.000us 2.791ms 100.05% 2.791ms 2.791ms 1
3940
+ void cutlass::device_kernel&lt;flash::enable_sm80_to_sm... 0.00% 0.000us 0.00% 0.000us 0.000us 2.790ms 100.00% 2.790ms 929.856us 3
3941
+ Activity Buffer Request 33.30% 1.480ms 33.30% 1.480ms 1.480ms 898.016us 32.19% 898.016us 898.016us 1
3942
+ aten::empty 1.01% 44.942us 1.01% 44.942us 7.490us 0.000us 0.00% 0.000us 0.000us 6
3943
+ cudaFuncSetAttribute 0.31% 13.870us 0.31% 13.870us 4.623us 0.000us 0.00% 0.000us 0.000us 3
3944
+ cudaLaunchKernel 1.01% 44.741us 1.01% 44.741us 14.914us 0.000us 0.00% 0.000us 0.000us 3
3945
+ cudaDeviceSynchronize 56.24% 2.499ms 56.24% 2.499ms 2.499ms 0.000us 0.00% 0.000us 0.000us 1
3946
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
3947
+ Self CPU time total: 4.443ms
3948
+ Self CUDA time total: 2.790ms
3949
 
3950
 
3951
 
 
3955
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
3956
  Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
3957
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
3958
+ hf_kernels_flash_attn3 2.31% 100.671us 40.75% 1.773ms 1.773ms 0.000us 0.00% 3.735ms 3.735ms 1
3959
+ FlashAttnFunc 2.09% 91.144us 38.44% 1.673ms 557.547us 0.000us 0.00% 3.735ms 1.245ms 3
3960
+ _flash_attn3_48fe103_dirty::fwd 1.16% 50.371us 36.34% 1.581ms 527.165us 2.796ms 100.00% 3.735ms 1.245ms 3
3961
+ hf_kernels_flash_attn3 0.00% 0.000us 0.00% 0.000us 0.000us 2.798ms 100.06% 2.798ms 2.798ms 1
3962
+ void cutlass::device_kernel&lt;flash::enable_sm80_to_sm... 0.00% 0.000us 0.00% 0.000us 0.000us 2.796ms 100.00% 2.796ms 932.000us 3
3963
+ Activity Buffer Request 33.75% 1.469ms 33.75% 1.469ms 1.469ms 939.487us 33.60% 939.487us 939.487us 1
3964
+ aten::empty 0.64% 27.720us 0.64% 27.720us 4.620us 0.000us 0.00% 0.000us 0.000us 6
3965
+ cudaFuncSetAttribute 0.11% 4.991us 0.11% 4.991us 1.664us 0.000us 0.00% 0.000us 0.000us 3
3966
+ cudaLaunchKernel 0.68% 29.510us 0.68% 29.510us 9.837us 0.000us 0.00% 0.000us 0.000us 3
3967
+ cudaDeviceSynchronize 59.25% 2.578ms 59.25% 2.578ms 2.578ms 0.000us 0.00% 0.000us 0.000us 1
3968
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
3969
+ Self CPU time total: 4.352ms
3970
+ Self CUDA time total: 2.796ms
3971
 
3972
 
3973
 
 
3977
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
3978
  Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
3979
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
3980
+ hf_kernels_flash_attn3 2.10% 95.451us 39.98% 1.817ms 1.817ms 0.000us 0.00% 3.967ms 3.967ms 1
3981
+ FlashAttnFunc 2.52% 114.605us 37.88% 1.721ms 573.824us 0.000us 0.00% 3.967ms 1.322ms 3
3982
+ _flash_attn3_48fe103_dirty::fwd 1.12% 50.981us 35.36% 1.607ms 535.622us 2.964ms 100.00% 3.967ms 1.322ms 3
3983
+ hf_kernels_flash_attn3 0.00% 0.000us 0.00% 0.000us 0.000us 2.966ms 100.05% 2.966ms 2.966ms 1
3984
+ void cutlass::device_kernel&lt;flash::enable_sm80_to_sm... 0.00% 0.000us 0.00% 0.000us 0.000us 2.964ms 100.00% 2.964ms 988.118us 3
3985
+ Activity Buffer Request 32.83% 1.492ms 32.83% 1.492ms 1.492ms 1.002ms 33.81% 1.002ms 1.002ms 1
3986
+ aten::empty 0.60% 27.089us 0.60% 27.089us 4.515us 0.000us 0.00% 0.000us 0.000us 6
3987
+ cudaFuncSetAttribute 0.12% 5.480us 0.12% 5.480us 1.827us 0.000us 0.00% 0.000us 0.000us 3
3988
+ cudaLaunchKernel 0.69% 31.551us 0.69% 31.551us 10.517us 0.000us 0.00% 0.000us 0.000us 3
3989
+ cudaDeviceSynchronize 60.02% 2.727ms 60.02% 2.727ms 2.727ms 0.000us 0.00% 0.000us 0.000us 1
3990
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
3991
+ Self CPU time total: 4.544ms
3992
+ Self CUDA time total: 2.964ms
3993
 
3994
 
3995
 
 
3999
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4000
  Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
4001
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4002
+ hf_kernels_flash_attn3 2.35% 113.792us 41.57% 2.016ms 2.016ms 0.000us 0.00% 4.078ms 4.078ms 1
4003
+ FlashAttnFunc 1.91% 92.684us 39.22% 1.902ms 634.112us 0.000us 0.00% 4.078ms 1.359ms 3
4004
+ _flash_attn3_48fe103_dirty::fwd 0.98% 47.600us 37.31% 1.810ms 603.217us 3.050ms 100.00% 4.078ms 1.359ms 3
4005
+ hf_kernels_flash_attn3 0.00% 0.000us 0.00% 0.000us 0.000us 3.052ms 100.05% 3.052ms 3.052ms 1
4006
+ void cutlass::device_kernel&lt;flash::enable_sm80_to_sm... 0.00% 0.000us 0.00% 0.000us 0.000us 3.050ms 100.00% 3.050ms 1.017ms 3
4007
+ Activity Buffer Request 30.19% 1.464ms 30.19% 1.464ms 1.464ms 1.028ms 33.70% 1.028ms 1.028ms 1
4008
+ aten::empty 0.58% 28.221us 0.58% 28.221us 4.703us 0.000us 0.00% 0.000us 0.000us 6
4009
+ cudaFuncSetAttribute 0.11% 5.430us 0.11% 5.430us 1.810us 0.000us 0.00% 0.000us 0.000us 3
4010
+ cudaLaunchKernel 5.44% 264.046us 5.44% 264.046us 88.015us 0.000us 0.00% 0.000us 0.000us 3
4011
+ cudaDeviceSynchronize 58.43% 2.834ms 58.43% 2.834ms 2.834ms 0.000us 0.00% 0.000us 0.000us 1
4012
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4013
+ Self CPU time total: 4.851ms
4014
+ Self CUDA time total: 3.050ms
4015
 
4016
 
4017
 
 
4021
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4022
  Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
4023
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4024
+ hf_kernels_flash_attn3 2.29% 116.152us 37.60% 1.908ms 1.908ms 0.000us 0.00% 4.514ms 4.514ms 1
4025
+ FlashAttnFunc 1.78% 90.384us 35.31% 1.792ms 597.414us 0.000us 0.00% 4.514ms 1.505ms 3
4026
+ _flash_attn3_48fe103_dirty::fwd 0.91% 46.231us 33.53% 1.702ms 567.286us 3.379ms 100.00% 4.514ms 1.505ms 3
4027
+ hf_kernels_flash_attn3 0.00% 0.000us 0.00% 0.000us 0.000us 3.380ms 100.05% 3.380ms 3.380ms 1
4028
+ void cutlass::device_kernel&lt;flash::enable_sm80_to_sm... 0.00% 0.000us 0.00% 0.000us 0.000us 3.379ms 100.00% 3.379ms 1.126ms 3
4029
+ Activity Buffer Request 28.41% 1.442ms 28.41% 1.442ms 1.442ms 1.136ms 33.61% 1.136ms 1.136ms 1
4030
+ aten::empty 0.54% 27.250us 0.54% 27.250us 4.542us 0.000us 0.00% 0.000us 0.000us 6
4031
+ cudaFuncSetAttribute 0.10% 5.250us 0.10% 5.250us 1.750us 0.000us 0.00% 0.000us 0.000us 3
4032
+ cudaLaunchKernel 3.57% 181.204us 3.57% 181.204us 60.401us 0.000us 0.00% 0.000us 0.000us 3
4033
+ cudaDeviceSynchronize 62.40% 3.167ms 62.40% 3.167ms 3.167ms 0.000us 0.00% 0.000us 0.000us 1
4034
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4035
+ Self CPU time total: 5.075ms
4036
+ Self CUDA time total: 3.379ms
4037
 
4038
 
4039
 
 
4043
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4044
  Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
4045
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4046
+ hf_kernels_flash_attn3 2.24% 115.243us 39.36% 2.021ms 2.021ms 0.000us 0.00% 4.438ms 4.438ms 1
4047
+ FlashAttnFunc 1.78% 91.262us 37.12% 1.906ms 635.278us 0.000us 0.00% 4.438ms 1.479ms 3
4048
+ _flash_attn3_48fe103_dirty::fwd 0.90% 46.212us 35.34% 1.815ms 604.857us 3.325ms 100.00% 4.438ms 1.479ms 3
4049
+ hf_kernels_flash_attn3 0.00% 0.000us 0.00% 0.000us 0.000us 3.327ms 100.04% 3.327ms 3.327ms 1
4050
+ void cutlass::device_kernel&lt;flash::enable_sm80_to_sm... 0.00% 0.000us 0.00% 0.000us 0.000us 3.325ms 100.00% 3.325ms 1.108ms 3
4051
+ Activity Buffer Request 30.40% 1.561ms 30.40% 1.561ms 1.561ms 1.113ms 33.46% 1.113ms 1.113ms 1
4052
+ aten::empty 0.54% 27.780us 0.54% 27.780us 4.630us 0.000us 0.00% 0.000us 0.000us 6
4053
+ cudaFuncSetAttribute 0.10% 5.330us 0.10% 5.330us 1.777us 0.000us 0.00% 0.000us 0.000us 3
4054
+ cudaLaunchKernel 3.40% 174.454us 3.40% 174.454us 58.151us 0.000us 0.00% 0.000us 0.000us 3
4055
+ cudaDeviceSynchronize 60.64% 3.113ms 60.64% 3.113ms 3.113ms 0.000us 0.00% 0.000us 0.000us 1
4056
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4057
+ Self CPU time total: 5.134ms
4058
+ Self CUDA time total: 3.325ms
4059
 
4060
 
4061
  impl wl p50(ms) ok
4062
+ hf_kernels_flash_attn3 cuda_attn_L128_bfloat16 1.00 True
4063
+ hf_kernels_flash_attn3 cuda_attn_L256_bfloat16 0.99 True
4064
+ hf_kernels_flash_attn3 cuda_attn_L320_bfloat16 1.03 True
4065
+ hf_kernels_flash_attn3 cuda_attn_L384_bfloat16 1.02 True
4066
+ hf_kernels_flash_attn3 cuda_attn_L448_bfloat16 1.19 True
4067
+ hf_kernels_flash_attn3 cuda_attn_L512_bfloat16 1.15 True
4068
  </pre></div>
4069
+ <div class="uv-install-logs" id="uv-logs-benchmark">
4070
+ <div class="uv-logs-header" onclick="toggleUvLogs(this)">▶ UV Install Logs</div>
4071
+ <div class="uv-logs-content" style="display: none;">
4072
+ Installed 15 packages in 13ms
4073
+ </div>
4074
  </div>
4075
+ <div class="cell-stderr">Fetching 4 files: 0%| | 0/4 [00:00&lt;?, ?it/s]
4076
+ Fetching 4 files: 50%|█████ | 2/4 [00:01&lt;00:01, 1.44it/s]
4077
+ Fetching 4 files: 100%|██████████| 4/4 [00:01&lt;00:00, 2.88it/s]</div>
4078
  <div class="cell-artifacts">
4079
  <h4>Artifacts:</h4>
4080
  <a href="artifacts/benchmark/attention.jsonl" class="artifact" target="_blank">attention.jsonl</a>
flash_attn/impls/mem_efficient_attention.html CHANGED
@@ -809,6 +809,14 @@
809
  .artifact-preview svg {
810
  background: transparent;
811
  }
 
 
 
 
 
 
 
 
812
  /* CSV table styling */
813
  .artifact-csv {
814
  margin-top: 1rem;
@@ -3871,7 +3879,7 @@ span.linenos.special { color: #000000; background-color: #ffffc0; padding-left:
3871
  <span onclick="toggleOutput('benchmark')" style="cursor: pointer;">▼ output</span>
3872
  <span id="uv-indicator-benchmark" style="cursor: default; opacity: 0.3;">▶ uv-logs</span>
3873
  </span> |
3874
- Cell: benchmark | 3.92s
3875
  | <button class="run-btn" onclick="runCell('benchmark')">▶ run</button>
3876
  <button class="copy-btn" onclick="copyCell('benchmark')">Copy</button>
3877
  <a href="cells/benchmark.py" target="_blank" class="raw-btn">Raw</a>
@@ -3924,28 +3932,28 @@ PROFILE TRACE: torch_mem_eff | cuda_attn_L128_bfloat16
3924
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
3925
  Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
3926
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
3927
- torch_mem_eff 4.77% 333.269us 32.71% 2.284ms 2.284ms 0.000us 0.00% 5.420ms 5.420ms 1
3928
- torch_mem_eff 0.00% 0.000us 0.00% 0.000us 0.000us 5.402ms 100.61% 5.402ms 5.402ms 1
3929
- aten::scaled_dot_product_attention 0.44% 30.450us 2.54% 177.435us 59.145us 0.000us 0.00% 4.753ms 1.584ms 3
3930
- aten::_scaled_dot_product_efficient_attention 0.33% 22.722us 2.10% 146.985us 48.995us 0.000us 0.00% 4.753ms 1.584ms 3
3931
- aten::_efficient_attention_forward 0.51% 35.382us 1.42% 99.273us 33.091us 4.753ms 88.51% 4.753ms 1.584ms 3
3932
- fmha_cutlassF_bf16_aligned_64x128_rf_sm80(PyTorchMem... 0.00% 0.000us 0.00% 0.000us 0.000us 4.753ms 88.51% 4.753ms 1.584ms 3
3933
- aten::contiguous 0.17% 11.660us 24.51% 1.712ms 190.185us 0.000us 0.00% 667.266us 74.141us 9
3934
- aten::clone 0.46% 31.810us 24.34% 1.700ms 188.889us 0.000us 0.00% 667.266us 74.141us 9
3935
- aten::copy_ 1.01% 70.871us 22.86% 1.597ms 177.404us 616.738us 11.49% 667.266us 74.141us 9
3936
- void at::native::elementwise_kernel&lt;128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 616.738us 11.49% 616.738us 68.526us 9
3937
- Activity Buffer Request 20.64% 1.441ms 20.64% 1.441ms 1.441ms 50.528us 0.94% 50.528us 50.528us 1
3938
- aten::transpose 0.91% 63.619us 1.25% 87.011us 3.625us 0.000us 0.00% 0.000us 0.000us 24
3939
- aten::as_strided 0.33% 23.392us 0.33% 23.392us 0.975us 0.000us 0.00% 0.000us 0.000us 24
3940
- aten::empty_like 0.24% 16.972us 1.02% 71.553us 7.950us 0.000us 0.00% 0.000us 0.000us 9
3941
- aten::empty 1.18% 82.691us 1.18% 82.691us 3.938us 0.000us 0.00% 0.000us 0.000us 21
3942
- cudaLaunchKernel 1.55% 108.383us 1.55% 108.383us 9.032us 0.000us 0.00% 0.000us 0.000us 12
3943
- cudaStreamIsCapturing 0.05% 3.260us 0.05% 3.260us 1.087us 0.000us 0.00% 0.000us 0.000us 3
3944
- cudaFuncSetAttribute 0.12% 8.450us 0.12% 8.450us 2.817us 0.000us 0.00% 0.000us 0.000us 3
3945
- cudaDeviceSynchronize 67.29% 4.700ms 67.29% 4.700ms 4.700ms 0.000us 0.00% 0.000us 0.000us 1
3946
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
3947
- Self CPU time total: 6.984ms
3948
- Self CUDA time total: 5.369ms
3949
 
3950
 
3951
 
@@ -3955,28 +3963,28 @@ PROFILE TRACE: torch_mem_eff | cuda_attn_L256_bfloat16
3955
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
3956
  Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
3957
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
3958
- torch_mem_eff 3.53% 251.015us 29.52% 2.098ms 2.098ms 0.000us 0.00% 5.633ms 5.633ms 1
3959
- torch_mem_eff 0.00% 0.000us 0.00% 0.000us 0.000us 5.587ms 100.15% 5.587ms 5.587ms 1
3960
- aten::scaled_dot_product_attention 0.25% 17.630us 2.05% 145.594us 48.531us 0.000us 0.00% 4.943ms 1.648ms 3
3961
- aten::_scaled_dot_product_efficient_attention 0.28% 19.810us 1.80% 127.964us 42.655us 0.000us 0.00% 4.943ms 1.648ms 3
3962
- aten::_efficient_attention_forward 0.42% 29.862us 1.18% 83.512us 27.837us 4.943ms 88.61% 4.943ms 1.648ms 3
3963
- fmha_cutlassF_bf16_aligned_64x128_rf_sm80(PyTorchMem... 0.00% 0.000us 0.00% 0.000us 0.000us 4.943ms 88.61% 4.943ms 1.648ms 3
3964
- aten::contiguous 0.10% 7.191us 23.30% 1.656ms 184.002us 0.000us 0.00% 689.540us 76.616us 9
3965
- aten::clone 0.33% 23.318us 23.20% 1.649ms 183.203us 0.000us 0.00% 689.540us 76.616us 9
3966
- aten::copy_ 0.92% 65.725us 22.12% 1.572ms 174.717us 635.140us 11.39% 689.540us 76.616us 9
3967
- void at::native::elementwise_kernel&lt;128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 635.140us 11.39% 635.140us 70.571us 9
3968
- Activity Buffer Request 20.24% 1.439ms 20.24% 1.439ms 1.439ms 54.400us 0.98% 54.400us 54.400us 1
3969
- aten::transpose 0.71% 50.494us 0.99% 70.123us 2.922us 0.000us 0.00% 0.000us 0.000us 24
3970
- aten::as_strided 0.28% 19.629us 0.28% 19.629us 0.818us 0.000us 0.00% 0.000us 0.000us 24
3971
- aten::empty_like 0.18% 12.608us 0.75% 53.061us 5.896us 0.000us 0.00% 0.000us 0.000us 9
3972
- aten::empty 0.94% 66.903us 0.94% 66.903us 3.186us 0.000us 0.00% 0.000us 0.000us 21
3973
- cudaLaunchKernel 1.25% 89.012us 1.25% 89.012us 7.418us 0.000us 0.00% 0.000us 0.000us 12
3974
- cudaStreamIsCapturing 0.03% 2.220us 0.03% 2.220us 0.740us 0.000us 0.00% 0.000us 0.000us 3
3975
- cudaFuncSetAttribute 0.05% 3.880us 0.05% 3.880us 1.293us 0.000us 0.00% 0.000us 0.000us 3
3976
- cudaDeviceSynchronize 70.48% 5.009ms 70.48% 5.009ms 5.009ms 0.000us 0.00% 0.000us 0.000us 1
3977
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
3978
- Self CPU time total: 7.107ms
3979
- Self CUDA time total: 5.578ms
3980
 
3981
 
3982
 
@@ -3986,28 +3994,28 @@ PROFILE TRACE: torch_mem_eff | cuda_attn_L320_bfloat16
3986
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
3987
  Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
3988
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
3989
- torch_mem_eff 3.28% 246.598us 28.54% 2.146ms 2.146ms 0.000us 0.00% 6.014ms 6.014ms 1
3990
- torch_mem_eff 0.00% 0.000us 0.00% 0.000us 0.000us 5.967ms 100.18% 5.967ms 5.967ms 1
3991
- aten::scaled_dot_product_attention 0.24% 18.181us 1.92% 144.583us 48.194us 0.000us 0.00% 5.302ms 1.767ms 3
3992
- aten::_scaled_dot_product_efficient_attention 0.27% 19.980us 1.68% 126.402us 42.134us 0.000us 0.00% 5.302ms 1.767ms 3
3993
- aten::_efficient_attention_forward 0.38% 28.571us 1.10% 82.521us 27.507us 5.302ms 89.01% 5.302ms 1.767ms 3
3994
- fmha_cutlassF_bf16_aligned_64x128_rf_sm80(PyTorchMem... 0.00% 0.000us 0.00% 0.000us 0.000us 5.302ms 89.01% 5.302ms 1.767ms 3
3995
- aten::contiguous 0.09% 6.930us 22.70% 1.707ms 189.666us 0.000us 0.00% 712.547us 79.172us 9
3996
- aten::clone 0.30% 22.691us 22.61% 1.700ms 188.896us 0.000us 0.00% 712.547us 79.172us 9
3997
- aten::copy_ 1.08% 81.024us 21.57% 1.622ms 180.228us 654.403us 10.99% 712.547us 79.172us 9
3998
- void at::native::elementwise_kernel&lt;128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 654.403us 10.99% 654.403us 72.711us 9
3999
- Activity Buffer Request 19.57% 1.471ms 19.57% 1.471ms 1.471ms 58.144us 0.98% 58.144us 58.144us 1
4000
- aten::transpose 0.68% 51.431us 0.95% 71.351us 2.973us 0.000us 0.00% 0.000us 0.000us 24
4001
- aten::as_strided 0.26% 19.920us 0.26% 19.920us 0.830us 0.000us 0.00% 0.000us 0.000us 24
4002
- aten::empty_like 0.16% 11.979us 0.74% 55.320us 6.147us 0.000us 0.00% 0.000us 0.000us 9
4003
- aten::empty 0.93% 69.561us 0.93% 69.561us 3.312us 0.000us 0.00% 0.000us 0.000us 21
4004
- cudaLaunchKernel 1.22% 91.652us 1.22% 91.652us 7.638us 0.000us 0.00% 0.000us 0.000us 12
4005
- cudaStreamIsCapturing 0.03% 2.359us 0.03% 2.359us 0.786us 0.000us 0.00% 0.000us 0.000us 3
4006
- cudaFuncSetAttribute 0.05% 3.430us 0.05% 3.430us 1.143us 0.000us 0.00% 0.000us 0.000us 3
4007
- cudaDeviceSynchronize 71.46% 5.373ms 71.46% 5.373ms 5.373ms 0.000us 0.00% 0.000us 0.000us 1
4008
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4009
- Self CPU time total: 7.519ms
4010
- Self CUDA time total: 5.956ms
4011
 
4012
 
4013
 
@@ -4017,28 +4025,28 @@ PROFILE TRACE: torch_mem_eff | cuda_attn_L384_bfloat16
4017
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4018
  Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
4019
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4020
- torch_mem_eff 3.21% 251.576us 29.97% 2.347ms 2.347ms 0.000us 0.00% 6.116ms 6.116ms 1
4021
- torch_mem_eff 0.00% 0.000us 0.00% 0.000us 0.000us 6.068ms 100.14% 6.068ms 6.068ms 1
4022
- aten::scaled_dot_product_attention 0.24% 18.800us 1.87% 146.693us 48.898us 0.000us 0.00% 5.408ms 1.803ms 3
4023
- aten::_scaled_dot_product_efficient_attention 0.25% 19.900us 1.63% 127.893us 42.631us 0.000us 0.00% 5.408ms 1.803ms 3
4024
- aten::_efficient_attention_forward 0.38% 29.372us 1.07% 83.903us 27.968us 5.408ms 89.25% 5.408ms 1.803ms 3
4025
- fmha_cutlassF_bf16_aligned_64x128_rf_sm80(PyTorchMem... 0.00% 0.000us 0.00% 0.000us 0.000us 5.408ms 89.25% 5.408ms 1.803ms 3
4026
- aten::contiguous 0.10% 7.511us 24.29% 1.902ms 211.340us 0.000us 0.00% 708.735us 78.748us 9
4027
- aten::clone 0.28% 21.872us 24.19% 1.895ms 210.505us 0.000us 0.00% 708.735us 78.748us 9
4028
- aten::copy_ 0.85% 66.540us 23.20% 1.817ms 201.834us 651.551us 10.75% 708.735us 78.748us 9
4029
- void at::native::elementwise_kernel&lt;128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 651.551us 10.75% 651.551us 72.395us 9
4030
- Activity Buffer Request 18.68% 1.462ms 18.68% 1.462ms 1.462ms 57.184us 0.94% 57.184us 57.184us 1
4031
- aten::transpose 0.65% 50.781us 0.90% 70.402us 2.933us 0.000us 0.00% 0.000us 0.000us 24
4032
- aten::as_strided 0.25% 19.621us 0.25% 19.621us 0.818us 0.000us 0.00% 0.000us 0.000us 24
4033
- aten::empty_like 0.15% 11.809us 0.72% 56.170us 6.241us 0.000us 0.00% 0.000us 0.000us 9
4034
- aten::empty 0.90% 70.242us 0.90% 70.242us 3.345us 0.000us 0.00% 0.000us 0.000us 21
4035
- cudaLaunchKernel 3.97% 310.797us 3.97% 310.797us 25.900us 0.000us 0.00% 0.000us 0.000us 12
4036
- cudaStreamIsCapturing 0.03% 2.250us 0.03% 2.250us 0.750us 0.000us 0.00% 0.000us 0.000us 3
4037
- cudaFuncSetAttribute 0.04% 3.220us 0.04% 3.220us 1.073us 0.000us 0.00% 0.000us 0.000us 3
4038
- cudaDeviceSynchronize 70.03% 5.484ms 70.03% 5.484ms 5.484ms 0.000us 0.00% 0.000us 0.000us 1
4039
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4040
- Self CPU time total: 7.830ms
4041
- Self CUDA time total: 6.059ms
4042
 
4043
 
4044
 
@@ -4048,28 +4056,28 @@ PROFILE TRACE: torch_mem_eff | cuda_attn_L448_bfloat16
4048
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4049
  Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
4050
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4051
- torch_mem_eff 3.15% 250.575us 28.50% 2.270ms 2.270ms 0.000us 0.00% 6.322ms 6.322ms 1
4052
- torch_mem_eff 0.00% 0.000us 0.00% 0.000us 0.000us 6.270ms 100.14% 6.270ms 6.270ms 1
4053
- aten::scaled_dot_product_attention 0.22% 17.572us 1.82% 145.084us 48.361us 0.000us 0.00% 5.598ms 1.866ms 3
4054
- aten::_scaled_dot_product_efficient_attention 0.24% 19.250us 1.60% 127.512us 42.504us 0.000us 0.00% 5.598ms 1.866ms 3
4055
- aten::_efficient_attention_forward 0.36% 28.812us 1.05% 83.962us 27.987us 5.598ms 89.40% 5.598ms 1.866ms 3
4056
- fmha_cutlassF_bf16_aligned_64x128_rf_sm80(PyTorchMem... 0.00% 0.000us 0.00% 0.000us 0.000us 5.598ms 89.40% 5.598ms 1.866ms 3
4057
- aten::contiguous 0.09% 6.912us 22.94% 1.827ms 203.045us 0.000us 0.00% 724.000us 80.444us 9
4058
- aten::clone 0.28% 21.949us 22.86% 1.820ms 202.277us 0.000us 0.00% 724.000us 80.444us 9
4059
- aten::copy_ 0.82% 65.091us 21.89% 1.744ms 193.745us 664.032us 10.60% 724.000us 80.444us 9
4060
- void at::native::elementwise_kernel&lt;128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 664.032us 10.60% 664.032us 73.781us 9
4061
- Activity Buffer Request 18.02% 1.435ms 18.02% 1.435ms 1.435ms 59.968us 0.96% 59.968us 59.968us 1
4062
- aten::transpose 0.64% 50.930us 0.89% 70.859us 2.952us 0.000us 0.00% 0.000us 0.000us 24
4063
- aten::as_strided 0.25% 19.929us 0.25% 19.929us 0.830us 0.000us 0.00% 0.000us 0.000us 24
4064
- aten::empty_like 0.15% 12.022us 0.69% 54.843us 6.094us 0.000us 0.00% 0.000us 0.000us 9
4065
- aten::empty 0.87% 69.430us 0.87% 69.430us 3.306us 0.000us 0.00% 0.000us 0.000us 21
4066
- cudaLaunchKernel 3.34% 266.388us 3.34% 266.388us 22.199us 0.000us 0.00% 0.000us 0.000us 12
4067
- cudaStreamIsCapturing 0.03% 2.320us 0.03% 2.320us 0.773us 0.000us 0.00% 0.000us 0.000us 3
4068
- cudaFuncSetAttribute 0.04% 3.120us 0.04% 3.120us 1.040us 0.000us 0.00% 0.000us 0.000us 3
4069
- cudaDeviceSynchronize 71.50% 5.695ms 71.50% 5.695ms 5.695ms 0.000us 0.00% 0.000us 0.000us 1
4070
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4071
- Self CPU time total: 7.965ms
4072
- Self CUDA time total: 6.262ms
4073
 
4074
 
4075
 
@@ -4079,36 +4087,36 @@ PROFILE TRACE: torch_mem_eff | cuda_attn_L512_bfloat16
4079
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4080
  Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
4081
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4082
- torch_mem_eff 3.00% 248.403us 26.98% 2.232ms 2.232ms 0.000us 0.00% 6.668ms 6.668ms 1
4083
- torch_mem_eff 0.00% 0.000us 0.00% 0.000us 0.000us 6.616ms 100.13% 6.616ms 6.616ms 1
4084
- aten::scaled_dot_product_attention 0.21% 17.221us 1.72% 142.654us 47.551us 0.000us 0.00% 5.939ms 1.980ms 3
4085
- aten::_scaled_dot_product_efficient_attention 0.23% 18.779us 1.52% 125.433us 41.811us 0.000us 0.00% 5.939ms 1.980ms 3
4086
- aten::_efficient_attention_forward 0.34% 28.440us 0.99% 81.712us 27.237us 5.939ms 89.88% 5.939ms 1.980ms 3
4087
- fmha_cutlassF_bf16_aligned_64x128_rf_sm80(PyTorchMem... 0.00% 0.000us 0.00% 0.000us 0.000us 5.939ms 89.88% 5.939ms 1.980ms 3
4088
- aten::contiguous 0.08% 6.861us 21.66% 1.792ms 199.142us 0.000us 0.00% 729.440us 81.049us 9
4089
- aten::clone 0.26% 21.352us 21.58% 1.785ms 198.379us 0.000us 0.00% 729.440us 81.049us 9
4090
- aten::copy_ 0.83% 69.012us 20.65% 1.709ms 189.858us 668.928us 10.12% 729.440us 81.049us 9
4091
- void at::native::elementwise_kernel&lt;128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 668.928us 10.12% 668.928us 74.325us 9
4092
- Activity Buffer Request 17.29% 1.430ms 17.29% 1.430ms 1.430ms 60.512us 0.92% 60.512us 60.512us 1
4093
- aten::transpose 0.63% 51.780us 0.89% 73.784us 3.074us 0.000us 0.00% 0.000us 0.000us 24
4094
- aten::as_strided 0.27% 22.004us 0.27% 22.004us 0.917us 0.000us 0.00% 0.000us 0.000us 24
4095
- aten::empty_like 0.14% 11.870us 0.67% 55.340us 6.149us 0.000us 0.00% 0.000us 0.000us 9
4096
- aten::empty 0.84% 69.312us 0.84% 69.312us 3.301us 0.000us 0.00% 0.000us 0.000us 21
4097
- cudaLaunchKernel 2.79% 231.145us 2.79% 231.145us 19.262us 0.000us 0.00% 0.000us 0.000us 12
4098
- cudaStreamIsCapturing 0.03% 2.280us 0.03% 2.280us 0.760us 0.000us 0.00% 0.000us 0.000us 3
4099
- cudaFuncSetAttribute 0.04% 3.570us 0.04% 3.570us 1.190us 0.000us 0.00% 0.000us 0.000us 3
4100
- cudaDeviceSynchronize 73.02% 6.041ms 73.02% 6.041ms 6.041ms 0.000us 0.00% 0.000us 0.000us 1
4101
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4102
- Self CPU time total: 8.273ms
4103
- Self CUDA time total: 6.608ms
4104
 
4105
 
4106
  impl wl p50(ms) ok
4107
  torch_mem_eff cuda_attn_L128_bfloat16 1.83 True
4108
- torch_mem_eff cuda_attn_L256_bfloat16 1.89 True
4109
- torch_mem_eff cuda_attn_L320_bfloat16 2.00 True
4110
- torch_mem_eff cuda_attn_L384_bfloat16 1.97 True
4111
- torch_mem_eff cuda_attn_L448_bfloat16 2.06 True
4112
  torch_mem_eff cuda_attn_L512_bfloat16 2.19 True
4113
  </pre></div>
4114
  <div class="cell-artifacts">
 
809
  .artifact-preview svg {
810
  background: transparent;
811
  }
812
+ /* Invert SVG images in dark mode */
813
+ :root[data-theme="dark"] .artifact-preview img[src$=".svg"] {
814
+ filter: invert(0.9) hue-rotate(180deg);
815
+ }
816
+ /* Keep SVG images readable in monocolor mode */
817
+ :root[data-ui="monocolor"] .artifact-preview img[src$=".svg"] {
818
+ filter: none;
819
+ }
820
  /* CSV table styling */
821
  .artifact-csv {
822
  margin-top: 1rem;
 
3879
  <span onclick="toggleOutput('benchmark')" style="cursor: pointer;">▼ output</span>
3880
  <span id="uv-indicator-benchmark" style="cursor: default; opacity: 0.3;">▶ uv-logs</span>
3881
  </span> |
3882
+ Cell: benchmark | 3.89s
3883
  | <button class="run-btn" onclick="runCell('benchmark')">▶ run</button>
3884
  <button class="copy-btn" onclick="copyCell('benchmark')">Copy</button>
3885
  <a href="cells/benchmark.py" target="_blank" class="raw-btn">Raw</a>
 
3932
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
3933
  Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
3934
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
3935
+ torch_mem_eff 5.04% 355.427us 33.26% 2.347ms 2.347ms 0.000us 0.00% 5.443ms 5.443ms 1
3936
+ torch_mem_eff 0.00% 0.000us 0.00% 0.000us 0.000us 5.441ms 100.90% 5.441ms 5.441ms 1
3937
+ aten::scaled_dot_product_attention 0.45% 31.972us 2.63% 185.885us 61.962us 0.000us 0.00% 4.772ms 1.591ms 3
3938
+ aten::_scaled_dot_product_efficient_attention 0.35% 24.621us 2.18% 153.913us 51.304us 0.000us 0.00% 4.772ms 1.591ms 3
3939
+ aten::_efficient_attention_forward 0.53% 37.509us 1.49% 105.321us 35.107us 4.772ms 88.48% 4.772ms 1.591ms 3
3940
+ fmha_cutlassF_bf16_aligned_64x128_rf_sm80(PyTorchMem... 0.00% 0.000us 0.00% 0.000us 0.000us 4.772ms 88.48% 4.772ms 1.591ms 3
3941
+ aten::contiguous 0.16% 11.612us 24.73% 1.745ms 193.873us 0.000us 0.00% 671.455us 74.606us 9
3942
+ aten::clone 0.45% 31.980us 24.56% 1.733ms 192.583us 0.000us 0.00% 671.455us 74.606us 9
3943
+ aten::copy_ 1.09% 76.971us 23.11% 1.631ms 181.191us 621.119us 11.52% 671.455us 74.606us 9
3944
+ void at::native::elementwise_kernel&lt;128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 621.119us 11.52% 621.119us 69.013us 9
3945
+ Activity Buffer Request 20.82% 1.469ms 20.82% 1.469ms 1.469ms 50.336us 0.93% 50.336us 50.336us 1
3946
+ aten::transpose 0.89% 62.923us 1.20% 84.503us 3.521us 0.000us 0.00% 0.000us 0.000us 24
3947
+ aten::as_strided 0.31% 21.580us 0.31% 21.580us 0.899us 0.000us 0.00% 0.000us 0.000us 24
3948
+ aten::empty_like 0.23% 16.040us 1.00% 70.551us 7.839us 0.000us 0.00% 0.000us 0.000us 9
3949
+ aten::empty 1.20% 84.702us 1.20% 84.702us 4.033us 0.000us 0.00% 0.000us 0.000us 21
3950
+ cudaLaunchKernel 1.56% 109.883us 1.56% 109.883us 9.157us 0.000us 0.00% 0.000us 0.000us 12
3951
+ cudaStreamIsCapturing 0.04% 3.130us 0.04% 3.130us 1.043us 0.000us 0.00% 0.000us 0.000us 3
3952
+ cudaFuncSetAttribute 0.13% 9.350us 0.13% 9.350us 3.117us 0.000us 0.00% 0.000us 0.000us 3
3953
+ cudaDeviceSynchronize 66.74% 4.709ms 66.74% 4.709ms 4.709ms 0.000us 0.00% 0.000us 0.000us 1
3954
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
3955
+ Self CPU time total: 7.056ms
3956
+ Self CUDA time total: 5.393ms
3957
 
3958
 
3959
 
 
3963
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
3964
  Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
3965
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
3966
+ torch_mem_eff 3.16% 230.972us 28.28% 2.069ms 2.069ms 0.000us 0.00% 5.837ms 5.837ms 1
3967
+ torch_mem_eff 0.00% 0.000us 0.00% 0.000us 0.000us 5.791ms 100.14% 5.791ms 5.791ms 1
3968
+ aten::scaled_dot_product_attention 0.28% 20.721us 1.89% 138.014us 46.005us 0.000us 0.00% 5.147ms 1.716ms 3
3969
+ aten::_scaled_dot_product_efficient_attention 0.25% 18.299us 1.60% 117.293us 39.098us 0.000us 0.00% 5.147ms 1.716ms 3
3970
+ aten::_efficient_attention_forward 0.37% 27.244us 1.07% 78.053us 26.018us 5.147ms 89.00% 5.147ms 1.716ms 3
3971
+ fmha_cutlassF_bf16_aligned_64x128_rf_sm80(PyTorchMem... 0.00% 0.000us 0.00% 0.000us 0.000us 5.147ms 89.00% 5.147ms 1.716ms 3
3972
+ aten::contiguous 0.10% 7.473us 22.69% 1.660ms 184.464us 0.000us 0.00% 690.528us 76.725us 9
3973
+ aten::clone 0.31% 22.407us 22.59% 1.653ms 183.634us 0.000us 0.00% 690.528us 76.725us 9
3974
+ aten::copy_ 0.90% 65.683us 21.62% 1.582ms 175.735us 636.032us 11.00% 690.528us 76.725us 9
3975
+ void at::native::elementwise_kernel&lt;128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 636.032us 11.00% 636.032us 70.670us 9
3976
+ Activity Buffer Request 19.82% 1.450ms 19.82% 1.450ms 1.450ms 54.496us 0.94% 54.496us 54.496us 1
3977
+ aten::transpose 0.62% 45.174us 0.83% 60.723us 2.530us 0.000us 0.00% 0.000us 0.000us 24
3978
+ aten::as_strided 0.21% 15.549us 0.21% 15.549us 0.648us 0.000us 0.00% 0.000us 0.000us 24
3979
+ aten::empty_like 0.16% 11.973us 0.67% 48.683us 5.409us 0.000us 0.00% 0.000us 0.000us 9
3980
+ aten::empty 0.84% 61.270us 0.84% 61.270us 2.918us 0.000us 0.00% 0.000us 0.000us 21
3981
+ cudaLaunchKernel 1.18% 86.180us 1.18% 86.180us 7.182us 0.000us 0.00% 0.000us 0.000us 12
3982
+ cudaStreamIsCapturing 0.03% 2.460us 0.03% 2.460us 0.820us 0.000us 0.00% 0.000us 0.000us 3
3983
+ cudaFuncSetAttribute 0.04% 3.159us 0.04% 3.159us 1.053us 0.000us 0.00% 0.000us 0.000us 3
3984
+ cudaDeviceSynchronize 71.72% 5.248ms 71.72% 5.248ms 5.248ms 0.000us 0.00% 0.000us 0.000us 1
3985
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
3986
+ Self CPU time total: 7.317ms
3987
+ Self CUDA time total: 5.783ms
3988
 
3989
 
3990
 
 
3994
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
3995
  Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
3996
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
3997
+ torch_mem_eff 3.27% 244.917us 27.45% 2.054ms 2.054ms 0.000us 0.00% 6.034ms 6.034ms 1
3998
+ torch_mem_eff 0.00% 0.000us 0.00% 0.000us 0.000us 5.984ms 100.14% 5.984ms 5.984ms 1
3999
+ aten::scaled_dot_product_attention 0.26% 19.270us 1.91% 142.603us 47.534us 0.000us 0.00% 5.315ms 1.772ms 3
4000
+ aten::_scaled_dot_product_efficient_attention 0.25% 18.622us 1.65% 123.333us 41.111us 0.000us 0.00% 5.315ms 1.772ms 3
4001
+ aten::_efficient_attention_forward 0.37% 27.710us 1.08% 80.560us 26.853us 5.315ms 88.95% 5.315ms 1.772ms 3
4002
+ fmha_cutlassF_bf16_aligned_64x128_rf_sm80(PyTorchMem... 0.00% 0.000us 0.00% 0.000us 0.000us 5.315ms 88.95% 5.315ms 1.772ms 3
4003
+ aten::contiguous 0.10% 7.220us 21.76% 1.628ms 180.911us 0.000us 0.00% 718.878us 79.875us 9
4004
+ aten::clone 0.29% 21.638us 21.66% 1.621ms 180.109us 0.000us 0.00% 718.878us 79.875us 9
4005
+ aten::copy_ 0.91% 68.381us 20.73% 1.551ms 172.378us 660.254us 11.05% 718.878us 79.875us 9
4006
+ void at::native::elementwise_kernel&lt;128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 660.254us 11.05% 660.254us 73.362us 9
4007
+ Activity Buffer Request 18.95% 1.418ms 18.95% 1.418ms 1.418ms 58.624us 0.98% 58.624us 58.624us 1
4008
+ aten::transpose 0.63% 46.916us 0.84% 62.771us 2.615us 0.000us 0.00% 0.000us 0.000us 24
4009
+ aten::as_strided 0.21% 15.855us 0.21% 15.855us 0.661us 0.000us 0.00% 0.000us 0.000us 24
4010
+ aten::empty_like 0.15% 11.482us 0.64% 47.942us 5.327us 0.000us 0.00% 0.000us 0.000us 9
4011
+ aten::empty 0.82% 61.110us 0.82% 61.110us 2.910us 0.000us 0.00% 0.000us 0.000us 21
4012
+ cudaLaunchKernel 1.17% 87.854us 1.17% 87.854us 7.321us 0.000us 0.00% 0.000us 0.000us 12
4013
+ cudaStreamIsCapturing 0.03% 2.410us 0.03% 2.410us 0.803us 0.000us 0.00% 0.000us 0.000us 3
4014
+ cudaFuncSetAttribute 0.04% 2.950us 0.04% 2.950us 0.983us 0.000us 0.00% 0.000us 0.000us 3
4015
+ cudaDeviceSynchronize 72.55% 5.429ms 72.55% 5.429ms 5.429ms 0.000us 0.00% 0.000us 0.000us 1
4016
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4017
+ Self CPU time total: 7.483ms
4018
+ Self CUDA time total: 5.976ms
4019
 
4020
 
4021
 
 
4025
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4026
  Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
4027
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4028
+ torch_mem_eff 3.13% 245.154us 29.09% 2.280ms 2.280ms 0.000us 0.00% 6.166ms 6.166ms 1
4029
+ torch_mem_eff 0.00% 0.000us 0.00% 0.000us 0.000us 6.117ms 100.15% 6.117ms 6.117ms 1
4030
+ aten::scaled_dot_product_attention 0.24% 18.991us 1.80% 140.753us 46.918us 0.000us 0.00% 5.454ms 1.818ms 3
4031
+ aten::_scaled_dot_product_efficient_attention 0.25% 19.741us 1.55% 121.762us 40.587us 0.000us 0.00% 5.454ms 1.818ms 3
4032
+ aten::_efficient_attention_forward 0.36% 27.980us 1.01% 79.030us 26.343us 5.454ms 89.29% 5.454ms 1.818ms 3
4033
+ fmha_cutlassF_bf16_aligned_64x128_rf_sm80(PyTorchMem... 0.00% 0.000us 0.00% 0.000us 0.000us 5.454ms 89.29% 5.454ms 1.818ms 3
4034
+ aten::contiguous 0.10% 7.853us 23.65% 1.854ms 206.016us 0.000us 0.00% 711.999us 79.111us 9
4035
+ aten::clone 0.28% 21.760us 23.55% 1.846ms 205.144us 0.000us 0.00% 711.999us 79.111us 9
4036
+ aten::copy_ 0.86% 67.621us 22.63% 1.774ms 197.124us 654.399us 10.71% 711.999us 79.111us 9
4037
+ void at::native::elementwise_kernel&lt;128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 654.399us 10.71% 654.399us 72.711us 9
4038
+ Activity Buffer Request 18.63% 1.461ms 18.63% 1.461ms 1.461ms 57.600us 0.94% 57.600us 57.600us 1
4039
+ aten::transpose 0.60% 47.388us 0.81% 63.381us 2.641us 0.000us 0.00% 0.000us 0.000us 24
4040
+ aten::as_strided 0.20% 15.993us 0.20% 15.993us 0.666us 0.000us 0.00% 0.000us 0.000us 24
4041
+ aten::empty_like 0.15% 12.039us 0.64% 50.420us 5.602us 0.000us 0.00% 0.000us 0.000us 9
4042
+ aten::empty 0.81% 63.411us 0.81% 63.411us 3.020us 0.000us 0.00% 0.000us 0.000us 21
4043
+ cudaLaunchKernel 3.40% 266.437us 3.40% 266.437us 22.203us 0.000us 0.00% 0.000us 0.000us 12
4044
+ cudaStreamIsCapturing 0.03% 2.470us 0.03% 2.470us 0.823us 0.000us 0.00% 0.000us 0.000us 3
4045
+ cudaFuncSetAttribute 0.04% 3.000us 0.04% 3.000us 1.000us 0.000us 0.00% 0.000us 0.000us 3
4046
+ cudaDeviceSynchronize 70.91% 5.560ms 70.91% 5.560ms 5.560ms 0.000us 0.00% 0.000us 0.000us 1
4047
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4048
+ Self CPU time total: 7.840ms
4049
+ Self CUDA time total: 6.108ms
4050
 
4051
 
4052
 
 
4056
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4057
  Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
4058
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4059
+ torch_mem_eff 3.12% 251.727us 28.35% 2.287ms 2.287ms 0.000us 0.00% 6.402ms 6.402ms 1
4060
+ torch_mem_eff 0.00% 0.000us 0.00% 0.000us 0.000us 6.350ms 100.14% 6.350ms 6.350ms 1
4061
+ aten::scaled_dot_product_attention 0.24% 19.272us 1.78% 143.434us 47.811us 0.000us 0.00% 5.676ms 1.892ms 3
4062
+ aten::_scaled_dot_product_efficient_attention 0.24% 19.071us 1.54% 124.162us 41.387us 0.000us 0.00% 5.676ms 1.892ms 3
4063
+ aten::_efficient_attention_forward 0.36% 28.918us 1.02% 82.141us 27.380us 5.676ms 89.51% 5.676ms 1.892ms 3
4064
+ fmha_cutlassF_bf16_aligned_64x128_rf_sm80(PyTorchMem... 0.00% 0.000us 0.00% 0.000us 0.000us 5.676ms 89.51% 5.676ms 1.892ms 3
4065
+ aten::contiguous 0.09% 7.578us 22.96% 1.852ms 205.774us 0.000us 0.00% 725.410us 80.601us 9
4066
+ aten::clone 0.27% 22.113us 22.87% 1.844ms 204.932us 0.000us 0.00% 725.410us 80.601us 9
4067
+ aten::copy_ 0.85% 68.201us 21.96% 1.771ms 196.780us 665.282us 10.49% 725.410us 80.601us 9
4068
+ void at::native::elementwise_kernel&lt;128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 665.282us 10.49% 665.282us 73.920us 9
4069
+ Activity Buffer Request 18.11% 1.461ms 18.11% 1.461ms 1.461ms 60.128us 0.95% 60.128us 60.128us 1
4070
+ aten::transpose 0.57% 46.288us 0.78% 62.529us 2.605us 0.000us 0.00% 0.000us 0.000us 24
4071
+ aten::as_strided 0.20% 16.241us 0.20% 16.241us 0.677us 0.000us 0.00% 0.000us 0.000us 24
4072
+ aten::empty_like 0.15% 12.469us 0.64% 51.250us 5.694us 0.000us 0.00% 0.000us 0.000us 9
4073
+ aten::empty 0.80% 64.494us 0.80% 64.494us 3.071us 0.000us 0.00% 0.000us 0.000us 21
4074
+ cudaLaunchKernel 3.27% 263.876us 3.27% 263.876us 21.990us 0.000us 0.00% 0.000us 0.000us 12
4075
+ cudaStreamIsCapturing 0.03% 2.440us 0.03% 2.440us 0.813us 0.000us 0.00% 0.000us 0.000us 3
4076
+ cudaFuncSetAttribute 0.04% 3.380us 0.04% 3.380us 1.127us 0.000us 0.00% 0.000us 0.000us 3
4077
+ cudaDeviceSynchronize 71.65% 5.779ms 71.65% 5.779ms 5.779ms 0.000us 0.00% 0.000us 0.000us 1
4078
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4079
+ Self CPU time total: 8.066ms
4080
+ Self CUDA time total: 6.342ms
4081
 
4082
 
4083
 
 
4087
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4088
  Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
4089
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4090
+ torch_mem_eff 2.86% 239.115us 26.99% 2.259ms 2.259ms 0.000us 0.00% 6.718ms 6.718ms 1
4091
+ torch_mem_eff 0.00% 0.000us 0.00% 0.000us 0.000us 6.665ms 100.13% 6.665ms 6.665ms 1
4092
+ aten::scaled_dot_product_attention 0.23% 19.210us 1.67% 139.873us 46.624us 0.000us 0.00% 5.983ms 1.994ms 3
4093
+ aten::_scaled_dot_product_efficient_attention 0.22% 18.712us 1.44% 120.663us 40.221us 0.000us 0.00% 5.983ms 1.994ms 3
4094
+ aten::_efficient_attention_forward 0.33% 27.381us 0.94% 78.541us 26.180us 5.983ms 89.89% 5.983ms 1.994ms 3
4095
+ fmha_cutlassF_bf16_aligned_64x128_rf_sm80(PyTorchMem... 0.00% 0.000us 0.00% 0.000us 0.000us 5.983ms 89.89% 5.983ms 1.994ms 3
4096
+ aten::contiguous 0.09% 7.469us 21.99% 1.841ms 204.601us 0.000us 0.00% 734.336us 81.593us 9
4097
+ aten::clone 0.27% 22.450us 21.90% 1.834ms 203.772us 0.000us 0.00% 734.336us 81.593us 9
4098
+ aten::copy_ 0.80% 67.050us 21.01% 1.759ms 195.442us 673.088us 10.11% 734.336us 81.593us 9
4099
+ void at::native::elementwise_kernel&lt;128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 673.088us 10.11% 673.088us 74.788us 9
4100
+ Activity Buffer Request 17.30% 1.449ms 17.30% 1.449ms 1.449ms 61.248us 0.92% 61.248us 61.248us 1
4101
+ aten::transpose 0.55% 46.102us 0.74% 62.332us 2.597us 0.000us 0.00% 0.000us 0.000us 24
4102
+ aten::as_strided 0.19% 16.230us 0.19% 16.230us 0.676us 0.000us 0.00% 0.000us 0.000us 24
4103
+ aten::empty_like 0.14% 11.891us 0.63% 52.512us 5.835us 0.000us 0.00% 0.000us 0.000us 9
4104
+ aten::empty 0.78% 65.061us 0.78% 65.061us 3.098us 0.000us 0.00% 0.000us 0.000us 21
4105
+ cudaLaunchKernel 3.16% 264.678us 3.16% 264.678us 22.056us 0.000us 0.00% 0.000us 0.000us 12
4106
+ cudaStreamIsCapturing 0.03% 2.400us 0.03% 2.400us 0.800us 0.000us 0.00% 0.000us 0.000us 3
4107
+ cudaFuncSetAttribute 0.04% 2.990us 0.04% 2.990us 0.997us 0.000us 0.00% 0.000us 0.000us 3
4108
+ cudaDeviceSynchronize 73.01% 6.113ms 73.01% 6.113ms 6.113ms 0.000us 0.00% 0.000us 0.000us 1
4109
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4110
+ Self CPU time total: 8.372ms
4111
+ Self CUDA time total: 6.656ms
4112
 
4113
 
4114
  impl wl p50(ms) ok
4115
  torch_mem_eff cuda_attn_L128_bfloat16 1.83 True
4116
+ torch_mem_eff cuda_attn_L256_bfloat16 1.94 True
4117
+ torch_mem_eff cuda_attn_L320_bfloat16 1.95 True
4118
+ torch_mem_eff cuda_attn_L384_bfloat16 2.05 True
4119
+ torch_mem_eff cuda_attn_L448_bfloat16 2.07 True
4120
  torch_mem_eff cuda_attn_L512_bfloat16 2.19 True
4121
  </pre></div>
4122
  <div class="cell-artifacts">
flash_attn/impls/sage_attention.html CHANGED
@@ -809,6 +809,14 @@
809
  .artifact-preview svg {
810
  background: transparent;
811
  }
 
 
 
 
 
 
 
 
812
  /* CSV table styling */
813
  .artifact-csv {
814
  margin-top: 1rem;
@@ -3869,9 +3877,9 @@ span.linenos.special { color: #000000; background-color: #ffffc0; padding-left:
3869
  <span class="collapse-indicators">
3870
  <span onclick="toggleCode('benchmark')" style="cursor: pointer;">▼ code</span>
3871
  <span onclick="toggleOutput('benchmark')" style="cursor: pointer;">▼ output</span>
3872
- <span id="uv-indicator-benchmark" onclick="toggleUvLogsFromHeader('benchmark')" style="cursor: pointer;">▶ uv-logs</span>
3873
  </span> |
3874
- Cell: benchmark | 4.53s
3875
  | <button class="run-btn" onclick="runCell('benchmark')">▶ run</button>
3876
  <button class="copy-btn" onclick="copyCell('benchmark')">Copy</button>
3877
  <a href="cells/benchmark.py" target="_blank" class="raw-btn">Raw</a>
@@ -3920,28 +3928,23 @@ Cell: benchmark | 4.53s
3920
  <div class="cell-stdout"><pre class="stdout-text">Running attention benchmark on cuda with 6 workloads.
3921
  impl wl p50(ms) ok
3922
  sage_int8_fp16 cuda_attn_L128_bfloat16 FAIL False
3923
- Error: module &#x27;sage_attention_5c963cbdaf16559b&#x27; has no attribute &#x27;fwd&#x27;
3924
  sage_int8_fp16 cuda_attn_L256_bfloat16 FAIL False
3925
- Error: module &#x27;sage_attention_5c963cbdaf16559b&#x27; has no attribute &#x27;fwd&#x27;
3926
  sage_int8_fp16 cuda_attn_L320_bfloat16 FAIL False
3927
- Error: module &#x27;sage_attention_5c963cbdaf16559b&#x27; has no attribute &#x27;fwd&#x27;
3928
  sage_int8_fp16 cuda_attn_L384_bfloat16 FAIL False
3929
- Error: module &#x27;sage_attention_5c963cbdaf16559b&#x27; has no attribute &#x27;fwd&#x27;
3930
  sage_int8_fp16 cuda_attn_L448_bfloat16 FAIL False
3931
- Error: module &#x27;sage_attention_5c963cbdaf16559b&#x27; has no attribute &#x27;fwd&#x27;
3932
  sage_int8_fp16 cuda_attn_L512_bfloat16 FAIL False
3933
- Error: module &#x27;sage_attention_5c963cbdaf16559b&#x27; has no attribute &#x27;fwd&#x27;
3934
  </pre></div>
3935
- <div class="uv-install-logs" id="uv-logs-benchmark">
3936
- <div class="uv-logs-header" onclick="toggleUvLogs(this)">▶ UV Install Logs</div>
3937
- <div class="uv-logs-content" style="display: none;">
3938
- Installed 15 packages in 14ms
3939
- </div>
3940
  </div>
3941
- <div class="cell-stderr">Fetching 11 files: 0%| | 0/11 [00:00&lt;?, ?it/s]
3942
- Fetching 11 files: 18%|█▊ | 2/11 [00:00&lt;00:00, 15.79it/s]
3943
- Fetching 11 files: 73%|███████▎ | 8/11 [00:00&lt;00:00, 13.55it/s]
3944
- Fetching 11 files: 100%|██████████| 11/11 [00:00&lt;00:00, 18.83it/s]</div>
3945
  <div class="cell-artifacts">
3946
  <h4>Artifacts:</h4>
3947
  <a href="artifacts/benchmark/attention.jsonl" class="artifact" target="_blank">attention.jsonl</a>
 
809
  .artifact-preview svg {
810
  background: transparent;
811
  }
812
+ /* Invert SVG images in dark mode */
813
+ :root[data-theme="dark"] .artifact-preview img[src$=".svg"] {
814
+ filter: invert(0.9) hue-rotate(180deg);
815
+ }
816
+ /* Keep SVG images readable in monocolor mode */
817
+ :root[data-ui="monocolor"] .artifact-preview img[src$=".svg"] {
818
+ filter: none;
819
+ }
820
  /* CSV table styling */
821
  .artifact-csv {
822
  margin-top: 1rem;
 
3877
  <span class="collapse-indicators">
3878
  <span onclick="toggleCode('benchmark')" style="cursor: pointer;">▼ code</span>
3879
  <span onclick="toggleOutput('benchmark')" style="cursor: pointer;">▼ output</span>
3880
+ <span id="uv-indicator-benchmark" style="cursor: default; opacity: 0.3;">▶ uv-logs</span>
3881
  </span> |
3882
+ Cell: benchmark | 4.19s
3883
  | <button class="run-btn" onclick="runCell('benchmark')">▶ run</button>
3884
  <button class="copy-btn" onclick="copyCell('benchmark')">Copy</button>
3885
  <a href="cells/benchmark.py" target="_blank" class="raw-btn">Raw</a>
 
3928
  <div class="cell-stdout"><pre class="stdout-text">Running attention benchmark on cuda with 6 workloads.
3929
  impl wl p50(ms) ok
3930
  sage_int8_fp16 cuda_attn_L128_bfloat16 FAIL False
3931
+ Error: module &#x27;sage_attention_d4f4a6803f593c0b&#x27; has no attribute &#x27;fwd&#x27;
3932
  sage_int8_fp16 cuda_attn_L256_bfloat16 FAIL False
3933
+ Error: module &#x27;sage_attention_d4f4a6803f593c0b&#x27; has no attribute &#x27;fwd&#x27;
3934
  sage_int8_fp16 cuda_attn_L320_bfloat16 FAIL False
3935
+ Error: module &#x27;sage_attention_d4f4a6803f593c0b&#x27; has no attribute &#x27;fwd&#x27;
3936
  sage_int8_fp16 cuda_attn_L384_bfloat16 FAIL False
3937
+ Error: module &#x27;sage_attention_d4f4a6803f593c0b&#x27; has no attribute &#x27;fwd&#x27;
3938
  sage_int8_fp16 cuda_attn_L448_bfloat16 FAIL False
3939
+ Error: module &#x27;sage_attention_d4f4a6803f593c0b&#x27; has no attribute &#x27;fwd&#x27;
3940
  sage_int8_fp16 cuda_attn_L512_bfloat16 FAIL False
3941
+ Error: module &#x27;sage_attention_d4f4a6803f593c0b&#x27; has no attribute &#x27;fwd&#x27;
3942
  </pre></div>
3943
+ <div class="cell-stderr">
3944
+ Fetching 11 files: 0%| | 0/11 [00:00&lt;?, ?it/s]
3945
+ Fetching 11 files: 73%|███████▎ | 8/11 [00:00&lt;00:00, 13.96it/s]
3946
+ Fetching 11 files: 100%|██████████| 11/11 [00:00&lt;00:00, 19.18it/s]
 
3947
  </div>
 
 
 
 
3948
  <div class="cell-artifacts">
3949
  <h4>Artifacts:</h4>
3950
  <a href="artifacts/benchmark/attention.jsonl" class="artifact" target="_blank">attention.jsonl</a>
flash_attn/impls/xformers.html CHANGED
@@ -809,6 +809,14 @@
809
  .artifact-preview svg {
810
  background: transparent;
811
  }
 
 
 
 
 
 
 
 
812
  /* CSV table styling */
813
  .artifact-csv {
814
  margin-top: 1rem;
@@ -3871,7 +3879,7 @@ span.linenos.special { color: #000000; background-color: #ffffc0; padding-left:
3871
  <span onclick="toggleOutput('benchmark')" style="cursor: pointer;">▼ output</span>
3872
  <span id="uv-indicator-benchmark" onclick="toggleUvLogsFromHeader('benchmark')" style="cursor: pointer;">▶ uv-logs</span>
3873
  </span> |
3874
- Cell: benchmark | 5.02s
3875
  | <button class="run-btn" onclick="runCell('benchmark')">▶ run</button>
3876
  <button class="copy-btn" onclick="copyCell('benchmark')">Copy</button>
3877
  <a href="cells/benchmark.py" target="_blank" class="raw-btn">Raw</a>
@@ -3923,21 +3931,21 @@ PROFILE TRACE: xformers_meff | cuda_attn_L128_bfloat16
3923
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
3924
  Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
3925
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
3926
- xformers_meff 10.85% 481.112us 51.55% 2.285ms 2.285ms 0.000us 0.00% 3.582ms 3.582ms 1
3927
- xformers_flash3::flash_fwd 4.56% 202.185us 39.85% 1.766ms 588.715us 0.000us 0.00% 3.582ms 1.194ms 3
3928
- flash_attn_3::fwd 1.68% 74.662us 35.29% 1.564ms 521.320us 2.681ms 100.00% 3.582ms 1.194ms 3
3929
- xformers_meff 0.00% 0.000us 0.00% 0.000us 0.000us 2.682ms 100.06% 2.682ms 2.682ms 1
3930
- void cutlass::device_kernel&lt;flash::enable_sm80_to_sm... 0.00% 0.000us 0.00% 0.000us 0.000us 2.681ms 100.00% 2.681ms 893.515us 3
3931
- Activity Buffer Request 31.74% 1.407ms 31.74% 1.407ms 1.407ms 901.761us 33.64% 901.761us 901.761us 1
3932
- aten::empty 0.77% 33.920us 0.77% 33.920us 5.653us 0.000us 0.00% 0.000us 0.000us 6
3933
- cudaFuncSetAttribute 0.23% 10.152us 0.23% 10.152us 3.384us 0.000us 0.00% 0.000us 0.000us 3
3934
- cudaLaunchKernel 0.87% 38.521us 0.87% 38.521us 12.840us 0.000us 0.00% 0.000us 0.000us 3
3935
- aten::reshape 0.29% 13.028us 0.85% 37.710us 6.285us 0.000us 0.00% 0.000us 0.000us 6
3936
- aten::view 0.56% 24.682us 0.56% 24.682us 4.114us 0.000us 0.00% 0.000us 0.000us 6
3937
- cudaDeviceSynchronize 48.45% 2.147ms 48.45% 2.147ms 2.147ms 0.000us 0.00% 0.000us 0.000us 1
3938
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
3939
- Self CPU time total: 4.432ms
3940
- Self CUDA time total: 2.681ms
3941
 
3942
 
3943
 
@@ -3947,21 +3955,21 @@ PROFILE TRACE: xformers_meff | cuda_attn_L256_bfloat16
3947
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
3948
  Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
3949
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
3950
- xformers_meff 7.16% 317.438us 45.96% 2.036ms 2.036ms 0.000us 0.00% 3.779ms 3.779ms 1
3951
- xformers_flash3::flash_fwd 3.35% 148.243us 38.25% 1.695ms 564.991us 0.000us 0.00% 3.779ms 1.260ms 3
3952
- flash_attn_3::fwd 1.25% 55.403us 34.91% 1.547ms 515.576us 2.825ms 100.00% 3.779ms 1.260ms 3
3953
- xformers_meff 0.00% 0.000us 0.00% 0.000us 0.000us 2.827ms 100.05% 2.827ms 2.827ms 1
3954
- void cutlass::device_kernel&lt;flash::enable_sm80_to_sm... 0.00% 0.000us 0.00% 0.000us 0.000us 2.825ms 100.00% 2.825ms 941.739us 3
3955
- Activity Buffer Request 32.14% 1.424ms 32.14% 1.424ms 1.424ms 954.080us 33.77% 954.080us 954.080us 1
3956
- aten::empty 0.63% 27.720us 0.63% 27.720us 4.620us 0.000us 0.00% 0.000us 0.000us 6
3957
- cudaFuncSetAttribute 0.12% 5.400us 0.12% 5.400us 1.800us 0.000us 0.00% 0.000us 0.000us 3
3958
- cudaLaunchKernel 0.77% 34.161us 0.77% 34.161us 11.387us 0.000us 0.00% 0.000us 0.000us 3
3959
- aten::reshape 0.21% 9.370us 0.54% 23.750us 3.958us 0.000us 0.00% 0.000us 0.000us 6
3960
- aten::view 0.32% 14.380us 0.32% 14.380us 2.397us 0.000us 0.00% 0.000us 0.000us 6
3961
- cudaDeviceSynchronize 54.04% 2.395ms 54.04% 2.395ms 2.395ms 0.000us 0.00% 0.000us 0.000us 1
3962
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
3963
- Self CPU time total: 4.431ms
3964
- Self CUDA time total: 2.825ms
3965
 
3966
 
3967
 
@@ -3971,21 +3979,21 @@ PROFILE TRACE: xformers_meff | cuda_attn_L320_bfloat16
3971
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
3972
  Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
3973
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
3974
- xformers_meff 6.87% 310.027us 44.72% 2.018ms 2.018ms 0.000us 0.00% 3.923ms 3.923ms 1
3975
- xformers_flash3::flash_fwd 3.22% 145.444us 37.33% 1.684ms 561.324us 0.000us 0.00% 3.923ms 1.308ms 3
3976
- flash_attn_3::fwd 1.15% 52.002us 34.10% 1.539ms 512.843us 2.919ms 100.00% 3.923ms 1.308ms 3
3977
- xformers_meff 0.00% 0.000us 0.00% 0.000us 0.000us 2.921ms 100.06% 2.921ms 2.921ms 1
3978
- void cutlass::device_kernel&lt;flash::enable_sm80_to_sm... 0.00% 0.000us 0.00% 0.000us 0.000us 2.919ms 100.00% 2.919ms 973.037us 3
3979
- Activity Buffer Request 31.44% 1.418ms 31.44% 1.418ms 1.418ms 1.004ms 34.40% 1.004ms 1.004ms 1
3980
- aten::empty 0.63% 28.392us 0.63% 28.392us 4.732us 0.000us 0.00% 0.000us 0.000us 6
3981
- cudaFuncSetAttribute 0.12% 5.520us 0.12% 5.520us 1.840us 0.000us 0.00% 0.000us 0.000us 3
3982
- cudaLaunchKernel 0.76% 34.420us 0.76% 34.420us 11.473us 0.000us 0.00% 0.000us 0.000us 3
3983
- aten::reshape 0.21% 9.519us 0.52% 23.650us 3.942us 0.000us 0.00% 0.000us 0.000us 6
3984
- aten::view 0.31% 14.131us 0.31% 14.131us 2.355us 0.000us 0.00% 0.000us 0.000us 6
3985
- cudaDeviceSynchronize 55.28% 2.494ms 55.28% 2.494ms 2.494ms 0.000us 0.00% 0.000us 0.000us 1
3986
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
3987
- Self CPU time total: 4.511ms
3988
- Self CUDA time total: 2.919ms
3989
 
3990
 
3991
 
@@ -3995,21 +4003,21 @@ PROFILE TRACE: xformers_meff | cuda_attn_L384_bfloat16
3995
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
3996
  Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
3997
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
3998
- xformers_meff 6.73% 317.798us 47.46% 2.241ms 2.241ms 0.000us 0.00% 3.892ms 3.892ms 1
3999
- xformers_flash3::flash_fwd 3.10% 146.544us 40.23% 1.900ms 633.169us 0.000us 0.00% 3.892ms 1.297ms 3
4000
- flash_attn_3::fwd 1.15% 54.462us 37.13% 1.753ms 584.321us 2.910ms 100.00% 3.892ms 1.297ms 3
4001
- xformers_meff 0.00% 0.000us 0.00% 0.000us 0.000us 2.911ms 100.05% 2.911ms 2.911ms 1
4002
- void cutlass::device_kernel&lt;flash::enable_sm80_to_sm... 0.00% 0.000us 0.00% 0.000us 0.000us 2.910ms 100.00% 2.910ms 969.848us 3
4003
- Activity Buffer Request 30.01% 1.417ms 30.01% 1.417ms 1.417ms 982.915us 33.78% 982.915us 982.915us 1
4004
- aten::empty 0.62% 29.170us 0.62% 29.170us 4.862us 0.000us 0.00% 0.000us 0.000us 6
4005
- cudaFuncSetAttribute 0.11% 5.370us 0.11% 5.370us 1.790us 0.000us 0.00% 0.000us 0.000us 3
4006
- cudaLaunchKernel 5.23% 247.156us 5.23% 247.156us 82.385us 0.000us 0.00% 0.000us 0.000us 3
4007
- aten::reshape 0.20% 9.560us 0.50% 23.460us 3.910us 0.000us 0.00% 0.000us 0.000us 6
4008
- aten::view 0.29% 13.900us 0.29% 13.900us 2.317us 0.000us 0.00% 0.000us 0.000us 6
4009
- cudaDeviceSynchronize 52.54% 2.481ms 52.54% 2.481ms 2.481ms 0.000us 0.00% 0.000us 0.000us 1
4010
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4011
- Self CPU time total: 4.721ms
4012
- Self CUDA time total: 2.910ms
4013
 
4014
 
4015
 
@@ -4019,21 +4027,21 @@ PROFILE TRACE: xformers_meff | cuda_attn_L448_bfloat16
4019
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4020
  Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
4021
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4022
- xformers_meff 5.86% 306.369us 41.94% 2.193ms 2.193ms 0.000us 0.00% 4.614ms 4.614ms 1
4023
- xformers_flash3::flash_fwd 2.85% 149.202us 35.63% 1.863ms 620.885us 0.000us 0.00% 4.614ms 1.538ms 3
4024
- flash_attn_3::fwd 1.03% 53.951us 32.77% 1.713ms 571.151us 3.461ms 100.00% 4.614ms 1.538ms 3
4025
- xformers_meff 0.00% 0.000us 0.00% 0.000us 0.000us 3.462ms 100.04% 3.462ms 3.462ms 1
4026
- void cutlass::device_kernel&lt;flash::enable_sm80_to_sm... 0.00% 0.000us 0.00% 0.000us 0.000us 3.461ms 100.00% 3.461ms 1.154ms 3
4027
- Activity Buffer Request 27.28% 1.426ms 27.28% 1.426ms 1.426ms 1.153ms 33.31% 1.153ms 1.153ms 1
4028
- aten::empty 0.55% 28.813us 0.55% 28.813us 4.802us 0.000us 0.00% 0.000us 0.000us 6
4029
- cudaFuncSetAttribute 0.11% 5.560us 0.11% 5.560us 1.853us 0.000us 0.00% 0.000us 0.000us 3
4030
- cudaLaunchKernel 3.80% 198.684us 3.80% 198.684us 66.228us 0.000us 0.00% 0.000us 0.000us 3
4031
- aten::reshape 0.18% 9.430us 0.46% 23.930us 3.988us 0.000us 0.00% 0.000us 0.000us 6
4032
- aten::view 0.28% 14.500us 0.28% 14.500us 2.417us 0.000us 0.00% 0.000us 0.000us 6
4033
- cudaDeviceSynchronize 58.06% 3.036ms 58.06% 3.036ms 3.036ms 0.000us 0.00% 0.000us 0.000us 1
4034
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4035
- Self CPU time total: 5.228ms
4036
- Self CUDA time total: 3.461ms
4037
 
4038
 
4039
 
@@ -4043,37 +4051,83 @@ PROFILE TRACE: xformers_meff | cuda_attn_L512_bfloat16
4043
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4044
  Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
4045
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4046
- xformers_meff 5.96% 310.158us 41.66% 2.167ms 2.167ms 0.000us 0.00% 4.643ms 4.643ms 1
4047
- xformers_flash3::flash_fwd 2.83% 146.954us 35.22% 1.832ms 610.728us 0.000us 0.00% 4.643ms 1.548ms 3
4048
- flash_attn_3::fwd 1.00% 51.911us 32.40% 1.685ms 561.744us 3.464ms 100.00% 4.643ms 1.548ms 3
4049
- xformers_meff 0.00% 0.000us 0.00% 0.000us 0.000us 3.465ms 100.04% 3.465ms 3.465ms 1
4050
- void cutlass::device_kernel&lt;flash::enable_sm80_to_sm... 0.00% 0.000us 0.00% 0.000us 0.000us 3.464ms 100.00% 3.464ms 1.155ms 3
4051
- Activity Buffer Request 27.49% 1.430ms 27.49% 1.430ms 1.430ms 1.179ms 34.05% 1.179ms 1.179ms 1
4052
- aten::empty 0.54% 28.311us 0.54% 28.311us 4.719us 0.000us 0.00% 0.000us 0.000us 6
4053
- cudaFuncSetAttribute 0.11% 5.750us 0.11% 5.750us 1.917us 0.000us 0.00% 0.000us 0.000us 3
4054
- cudaLaunchKernel 3.25% 169.084us 3.25% 169.084us 56.361us 0.000us 0.00% 0.000us 0.000us 3
4055
- aten::reshape 0.17% 8.670us 0.48% 24.720us 4.120us 0.000us 0.00% 0.000us 0.000us 6
4056
- aten::view 0.31% 16.050us 0.31% 16.050us 2.675us 0.000us 0.00% 0.000us 0.000us 6
4057
- cudaDeviceSynchronize 58.34% 3.035ms 58.34% 3.035ms 3.035ms 0.000us 0.00% 0.000us 0.000us 1
4058
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4059
- Self CPU time total: 5.202ms
4060
- Self CUDA time total: 3.464ms
4061
 
4062
 
4063
  impl wl p50(ms) ok
4064
- xformers_meff cuda_attn_L128_bfloat16 1.00 True
4065
- xformers_meff cuda_attn_L256_bfloat16 1.03 True
4066
- xformers_meff cuda_attn_L320_bfloat16 1.08 True
4067
- xformers_meff cuda_attn_L384_bfloat16 1.09 True
4068
- xformers_meff cuda_attn_L448_bfloat16 1.25 True
4069
- xformers_meff cuda_attn_L512_bfloat16 1.24 True
4070
  </pre></div>
4071
  <div class="uv-install-logs" id="uv-logs-benchmark">
4072
  <div class="uv-logs-header" onclick="toggleUvLogs(this)">▶ UV Install Logs</div>
4073
  <div class="uv-logs-content" style="display: none;">
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
4074
  Downloading xformers (111.8MiB)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
4075
  Downloading xformers
4076
- Installed 1 package in 13ms
 
 
 
 
 
 
 
 
 
4077
  </div>
4078
  </div>
4079
  <div class="cell-artifacts">
 
809
  .artifact-preview svg {
810
  background: transparent;
811
  }
812
+ /* Invert SVG images in dark mode */
813
+ :root[data-theme="dark"] .artifact-preview img[src$=".svg"] {
814
+ filter: invert(0.9) hue-rotate(180deg);
815
+ }
816
+ /* Keep SVG images readable in monocolor mode */
817
+ :root[data-ui="monocolor"] .artifact-preview img[src$=".svg"] {
818
+ filter: none;
819
+ }
820
  /* CSV table styling */
821
  .artifact-csv {
822
  margin-top: 1rem;
 
3879
  <span onclick="toggleOutput('benchmark')" style="cursor: pointer;">▼ output</span>
3880
  <span id="uv-indicator-benchmark" onclick="toggleUvLogsFromHeader('benchmark')" style="cursor: pointer;">▶ uv-logs</span>
3881
  </span> |
3882
+ Cell: benchmark | 33.44s
3883
  | <button class="run-btn" onclick="runCell('benchmark')">▶ run</button>
3884
  <button class="copy-btn" onclick="copyCell('benchmark')">Copy</button>
3885
  <a href="cells/benchmark.py" target="_blank" class="raw-btn">Raw</a>
 
3931
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
3932
  Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
3933
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
3934
+ xformers_meff 9.89% 457.200us 48.78% 2.255ms 2.255ms 0.000us 0.00% 3.820ms 3.820ms 1
3935
+ xformers_flash3::flash_fwd 3.84% 177.424us 38.10% 1.761ms 587.077us 0.000us 0.00% 3.820ms 1.273ms 3
3936
+ flash_attn_3::fwd 1.55% 71.862us 34.26% 1.584ms 527.935us 2.885ms 100.00% 3.820ms 1.273ms 3
3937
+ xformers_meff 0.00% 0.000us 0.00% 0.000us 0.000us 2.886ms 100.04% 2.886ms 2.886ms 1
3938
+ void cutlass::device_kernel&lt;flash::enable_sm80_to_sm... 0.00% 0.000us 0.00% 0.000us 0.000us 2.885ms 100.00% 2.885ms 961.658us 3
3939
+ Activity Buffer Request 30.73% 1.420ms 30.73% 1.420ms 1.420ms 934.553us 32.39% 934.553us 934.553us 1
3940
+ aten::empty 0.74% 34.201us 0.74% 34.201us 5.700us 0.000us 0.00% 0.000us 0.000us 6
3941
+ cudaFuncSetAttribute 0.22% 10.110us 0.22% 10.110us 3.370us 0.000us 0.00% 0.000us 0.000us 3
3942
+ cudaLaunchKernel 1.02% 47.230us 1.02% 47.230us 15.743us 0.000us 0.00% 0.000us 0.000us 3
3943
+ aten::reshape 0.34% 15.510us 0.79% 36.581us 6.097us 0.000us 0.00% 0.000us 0.000us 6
3944
+ aten::view 0.46% 21.071us 0.46% 21.071us 3.512us 0.000us 0.00% 0.000us 0.000us 6
3945
+ cudaDeviceSynchronize 51.22% 2.368ms 51.22% 2.368ms 2.368ms 0.000us 0.00% 0.000us 0.000us 1
3946
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
3947
+ Self CPU time total: 4.623ms
3948
+ Self CUDA time total: 2.885ms
3949
 
3950
 
3951
 
 
3955
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
3956
  Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
3957
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
3958
+ xformers_meff 6.56% 301.335us 45.12% 2.073ms 2.073ms 0.000us 0.00% 3.862ms 3.862ms 1
3959
+ xformers_flash3::flash_fwd 3.02% 138.865us 38.04% 1.748ms 582.607us 0.000us 0.00% 3.862ms 1.287ms 3
3960
+ flash_attn_3::fwd 1.15% 53.013us 35.02% 1.609ms 536.319us 2.932ms 100.00% 3.862ms 1.287ms 3
3961
+ xformers_meff 0.00% 0.000us 0.00% 0.000us 0.000us 2.933ms 100.04% 2.933ms 2.933ms 1
3962
+ void cutlass::device_kernel&lt;flash::enable_sm80_to_sm... 0.00% 0.000us 0.00% 0.000us 0.000us 2.932ms 100.00% 2.932ms 977.308us 3
3963
+ Activity Buffer Request 32.36% 1.487ms 32.36% 1.487ms 1.487ms 930.332us 31.73% 930.332us 930.332us 1
3964
+ aten::empty 0.65% 29.679us 0.65% 29.679us 4.946us 0.000us 0.00% 0.000us 0.000us 6
3965
+ cudaFuncSetAttribute 0.12% 5.591us 0.12% 5.591us 1.864us 0.000us 0.00% 0.000us 0.000us 3
3966
+ cudaLaunchKernel 0.74% 34.170us 0.74% 34.170us 11.390us 0.000us 0.00% 0.000us 0.000us 3
3967
+ aten::reshape 0.22% 9.881us 0.51% 23.631us 3.938us 0.000us 0.00% 0.000us 0.000us 6
3968
+ aten::view 0.30% 13.750us 0.30% 13.750us 2.292us 0.000us 0.00% 0.000us 0.000us 6
3969
+ cudaDeviceSynchronize 54.88% 2.521ms 54.88% 2.521ms 2.521ms 0.000us 0.00% 0.000us 0.000us 1
3970
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
3971
+ Self CPU time total: 4.594ms
3972
+ Self CUDA time total: 2.932ms
3973
 
3974
 
3975
 
 
3979
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
3980
  Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
3981
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
3982
+ xformers_meff 6.47% 295.057us 44.36% 2.024ms 2.024ms 0.000us 0.00% 3.906ms 3.906ms 1
3983
+ xformers_flash3::flash_fwd 3.08% 140.693us 37.39% 1.706ms 568.676us 0.000us 0.00% 3.906ms 1.302ms 3
3984
+ flash_attn_3::fwd 1.15% 52.641us 34.31% 1.565ms 521.779us 2.948ms 100.00% 3.906ms 1.302ms 3
3985
+ xformers_meff 0.00% 0.000us 0.00% 0.000us 0.000us 2.949ms 100.05% 2.949ms 2.949ms 1
3986
+ void cutlass::device_kernel&lt;flash::enable_sm80_to_sm... 0.00% 0.000us 0.00% 0.000us 0.000us 2.948ms 100.00% 2.948ms 982.658us 3
3987
+ Activity Buffer Request 31.65% 1.444ms 31.65% 1.444ms 1.444ms 958.263us 32.51% 958.263us 958.263us 1
3988
+ aten::empty 0.65% 29.440us 0.65% 29.440us 4.907us 0.000us 0.00% 0.000us 0.000us 6
3989
+ cudaFuncSetAttribute 0.12% 5.511us 0.12% 5.511us 1.837us 0.000us 0.00% 0.000us 0.000us 3
3990
+ cudaLaunchKernel 0.74% 33.911us 0.74% 33.911us 11.304us 0.000us 0.00% 0.000us 0.000us 3
3991
+ aten::reshape 0.18% 8.109us 0.50% 22.850us 3.808us 0.000us 0.00% 0.000us 0.000us 6
3992
+ aten::view 0.32% 14.741us 0.32% 14.741us 2.457us 0.000us 0.00% 0.000us 0.000us 6
3993
+ cudaDeviceSynchronize 55.64% 2.539ms 55.64% 2.539ms 2.539ms 0.000us 0.00% 0.000us 0.000us 1
3994
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
3995
+ Self CPU time total: 4.562ms
3996
+ Self CUDA time total: 2.948ms
3997
 
3998
 
3999
 
 
4003
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4004
  Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
4005
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4006
+ xformers_meff 6.44% 300.857us 47.49% 2.217ms 2.217ms 0.000us 0.00% 3.827ms 3.827ms 1
4007
+ xformers_flash3::flash_fwd 3.16% 147.703us 40.53% 1.892ms 630.694us 0.000us 0.00% 3.827ms 1.276ms 3
4008
+ flash_attn_3::fwd 1.13% 52.820us 37.36% 1.744ms 581.460us 2.874ms 100.00% 3.827ms 1.276ms 3
4009
+ xformers_meff 0.00% 0.000us 0.00% 0.000us 0.000us 2.876ms 100.05% 2.876ms 2.876ms 1
4010
+ void cutlass::device_kernel&lt;flash::enable_sm80_to_sm... 0.00% 0.000us 0.00% 0.000us 0.000us 2.874ms 100.00% 2.874ms 958.161us 3
4011
+ Activity Buffer Request 30.85% 1.440ms 30.85% 1.440ms 1.440ms 952.124us 33.12% 952.124us 952.124us 1
4012
+ aten::empty 0.63% 29.391us 0.63% 29.391us 4.899us 0.000us 0.00% 0.000us 0.000us 6
4013
+ cudaFuncSetAttribute 0.13% 5.930us 0.13% 5.930us 1.977us 0.000us 0.00% 0.000us 0.000us 3
4014
+ cudaLaunchKernel 4.63% 215.955us 4.63% 215.955us 71.985us 0.000us 0.00% 0.000us 0.000us 3
4015
+ aten::reshape 0.22% 10.380us 0.51% 23.940us 3.990us 0.000us 0.00% 0.000us 0.000us 6
4016
+ aten::view 0.29% 13.560us 0.29% 13.560us 2.260us 0.000us 0.00% 0.000us 0.000us 6
4017
+ cudaDeviceSynchronize 52.51% 2.452ms 52.51% 2.452ms 2.452ms 0.000us 0.00% 0.000us 0.000us 1
4018
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4019
+ Self CPU time total: 4.669ms
4020
+ Self CUDA time total: 2.874ms
4021
 
4022
 
4023
 
 
4027
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4028
  Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
4029
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4030
+ xformers_meff 5.75% 298.955us 42.23% 2.194ms 2.194ms 0.000us 0.00% 4.560ms 4.560ms 1
4031
+ xformers_flash3::flash_fwd 2.73% 142.094us 36.04% 1.872ms 624.074us 0.000us 0.00% 4.560ms 1.520ms 3
4032
+ flash_attn_3::fwd 1.06% 54.881us 33.30% 1.730ms 576.710us 3.413ms 100.00% 4.560ms 1.520ms 3
4033
+ xformers_meff 0.00% 0.000us 0.00% 0.000us 0.000us 3.415ms 100.04% 3.415ms 3.415ms 1
4034
+ void cutlass::device_kernel&lt;flash::enable_sm80_to_sm... 0.00% 0.000us 0.00% 0.000us 0.000us 3.413ms 100.00% 3.413ms 1.138ms 3
4035
+ Activity Buffer Request 27.56% 1.432ms 27.56% 1.432ms 1.432ms 1.147ms 33.59% 1.147ms 1.147ms 1
4036
+ aten::empty 0.56% 28.860us 0.56% 28.860us 4.810us 0.000us 0.00% 0.000us 0.000us 6
4037
+ cudaFuncSetAttribute 0.10% 5.420us 0.10% 5.420us 1.807us 0.000us 0.00% 0.000us 0.000us 3
4038
+ cudaLaunchKernel 4.02% 208.865us 4.02% 208.865us 69.622us 0.000us 0.00% 0.000us 0.000us 3
4039
+ aten::reshape 0.18% 9.222us 0.44% 22.901us 3.817us 0.000us 0.00% 0.000us 0.000us 6
4040
+ aten::view 0.26% 13.679us 0.26% 13.679us 2.280us 0.000us 0.00% 0.000us 0.000us 6
4041
+ cudaDeviceSynchronize 57.77% 3.001ms 57.77% 3.001ms 3.001ms 0.000us 0.00% 0.000us 0.000us 1
4042
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4043
+ Self CPU time total: 5.196ms
4044
+ Self CUDA time total: 3.413ms
4045
 
4046
 
4047
 
 
4051
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4052
  Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
4053
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4054
+ xformers_meff 5.27% 272.556us 42.19% 2.184ms 2.184ms 0.000us 0.00% 4.536ms 4.536ms 1
4055
+ xformers_flash3::flash_fwd 2.70% 139.942us 36.49% 1.889ms 629.618us 0.000us 0.00% 4.536ms 1.512ms 3
4056
+ flash_attn_3::fwd 1.02% 52.981us 33.79% 1.749ms 582.970us 3.398ms 100.00% 4.536ms 1.512ms 3
4057
+ xformers_meff 0.00% 0.000us 0.00% 0.000us 0.000us 3.400ms 100.05% 3.400ms 3.400ms 1
4058
+ void cutlass::device_kernel&lt;flash::enable_sm80_to_sm... 0.00% 0.000us 0.00% 0.000us 0.000us 3.398ms 100.00% 3.398ms 1.133ms 3
4059
+ Activity Buffer Request 28.10% 1.454ms 28.10% 1.454ms 1.454ms 1.138ms 33.49% 1.138ms 1.138ms 1
4060
+ aten::empty 0.56% 28.991us 0.56% 28.991us 4.832us 0.000us 0.00% 0.000us 0.000us 6
4061
+ cudaFuncSetAttribute 0.11% 5.511us 0.11% 5.511us 1.837us 0.000us 0.00% 0.000us 0.000us 3
4062
+ cudaLaunchKernel 4.00% 207.225us 4.00% 207.225us 69.075us 0.000us 0.00% 0.000us 0.000us 3
4063
+ aten::reshape 0.17% 8.891us 0.44% 22.532us 3.755us 0.000us 0.00% 0.000us 0.000us 6
4064
+ aten::view 0.26% 13.641us 0.26% 13.641us 2.274us 0.000us 0.00% 0.000us 0.000us 6
4065
+ cudaDeviceSynchronize 57.81% 2.992ms 57.81% 2.992ms 2.992ms 0.000us 0.00% 0.000us 0.000us 1
4066
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4067
+ Self CPU time total: 5.176ms
4068
+ Self CUDA time total: 3.398ms
4069
 
4070
 
4071
  impl wl p50(ms) ok
4072
+ xformers_meff cuda_attn_L128_bfloat16 0.99 True
4073
+ xformers_meff cuda_attn_L256_bfloat16 1.05 True
4074
+ xformers_meff cuda_attn_L320_bfloat16 1.06 True
4075
+ xformers_meff cuda_attn_L384_bfloat16 1.06 True
4076
+ xformers_meff cuda_attn_L448_bfloat16 1.23 True
4077
+ xformers_meff cuda_attn_L512_bfloat16 1.23 True
4078
  </pre></div>
4079
  <div class="uv-install-logs" id="uv-logs-benchmark">
4080
  <div class="uv-logs-header" onclick="toggleUvLogs(this)">▶ UV Install Logs</div>
4081
  <div class="uv-logs-content" style="display: none;">
4082
+ Building kernels-benchmark-tools @ file:///__w/kernels-benchmarks/kernels-benchmarks/tools
4083
+ Downloading setuptools (1.1MiB)
4084
+ Downloading numpy (16.2MiB)
4085
+ Downloading nvidia-cublas-cu12 (566.8MiB)
4086
+ Downloading nvidia-cuda-cupti-cu12 (9.8MiB)
4087
+ Downloading nvidia-cusparse-cu12 (274.9MiB)
4088
+ Downloading nvidia-nccl-cu12 (307.4MiB)
4089
+ Downloading kiwisolver (1.4MiB)
4090
+ Downloading torch (846.9MiB)
4091
+ Downloading matplotlib (8.3MiB)
4092
+ Downloading sympy (6.0MiB)
4093
+ Downloading nvidia-cusparselt-cu12 (273.9MiB)
4094
+ Downloading fonttools (4.7MiB)
4095
+ Downloading nvidia-cufft-cu12 (184.2MiB)
4096
+ Downloading pillow (6.7MiB)
4097
+ Downloading networkx (1.9MiB)
4098
+ Downloading nvidia-cusolver-cu12 (255.1MiB)
4099
+ Downloading triton (148.3MiB)
4100
+ Downloading nvidia-cuda-nvrtc-cu12 (84.0MiB)
4101
  Downloading xformers (111.8MiB)
4102
+ Downloading nvidia-curand-cu12 (60.7MiB)
4103
+ Downloading nvidia-cufile-cu12 (1.1MiB)
4104
+ Downloading nvidia-cudnn-cu12 (674.0MiB)
4105
+ Downloading nvidia-nvjitlink-cu12 (37.4MiB)
4106
+ Downloading nvidia-cufile-cu12
4107
+ Downloading kiwisolver
4108
+ Downloading setuptools
4109
+ Downloading networkx
4110
+ Downloading fonttools
4111
+ Downloading pillow
4112
+ Built kernels-benchmark-tools @ file:///__w/kernels-benchmarks/kernels-benchmarks/tools
4113
+ Downloading nvidia-cuda-cupti-cu12
4114
+ Downloading matplotlib
4115
+ Downloading numpy
4116
+ Downloading sympy
4117
+ Downloading nvidia-nvjitlink-cu12
4118
+ Downloading nvidia-curand-cu12
4119
+ Downloading nvidia-cuda-nvrtc-cu12
4120
  Downloading xformers
4121
+ Downloading triton
4122
+ Downloading nvidia-cufft-cu12
4123
+ Downloading nvidia-cusolver-cu12
4124
+ Downloading nvidia-cusparse-cu12
4125
+ Downloading nvidia-cusparselt-cu12
4126
+ Downloading nvidia-nccl-cu12
4127
+ Downloading nvidia-cudnn-cu12
4128
+ Downloading nvidia-cublas-cu12
4129
+ Downloading torch
4130
+ Installed 38 packages in 204ms
4131
  </div>
4132
  </div>
4133
  <div class="cell-artifacts">
flash_attn/results/artifacts/combine/latency.svg CHANGED

Git LFS Details

  • SHA256: 0a7d7b3dc8fc6b60a4b9f8bfcf3e229706548b71a8174822b89cc9a2746d3bbd
  • Pointer size: 130 Bytes
  • Size of remote file: 24.8 kB

Git LFS Details

  • SHA256: 567d9c3aecb5f005a8679995284fab5112829f643a670a3a2d3688588b305153
  • Pointer size: 130 Bytes
  • Size of remote file: 24.8 kB
flash_attn/results/combined_results.html CHANGED
@@ -809,6 +809,14 @@
809
  .artifact-preview svg {
810
  background: transparent;
811
  }
 
 
 
 
 
 
 
 
812
  /* CSV table styling */
813
  .artifact-csv {
814
  margin-top: 1rem;
@@ -3872,7 +3880,7 @@ span.linenos.special { color: #000000; background-color: #ffffc0; padding-left:
3872
  <rdf:RDF>
3873
  <ns2:Work>
3874
  <dc:type rdf:resource="http://purl.org/dc/dcmitype/StillImage" />
3875
- <dc:date>2025-10-29T14:28:03.109695</dc:date>
3876
  <dc:format>image/svg+xml</dc:format>
3877
  <dc:creator>
3878
  <ns2:Agent>
@@ -3982,96 +3990,96 @@ span.linenos.special { color: #000000; background-color: #ffffc0; padding-left:
3982
  <g id="matplotlib.axis_2">
3983
  <g id="ytick_1">
3984
  <g id="grid-y--2" class="grid grid-y">
3985
- <path d="M 47.81 409.00723 L 835.361742 409.00723 " clip-path="url(#p09feef2583)" style="fill: none; stroke: #b0b0b0; stroke-opacity: 0.3; stroke-width: 0.8; stroke-linecap: square" />
3986
  </g>
3987
  <g id="line2d_7">
3988
  <defs>
3989
  <path id="m0fca2865ba" d="M 0 0 L -3.5 0 " style="stroke: #000000; stroke-width: 0.8" />
3990
  </defs>
3991
  <g>
3992
- <use ns4:href="#m0fca2865ba" x="47.81" y="409.00723" style="stroke: #000000; stroke-width: 0.8" />
3993
  </g>
3994
  </g>
3995
  <g id="text_7">
3996
- <text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: end" x="40.81" y="412.806448" transform="rotate(-0 40.81 412.806448)">1.0</text>
3997
  </g>
3998
  </g>
3999
  <g id="ytick_2">
4000
  <g id="grid-y--3" class="grid grid-y">
4001
- <path d="M 47.81 347.973099 L 835.361742 347.973099 " clip-path="url(#p09feef2583)" style="fill: none; stroke: #b0b0b0; stroke-opacity: 0.3; stroke-width: 0.8; stroke-linecap: square" />
4002
  </g>
4003
  <g id="line2d_8">
4004
  <g>
4005
- <use ns4:href="#m0fca2865ba" x="47.81" y="347.973099" style="stroke: #000000; stroke-width: 0.8" />
4006
  </g>
4007
  </g>
4008
  <g id="text_8">
4009
- <text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: end" x="40.81" y="351.772318" transform="rotate(-0 40.81 351.772318)">1.2</text>
4010
  </g>
4011
  </g>
4012
  <g id="ytick_3">
4013
  <g id="grid-y--4" class="grid grid-y">
4014
- <path d="M 47.81 286.938969 L 835.361742 286.938969 " clip-path="url(#p09feef2583)" style="fill: none; stroke: #b0b0b0; stroke-opacity: 0.3; stroke-width: 0.8; stroke-linecap: square" />
4015
  </g>
4016
  <g id="line2d_9">
4017
  <g>
4018
- <use ns4:href="#m0fca2865ba" x="47.81" y="286.938969" style="stroke: #000000; stroke-width: 0.8" />
4019
  </g>
4020
  </g>
4021
  <g id="text_9">
4022
- <text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: end" x="40.81" y="290.738187" transform="rotate(-0 40.81 290.738187)">1.4</text>
4023
  </g>
4024
  </g>
4025
  <g id="ytick_4">
4026
  <g id="grid-y--5" class="grid grid-y">
4027
- <path d="M 47.81 225.904838 L 835.361742 225.904838 " clip-path="url(#p09feef2583)" style="fill: none; stroke: #b0b0b0; stroke-opacity: 0.3; stroke-width: 0.8; stroke-linecap: square" />
4028
  </g>
4029
  <g id="line2d_10">
4030
  <g>
4031
- <use ns4:href="#m0fca2865ba" x="47.81" y="225.904838" style="stroke: #000000; stroke-width: 0.8" />
4032
  </g>
4033
  </g>
4034
  <g id="text_10">
4035
- <text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: end" x="40.81" y="229.704057" transform="rotate(-0 40.81 229.704057)">1.6</text>
4036
  </g>
4037
  </g>
4038
  <g id="ytick_5">
4039
  <g id="grid-y--6" class="grid grid-y">
4040
- <path d="M 47.81 164.870708 L 835.361742 164.870708 " clip-path="url(#p09feef2583)" style="fill: none; stroke: #b0b0b0; stroke-opacity: 0.3; stroke-width: 0.8; stroke-linecap: square" />
4041
  </g>
4042
  <g id="line2d_11">
4043
  <g>
4044
- <use ns4:href="#m0fca2865ba" x="47.81" y="164.870708" style="stroke: #000000; stroke-width: 0.8" />
4045
  </g>
4046
  </g>
4047
  <g id="text_11">
4048
- <text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: end" x="40.81" y="168.669926" transform="rotate(-0 40.81 168.669926)">1.8</text>
4049
  </g>
4050
  </g>
4051
  <g id="ytick_6">
4052
  <g id="grid-y--7" class="grid grid-y">
4053
- <path d="M 47.81 103.836577 L 835.361742 103.836577 " clip-path="url(#p09feef2583)" style="fill: none; stroke: #b0b0b0; stroke-opacity: 0.3; stroke-width: 0.8; stroke-linecap: square" />
4054
  </g>
4055
  <g id="line2d_12">
4056
  <g>
4057
- <use ns4:href="#m0fca2865ba" x="47.81" y="103.836577" style="stroke: #000000; stroke-width: 0.8" />
4058
  </g>
4059
  </g>
4060
  <g id="text_12">
4061
- <text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: end" x="40.81" y="107.635796" transform="rotate(-0 40.81 107.635796)">2.0</text>
4062
  </g>
4063
  </g>
4064
  <g id="ytick_7">
4065
  <g id="grid-y--8" class="grid grid-y">
4066
- <path d="M 47.81 42.802447 L 835.361742 42.802447 " clip-path="url(#p09feef2583)" style="fill: none; stroke: #b0b0b0; stroke-opacity: 0.3; stroke-width: 0.8; stroke-linecap: square" />
4067
  </g>
4068
  <g id="line2d_13">
4069
  <g>
4070
- <use ns4:href="#m0fca2865ba" x="47.81" y="42.802447" style="stroke: #000000; stroke-width: 0.8" />
4071
  </g>
4072
  </g>
4073
  <g id="text_13">
4074
- <text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: end" x="40.81" y="46.601665" transform="rotate(-0 40.81 46.601665)">2.2</text>
4075
  </g>
4076
  </g>
4077
  <g id="label--y" class="ylabel">
@@ -4079,73 +4087,73 @@ span.linenos.special { color: #000000; background-color: #ffffc0; padding-left:
4079
  </g>
4080
  </g>
4081
  <g id="series--torch-flash-ma" class="series">
4082
- <path d="M 83.607806 340.639848 L 226.799032 324.181385 L 369.990258 320.559009 L 513.181484 308.901185 L 656.37271 265.282228 L 799.563935 254.967155 " clip-path="url(#p09feef2583)" style="fill: none; stroke: #1f77b4; stroke-width: 1.5; stroke-linecap: square" />
4083
  <defs>
4084
  <path id="md7efaf3aec" d="M 0 3 C 0.795609 3 1.55874 2.683901 2.12132 2.12132 C 2.683901 1.55874 3 0.795609 3 0 C 3 -0.795609 2.683901 -1.55874 2.12132 -2.12132 C 1.55874 -2.683901 0.795609 -3 0 -3 C -0.795609 -3 -1.55874 -2.683901 -2.12132 -2.12132 C -2.683901 -1.55874 -3 -0.795609 -3 0 C -3 0.795609 -2.683901 1.55874 -2.12132 2.12132 C -1.55874 2.683901 -0.795609 3 0 3 z " style="stroke: #1f77b4" />
4085
  </defs>
4086
  <g clip-path="url(#p09feef2583)">
4087
- <use ns4:href="#md7efaf3aec" x="83.607806" y="340.639848" style="fill: #1f77b4; stroke: #1f77b4" />
4088
- <use ns4:href="#md7efaf3aec" x="226.799032" y="324.181385" style="fill: #1f77b4; stroke: #1f77b4" />
4089
- <use ns4:href="#md7efaf3aec" x="369.990258" y="320.559009" style="fill: #1f77b4; stroke: #1f77b4" />
4090
- <use ns4:href="#md7efaf3aec" x="513.181484" y="308.901185" style="fill: #1f77b4; stroke: #1f77b4" />
4091
- <use ns4:href="#md7efaf3aec" x="656.37271" y="265.282228" style="fill: #1f77b4; stroke: #1f77b4" />
4092
- <use ns4:href="#md7efaf3aec" x="799.563935" y="254.967155" style="fill: #1f77b4; stroke: #1f77b4" />
4093
  </g>
4094
  </g>
4095
  <g id="series--torch-mem-eff" class="series">
4096
- <path d="M 83.607806 156.748591 L 226.799032 137.315018 L 369.990258 105.143013 L 513.181484 114.228248 L 656.37271 86.655469 L 799.563935 45.999414 " clip-path="url(#p09feef2583)" style="fill: none; stroke: #ff7f0e; stroke-width: 1.5; stroke-linecap: square" />
4097
  <defs>
4098
  <path id="m9b8c54d372" d="M 0 3 C 0.795609 3 1.55874 2.683901 2.12132 2.12132 C 2.683901 1.55874 3 0.795609 3 0 C 3 -0.795609 2.683901 -1.55874 2.12132 -2.12132 C 1.55874 -2.683901 0.795609 -3 0 -3 C -0.795609 -3 -1.55874 -2.683901 -2.12132 -2.12132 C -2.683901 -1.55874 -3 -0.795609 -3 0 C -3 0.795609 -2.683901 1.55874 -2.12132 2.12132 C -1.55874 2.683901 -0.795609 3 0 3 z " style="stroke: #ff7f0e" />
4099
  </defs>
4100
  <g clip-path="url(#p09feef2583)">
4101
- <use ns4:href="#m9b8c54d372" x="83.607806" y="156.748591" style="fill: #ff7f0e; stroke: #ff7f0e" />
4102
- <use ns4:href="#m9b8c54d372" x="226.799032" y="137.315018" style="fill: #ff7f0e; stroke: #ff7f0e" />
4103
- <use ns4:href="#m9b8c54d372" x="369.990258" y="105.143013" style="fill: #ff7f0e; stroke: #ff7f0e" />
4104
- <use ns4:href="#m9b8c54d372" x="513.181484" y="114.228248" style="fill: #ff7f0e; stroke: #ff7f0e" />
4105
- <use ns4:href="#m9b8c54d372" x="656.37271" y="86.655469" style="fill: #ff7f0e; stroke: #ff7f0e" />
4106
  <use ns4:href="#m9b8c54d372" x="799.563935" y="45.999414" style="fill: #ff7f0e; stroke: #ff7f0e" />
4107
  </g>
4108
  </g>
4109
  <g id="series--xformers-meff" class="series">
4110
- <path d="M 83.607806 410.498293 L 226.799032 399.197519 L 369.990258 383.346345 L 513.181484 381.042612 L 656.37271 332.003214 L 799.563935 335.418073 " clip-path="url(#p09feef2583)" style="fill: none; stroke: #2ca02c; stroke-width: 1.5; stroke-linecap: square" />
4111
  <defs>
4112
  <path id="mc655281e0b" d="M 0 3 C 0.795609 3 1.55874 2.683901 2.12132 2.12132 C 2.683901 1.55874 3 0.795609 3 0 C 3 -0.795609 2.683901 -1.55874 2.12132 -2.12132 C 1.55874 -2.683901 0.795609 -3 0 -3 C -0.795609 -3 -1.55874 -2.683901 -2.12132 -2.12132 C -2.683901 -1.55874 -3 -0.795609 -3 0 C -3 0.795609 -2.683901 1.55874 -2.12132 2.12132 C -1.55874 2.683901 -0.795609 3 0 3 z " style="stroke: #2ca02c" />
4113
  </defs>
4114
  <g clip-path="url(#p09feef2583)">
4115
- <use ns4:href="#mc655281e0b" x="83.607806" y="410.498293" style="fill: #2ca02c; stroke: #2ca02c" />
4116
- <use ns4:href="#mc655281e0b" x="226.799032" y="399.197519" style="fill: #2ca02c; stroke: #2ca02c" />
4117
- <use ns4:href="#mc655281e0b" x="369.990258" y="383.346345" style="fill: #2ca02c; stroke: #2ca02c" />
4118
- <use ns4:href="#mc655281e0b" x="513.181484" y="381.042612" style="fill: #2ca02c; stroke: #2ca02c" />
4119
- <use ns4:href="#mc655281e0b" x="656.37271" y="332.003214" style="fill: #2ca02c; stroke: #2ca02c" />
4120
- <use ns4:href="#mc655281e0b" x="799.563935" y="335.418073" style="fill: #2ca02c; stroke: #2ca02c" />
4121
  </g>
4122
  </g>
4123
  <g id="series--hf-kernels-flash-attn" class="series">
4124
- <path d="M 83.607806 418.603626 L 226.799032 405.380276 L 369.990258 389.547718 L 513.181484 382.629499 L 656.37271 335.525188 L 799.563935 340.270592 " clip-path="url(#p09feef2583)" style="fill: none; stroke: #d62728; stroke-width: 1.5; stroke-linecap: square" />
4125
  <defs>
4126
  <path id="m61c8040d7e" d="M 0 3 C 0.795609 3 1.55874 2.683901 2.12132 2.12132 C 2.683901 1.55874 3 0.795609 3 0 C 3 -0.795609 2.683901 -1.55874 2.12132 -2.12132 C 1.55874 -2.683901 0.795609 -3 0 -3 C -0.795609 -3 -1.55874 -2.683901 -2.12132 -2.12132 C -2.683901 -1.55874 -3 -0.795609 -3 0 C -3 0.795609 -2.683901 1.55874 -2.12132 2.12132 C -1.55874 2.683901 -0.795609 3 0 3 z " style="stroke: #d62728" />
4127
  </defs>
4128
  <g clip-path="url(#p09feef2583)">
4129
- <use ns4:href="#m61c8040d7e" x="83.607806" y="418.603626" style="fill: #d62728; stroke: #d62728" />
4130
- <use ns4:href="#m61c8040d7e" x="226.799032" y="405.380276" style="fill: #d62728; stroke: #d62728" />
4131
- <use ns4:href="#m61c8040d7e" x="369.990258" y="389.547718" style="fill: #d62728; stroke: #d62728" />
4132
- <use ns4:href="#m61c8040d7e" x="513.181484" y="382.629499" style="fill: #d62728; stroke: #d62728" />
4133
- <use ns4:href="#m61c8040d7e" x="656.37271" y="335.525188" style="fill: #d62728; stroke: #d62728" />
4134
- <use ns4:href="#m61c8040d7e" x="799.563935" y="340.270592" style="fill: #d62728; stroke: #d62728" />
4135
  </g>
4136
  </g>
4137
  <g id="series--hf-kernels-flash-attn3" class="series">
4138
- <path d="M 83.607806 428.387702 L 226.799032 418.05737 L 369.990258 396.545281 L 513.181484 392.764216 L 656.37271 347.753681 L 799.563935 353.503096 " clip-path="url(#p09feef2583)" style="fill: none; stroke: #9467bd; stroke-width: 1.5; stroke-linecap: square" />
4139
  <defs>
4140
  <path id="m7cd35be9cc" d="M 0 3 C 0.795609 3 1.55874 2.683901 2.12132 2.12132 C 2.683901 1.55874 3 0.795609 3 0 C 3 -0.795609 2.683901 -1.55874 2.12132 -2.12132 C 1.55874 -2.683901 0.795609 -3 0 -3 C -0.795609 -3 -1.55874 -2.683901 -2.12132 -2.12132 C -2.683901 -1.55874 -3 -0.795609 -3 0 C -3 0.795609 -2.683901 1.55874 -2.12132 2.12132 C -1.55874 2.683901 -0.795609 3 0 3 z " style="stroke: #9467bd" />
4141
  </defs>
4142
  <g clip-path="url(#p09feef2583)">
4143
- <use ns4:href="#m7cd35be9cc" x="83.607806" y="428.387702" style="fill: #9467bd; stroke: #9467bd" />
4144
- <use ns4:href="#m7cd35be9cc" x="226.799032" y="418.05737" style="fill: #9467bd; stroke: #9467bd" />
4145
- <use ns4:href="#m7cd35be9cc" x="369.990258" y="396.545281" style="fill: #9467bd; stroke: #9467bd" />
4146
- <use ns4:href="#m7cd35be9cc" x="513.181484" y="392.764216" style="fill: #9467bd; stroke: #9467bd" />
4147
- <use ns4:href="#m7cd35be9cc" x="656.37271" y="347.753681" style="fill: #9467bd; stroke: #9467bd" />
4148
- <use ns4:href="#m7cd35be9cc" x="799.563935" y="353.503096" style="fill: #9467bd; stroke: #9467bd" />
4149
  </g>
4150
  </g>
4151
  <g id="patch_3">
@@ -4230,7 +4238,7 @@ span.linenos.special { color: #000000; background-color: #ffffc0; padding-left:
4230
  <span onclick="toggleOutput('combine')" style="cursor: pointer;">▼ output</span>
4231
  <span id="uv-indicator-combine" onclick="toggleUvLogsFromHeader('combine')" style="cursor: pointer;">▶ uv-logs</span>
4232
  </span> |
4233
- Cell: combine | 4.25s
4234
  | <button class="run-btn" onclick="runCell('combine')">▶ run</button>
4235
  <button class="copy-btn" onclick="copyCell('combine')">Copy</button>
4236
  <a href="cells/combine.py" target="_blank" class="raw-btn">Raw</a>
@@ -4337,48 +4345,48 @@ Summary: 6 found, 0 skipped, 0 missing
4337
  COMBINED BENCHMARK SUMMARY
4338
 
4339
  impl wl p50(ms) ok
4340
- hf_kernels_flash_attn cuda_attn_L128_bfloat16 0.97 True
4341
- hf_kernels_flash_attn cuda_attn_L256_bfloat16 1.01 True
4342
- hf_kernels_flash_attn cuda_attn_L320_bfloat16 1.06 True
4343
- hf_kernels_flash_attn cuda_attn_L384_bfloat16 1.09 True
4344
- hf_kernels_flash_attn cuda_attn_L448_bfloat16 1.24 True
4345
- hf_kernels_flash_attn cuda_attn_L512_bfloat16 1.23 True
4346
- hf_kernels_flash_attn3 cuda_attn_L128_bfloat16 0.94 True
4347
- hf_kernels_flash_attn3 cuda_attn_L256_bfloat16 0.97 True
4348
- hf_kernels_flash_attn3 cuda_attn_L320_bfloat16 1.04 True
4349
- hf_kernels_flash_attn3 cuda_attn_L384_bfloat16 1.05 True
4350
- hf_kernels_flash_attn3 cuda_attn_L448_bfloat16 1.20 True
4351
- hf_kernels_flash_attn3 cuda_attn_L512_bfloat16 1.18 True
4352
  sage_int8_fp16 cuda_attn_L128_bfloat16 FAIL False
4353
- Error: module &#x27;sage_attention_5c963cbdaf16559b&#x27; has no attribute &#x27;fwd&#x27;
4354
  sage_int8_fp16 cuda_attn_L256_bfloat16 FAIL False
4355
- Error: module &#x27;sage_attention_5c963cbdaf16559b&#x27; has no attribute &#x27;fwd&#x27;
4356
  sage_int8_fp16 cuda_attn_L320_bfloat16 FAIL False
4357
- Error: module &#x27;sage_attention_5c963cbdaf16559b&#x27; has no attribute &#x27;fwd&#x27;
4358
  sage_int8_fp16 cuda_attn_L384_bfloat16 FAIL False
4359
- Error: module &#x27;sage_attention_5c963cbdaf16559b&#x27; has no attribute &#x27;fwd&#x27;
4360
  sage_int8_fp16 cuda_attn_L448_bfloat16 FAIL False
4361
- Error: module &#x27;sage_attention_5c963cbdaf16559b&#x27; has no attribute &#x27;fwd&#x27;
4362
  sage_int8_fp16 cuda_attn_L512_bfloat16 FAIL False
4363
- Error: module &#x27;sage_attention_5c963cbdaf16559b&#x27; has no attribute &#x27;fwd&#x27;
4364
- torch_flash_ma cuda_attn_L128_bfloat16 1.22 True
4365
- torch_flash_ma cuda_attn_L256_bfloat16 1.28 True
4366
- torch_flash_ma cuda_attn_L320_bfloat16 1.29 True
4367
- torch_flash_ma cuda_attn_L384_bfloat16 1.33 True
4368
- torch_flash_ma cuda_attn_L448_bfloat16 1.47 True
4369
- torch_flash_ma cuda_attn_L512_bfloat16 1.50 True
4370
  torch_mem_eff cuda_attn_L128_bfloat16 1.83 True
4371
- torch_mem_eff cuda_attn_L256_bfloat16 1.89 True
4372
- torch_mem_eff cuda_attn_L320_bfloat16 2.00 True
4373
- torch_mem_eff cuda_attn_L384_bfloat16 1.97 True
4374
- torch_mem_eff cuda_attn_L448_bfloat16 2.06 True
4375
  torch_mem_eff cuda_attn_L512_bfloat16 2.19 True
4376
- xformers_meff cuda_attn_L128_bfloat16 1.00 True
4377
- xformers_meff cuda_attn_L256_bfloat16 1.03 True
4378
- xformers_meff cuda_attn_L320_bfloat16 1.08 True
4379
- xformers_meff cuda_attn_L384_bfloat16 1.09 True
4380
- xformers_meff cuda_attn_L448_bfloat16 1.25 True
4381
- xformers_meff cuda_attn_L512_bfloat16 1.24 True
4382
 
4383
  GENERATING COMBINED VISUALIZATION
4384
 
@@ -4402,7 +4410,7 @@ Implementations included:
4402
  <div class="uv-install-logs" id="uv-logs-combine">
4403
  <div class="uv-logs-header" onclick="toggleUvLogs(this)">▶ UV Install Logs</div>
4404
  <div class="uv-logs-content" style="display: none;">
4405
- Installed 37 packages in 208ms
4406
  </div>
4407
  </div>
4408
  <div class="cell-artifacts">
@@ -4415,7 +4423,7 @@ Installed 37 packages in 208ms
4415
  <rdf:RDF>
4416
  <ns2:Work>
4417
  <dc:type rdf:resource="http://purl.org/dc/dcmitype/StillImage" />
4418
- <dc:date>2025-10-29T14:28:03.109695</dc:date>
4419
  <dc:format>image/svg+xml</dc:format>
4420
  <dc:creator>
4421
  <ns2:Agent>
@@ -4525,96 +4533,96 @@ Installed 37 packages in 208ms
4525
  <g id="matplotlib.axis_2">
4526
  <g id="ytick_1">
4527
  <g id="grid-y--2" class="grid grid-y">
4528
- <path d="M 47.81 409.00723 L 835.361742 409.00723 " clip-path="url(#p09feef2583)" style="fill: none; stroke: #b0b0b0; stroke-opacity: 0.3; stroke-width: 0.8; stroke-linecap: square" />
4529
  </g>
4530
  <g id="line2d_7">
4531
  <defs>
4532
  <path id="m0fca2865ba" d="M 0 0 L -3.5 0 " style="stroke: #000000; stroke-width: 0.8" />
4533
  </defs>
4534
  <g>
4535
- <use ns4:href="#m0fca2865ba" x="47.81" y="409.00723" style="stroke: #000000; stroke-width: 0.8" />
4536
  </g>
4537
  </g>
4538
  <g id="text_7">
4539
- <text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: end" x="40.81" y="412.806448" transform="rotate(-0 40.81 412.806448)">1.0</text>
4540
  </g>
4541
  </g>
4542
  <g id="ytick_2">
4543
  <g id="grid-y--3" class="grid grid-y">
4544
- <path d="M 47.81 347.973099 L 835.361742 347.973099 " clip-path="url(#p09feef2583)" style="fill: none; stroke: #b0b0b0; stroke-opacity: 0.3; stroke-width: 0.8; stroke-linecap: square" />
4545
  </g>
4546
  <g id="line2d_8">
4547
  <g>
4548
- <use ns4:href="#m0fca2865ba" x="47.81" y="347.973099" style="stroke: #000000; stroke-width: 0.8" />
4549
  </g>
4550
  </g>
4551
  <g id="text_8">
4552
- <text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: end" x="40.81" y="351.772318" transform="rotate(-0 40.81 351.772318)">1.2</text>
4553
  </g>
4554
  </g>
4555
  <g id="ytick_3">
4556
  <g id="grid-y--4" class="grid grid-y">
4557
- <path d="M 47.81 286.938969 L 835.361742 286.938969 " clip-path="url(#p09feef2583)" style="fill: none; stroke: #b0b0b0; stroke-opacity: 0.3; stroke-width: 0.8; stroke-linecap: square" />
4558
  </g>
4559
  <g id="line2d_9">
4560
  <g>
4561
- <use ns4:href="#m0fca2865ba" x="47.81" y="286.938969" style="stroke: #000000; stroke-width: 0.8" />
4562
  </g>
4563
  </g>
4564
  <g id="text_9">
4565
- <text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: end" x="40.81" y="290.738187" transform="rotate(-0 40.81 290.738187)">1.4</text>
4566
  </g>
4567
  </g>
4568
  <g id="ytick_4">
4569
  <g id="grid-y--5" class="grid grid-y">
4570
- <path d="M 47.81 225.904838 L 835.361742 225.904838 " clip-path="url(#p09feef2583)" style="fill: none; stroke: #b0b0b0; stroke-opacity: 0.3; stroke-width: 0.8; stroke-linecap: square" />
4571
  </g>
4572
  <g id="line2d_10">
4573
  <g>
4574
- <use ns4:href="#m0fca2865ba" x="47.81" y="225.904838" style="stroke: #000000; stroke-width: 0.8" />
4575
  </g>
4576
  </g>
4577
  <g id="text_10">
4578
- <text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: end" x="40.81" y="229.704057" transform="rotate(-0 40.81 229.704057)">1.6</text>
4579
  </g>
4580
  </g>
4581
  <g id="ytick_5">
4582
  <g id="grid-y--6" class="grid grid-y">
4583
- <path d="M 47.81 164.870708 L 835.361742 164.870708 " clip-path="url(#p09feef2583)" style="fill: none; stroke: #b0b0b0; stroke-opacity: 0.3; stroke-width: 0.8; stroke-linecap: square" />
4584
  </g>
4585
  <g id="line2d_11">
4586
  <g>
4587
- <use ns4:href="#m0fca2865ba" x="47.81" y="164.870708" style="stroke: #000000; stroke-width: 0.8" />
4588
  </g>
4589
  </g>
4590
  <g id="text_11">
4591
- <text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: end" x="40.81" y="168.669926" transform="rotate(-0 40.81 168.669926)">1.8</text>
4592
  </g>
4593
  </g>
4594
  <g id="ytick_6">
4595
  <g id="grid-y--7" class="grid grid-y">
4596
- <path d="M 47.81 103.836577 L 835.361742 103.836577 " clip-path="url(#p09feef2583)" style="fill: none; stroke: #b0b0b0; stroke-opacity: 0.3; stroke-width: 0.8; stroke-linecap: square" />
4597
  </g>
4598
  <g id="line2d_12">
4599
  <g>
4600
- <use ns4:href="#m0fca2865ba" x="47.81" y="103.836577" style="stroke: #000000; stroke-width: 0.8" />
4601
  </g>
4602
  </g>
4603
  <g id="text_12">
4604
- <text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: end" x="40.81" y="107.635796" transform="rotate(-0 40.81 107.635796)">2.0</text>
4605
  </g>
4606
  </g>
4607
  <g id="ytick_7">
4608
  <g id="grid-y--8" class="grid grid-y">
4609
- <path d="M 47.81 42.802447 L 835.361742 42.802447 " clip-path="url(#p09feef2583)" style="fill: none; stroke: #b0b0b0; stroke-opacity: 0.3; stroke-width: 0.8; stroke-linecap: square" />
4610
  </g>
4611
  <g id="line2d_13">
4612
  <g>
4613
- <use ns4:href="#m0fca2865ba" x="47.81" y="42.802447" style="stroke: #000000; stroke-width: 0.8" />
4614
  </g>
4615
  </g>
4616
  <g id="text_13">
4617
- <text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: end" x="40.81" y="46.601665" transform="rotate(-0 40.81 46.601665)">2.2</text>
4618
  </g>
4619
  </g>
4620
  <g id="label--y" class="ylabel">
@@ -4622,73 +4630,73 @@ Installed 37 packages in 208ms
4622
  </g>
4623
  </g>
4624
  <g id="series--torch-flash-ma" class="series">
4625
- <path d="M 83.607806 340.639848 L 226.799032 324.181385 L 369.990258 320.559009 L 513.181484 308.901185 L 656.37271 265.282228 L 799.563935 254.967155 " clip-path="url(#p09feef2583)" style="fill: none; stroke: #1f77b4; stroke-width: 1.5; stroke-linecap: square" />
4626
  <defs>
4627
  <path id="md7efaf3aec" d="M 0 3 C 0.795609 3 1.55874 2.683901 2.12132 2.12132 C 2.683901 1.55874 3 0.795609 3 0 C 3 -0.795609 2.683901 -1.55874 2.12132 -2.12132 C 1.55874 -2.683901 0.795609 -3 0 -3 C -0.795609 -3 -1.55874 -2.683901 -2.12132 -2.12132 C -2.683901 -1.55874 -3 -0.795609 -3 0 C -3 0.795609 -2.683901 1.55874 -2.12132 2.12132 C -1.55874 2.683901 -0.795609 3 0 3 z " style="stroke: #1f77b4" />
4628
  </defs>
4629
  <g clip-path="url(#p09feef2583)">
4630
- <use ns4:href="#md7efaf3aec" x="83.607806" y="340.639848" style="fill: #1f77b4; stroke: #1f77b4" />
4631
- <use ns4:href="#md7efaf3aec" x="226.799032" y="324.181385" style="fill: #1f77b4; stroke: #1f77b4" />
4632
- <use ns4:href="#md7efaf3aec" x="369.990258" y="320.559009" style="fill: #1f77b4; stroke: #1f77b4" />
4633
- <use ns4:href="#md7efaf3aec" x="513.181484" y="308.901185" style="fill: #1f77b4; stroke: #1f77b4" />
4634
- <use ns4:href="#md7efaf3aec" x="656.37271" y="265.282228" style="fill: #1f77b4; stroke: #1f77b4" />
4635
- <use ns4:href="#md7efaf3aec" x="799.563935" y="254.967155" style="fill: #1f77b4; stroke: #1f77b4" />
4636
  </g>
4637
  </g>
4638
  <g id="series--torch-mem-eff" class="series">
4639
- <path d="M 83.607806 156.748591 L 226.799032 137.315018 L 369.990258 105.143013 L 513.181484 114.228248 L 656.37271 86.655469 L 799.563935 45.999414 " clip-path="url(#p09feef2583)" style="fill: none; stroke: #ff7f0e; stroke-width: 1.5; stroke-linecap: square" />
4640
  <defs>
4641
  <path id="m9b8c54d372" d="M 0 3 C 0.795609 3 1.55874 2.683901 2.12132 2.12132 C 2.683901 1.55874 3 0.795609 3 0 C 3 -0.795609 2.683901 -1.55874 2.12132 -2.12132 C 1.55874 -2.683901 0.795609 -3 0 -3 C -0.795609 -3 -1.55874 -2.683901 -2.12132 -2.12132 C -2.683901 -1.55874 -3 -0.795609 -3 0 C -3 0.795609 -2.683901 1.55874 -2.12132 2.12132 C -1.55874 2.683901 -0.795609 3 0 3 z " style="stroke: #ff7f0e" />
4642
  </defs>
4643
  <g clip-path="url(#p09feef2583)">
4644
- <use ns4:href="#m9b8c54d372" x="83.607806" y="156.748591" style="fill: #ff7f0e; stroke: #ff7f0e" />
4645
- <use ns4:href="#m9b8c54d372" x="226.799032" y="137.315018" style="fill: #ff7f0e; stroke: #ff7f0e" />
4646
- <use ns4:href="#m9b8c54d372" x="369.990258" y="105.143013" style="fill: #ff7f0e; stroke: #ff7f0e" />
4647
- <use ns4:href="#m9b8c54d372" x="513.181484" y="114.228248" style="fill: #ff7f0e; stroke: #ff7f0e" />
4648
- <use ns4:href="#m9b8c54d372" x="656.37271" y="86.655469" style="fill: #ff7f0e; stroke: #ff7f0e" />
4649
  <use ns4:href="#m9b8c54d372" x="799.563935" y="45.999414" style="fill: #ff7f0e; stroke: #ff7f0e" />
4650
  </g>
4651
  </g>
4652
  <g id="series--xformers-meff" class="series">
4653
- <path d="M 83.607806 410.498293 L 226.799032 399.197519 L 369.990258 383.346345 L 513.181484 381.042612 L 656.37271 332.003214 L 799.563935 335.418073 " clip-path="url(#p09feef2583)" style="fill: none; stroke: #2ca02c; stroke-width: 1.5; stroke-linecap: square" />
4654
  <defs>
4655
  <path id="mc655281e0b" d="M 0 3 C 0.795609 3 1.55874 2.683901 2.12132 2.12132 C 2.683901 1.55874 3 0.795609 3 0 C 3 -0.795609 2.683901 -1.55874 2.12132 -2.12132 C 1.55874 -2.683901 0.795609 -3 0 -3 C -0.795609 -3 -1.55874 -2.683901 -2.12132 -2.12132 C -2.683901 -1.55874 -3 -0.795609 -3 0 C -3 0.795609 -2.683901 1.55874 -2.12132 2.12132 C -1.55874 2.683901 -0.795609 3 0 3 z " style="stroke: #2ca02c" />
4656
  </defs>
4657
  <g clip-path="url(#p09feef2583)">
4658
- <use ns4:href="#mc655281e0b" x="83.607806" y="410.498293" style="fill: #2ca02c; stroke: #2ca02c" />
4659
- <use ns4:href="#mc655281e0b" x="226.799032" y="399.197519" style="fill: #2ca02c; stroke: #2ca02c" />
4660
- <use ns4:href="#mc655281e0b" x="369.990258" y="383.346345" style="fill: #2ca02c; stroke: #2ca02c" />
4661
- <use ns4:href="#mc655281e0b" x="513.181484" y="381.042612" style="fill: #2ca02c; stroke: #2ca02c" />
4662
- <use ns4:href="#mc655281e0b" x="656.37271" y="332.003214" style="fill: #2ca02c; stroke: #2ca02c" />
4663
- <use ns4:href="#mc655281e0b" x="799.563935" y="335.418073" style="fill: #2ca02c; stroke: #2ca02c" />
4664
  </g>
4665
  </g>
4666
  <g id="series--hf-kernels-flash-attn" class="series">
4667
- <path d="M 83.607806 418.603626 L 226.799032 405.380276 L 369.990258 389.547718 L 513.181484 382.629499 L 656.37271 335.525188 L 799.563935 340.270592 " clip-path="url(#p09feef2583)" style="fill: none; stroke: #d62728; stroke-width: 1.5; stroke-linecap: square" />
4668
  <defs>
4669
  <path id="m61c8040d7e" d="M 0 3 C 0.795609 3 1.55874 2.683901 2.12132 2.12132 C 2.683901 1.55874 3 0.795609 3 0 C 3 -0.795609 2.683901 -1.55874 2.12132 -2.12132 C 1.55874 -2.683901 0.795609 -3 0 -3 C -0.795609 -3 -1.55874 -2.683901 -2.12132 -2.12132 C -2.683901 -1.55874 -3 -0.795609 -3 0 C -3 0.795609 -2.683901 1.55874 -2.12132 2.12132 C -1.55874 2.683901 -0.795609 3 0 3 z " style="stroke: #d62728" />
4670
  </defs>
4671
  <g clip-path="url(#p09feef2583)">
4672
- <use ns4:href="#m61c8040d7e" x="83.607806" y="418.603626" style="fill: #d62728; stroke: #d62728" />
4673
- <use ns4:href="#m61c8040d7e" x="226.799032" y="405.380276" style="fill: #d62728; stroke: #d62728" />
4674
- <use ns4:href="#m61c8040d7e" x="369.990258" y="389.547718" style="fill: #d62728; stroke: #d62728" />
4675
- <use ns4:href="#m61c8040d7e" x="513.181484" y="382.629499" style="fill: #d62728; stroke: #d62728" />
4676
- <use ns4:href="#m61c8040d7e" x="656.37271" y="335.525188" style="fill: #d62728; stroke: #d62728" />
4677
- <use ns4:href="#m61c8040d7e" x="799.563935" y="340.270592" style="fill: #d62728; stroke: #d62728" />
4678
  </g>
4679
  </g>
4680
  <g id="series--hf-kernels-flash-attn3" class="series">
4681
- <path d="M 83.607806 428.387702 L 226.799032 418.05737 L 369.990258 396.545281 L 513.181484 392.764216 L 656.37271 347.753681 L 799.563935 353.503096 " clip-path="url(#p09feef2583)" style="fill: none; stroke: #9467bd; stroke-width: 1.5; stroke-linecap: square" />
4682
  <defs>
4683
  <path id="m7cd35be9cc" d="M 0 3 C 0.795609 3 1.55874 2.683901 2.12132 2.12132 C 2.683901 1.55874 3 0.795609 3 0 C 3 -0.795609 2.683901 -1.55874 2.12132 -2.12132 C 1.55874 -2.683901 0.795609 -3 0 -3 C -0.795609 -3 -1.55874 -2.683901 -2.12132 -2.12132 C -2.683901 -1.55874 -3 -0.795609 -3 0 C -3 0.795609 -2.683901 1.55874 -2.12132 2.12132 C -1.55874 2.683901 -0.795609 3 0 3 z " style="stroke: #9467bd" />
4684
  </defs>
4685
  <g clip-path="url(#p09feef2583)">
4686
- <use ns4:href="#m7cd35be9cc" x="83.607806" y="428.387702" style="fill: #9467bd; stroke: #9467bd" />
4687
- <use ns4:href="#m7cd35be9cc" x="226.799032" y="418.05737" style="fill: #9467bd; stroke: #9467bd" />
4688
- <use ns4:href="#m7cd35be9cc" x="369.990258" y="396.545281" style="fill: #9467bd; stroke: #9467bd" />
4689
- <use ns4:href="#m7cd35be9cc" x="513.181484" y="392.764216" style="fill: #9467bd; stroke: #9467bd" />
4690
- <use ns4:href="#m7cd35be9cc" x="656.37271" y="347.753681" style="fill: #9467bd; stroke: #9467bd" />
4691
- <use ns4:href="#m7cd35be9cc" x="799.563935" y="353.503096" style="fill: #9467bd; stroke: #9467bd" />
4692
  </g>
4693
  </g>
4694
  <g id="patch_3">
 
809
  .artifact-preview svg {
810
  background: transparent;
811
  }
812
+ /* Invert SVG images in dark mode */
813
+ :root[data-theme="dark"] .artifact-preview img[src$=".svg"] {
814
+ filter: invert(0.9) hue-rotate(180deg);
815
+ }
816
+ /* Keep SVG images readable in monocolor mode */
817
+ :root[data-ui="monocolor"] .artifact-preview img[src$=".svg"] {
818
+ filter: none;
819
+ }
820
  /* CSV table styling */
821
  .artifact-csv {
822
  margin-top: 1rem;
 
3880
  <rdf:RDF>
3881
  <ns2:Work>
3882
  <dc:type rdf:resource="http://purl.org/dc/dcmitype/StillImage" />
3883
+ <dc:date>2025-10-29T15:51:09.340715</dc:date>
3884
  <dc:format>image/svg+xml</dc:format>
3885
  <dc:creator>
3886
  <ns2:Agent>
 
3990
  <g id="matplotlib.axis_2">
3991
  <g id="ytick_1">
3992
  <g id="grid-y--2" class="grid grid-y">
3993
+ <path d="M 47.81 413.024194 L 835.361742 413.024194 " clip-path="url(#p09feef2583)" style="fill: none; stroke: #b0b0b0; stroke-opacity: 0.3; stroke-width: 0.8; stroke-linecap: square" />
3994
  </g>
3995
  <g id="line2d_7">
3996
  <defs>
3997
  <path id="m0fca2865ba" d="M 0 0 L -3.5 0 " style="stroke: #000000; stroke-width: 0.8" />
3998
  </defs>
3999
  <g>
4000
+ <use ns4:href="#m0fca2865ba" x="47.81" y="413.024194" style="stroke: #000000; stroke-width: 0.8" />
4001
  </g>
4002
  </g>
4003
  <g id="text_7">
4004
+ <text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: end" x="40.81" y="416.823413" transform="rotate(-0 40.81 416.823413)">1.0</text>
4005
  </g>
4006
  </g>
4007
  <g id="ytick_2">
4008
  <g id="grid-y--3" class="grid grid-y">
4009
+ <path d="M 47.81 351.27252 L 835.361742 351.27252 " clip-path="url(#p09feef2583)" style="fill: none; stroke: #b0b0b0; stroke-opacity: 0.3; stroke-width: 0.8; stroke-linecap: square" />
4010
  </g>
4011
  <g id="line2d_8">
4012
  <g>
4013
+ <use ns4:href="#m0fca2865ba" x="47.81" y="351.27252" style="stroke: #000000; stroke-width: 0.8" />
4014
  </g>
4015
  </g>
4016
  <g id="text_8">
4017
+ <text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: end" x="40.81" y="355.071739" transform="rotate(-0 40.81 355.071739)">1.2</text>
4018
  </g>
4019
  </g>
4020
  <g id="ytick_3">
4021
  <g id="grid-y--4" class="grid grid-y">
4022
+ <path d="M 47.81 289.520846 L 835.361742 289.520846 " clip-path="url(#p09feef2583)" style="fill: none; stroke: #b0b0b0; stroke-opacity: 0.3; stroke-width: 0.8; stroke-linecap: square" />
4023
  </g>
4024
  <g id="line2d_9">
4025
  <g>
4026
+ <use ns4:href="#m0fca2865ba" x="47.81" y="289.520846" style="stroke: #000000; stroke-width: 0.8" />
4027
  </g>
4028
  </g>
4029
  <g id="text_9">
4030
+ <text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: end" x="40.81" y="293.320065" transform="rotate(-0 40.81 293.320065)">1.4</text>
4031
  </g>
4032
  </g>
4033
  <g id="ytick_4">
4034
  <g id="grid-y--5" class="grid grid-y">
4035
+ <path d="M 47.81 227.769172 L 835.361742 227.769172 " clip-path="url(#p09feef2583)" style="fill: none; stroke: #b0b0b0; stroke-opacity: 0.3; stroke-width: 0.8; stroke-linecap: square" />
4036
  </g>
4037
  <g id="line2d_10">
4038
  <g>
4039
+ <use ns4:href="#m0fca2865ba" x="47.81" y="227.769172" style="stroke: #000000; stroke-width: 0.8" />
4040
  </g>
4041
  </g>
4042
  <g id="text_10">
4043
+ <text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: end" x="40.81" y="231.568391" transform="rotate(-0 40.81 231.568391)">1.6</text>
4044
  </g>
4045
  </g>
4046
  <g id="ytick_5">
4047
  <g id="grid-y--6" class="grid grid-y">
4048
+ <path d="M 47.81 166.017498 L 835.361742 166.017498 " clip-path="url(#p09feef2583)" style="fill: none; stroke: #b0b0b0; stroke-opacity: 0.3; stroke-width: 0.8; stroke-linecap: square" />
4049
  </g>
4050
  <g id="line2d_11">
4051
  <g>
4052
+ <use ns4:href="#m0fca2865ba" x="47.81" y="166.017498" style="stroke: #000000; stroke-width: 0.8" />
4053
  </g>
4054
  </g>
4055
  <g id="text_11">
4056
+ <text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: end" x="40.81" y="169.816717" transform="rotate(-0 40.81 169.816717)">1.8</text>
4057
  </g>
4058
  </g>
4059
  <g id="ytick_6">
4060
  <g id="grid-y--7" class="grid grid-y">
4061
+ <path d="M 47.81 104.265824 L 835.361742 104.265824 " clip-path="url(#p09feef2583)" style="fill: none; stroke: #b0b0b0; stroke-opacity: 0.3; stroke-width: 0.8; stroke-linecap: square" />
4062
  </g>
4063
  <g id="line2d_12">
4064
  <g>
4065
+ <use ns4:href="#m0fca2865ba" x="47.81" y="104.265824" style="stroke: #000000; stroke-width: 0.8" />
4066
  </g>
4067
  </g>
4068
  <g id="text_12">
4069
+ <text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: end" x="40.81" y="108.065043" transform="rotate(-0 40.81 108.065043)">2.0</text>
4070
  </g>
4071
  </g>
4072
  <g id="ytick_7">
4073
  <g id="grid-y--8" class="grid grid-y">
4074
+ <path d="M 47.81 42.51415 L 835.361742 42.51415 " clip-path="url(#p09feef2583)" style="fill: none; stroke: #b0b0b0; stroke-opacity: 0.3; stroke-width: 0.8; stroke-linecap: square" />
4075
  </g>
4076
  <g id="line2d_13">
4077
  <g>
4078
+ <use ns4:href="#m0fca2865ba" x="47.81" y="42.51415" style="stroke: #000000; stroke-width: 0.8" />
4079
  </g>
4080
  </g>
4081
  <g id="text_13">
4082
+ <text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: end" x="40.81" y="46.313369" transform="rotate(-0 40.81 46.313369)">2.2</text>
4083
  </g>
4084
  </g>
4085
  <g id="label--y" class="ylabel">
 
4087
  </g>
4088
  </g>
4089
  <g id="series--torch-flash-ma" class="series">
4090
+ <path d="M 83.607806 346.756003 L 226.799032 329.780159 L 369.990258 321.569965 L 513.181484 313.597515 L 656.37271 266.140736 L 799.563935 260.34812 " clip-path="url(#p09feef2583)" style="fill: none; stroke: #1f77b4; stroke-width: 1.5; stroke-linecap: square" />
4091
  <defs>
4092
  <path id="md7efaf3aec" d="M 0 3 C 0.795609 3 1.55874 2.683901 2.12132 2.12132 C 2.683901 1.55874 3 0.795609 3 0 C 3 -0.795609 2.683901 -1.55874 2.12132 -2.12132 C 1.55874 -2.683901 0.795609 -3 0 -3 C -0.795609 -3 -1.55874 -2.683901 -2.12132 -2.12132 C -2.683901 -1.55874 -3 -0.795609 -3 0 C -3 0.795609 -2.683901 1.55874 -2.12132 2.12132 C -1.55874 2.683901 -0.795609 3 0 3 z " style="stroke: #1f77b4" />
4093
  </defs>
4094
  <g clip-path="url(#p09feef2583)">
4095
+ <use ns4:href="#md7efaf3aec" x="83.607806" y="346.756003" style="fill: #1f77b4; stroke: #1f77b4" />
4096
+ <use ns4:href="#md7efaf3aec" x="226.799032" y="329.780159" style="fill: #1f77b4; stroke: #1f77b4" />
4097
+ <use ns4:href="#md7efaf3aec" x="369.990258" y="321.569965" style="fill: #1f77b4; stroke: #1f77b4" />
4098
+ <use ns4:href="#md7efaf3aec" x="513.181484" y="313.597515" style="fill: #1f77b4; stroke: #1f77b4" />
4099
+ <use ns4:href="#md7efaf3aec" x="656.37271" y="266.140736" style="fill: #1f77b4; stroke: #1f77b4" />
4100
+ <use ns4:href="#md7efaf3aec" x="799.563935" y="260.34812" style="fill: #1f77b4; stroke: #1f77b4" />
4101
  </g>
4102
  </g>
4103
  <g id="series--torch-mem-eff" class="series">
4104
+ <path d="M 83.607806 155.401459 L 226.799032 122.036412 L 369.990258 119.6 L 513.181484 89.078617 L 656.37271 83.422164 L 799.563935 45.999414 " clip-path="url(#p09feef2583)" style="fill: none; stroke: #ff7f0e; stroke-width: 1.5; stroke-linecap: square" />
4105
  <defs>
4106
  <path id="m9b8c54d372" d="M 0 3 C 0.795609 3 1.55874 2.683901 2.12132 2.12132 C 2.683901 1.55874 3 0.795609 3 0 C 3 -0.795609 2.683901 -1.55874 2.12132 -2.12132 C 1.55874 -2.683901 0.795609 -3 0 -3 C -0.795609 -3 -1.55874 -2.683901 -2.12132 -2.12132 C -2.683901 -1.55874 -3 -0.795609 -3 0 C -3 0.795609 -2.683901 1.55874 -2.12132 2.12132 C -1.55874 2.683901 -0.795609 3 0 3 z " style="stroke: #ff7f0e" />
4107
  </defs>
4108
  <g clip-path="url(#p09feef2583)">
4109
+ <use ns4:href="#m9b8c54d372" x="83.607806" y="155.401459" style="fill: #ff7f0e; stroke: #ff7f0e" />
4110
+ <use ns4:href="#m9b8c54d372" x="226.799032" y="122.036412" style="fill: #ff7f0e; stroke: #ff7f0e" />
4111
+ <use ns4:href="#m9b8c54d372" x="369.990258" y="119.6" style="fill: #ff7f0e; stroke: #ff7f0e" />
4112
+ <use ns4:href="#m9b8c54d372" x="513.181484" y="89.078617" style="fill: #ff7f0e; stroke: #ff7f0e" />
4113
+ <use ns4:href="#m9b8c54d372" x="656.37271" y="83.422164" style="fill: #ff7f0e; stroke: #ff7f0e" />
4114
  <use ns4:href="#m9b8c54d372" x="799.563935" y="45.999414" style="fill: #ff7f0e; stroke: #ff7f0e" />
4115
  </g>
4116
  </g>
4117
  <g id="series--xformers-meff" class="series">
4118
+ <path d="M 83.607806 415.619926 L 226.799032 397.353472 L 369.990258 394.772252 L 513.181484 393.111132 L 656.37271 341.729417 L 799.563935 342.902698 " clip-path="url(#p09feef2583)" style="fill: none; stroke: #2ca02c; stroke-width: 1.5; stroke-linecap: square" />
4119
  <defs>
4120
  <path id="mc655281e0b" d="M 0 3 C 0.795609 3 1.55874 2.683901 2.12132 2.12132 C 2.683901 1.55874 3 0.795609 3 0 C 3 -0.795609 2.683901 -1.55874 2.12132 -2.12132 C 1.55874 -2.683901 0.795609 -3 0 -3 C -0.795609 -3 -1.55874 -2.683901 -2.12132 -2.12132 C -2.683901 -1.55874 -3 -0.795609 -3 0 C -3 0.795609 -2.683901 1.55874 -2.12132 2.12132 C -1.55874 2.683901 -0.795609 3 0 3 z " style="stroke: #2ca02c" />
4121
  </defs>
4122
  <g clip-path="url(#p09feef2583)">
4123
+ <use ns4:href="#mc655281e0b" x="83.607806" y="415.619926" style="fill: #2ca02c; stroke: #2ca02c" />
4124
+ <use ns4:href="#mc655281e0b" x="226.799032" y="397.353472" style="fill: #2ca02c; stroke: #2ca02c" />
4125
+ <use ns4:href="#mc655281e0b" x="369.990258" y="394.772252" style="fill: #2ca02c; stroke: #2ca02c" />
4126
+ <use ns4:href="#mc655281e0b" x="513.181484" y="393.111132" style="fill: #2ca02c; stroke: #2ca02c" />
4127
+ <use ns4:href="#mc655281e0b" x="656.37271" y="341.729417" style="fill: #2ca02c; stroke: #2ca02c" />
4128
+ <use ns4:href="#mc655281e0b" x="799.563935" y="342.902698" style="fill: #2ca02c; stroke: #2ca02c" />
4129
  </g>
4130
  </g>
4131
  <g id="series--hf-kernels-flash-attn" class="series">
4132
+ <path d="M 83.607806 428.387702 L 226.799032 413.415083 L 369.990258 398.063616 L 513.181484 390.915551 L 656.37271 347.629789 L 799.563935 352.847806 " clip-path="url(#p09feef2583)" style="fill: none; stroke: #d62728; stroke-width: 1.5; stroke-linecap: square" />
4133
  <defs>
4134
  <path id="m61c8040d7e" d="M 0 3 C 0.795609 3 1.55874 2.683901 2.12132 2.12132 C 2.683901 1.55874 3 0.795609 3 0 C 3 -0.795609 2.683901 -1.55874 2.12132 -2.12132 C 1.55874 -2.683901 0.795609 -3 0 -3 C -0.795609 -3 -1.55874 -2.683901 -2.12132 -2.12132 C -2.683901 -1.55874 -3 -0.795609 -3 0 C -3 0.795609 -2.683901 1.55874 -2.12132 2.12132 C -1.55874 2.683901 -0.795609 3 0 3 z " style="stroke: #d62728" />
4135
  </defs>
4136
  <g clip-path="url(#p09feef2583)">
4137
+ <use ns4:href="#m61c8040d7e" x="83.607806" y="428.387702" style="fill: #d62728; stroke: #d62728" />
4138
+ <use ns4:href="#m61c8040d7e" x="226.799032" y="413.415083" style="fill: #d62728; stroke: #d62728" />
4139
+ <use ns4:href="#m61c8040d7e" x="369.990258" y="398.063616" style="fill: #d62728; stroke: #d62728" />
4140
+ <use ns4:href="#m61c8040d7e" x="513.181484" y="390.915551" style="fill: #d62728; stroke: #d62728" />
4141
+ <use ns4:href="#m61c8040d7e" x="656.37271" y="347.629789" style="fill: #d62728; stroke: #d62728" />
4142
+ <use ns4:href="#m61c8040d7e" x="799.563935" y="352.847806" style="fill: #d62728; stroke: #d62728" />
4143
  </g>
4144
  </g>
4145
  <g id="series--hf-kernels-flash-attn3" class="series">
4146
+ <path d="M 83.607806 411.846899 L 226.799032 414.604111 L 369.990258 404.183516 L 513.181484 406.09473 L 656.37271 355.213203 L 799.563935 367.844508 " clip-path="url(#p09feef2583)" style="fill: none; stroke: #9467bd; stroke-width: 1.5; stroke-linecap: square" />
4147
  <defs>
4148
  <path id="m7cd35be9cc" d="M 0 3 C 0.795609 3 1.55874 2.683901 2.12132 2.12132 C 2.683901 1.55874 3 0.795609 3 0 C 3 -0.795609 2.683901 -1.55874 2.12132 -2.12132 C 1.55874 -2.683901 0.795609 -3 0 -3 C -0.795609 -3 -1.55874 -2.683901 -2.12132 -2.12132 C -2.683901 -1.55874 -3 -0.795609 -3 0 C -3 0.795609 -2.683901 1.55874 -2.12132 2.12132 C -1.55874 2.683901 -0.795609 3 0 3 z " style="stroke: #9467bd" />
4149
  </defs>
4150
  <g clip-path="url(#p09feef2583)">
4151
+ <use ns4:href="#m7cd35be9cc" x="83.607806" y="411.846899" style="fill: #9467bd; stroke: #9467bd" />
4152
+ <use ns4:href="#m7cd35be9cc" x="226.799032" y="414.604111" style="fill: #9467bd; stroke: #9467bd" />
4153
+ <use ns4:href="#m7cd35be9cc" x="369.990258" y="404.183516" style="fill: #9467bd; stroke: #9467bd" />
4154
+ <use ns4:href="#m7cd35be9cc" x="513.181484" y="406.09473" style="fill: #9467bd; stroke: #9467bd" />
4155
+ <use ns4:href="#m7cd35be9cc" x="656.37271" y="355.213203" style="fill: #9467bd; stroke: #9467bd" />
4156
+ <use ns4:href="#m7cd35be9cc" x="799.563935" y="367.844508" style="fill: #9467bd; stroke: #9467bd" />
4157
  </g>
4158
  </g>
4159
  <g id="patch_3">
 
4238
  <span onclick="toggleOutput('combine')" style="cursor: pointer;">▼ output</span>
4239
  <span id="uv-indicator-combine" onclick="toggleUvLogsFromHeader('combine')" style="cursor: pointer;">▶ uv-logs</span>
4240
  </span> |
4241
+ Cell: combine | 4.24s
4242
  | <button class="run-btn" onclick="runCell('combine')">▶ run</button>
4243
  <button class="copy-btn" onclick="copyCell('combine')">Copy</button>
4244
  <a href="cells/combine.py" target="_blank" class="raw-btn">Raw</a>
 
4345
  COMBINED BENCHMARK SUMMARY
4346
 
4347
  impl wl p50(ms) ok
4348
+ hf_kernels_flash_attn cuda_attn_L128_bfloat16 0.95 True
4349
+ hf_kernels_flash_attn cuda_attn_L256_bfloat16 1.00 True
4350
+ hf_kernels_flash_attn cuda_attn_L320_bfloat16 1.05 True
4351
+ hf_kernels_flash_attn cuda_attn_L384_bfloat16 1.07 True
4352
+ hf_kernels_flash_attn cuda_attn_L448_bfloat16 1.21 True
4353
+ hf_kernels_flash_attn cuda_attn_L512_bfloat16 1.19 True
4354
+ hf_kernels_flash_attn3 cuda_attn_L128_bfloat16 1.00 True
4355
+ hf_kernels_flash_attn3 cuda_attn_L256_bfloat16 0.99 True
4356
+ hf_kernels_flash_attn3 cuda_attn_L320_bfloat16 1.03 True
4357
+ hf_kernels_flash_attn3 cuda_attn_L384_bfloat16 1.02 True
4358
+ hf_kernels_flash_attn3 cuda_attn_L448_bfloat16 1.19 True
4359
+ hf_kernels_flash_attn3 cuda_attn_L512_bfloat16 1.15 True
4360
  sage_int8_fp16 cuda_attn_L128_bfloat16 FAIL False
4361
+ Error: module &#x27;sage_attention_d4f4a6803f593c0b&#x27; has no attribute &#x27;fwd&#x27;
4362
  sage_int8_fp16 cuda_attn_L256_bfloat16 FAIL False
4363
+ Error: module &#x27;sage_attention_d4f4a6803f593c0b&#x27; has no attribute &#x27;fwd&#x27;
4364
  sage_int8_fp16 cuda_attn_L320_bfloat16 FAIL False
4365
+ Error: module &#x27;sage_attention_d4f4a6803f593c0b&#x27; has no attribute &#x27;fwd&#x27;
4366
  sage_int8_fp16 cuda_attn_L384_bfloat16 FAIL False
4367
+ Error: module &#x27;sage_attention_d4f4a6803f593c0b&#x27; has no attribute &#x27;fwd&#x27;
4368
  sage_int8_fp16 cuda_attn_L448_bfloat16 FAIL False
4369
+ Error: module &#x27;sage_attention_d4f4a6803f593c0b&#x27; has no attribute &#x27;fwd&#x27;
4370
  sage_int8_fp16 cuda_attn_L512_bfloat16 FAIL False
4371
+ Error: module &#x27;sage_attention_d4f4a6803f593c0b&#x27; has no attribute &#x27;fwd&#x27;
4372
+ torch_flash_ma cuda_attn_L128_bfloat16 1.21 True
4373
+ torch_flash_ma cuda_attn_L256_bfloat16 1.27 True
4374
+ torch_flash_ma cuda_attn_L320_bfloat16 1.30 True
4375
+ torch_flash_ma cuda_attn_L384_bfloat16 1.32 True
4376
+ torch_flash_ma cuda_attn_L448_bfloat16 1.48 True
4377
+ torch_flash_ma cuda_attn_L512_bfloat16 1.49 True
4378
  torch_mem_eff cuda_attn_L128_bfloat16 1.83 True
4379
+ torch_mem_eff cuda_attn_L256_bfloat16 1.94 True
4380
+ torch_mem_eff cuda_attn_L320_bfloat16 1.95 True
4381
+ torch_mem_eff cuda_attn_L384_bfloat16 2.05 True
4382
+ torch_mem_eff cuda_attn_L448_bfloat16 2.07 True
4383
  torch_mem_eff cuda_attn_L512_bfloat16 2.19 True
4384
+ xformers_meff cuda_attn_L128_bfloat16 0.99 True
4385
+ xformers_meff cuda_attn_L256_bfloat16 1.05 True
4386
+ xformers_meff cuda_attn_L320_bfloat16 1.06 True
4387
+ xformers_meff cuda_attn_L384_bfloat16 1.06 True
4388
+ xformers_meff cuda_attn_L448_bfloat16 1.23 True
4389
+ xformers_meff cuda_attn_L512_bfloat16 1.23 True
4390
 
4391
  GENERATING COMBINED VISUALIZATION
4392
 
 
4410
  <div class="uv-install-logs" id="uv-logs-combine">
4411
  <div class="uv-logs-header" onclick="toggleUvLogs(this)">▶ UV Install Logs</div>
4412
  <div class="uv-logs-content" style="display: none;">
4413
+ Installed 37 packages in 204ms
4414
  </div>
4415
  </div>
4416
  <div class="cell-artifacts">
 
4423
  <rdf:RDF>
4424
  <ns2:Work>
4425
  <dc:type rdf:resource="http://purl.org/dc/dcmitype/StillImage" />
4426
+ <dc:date>2025-10-29T15:51:09.340715</dc:date>
4427
  <dc:format>image/svg+xml</dc:format>
4428
  <dc:creator>
4429
  <ns2:Agent>
 
4533
  <g id="matplotlib.axis_2">
4534
  <g id="ytick_1">
4535
  <g id="grid-y--2" class="grid grid-y">
4536
+ <path d="M 47.81 413.024194 L 835.361742 413.024194 " clip-path="url(#p09feef2583)" style="fill: none; stroke: #b0b0b0; stroke-opacity: 0.3; stroke-width: 0.8; stroke-linecap: square" />
4537
  </g>
4538
  <g id="line2d_7">
4539
  <defs>
4540
  <path id="m0fca2865ba" d="M 0 0 L -3.5 0 " style="stroke: #000000; stroke-width: 0.8" />
4541
  </defs>
4542
  <g>
4543
+ <use ns4:href="#m0fca2865ba" x="47.81" y="413.024194" style="stroke: #000000; stroke-width: 0.8" />
4544
  </g>
4545
  </g>
4546
  <g id="text_7">
4547
+ <text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: end" x="40.81" y="416.823413" transform="rotate(-0 40.81 416.823413)">1.0</text>
4548
  </g>
4549
  </g>
4550
  <g id="ytick_2">
4551
  <g id="grid-y--3" class="grid grid-y">
4552
+ <path d="M 47.81 351.27252 L 835.361742 351.27252 " clip-path="url(#p09feef2583)" style="fill: none; stroke: #b0b0b0; stroke-opacity: 0.3; stroke-width: 0.8; stroke-linecap: square" />
4553
  </g>
4554
  <g id="line2d_8">
4555
  <g>
4556
+ <use ns4:href="#m0fca2865ba" x="47.81" y="351.27252" style="stroke: #000000; stroke-width: 0.8" />
4557
  </g>
4558
  </g>
4559
  <g id="text_8">
4560
+ <text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: end" x="40.81" y="355.071739" transform="rotate(-0 40.81 355.071739)">1.2</text>
4561
  </g>
4562
  </g>
4563
  <g id="ytick_3">
4564
  <g id="grid-y--4" class="grid grid-y">
4565
+ <path d="M 47.81 289.520846 L 835.361742 289.520846 " clip-path="url(#p09feef2583)" style="fill: none; stroke: #b0b0b0; stroke-opacity: 0.3; stroke-width: 0.8; stroke-linecap: square" />
4566
  </g>
4567
  <g id="line2d_9">
4568
  <g>
4569
+ <use ns4:href="#m0fca2865ba" x="47.81" y="289.520846" style="stroke: #000000; stroke-width: 0.8" />
4570
  </g>
4571
  </g>
4572
  <g id="text_9">
4573
+ <text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: end" x="40.81" y="293.320065" transform="rotate(-0 40.81 293.320065)">1.4</text>
4574
  </g>
4575
  </g>
4576
  <g id="ytick_4">
4577
  <g id="grid-y--5" class="grid grid-y">
4578
+ <path d="M 47.81 227.769172 L 835.361742 227.769172 " clip-path="url(#p09feef2583)" style="fill: none; stroke: #b0b0b0; stroke-opacity: 0.3; stroke-width: 0.8; stroke-linecap: square" />
4579
  </g>
4580
  <g id="line2d_10">
4581
  <g>
4582
+ <use ns4:href="#m0fca2865ba" x="47.81" y="227.769172" style="stroke: #000000; stroke-width: 0.8" />
4583
  </g>
4584
  </g>
4585
  <g id="text_10">
4586
+ <text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: end" x="40.81" y="231.568391" transform="rotate(-0 40.81 231.568391)">1.6</text>
4587
  </g>
4588
  </g>
4589
  <g id="ytick_5">
4590
  <g id="grid-y--6" class="grid grid-y">
4591
+ <path d="M 47.81 166.017498 L 835.361742 166.017498 " clip-path="url(#p09feef2583)" style="fill: none; stroke: #b0b0b0; stroke-opacity: 0.3; stroke-width: 0.8; stroke-linecap: square" />
4592
  </g>
4593
  <g id="line2d_11">
4594
  <g>
4595
+ <use ns4:href="#m0fca2865ba" x="47.81" y="166.017498" style="stroke: #000000; stroke-width: 0.8" />
4596
  </g>
4597
  </g>
4598
  <g id="text_11">
4599
+ <text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: end" x="40.81" y="169.816717" transform="rotate(-0 40.81 169.816717)">1.8</text>
4600
  </g>
4601
  </g>
4602
  <g id="ytick_6">
4603
  <g id="grid-y--7" class="grid grid-y">
4604
+ <path d="M 47.81 104.265824 L 835.361742 104.265824 " clip-path="url(#p09feef2583)" style="fill: none; stroke: #b0b0b0; stroke-opacity: 0.3; stroke-width: 0.8; stroke-linecap: square" />
4605
  </g>
4606
  <g id="line2d_12">
4607
  <g>
4608
+ <use ns4:href="#m0fca2865ba" x="47.81" y="104.265824" style="stroke: #000000; stroke-width: 0.8" />
4609
  </g>
4610
  </g>
4611
  <g id="text_12">
4612
+ <text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: end" x="40.81" y="108.065043" transform="rotate(-0 40.81 108.065043)">2.0</text>
4613
  </g>
4614
  </g>
4615
  <g id="ytick_7">
4616
  <g id="grid-y--8" class="grid grid-y">
4617
+ <path d="M 47.81 42.51415 L 835.361742 42.51415 " clip-path="url(#p09feef2583)" style="fill: none; stroke: #b0b0b0; stroke-opacity: 0.3; stroke-width: 0.8; stroke-linecap: square" />
4618
  </g>
4619
  <g id="line2d_13">
4620
  <g>
4621
+ <use ns4:href="#m0fca2865ba" x="47.81" y="42.51415" style="stroke: #000000; stroke-width: 0.8" />
4622
  </g>
4623
  </g>
4624
  <g id="text_13">
4625
+ <text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: end" x="40.81" y="46.313369" transform="rotate(-0 40.81 46.313369)">2.2</text>
4626
  </g>
4627
  </g>
4628
  <g id="label--y" class="ylabel">
 
4630
  </g>
4631
  </g>
4632
  <g id="series--torch-flash-ma" class="series">
4633
+ <path d="M 83.607806 346.756003 L 226.799032 329.780159 L 369.990258 321.569965 L 513.181484 313.597515 L 656.37271 266.140736 L 799.563935 260.34812 " clip-path="url(#p09feef2583)" style="fill: none; stroke: #1f77b4; stroke-width: 1.5; stroke-linecap: square" />
4634
  <defs>
4635
  <path id="md7efaf3aec" d="M 0 3 C 0.795609 3 1.55874 2.683901 2.12132 2.12132 C 2.683901 1.55874 3 0.795609 3 0 C 3 -0.795609 2.683901 -1.55874 2.12132 -2.12132 C 1.55874 -2.683901 0.795609 -3 0 -3 C -0.795609 -3 -1.55874 -2.683901 -2.12132 -2.12132 C -2.683901 -1.55874 -3 -0.795609 -3 0 C -3 0.795609 -2.683901 1.55874 -2.12132 2.12132 C -1.55874 2.683901 -0.795609 3 0 3 z " style="stroke: #1f77b4" />
4636
  </defs>
4637
  <g clip-path="url(#p09feef2583)">
4638
+ <use ns4:href="#md7efaf3aec" x="83.607806" y="346.756003" style="fill: #1f77b4; stroke: #1f77b4" />
4639
+ <use ns4:href="#md7efaf3aec" x="226.799032" y="329.780159" style="fill: #1f77b4; stroke: #1f77b4" />
4640
+ <use ns4:href="#md7efaf3aec" x="369.990258" y="321.569965" style="fill: #1f77b4; stroke: #1f77b4" />
4641
+ <use ns4:href="#md7efaf3aec" x="513.181484" y="313.597515" style="fill: #1f77b4; stroke: #1f77b4" />
4642
+ <use ns4:href="#md7efaf3aec" x="656.37271" y="266.140736" style="fill: #1f77b4; stroke: #1f77b4" />
4643
+ <use ns4:href="#md7efaf3aec" x="799.563935" y="260.34812" style="fill: #1f77b4; stroke: #1f77b4" />
4644
  </g>
4645
  </g>
4646
  <g id="series--torch-mem-eff" class="series">
4647
+ <path d="M 83.607806 155.401459 L 226.799032 122.036412 L 369.990258 119.6 L 513.181484 89.078617 L 656.37271 83.422164 L 799.563935 45.999414 " clip-path="url(#p09feef2583)" style="fill: none; stroke: #ff7f0e; stroke-width: 1.5; stroke-linecap: square" />
4648
  <defs>
4649
  <path id="m9b8c54d372" d="M 0 3 C 0.795609 3 1.55874 2.683901 2.12132 2.12132 C 2.683901 1.55874 3 0.795609 3 0 C 3 -0.795609 2.683901 -1.55874 2.12132 -2.12132 C 1.55874 -2.683901 0.795609 -3 0 -3 C -0.795609 -3 -1.55874 -2.683901 -2.12132 -2.12132 C -2.683901 -1.55874 -3 -0.795609 -3 0 C -3 0.795609 -2.683901 1.55874 -2.12132 2.12132 C -1.55874 2.683901 -0.795609 3 0 3 z " style="stroke: #ff7f0e" />
4650
  </defs>
4651
  <g clip-path="url(#p09feef2583)">
4652
+ <use ns4:href="#m9b8c54d372" x="83.607806" y="155.401459" style="fill: #ff7f0e; stroke: #ff7f0e" />
4653
+ <use ns4:href="#m9b8c54d372" x="226.799032" y="122.036412" style="fill: #ff7f0e; stroke: #ff7f0e" />
4654
+ <use ns4:href="#m9b8c54d372" x="369.990258" y="119.6" style="fill: #ff7f0e; stroke: #ff7f0e" />
4655
+ <use ns4:href="#m9b8c54d372" x="513.181484" y="89.078617" style="fill: #ff7f0e; stroke: #ff7f0e" />
4656
+ <use ns4:href="#m9b8c54d372" x="656.37271" y="83.422164" style="fill: #ff7f0e; stroke: #ff7f0e" />
4657
  <use ns4:href="#m9b8c54d372" x="799.563935" y="45.999414" style="fill: #ff7f0e; stroke: #ff7f0e" />
4658
  </g>
4659
  </g>
4660
  <g id="series--xformers-meff" class="series">
4661
+ <path d="M 83.607806 415.619926 L 226.799032 397.353472 L 369.990258 394.772252 L 513.181484 393.111132 L 656.37271 341.729417 L 799.563935 342.902698 " clip-path="url(#p09feef2583)" style="fill: none; stroke: #2ca02c; stroke-width: 1.5; stroke-linecap: square" />
4662
  <defs>
4663
  <path id="mc655281e0b" d="M 0 3 C 0.795609 3 1.55874 2.683901 2.12132 2.12132 C 2.683901 1.55874 3 0.795609 3 0 C 3 -0.795609 2.683901 -1.55874 2.12132 -2.12132 C 1.55874 -2.683901 0.795609 -3 0 -3 C -0.795609 -3 -1.55874 -2.683901 -2.12132 -2.12132 C -2.683901 -1.55874 -3 -0.795609 -3 0 C -3 0.795609 -2.683901 1.55874 -2.12132 2.12132 C -1.55874 2.683901 -0.795609 3 0 3 z " style="stroke: #2ca02c" />
4664
  </defs>
4665
  <g clip-path="url(#p09feef2583)">
4666
+ <use ns4:href="#mc655281e0b" x="83.607806" y="415.619926" style="fill: #2ca02c; stroke: #2ca02c" />
4667
+ <use ns4:href="#mc655281e0b" x="226.799032" y="397.353472" style="fill: #2ca02c; stroke: #2ca02c" />
4668
+ <use ns4:href="#mc655281e0b" x="369.990258" y="394.772252" style="fill: #2ca02c; stroke: #2ca02c" />
4669
+ <use ns4:href="#mc655281e0b" x="513.181484" y="393.111132" style="fill: #2ca02c; stroke: #2ca02c" />
4670
+ <use ns4:href="#mc655281e0b" x="656.37271" y="341.729417" style="fill: #2ca02c; stroke: #2ca02c" />
4671
+ <use ns4:href="#mc655281e0b" x="799.563935" y="342.902698" style="fill: #2ca02c; stroke: #2ca02c" />
4672
  </g>
4673
  </g>
4674
  <g id="series--hf-kernels-flash-attn" class="series">
4675
+ <path d="M 83.607806 428.387702 L 226.799032 413.415083 L 369.990258 398.063616 L 513.181484 390.915551 L 656.37271 347.629789 L 799.563935 352.847806 " clip-path="url(#p09feef2583)" style="fill: none; stroke: #d62728; stroke-width: 1.5; stroke-linecap: square" />
4676
  <defs>
4677
  <path id="m61c8040d7e" d="M 0 3 C 0.795609 3 1.55874 2.683901 2.12132 2.12132 C 2.683901 1.55874 3 0.795609 3 0 C 3 -0.795609 2.683901 -1.55874 2.12132 -2.12132 C 1.55874 -2.683901 0.795609 -3 0 -3 C -0.795609 -3 -1.55874 -2.683901 -2.12132 -2.12132 C -2.683901 -1.55874 -3 -0.795609 -3 0 C -3 0.795609 -2.683901 1.55874 -2.12132 2.12132 C -1.55874 2.683901 -0.795609 3 0 3 z " style="stroke: #d62728" />
4678
  </defs>
4679
  <g clip-path="url(#p09feef2583)">
4680
+ <use ns4:href="#m61c8040d7e" x="83.607806" y="428.387702" style="fill: #d62728; stroke: #d62728" />
4681
+ <use ns4:href="#m61c8040d7e" x="226.799032" y="413.415083" style="fill: #d62728; stroke: #d62728" />
4682
+ <use ns4:href="#m61c8040d7e" x="369.990258" y="398.063616" style="fill: #d62728; stroke: #d62728" />
4683
+ <use ns4:href="#m61c8040d7e" x="513.181484" y="390.915551" style="fill: #d62728; stroke: #d62728" />
4684
+ <use ns4:href="#m61c8040d7e" x="656.37271" y="347.629789" style="fill: #d62728; stroke: #d62728" />
4685
+ <use ns4:href="#m61c8040d7e" x="799.563935" y="352.847806" style="fill: #d62728; stroke: #d62728" />
4686
  </g>
4687
  </g>
4688
  <g id="series--hf-kernels-flash-attn3" class="series">
4689
+ <path d="M 83.607806 411.846899 L 226.799032 414.604111 L 369.990258 404.183516 L 513.181484 406.09473 L 656.37271 355.213203 L 799.563935 367.844508 " clip-path="url(#p09feef2583)" style="fill: none; stroke: #9467bd; stroke-width: 1.5; stroke-linecap: square" />
4690
  <defs>
4691
  <path id="m7cd35be9cc" d="M 0 3 C 0.795609 3 1.55874 2.683901 2.12132 2.12132 C 2.683901 1.55874 3 0.795609 3 0 C 3 -0.795609 2.683901 -1.55874 2.12132 -2.12132 C 1.55874 -2.683901 0.795609 -3 0 -3 C -0.795609 -3 -1.55874 -2.683901 -2.12132 -2.12132 C -2.683901 -1.55874 -3 -0.795609 -3 0 C -3 0.795609 -2.683901 1.55874 -2.12132 2.12132 C -1.55874 2.683901 -0.795609 3 0 3 z " style="stroke: #9467bd" />
4692
  </defs>
4693
  <g clip-path="url(#p09feef2583)">
4694
+ <use ns4:href="#m7cd35be9cc" x="83.607806" y="411.846899" style="fill: #9467bd; stroke: #9467bd" />
4695
+ <use ns4:href="#m7cd35be9cc" x="226.799032" y="414.604111" style="fill: #9467bd; stroke: #9467bd" />
4696
+ <use ns4:href="#m7cd35be9cc" x="369.990258" y="404.183516" style="fill: #9467bd; stroke: #9467bd" />
4697
+ <use ns4:href="#m7cd35be9cc" x="513.181484" y="406.09473" style="fill: #9467bd; stroke: #9467bd" />
4698
+ <use ns4:href="#m7cd35be9cc" x="656.37271" y="355.213203" style="fill: #9467bd; stroke: #9467bd" />
4699
+ <use ns4:href="#m7cd35be9cc" x="799.563935" y="367.844508" style="fill: #9467bd; stroke: #9467bd" />
4700
  </g>
4701
  </g>
4702
  <g id="patch_3">
index.html CHANGED
@@ -809,6 +809,14 @@
809
  .artifact-preview svg {
810
  background: transparent;
811
  }
 
 
 
 
 
 
 
 
812
  /* CSV table styling */
813
  .artifact-csv {
814
  margin-top: 1rem;
@@ -3865,8 +3873,7 @@ span.linenos.special { color: #000000; background-color: #ffffc0; padding-left:
3865
  <h1>All Benchmarks Aggregated Report</h1>
3866
  <h2><a href="layer_norm/">Layer Norm</a></h2>
3867
  <div class="artifact-preview">
3868
- <object data="layer_norm/results/artifacts/combine/latency.svg" type="image/svg+xml" width="800">
3869
- </object>
3870
  </div>
3871
 
3872
  <table>
@@ -3889,8 +3896,7 @@ span.linenos.special { color: #000000; background-color: #ffffc0; padding-left:
3889
  </table>
3890
  <h2><a href="rotary/">Rotary Position Embeddings</a></h2>
3891
  <div class="artifact-preview">
3892
- <object data="rotary/results/artifacts/combine/latency.svg" type="image/svg+xml" width="800">
3893
- </object>
3894
  </div>
3895
 
3896
  <table>
@@ -3913,8 +3919,7 @@ span.linenos.special { color: #000000; background-color: #ffffc0; padding-left:
3913
  </table>
3914
  <h2><a href="flash_attn/">Flash Attention</a></h2>
3915
  <div class="artifact-preview">
3916
- <object data="flash_attn/results/artifacts/combine/latency.svg" type="image/svg+xml" width="800">
3917
- </object>
3918
  </div>
3919
 
3920
  <table>
@@ -3953,8 +3958,7 @@ span.linenos.special { color: #000000; background-color: #ffffc0; padding-left:
3953
  </table>
3954
  <h2><a href="causal_conv1d/">Causal Conv1D</a></h2>
3955
  <div class="artifact-preview">
3956
- <object data="causal_conv1d/results/artifacts/combine/latency.svg" type="image/svg+xml" width="800">
3957
- </object>
3958
  </div>
3959
 
3960
  <table>
@@ -3977,8 +3981,7 @@ span.linenos.special { color: #000000; background-color: #ffffc0; padding-left:
3977
  </table>
3978
  <h2><a href="activation/">Activation</a></h2>
3979
  <div class="artifact-preview">
3980
- <object data="activation/results/artifacts/combine/latency.svg" type="image/svg+xml" width="800">
3981
- </object>
3982
  </div>
3983
 
3984
  <table>
@@ -4001,8 +4004,7 @@ span.linenos.special { color: #000000; background-color: #ffffc0; padding-left:
4001
  </table>
4002
  <h2><a href="relu/">ReLU</a></h2>
4003
  <div class="artifact-preview">
4004
- <object data="relu/results/artifacts/combine/latency.svg" type="image/svg+xml" width="800">
4005
- </object>
4006
  </div>
4007
 
4008
  <table>
 
809
  .artifact-preview svg {
810
  background: transparent;
811
  }
812
+ /* Invert SVG images in dark mode */
813
+ :root[data-theme="dark"] .artifact-preview img[src$=".svg"] {
814
+ filter: invert(0.9) hue-rotate(180deg);
815
+ }
816
+ /* Keep SVG images readable in monocolor mode */
817
+ :root[data-ui="monocolor"] .artifact-preview img[src$=".svg"] {
818
+ filter: none;
819
+ }
820
  /* CSV table styling */
821
  .artifact-csv {
822
  margin-top: 1rem;
 
3873
  <h1>All Benchmarks Aggregated Report</h1>
3874
  <h2><a href="layer_norm/">Layer Norm</a></h2>
3875
  <div class="artifact-preview">
3876
+ <img src="layer_norm/results/artifacts/combine/latency.svg" alt="Layer Norm Latency" width="800">
 
3877
  </div>
3878
 
3879
  <table>
 
3896
  </table>
3897
  <h2><a href="rotary/">Rotary Position Embeddings</a></h2>
3898
  <div class="artifact-preview">
3899
+ <img src="rotary/results/artifacts/combine/latency.svg" alt="Rotary Position Embeddings Latency" width="800">
 
3900
  </div>
3901
 
3902
  <table>
 
3919
  </table>
3920
  <h2><a href="flash_attn/">Flash Attention</a></h2>
3921
  <div class="artifact-preview">
3922
+ <img src="flash_attn/results/artifacts/combine/latency.svg" alt="Flash Attention Latency" width="800">
 
3923
  </div>
3924
 
3925
  <table>
 
3958
  </table>
3959
  <h2><a href="causal_conv1d/">Causal Conv1D</a></h2>
3960
  <div class="artifact-preview">
3961
+ <img src="causal_conv1d/results/artifacts/combine/latency.svg" alt="Causal Conv1D Latency" width="800">
 
3962
  </div>
3963
 
3964
  <table>
 
3981
  </table>
3982
  <h2><a href="activation/">Activation</a></h2>
3983
  <div class="artifact-preview">
3984
+ <img src="activation/results/artifacts/combine/latency.svg" alt="Activation Latency" width="800">
 
3985
  </div>
3986
 
3987
  <table>
 
4004
  </table>
4005
  <h2><a href="relu/">ReLU</a></h2>
4006
  <div class="artifact-preview">
4007
+ <img src="relu/results/artifacts/combine/latency.svg" alt="ReLU Latency" width="800">
 
4008
  </div>
4009
 
4010
  <table>
layer_norm/impls/artifacts/benchmark/layer_norm.jsonl CHANGED
@@ -1,4 +1,4 @@
1
- {"ts": "2025-10-29T14:26:39Z", "run": "a26c482e532b445fa4192ef14ec4850f", "impl": "hf_kernels_layer_norm", "tags": {"family": "hf-kernels", "repo": "kernels-community/layer-norm", "op": "layer_norm"}, "wl": {"name": "LN_B16_S2048_D4096", "batch": 16, "seq_len": 2048, "hidden_dim": 4096, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.8322699999894212, "p50": 0.8364899999833142, "p90": 0.8382409999967422, "mean": 0.8359703999872181, "iqr": 0.0036810000096920703, "raw_times": [0.8322699999894212, 0.8382909999795629, 0.8345599999870501, 0.8382409999967422, 0.8364899999833142], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.8381600000006983, "peak_bytes": 2415935488, "ok": true, "absmax": 0.03125, "corr": {"ok": true, "rtol": 0.001, "atol": 0.03125, "absmax": 0.03125, "mae": 0.0015411376953125, "mse": 1.1205673217773438e-05, "ref": "layer_norm_ref"}, "err": null}
2
- {"ts": "2025-10-29T14:26:39Z", "run": "a26c482e532b445fa4192ef14ec4850f", "impl": "hf_kernels_layer_norm", "tags": {"family": "hf-kernels", "repo": "kernels-community/layer-norm", "op": "layer_norm"}, "wl": {"name": "LN_B16_S2048_D8192", "batch": 16, "seq_len": 2048, "hidden_dim": 8192, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 1.6435499999829517, "p50": 1.6499199999771008, "p90": 1.6516499999852385, "mean": 1.650240399987979, "iqr": 0.0024989999474200886, "raw_times": [1.6516499999852385, 1.6435499999829517, 1.6499199999771008, 1.6491510000378184, 1.656930999956785], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 1.644769999984419, "peak_bytes": 4831870976, "ok": true, "absmax": 0.03125, "corr": {"ok": true, "rtol": 0.001, "atol": 0.03125, "absmax": 0.03125, "mae": 0.0015106201171875, "mse": 1.1086463928222656e-05, "ref": "layer_norm_ref"}, "err": null}
3
- {"ts": "2025-10-29T14:26:39Z", "run": "a26c482e532b445fa4192ef14ec4850f", "impl": "hf_kernels_layer_norm", "tags": {"family": "hf-kernels", "repo": "kernels-community/layer-norm", "op": "layer_norm"}, "wl": {"name": "LN_B16_S4096_D4096", "batch": 16, "seq_len": 4096, "hidden_dim": 4096, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 1.6425610000396773, "p50": 1.6517310000381258, "p90": 1.654420999955164, "mean": 1.6505027999983213, "iqr": 0.006990999963818467, "raw_times": [1.6474299999913455, 1.6517310000381258, 1.654420999955164, 1.6563709999672938, 1.6425610000396773], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 1.6471609999939574, "peak_bytes": 4831854592, "ok": true, "absmax": 0.03125, "corr": {"ok": true, "rtol": 0.001, "atol": 0.03125, "absmax": 0.03125, "mae": 0.0015411376953125, "mse": 1.1205673217773438e-05, "ref": "layer_norm_ref"}, "err": null}
4
- {"ts": "2025-10-29T14:26:39Z", "run": "a26c482e532b445fa4192ef14ec4850f", "impl": "hf_kernels_layer_norm", "tags": {"family": "hf-kernels", "repo": "kernels-community/layer-norm", "op": "layer_norm"}, "wl": {"name": "LN_B16_S4096_D8192", "batch": 16, "seq_len": 4096, "hidden_dim": 8192, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 3.237169999977141, "p50": 3.2576509999557857, "p90": 3.264301000001524, "mean": 3.257706599993071, "iqr": 0.008230999981151399, "raw_times": [3.2576509999557857, 3.264301000001524, 3.2733410000105323, 3.2560700000203724, 3.237169999977141], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 3.2725309999932506, "peak_bytes": 9663709184, "ok": true, "absmax": 0.03125, "corr": {"ok": true, "rtol": 0.001, "atol": 0.03125, "absmax": 0.03125, "mae": 0.0015106201171875, "mse": 1.1026859283447266e-05, "ref": "layer_norm_ref"}, "err": null}
 
1
+ {"ts": "2025-10-29T15:50:43Z", "run": "146df378aaa14293b5d1526656fc6cb4", "impl": "torch_layer_norm", "tags": {"family": "torch", "op": "layer_norm"}, "wl": {"name": "LN_B16_S2048_D4096", "batch": 16, "seq_len": 2048, "hidden_dim": 4096, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.8137589999819284, "p50": 0.8219090000238793, "p90": 0.8223789999988185, "mean": 0.8196492000138278, "iqr": 0.007259999961206631, "raw_times": [0.825080000026901, 0.8219090000238793, 0.8137589999819284, 0.8223789999988185, 0.8151190000376118], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.8213489999775447, "peak_bytes": 2415935488, "ok": true, "absmax": 0.03125, "corr": {"ok": true, "rtol": 0.001, "atol": 0.03125, "absmax": 0.03125, "mae": 0.0015411376953125, "mse": 1.1205673217773438e-05, "ref": "layer_norm_ref"}, "err": null}
2
+ {"ts": "2025-10-29T15:50:43Z", "run": "146df378aaa14293b5d1526656fc6cb4", "impl": "torch_layer_norm", "tags": {"family": "torch", "op": "layer_norm"}, "wl": {"name": "LN_B16_S2048_D8192", "batch": 16, "seq_len": 2048, "hidden_dim": 8192, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 1.680888999999297, "p50": 1.6820789999769659, "p90": 1.6842590000010205, "mean": 1.683131400000093, "iqr": 0.0026189999857706425, "raw_times": [1.6820789999769659, 1.680888999999297, 1.6816400000152498, 1.6842590000010205, 1.6867900000079317], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 1.687689999982922, "peak_bytes": 4831870976, "ok": true, "absmax": 0.03125, "corr": {"ok": true, "rtol": 0.001, "atol": 0.03125, "absmax": 0.03125, "mae": 0.0015106201171875, "mse": 1.1086463928222656e-05, "ref": "layer_norm_ref"}, "err": null}
3
+ {"ts": "2025-10-29T15:50:43Z", "run": "146df378aaa14293b5d1526656fc6cb4", "impl": "torch_layer_norm", "tags": {"family": "torch", "op": "layer_norm"}, "wl": {"name": "LN_B16_S4096_D4096", "batch": 16, "seq_len": 4096, "hidden_dim": 4096, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 1.603787000021839, "p50": 1.6093779999550861, "p90": 1.6102179999961663, "mean": 1.6086159999986194, "iqr": 0.002069999993636884, "raw_times": [1.6093779999550861, 1.6081480000025294, 1.603787000021839, 1.611549000017476, 1.6102179999961663], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 1.6238279999924998, "peak_bytes": 4831854592, "ok": true, "absmax": 0.03125, "corr": {"ok": true, "rtol": 0.001, "atol": 0.03125, "absmax": 0.03125, "mae": 0.0015411376953125, "mse": 1.1205673217773438e-05, "ref": "layer_norm_ref"}, "err": null}
4
+ {"ts": "2025-10-29T15:50:44Z", "run": "146df378aaa14293b5d1526656fc6cb4", "impl": "torch_layer_norm", "tags": {"family": "torch", "op": "layer_norm"}, "wl": {"name": "LN_B16_S4096_D8192", "batch": 16, "seq_len": 4096, "hidden_dim": 8192, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 3.30805800001599, "p50": 3.3301390000133324, "p90": 3.331328999991001, "mean": 3.3278527999868857, "iqr": 0.001610000026630587, "raw_times": [3.331328999991001, 3.3400189999497343, 3.3297189999643706, 3.30805800001599, 3.3301390000133324], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 3.3235790000389898, "peak_bytes": 9663709184, "ok": true, "absmax": 0.03125, "corr": {"ok": true, "rtol": 0.001, "atol": 0.03125, "absmax": 0.03125, "mae": 0.0015106201171875, "mse": 1.1026859283447266e-05, "ref": "layer_norm_ref"}, "err": null}
layer_norm/impls/cells/benchmark.py CHANGED
@@ -3,7 +3,6 @@
3
  # dependencies = [
4
  # "numpy",
5
  # "torch==2.8.0",
6
- # "kernels",
7
  # "kernels-benchmark-tools",
8
  # ]
9
  #
@@ -13,37 +12,15 @@
13
  import torch
14
  import sys
15
  from kernels_benchmark_tools import KernelTypeEnum, run_benchmark
16
- from kernels import get_kernel
17
 
18
- # Load the layer norm kernel
19
- layer_norm_kernel = get_kernel("kernels-community/layer-norm")
20
 
21
-
22
- def hf_kernels_layer_norm(x, weight, bias, eps: float = 1e-5):
23
- B, S, D = x.shape
24
- # The kernel expects [N, D] input; support beta (bias) if provided.
25
- out = layer_norm_kernel.dropout_add_ln_fwd(
26
- input=x.view(-1, D),
27
- gamma=weight,
28
- beta=bias,
29
- rowscale=None,
30
- colscale=None,
31
- x0_subset=None,
32
- z_subset=None,
33
- dropout_p=0.0,
34
- epsilon=eps,
35
- rowscale_const=1.0,
36
- z_numrows=S,
37
- gen=None,
38
- residual_in_fp32=False,
39
- is_rms_norm=False,
40
- )[0].view(B, S, D)
41
- return out
42
 
43
 
44
  run_benchmark(
45
  kernel_type=KernelTypeEnum.LAYER_NORM,
46
- impl_name="hf_kernels_layer_norm",
47
- impl_tags={"family": "hf-kernels", "repo": "kernels-community/layer-norm", "op": "layer_norm"},
48
- impl_func=hf_kernels_layer_norm,
49
  )
 
3
  # dependencies = [
4
  # "numpy",
5
  # "torch==2.8.0",
 
6
  # "kernels-benchmark-tools",
7
  # ]
8
  #
 
12
  import torch
13
  import sys
14
  from kernels_benchmark_tools import KernelTypeEnum, run_benchmark
 
15
 
 
 
16
 
17
+ def torch_layer_norm(x, weight, bias, eps: float = 1e-5):
18
+ return torch.nn.functional.layer_norm(x, (x.shape[-1],), weight, bias, eps)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
19
 
20
 
21
  run_benchmark(
22
  kernel_type=KernelTypeEnum.LAYER_NORM,
23
+ impl_name="torch_layer_norm",
24
+ impl_tags={"family": "torch", "op": "layer_norm"},
25
+ impl_func=torch_layer_norm,
26
  )
layer_norm/impls/hf_kernels_layer_norm.html CHANGED
@@ -809,6 +809,14 @@
809
  .artifact-preview svg {
810
  background: transparent;
811
  }
 
 
 
 
 
 
 
 
812
  /* CSV table styling */
813
  .artifact-csv {
814
  margin-top: 1rem;
@@ -3872,7 +3880,7 @@ span.linenos.special { color: #000000; background-color: #ffffc0; padding-left:
3872
  <span onclick="toggleOutput('benchmark')" style="cursor: pointer;">▼ output</span>
3873
  <span id="uv-indicator-benchmark" onclick="toggleUvLogsFromHeader('benchmark')" style="cursor: pointer;">▶ uv-logs</span>
3874
  </span> |
3875
- Cell: benchmark | 6.34s
3876
  | <button class="run-btn" onclick="runCell('benchmark')">▶ run</button>
3877
  <button class="copy-btn" onclick="copyCell('benchmark')">Copy</button>
3878
  <a href="cells/benchmark.py" target="_blank" class="raw-btn">Raw</a>
@@ -3943,19 +3951,19 @@ PROFILE TRACE: hf_kernels_layer_norm | LN_B16_S2048_D4096
3943
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
3944
  Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
3945
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
3946
- hf_kernels_layer_norm 5.26% 209.855us 46.73% 1.864ms 1.864ms 0.000us 0.00% 3.097ms 3.097ms 1
3947
- _layer_norm_f8ec252::dropout_add_ln_fwd 1.78% 70.832us 40.86% 1.630ms 543.337us 2.360ms 100.00% 3.097ms 1.032ms 3
3948
- hf_kernels_layer_norm 0.00% 0.000us 0.00% 0.000us 0.000us 2.362ms 100.06% 2.362ms 2.362ms 1
3949
- void layer_norm::ln_fwd_kernel&lt;layer_norm::Kernel_tr... 0.00% 0.000us 0.00% 0.000us 0.000us 2.360ms 100.00% 2.360ms 786.699us 3
3950
- Activity Buffer Request 36.61% 1.461ms 36.61% 1.461ms 1.461ms 736.736us 31.22% 736.736us 736.736us 1
3951
- aten::view 0.61% 24.271us 0.61% 24.271us 4.045us 0.000us 0.00% 0.000us 0.000us 6
3952
- aten::empty 1.19% 47.642us 1.19% 47.642us 5.294us 0.000us 0.00% 0.000us 0.000us 9
3953
- cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFla... 0.27% 10.789us 0.27% 10.789us 3.596us 0.000us 0.00% 0.000us 0.000us 3
3954
- cudaLaunchKernel 1.01% 40.102us 1.01% 40.102us 13.367us 0.000us 0.00% 0.000us 0.000us 3
3955
- cudaDeviceSynchronize 53.27% 2.125ms 53.27% 2.125ms 2.125ms 0.000us 0.00% 0.000us 0.000us 1
3956
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
3957
- Self CPU time total: 3.989ms
3958
- Self CUDA time total: 2.360ms
3959
 
3960
 
3961
 
@@ -3965,19 +3973,19 @@ PROFILE TRACE: hf_kernels_layer_norm | LN_B16_S2048_D8192
3965
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
3966
  Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
3967
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
3968
- hf_kernels_layer_norm 2.24% 143.733us 27.27% 1.751ms 1.751ms 0.000us 0.00% 6.440ms 6.440ms 1
3969
- _layer_norm_f8ec252::dropout_add_ln_fwd 0.75% 48.181us 24.84% 1.595ms 531.669us 4.846ms 100.00% 6.440ms 2.147ms 3
3970
- hf_kernels_layer_norm 0.00% 0.000us 0.00% 0.000us 0.000us 4.848ms 100.03% 4.848ms 4.848ms 1
3971
- void layer_norm::ln_fwd_kernel&lt;layer_norm::Kernel_tr... 0.00% 0.000us 0.00% 0.000us 0.000us 4.846ms 100.00% 4.846ms 1.615ms 3
3972
- Activity Buffer Request 23.08% 1.482ms 23.08% 1.482ms 1.482ms 1.594ms 32.88% 1.594ms 1.594ms 1
3973
- aten::view 0.20% 12.572us 0.20% 12.572us 2.095us 0.000us 0.00% 0.000us 0.000us 6
3974
- aten::empty 0.46% 29.840us 0.46% 29.840us 3.316us 0.000us 0.00% 0.000us 0.000us 9
3975
- cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFla... 0.08% 5.420us 0.08% 5.420us 1.807us 0.000us 0.00% 0.000us 0.000us 3
3976
- cudaLaunchKernel 0.46% 29.490us 0.46% 29.490us 9.830us 0.000us 0.00% 0.000us 0.000us 3
3977
- cudaDeviceSynchronize 72.73% 4.670ms 72.73% 4.670ms 4.670ms 0.000us 0.00% 0.000us 0.000us 1
3978
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
3979
- Self CPU time total: 6.421ms
3980
- Self CUDA time total: 4.846ms
3981
 
3982
 
3983
 
@@ -3987,19 +3995,19 @@ PROFILE TRACE: hf_kernels_layer_norm | LN_B16_S4096_D4096
3987
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
3988
  Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
3989
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
3990
- hf_kernels_layer_norm 1.96% 126.465us 27.43% 1.766ms 1.766ms 0.000us 0.00% 6.435ms 6.435ms 1
3991
- _layer_norm_f8ec252::dropout_add_ln_fwd 0.73% 46.779us 25.26% 1.627ms 542.360us 4.838ms 100.00% 6.435ms 2.145ms 3
3992
- hf_kernels_layer_norm 0.00% 0.000us 0.00% 0.000us 0.000us 4.839ms 100.03% 4.839ms 4.839ms 1
3993
- void layer_norm::ln_fwd_kernel&lt;layer_norm::Kernel_tr... 0.00% 0.000us 0.00% 0.000us 0.000us 4.838ms 100.00% 4.838ms 1.613ms 3
3994
- Activity Buffer Request 23.54% 1.516ms 23.54% 1.516ms 1.516ms 1.597ms 33.01% 1.597ms 1.597ms 1
3995
- aten::view 0.20% 12.929us 0.20% 12.929us 2.155us 0.000us 0.00% 0.000us 0.000us 6
3996
- aten::empty 0.46% 29.911us 0.46% 29.911us 3.323us 0.000us 0.00% 0.000us 0.000us 9
3997
- cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFla... 0.08% 5.300us 0.08% 5.300us 1.767us 0.000us 0.00% 0.000us 0.000us 3
3998
- cudaLaunchKernel 0.45% 29.003us 0.45% 29.003us 9.668us 0.000us 0.00% 0.000us 0.000us 3
3999
- cudaDeviceSynchronize 72.57% 4.674ms 72.57% 4.674ms 4.674ms 0.000us 0.00% 0.000us 0.000us 1
4000
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4001
- Self CPU time total: 6.440ms
4002
- Self CUDA time total: 4.838ms
4003
 
4004
 
4005
 
@@ -4009,24 +4017,24 @@ PROFILE TRACE: hf_kernels_layer_norm | LN_B16_S4096_D8192
4009
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4010
  Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
4011
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4012
- hf_kernels_layer_norm 1.17% 134.085us 17.09% 1.957ms 1.957ms 0.000us 0.00% 12.886ms 12.886ms 1
4013
- _layer_norm_f8ec252::dropout_add_ln_fwd 0.41% 46.869us 15.80% 1.809ms 603.015us 9.665ms 100.00% 12.886ms 4.295ms 3
4014
- hf_kernels_layer_norm 0.00% 0.000us 0.00% 0.000us 0.000us 9.667ms 100.01% 9.667ms 9.667ms 1
4015
- void layer_norm::ln_fwd_kernel&lt;layer_norm::Kernel_tr... 0.00% 0.000us 0.00% 0.000us 0.000us 9.665ms 100.00% 9.665ms 3.222ms 3
4016
- Activity Buffer Request 12.76% 1.462ms 12.76% 1.462ms 1.462ms 3.220ms 33.32% 3.220ms 3.220ms 1
4017
- aten::view 0.12% 13.968us 0.12% 13.968us 2.328us 0.000us 0.00% 0.000us 0.000us 6
4018
- aten::empty 0.26% 30.043us 0.26% 30.043us 3.338us 0.000us 0.00% 0.000us 0.000us 9
4019
- cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFla... 0.05% 5.590us 0.05% 5.590us 1.863us 0.000us 0.00% 0.000us 0.000us 3
4020
- cudaLaunchKernel 2.31% 264.797us 2.31% 264.797us 88.266us 0.000us 0.00% 0.000us 0.000us 3
4021
- cudaDeviceSynchronize 82.91% 9.495ms 82.91% 9.495ms 9.495ms 0.000us 0.00% 0.000us 0.000us 1
4022
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4023
- Self CPU time total: 11.452ms
4024
- Self CUDA time total: 9.665ms
4025
 
4026
 
4027
  impl wl p50(ms) ok
4028
- hf_kernels_layer_norm LN_B16_S2048_D4096 0.84 True
4029
- hf_kernels_layer_norm LN_B16_S2048_D8192 1.65 True
4030
  hf_kernels_layer_norm LN_B16_S4096_D4096 1.65 True
4031
  hf_kernels_layer_norm LN_B16_S4096_D8192 3.26 True
4032
  </pre></div>
@@ -4035,12 +4043,12 @@ hf_kernels_layer_norm LN_B16_S4096_D8192 3.26 True
4035
  <div class="uv-logs-content" style="display: none;">
4036
  Downloading hf-xet (3.2MiB)
4037
  Downloading hf-xet
4038
- Installed 15 packages in 13ms
4039
  </div>
4040
  </div>
4041
  <div class="cell-stderr">Fetching 4 files: 0%| | 0/4 [00:00&lt;?, ?it/s]
4042
- Fetching 4 files: 50%|█████ | 2/4 [00:01&lt;00:01, 1.22it/s]
4043
- Fetching 4 files: 100%|██████████| 4/4 [00:01&lt;00:00, 2.44it/s]</div>
4044
  <div class="cell-artifacts">
4045
  <h4>Artifacts:</h4>
4046
  <a href="artifacts/benchmark/layer_norm.jsonl" class="artifact" target="_blank">layer_norm.jsonl</a>
 
809
  .artifact-preview svg {
810
  background: transparent;
811
  }
812
+ /* Invert SVG images in dark mode */
813
+ :root[data-theme="dark"] .artifact-preview img[src$=".svg"] {
814
+ filter: invert(0.9) hue-rotate(180deg);
815
+ }
816
+ /* Keep SVG images readable in monocolor mode */
817
+ :root[data-ui="monocolor"] .artifact-preview img[src$=".svg"] {
818
+ filter: none;
819
+ }
820
  /* CSV table styling */
821
  .artifact-csv {
822
  margin-top: 1rem;
 
3880
  <span onclick="toggleOutput('benchmark')" style="cursor: pointer;">▼ output</span>
3881
  <span id="uv-indicator-benchmark" onclick="toggleUvLogsFromHeader('benchmark')" style="cursor: pointer;">▶ uv-logs</span>
3882
  </span> |
3883
+ Cell: benchmark | 9.83s
3884
  | <button class="run-btn" onclick="runCell('benchmark')">▶ run</button>
3885
  <button class="copy-btn" onclick="copyCell('benchmark')">Copy</button>
3886
  <a href="cells/benchmark.py" target="_blank" class="raw-btn">Raw</a>
 
3951
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
3952
  Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
3953
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
3954
+ hf_kernels_layer_norm 4.95% 198.743us 46.81% 1.878ms 1.878ms 0.000us 0.00% 3.111ms 3.111ms 1
3955
+ _layer_norm_f8ec252::dropout_add_ln_fwd 1.73% 69.535us 41.21% 1.653ms 550.933us 2.375ms 100.00% 3.111ms 1.037ms 3
3956
+ hf_kernels_layer_norm 0.00% 0.000us 0.00% 0.000us 0.000us 2.376ms 100.07% 2.376ms 2.376ms 1
3957
+ void layer_norm::ln_fwd_kernel&lt;layer_norm::Kernel_tr... 0.00% 0.000us 0.00% 0.000us 0.000us 2.375ms 100.00% 2.375ms 791.590us 3
3958
+ Activity Buffer Request 36.98% 1.483ms 36.98% 1.483ms 1.483ms 736.636us 31.02% 736.636us 736.636us 1
3959
+ aten::view 0.65% 26.132us 0.65% 26.132us 4.355us 0.000us 0.00% 0.000us 0.000us 6
3960
+ aten::empty 1.22% 49.009us 1.22% 49.009us 5.445us 0.000us 0.00% 0.000us 0.000us 9
3961
+ cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFla... 0.22% 8.769us 0.22% 8.769us 2.923us 0.000us 0.00% 0.000us 0.000us 3
3962
+ cudaLaunchKernel 1.05% 42.291us 1.05% 42.291us 14.097us 0.000us 0.00% 0.000us 0.000us 3
3963
+ cudaDeviceSynchronize 53.19% 2.133ms 53.19% 2.133ms 2.133ms 0.000us 0.00% 0.000us 0.000us 1
3964
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
3965
+ Self CPU time total: 4.011ms
3966
+ Self CUDA time total: 2.375ms
3967
 
3968
 
3969
 
 
3973
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
3974
  Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
3975
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
3976
+ hf_kernels_layer_norm 1.97% 125.105us 26.88% 1.705ms 1.705ms 0.000us 0.00% 6.375ms 6.375ms 1
3977
+ _layer_norm_f8ec252::dropout_add_ln_fwd 0.73% 46.170us 24.73% 1.568ms 522.755us 4.809ms 100.00% 6.375ms 2.125ms 3
3978
+ hf_kernels_layer_norm 0.00% 0.000us 0.00% 0.000us 0.000us 4.811ms 100.03% 4.811ms 4.811ms 1
3979
+ void layer_norm::ln_fwd_kernel&lt;layer_norm::Kernel_tr... 0.00% 0.000us 0.00% 0.000us 0.000us 4.809ms 100.00% 4.809ms 1.603ms 3
3980
+ Activity Buffer Request 22.98% 1.457ms 22.98% 1.457ms 1.457ms 1.565ms 32.55% 1.565ms 1.565ms 1
3981
+ aten::view 0.18% 11.529us 0.18% 11.529us 1.922us 0.000us 0.00% 0.000us 0.000us 6
3982
+ aten::empty 0.46% 29.430us 0.46% 29.430us 3.270us 0.000us 0.00% 0.000us 0.000us 9
3983
+ cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFla... 0.08% 4.900us 0.08% 4.900us 1.633us 0.000us 0.00% 0.000us 0.000us 3
3984
+ cudaLaunchKernel 0.48% 30.441us 0.48% 30.441us 10.147us 0.000us 0.00% 0.000us 0.000us 3
3985
+ cudaDeviceSynchronize 73.12% 4.638ms 73.12% 4.638ms 4.638ms 0.000us 0.00% 0.000us 0.000us 1
3986
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
3987
+ Self CPU time total: 6.343ms
3988
+ Self CUDA time total: 4.809ms
3989
 
3990
 
3991
 
 
3995
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
3996
  Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
3997
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
3998
+ hf_kernels_layer_norm 1.75% 110.793us 26.94% 1.702ms 1.702ms 0.000us 0.00% 6.331ms 6.331ms 1
3999
+ _layer_norm_f8ec252::dropout_add_ln_fwd 0.70% 44.248us 25.01% 1.580ms 526.532us 4.779ms 100.00% 6.331ms 2.110ms 3
4000
+ hf_kernels_layer_norm 0.00% 0.000us 0.00% 0.000us 0.000us 4.781ms 100.03% 4.781ms 4.781ms 1
4001
+ void layer_norm::ln_fwd_kernel&lt;layer_norm::Kernel_tr... 0.00% 0.000us 0.00% 0.000us 0.000us 4.779ms 100.00% 4.779ms 1.593ms 3
4002
+ Activity Buffer Request 23.30% 1.472ms 23.30% 1.472ms 1.472ms 1.552ms 32.48% 1.552ms 1.552ms 1
4003
+ aten::view 0.18% 11.190us 0.18% 11.190us 1.865us 0.000us 0.00% 0.000us 0.000us 6
4004
+ aten::empty 0.49% 30.823us 0.49% 30.823us 3.425us 0.000us 0.00% 0.000us 0.000us 9
4005
+ cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFla... 0.08% 4.981us 0.08% 4.981us 1.660us 0.000us 0.00% 0.000us 0.000us 3
4006
+ cudaLaunchKernel 0.44% 28.031us 0.44% 28.031us 9.344us 0.000us 0.00% 0.000us 0.000us 3
4007
+ cudaDeviceSynchronize 73.06% 4.615ms 73.06% 4.615ms 4.615ms 0.000us 0.00% 0.000us 0.000us 1
4008
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4009
+ Self CPU time total: 6.317ms
4010
+ Self CUDA time total: 4.779ms
4011
 
4012
 
4013
 
 
4017
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4018
  Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
4019
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4020
+ hf_kernels_layer_norm 1.11% 111.882us 6.14% 619.354us 619.354us 0.000us 0.00% 12.808ms 12.808ms 1
4021
+ _layer_norm_f8ec252::dropout_add_ln_fwd 0.46% 46.119us 4.92% 496.462us 165.487us 9.625ms 100.00% 12.808ms 4.269ms 3
4022
+ hf_kernels_layer_norm 0.00% 0.000us 0.00% 0.000us 0.000us 9.626ms 100.01% 9.626ms 9.626ms 1
4023
+ void layer_norm::ln_fwd_kernel&lt;layer_norm::Kernel_tr... 0.00% 0.000us 0.00% 0.000us 0.000us 9.625ms 100.00% 9.625ms 3.208ms 3
4024
+ Activity Buffer Request 1.38% 138.943us 1.38% 138.943us 138.943us 3.183ms 33.07% 3.183ms 3.183ms 1
4025
+ aten::view 0.11% 11.010us 0.11% 11.010us 1.835us 0.000us 0.00% 0.000us 0.000us 6
4026
+ aten::empty 0.31% 31.174us 0.31% 31.174us 3.464us 0.000us 0.00% 0.000us 0.000us 9
4027
+ cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFla... 0.05% 5.190us 0.05% 5.190us 1.730us 0.000us 0.00% 0.000us 0.000us 3
4028
+ cudaLaunchKernel 2.73% 275.036us 2.73% 275.036us 91.679us 0.000us 0.00% 0.000us 0.000us 3
4029
+ cudaDeviceSynchronize 93.86% 9.465ms 93.86% 9.465ms 9.465ms 0.000us 0.00% 0.000us 0.000us 1
4030
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4031
+ Self CPU time total: 10.085ms
4032
+ Self CUDA time total: 9.625ms
4033
 
4034
 
4035
  impl wl p50(ms) ok
4036
+ hf_kernels_layer_norm LN_B16_S2048_D4096 0.83 True
4037
+ hf_kernels_layer_norm LN_B16_S2048_D8192 1.66 True
4038
  hf_kernels_layer_norm LN_B16_S4096_D4096 1.65 True
4039
  hf_kernels_layer_norm LN_B16_S4096_D8192 3.26 True
4040
  </pre></div>
 
4043
  <div class="uv-logs-content" style="display: none;">
4044
  Downloading hf-xet (3.2MiB)
4045
  Downloading hf-xet
4046
+ Installed 52 packages in 191ms
4047
  </div>
4048
  </div>
4049
  <div class="cell-stderr">Fetching 4 files: 0%| | 0/4 [00:00&lt;?, ?it/s]
4050
+ Fetching 4 files: 50%|█████ | 2/4 [00:01&lt;00:01, 1.30it/s]
4051
+ Fetching 4 files: 100%|██████████| 4/4 [00:01&lt;00:00, 2.60it/s]</div>
4052
  <div class="cell-artifacts">
4053
  <h4>Artifacts:</h4>
4054
  <a href="artifacts/benchmark/layer_norm.jsonl" class="artifact" target="_blank">layer_norm.jsonl</a>
layer_norm/impls/torch_layer_norm.html CHANGED
@@ -809,6 +809,14 @@
809
  .artifact-preview svg {
810
  background: transparent;
811
  }
 
 
 
 
 
 
 
 
812
  /* CSV table styling */
813
  .artifact-csv {
814
  margin-top: 1rem;
@@ -3871,7 +3879,7 @@ span.linenos.special { color: #000000; background-color: #ffffc0; padding-left:
3871
  <span onclick="toggleOutput('nv')" style="cursor: pointer;">▼ output</span>
3872
  <span id="uv-indicator-nv" style="cursor: default; opacity: 0.3;">▶ uv-logs</span>
3873
  </span> |
3874
- Cell: nv | 0.26s
3875
  | <button class="run-btn" onclick="runCell('nv')">▶ run</button>
3876
  <button class="copy-btn" onclick="copyCell('nv')">Copy</button>
3877
  <a href="cells/nv.py" target="_blank" class="raw-btn">Raw</a>
@@ -3887,7 +3895,7 @@ Cell: nv | 0.26s
3887
  </div>
3888
  </div>
3889
  <div id="output-nv" class="cell-output">
3890
- <div class="cell-stdout"><pre class="stdout-text">Wed Oct 29 14:26:26 2025
3891
  +-----------------------------------------------------------------------------------------+
3892
  | NVIDIA-SMI 570.195.03 Driver Version: 570.195.03 CUDA Version: 12.8 |
3893
  |-----------------------------------------+------------------------+----------------------+
@@ -3896,7 +3904,7 @@ Cell: nv | 0.26s
3896
  | | | MIG M. |
3897
  |=========================================+========================+======================|
3898
  | 0 NVIDIA L40S On | 00000000:4D:00.0 Off | 0 |
3899
- | N/A 30C P0 108W / 350W | 0MiB / 46068MiB | 100% Default |
3900
  | | | N/A |
3901
  +-----------------------------------------+------------------------+----------------------+
3902
 
@@ -3918,9 +3926,9 @@ Cell: nv | 0.26s
3918
  <span class="collapse-indicators">
3919
  <span onclick="toggleCode('benchmark')" style="cursor: pointer;">▼ code</span>
3920
  <span onclick="toggleOutput('benchmark')" style="cursor: pointer;">▼ output</span>
3921
- <span id="uv-indicator-benchmark" onclick="toggleUvLogsFromHeader('benchmark')" style="cursor: pointer;">▶ uv-logs</span>
3922
  </span> |
3923
- Cell: benchmark | 7.36s
3924
  | <button class="run-btn" onclick="runCell('benchmark')">▶ run</button>
3925
  <button class="copy-btn" onclick="copyCell('benchmark')">Copy</button>
3926
  <a href="cells/benchmark.py" target="_blank" class="raw-btn">Raw</a>
@@ -3968,19 +3976,19 @@ PROFILE TRACE: torch_layer_norm | LN_B16_S2048_D4096
3968
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
3969
  Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
3970
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
3971
- torch_layer_norm 3.90% 151.572us 46.01% 1.786ms 1.786ms 0.000us 0.00% 3.026ms 3.026ms 1
3972
- aten::layer_norm 0.43% 16.762us 42.11% 1.635ms 544.851us 0.000us 0.00% 3.026ms 1.009ms 3
3973
- aten::native_layer_norm 2.06% 80.009us 41.67% 1.618ms 539.263us 2.316ms 100.00% 3.026ms 1.009ms 3
3974
- torch_layer_norm 0.00% 0.000us 0.00% 0.000us 0.000us 2.318ms 100.06% 2.318ms 2.318ms 1
3975
- void at::native::(anonymous namespace)::vectorized_l... 0.00% 0.000us 0.00% 0.000us 0.000us 2.316ms 100.00% 2.316ms 772.127us 3
3976
- Activity Buffer Request 37.08% 1.440ms 37.08% 1.440ms 1.440ms 709.855us 30.65% 709.855us 709.855us 1
3977
- aten::empty 1.19% 46.261us 1.19% 46.261us 5.140us 0.000us 0.00% 0.000us 0.000us 9
3978
- cudaLaunchKernel 1.16% 45.163us 1.16% 45.163us 15.054us 0.000us 0.00% 0.000us 0.000us 3
3979
- aten::view 0.17% 6.761us 0.17% 6.761us 1.127us 0.000us 0.00% 0.000us 0.000us 6
3980
- cudaDeviceSynchronize 53.99% 2.096ms 53.99% 2.096ms 2.096ms 0.000us 0.00% 0.000us 0.000us 1
3981
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
3982
- Self CPU time total: 3.882ms
3983
- Self CUDA time total: 2.316ms
3984
 
3985
 
3986
 
@@ -3990,19 +3998,19 @@ PROFILE TRACE: torch_layer_norm | LN_B16_S2048_D8192
3990
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
3991
  Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
3992
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
3993
- torch_layer_norm 1.19% 75.581us 25.55% 1.628ms 1.628ms 0.000us 0.00% 6.473ms 6.473ms 1
3994
- aten::layer_norm 0.14% 9.142us 24.37% 1.553ms 517.550us 0.000us 0.00% 6.473ms 2.158ms 3
3995
- aten::native_layer_norm 0.81% 51.921us 24.22% 1.544ms 514.502us 4.881ms 100.00% 6.473ms 2.158ms 3
3996
- torch_layer_norm 0.00% 0.000us 0.00% 0.000us 0.000us 4.882ms 100.03% 4.882ms 4.882ms 1
3997
- void at::native::(anonymous namespace)::vectorized_l... 0.00% 0.000us 0.00% 0.000us 0.000us 4.881ms 100.00% 4.881ms 1.627ms 3
3998
- Activity Buffer Request 22.46% 1.431ms 22.46% 1.431ms 1.431ms 1.592ms 32.61% 1.592ms 1.592ms 1
3999
- aten::empty 0.44% 27.841us 0.44% 27.841us 3.093us 0.000us 0.00% 0.000us 0.000us 9
4000
- cudaLaunchKernel 0.45% 28.910us 0.45% 28.910us 9.637us 0.000us 0.00% 0.000us 0.000us 3
4001
- aten::view 0.06% 3.829us 0.06% 3.829us 0.638us 0.000us 0.00% 0.000us 0.000us 6
4002
- cudaDeviceSynchronize 74.45% 4.743ms 74.45% 4.743ms 4.743ms 0.000us 0.00% 0.000us 0.000us 1
4003
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4004
- Self CPU time total: 6.372ms
4005
- Self CUDA time total: 4.881ms
4006
 
4007
 
4008
 
@@ -4012,19 +4020,19 @@ PROFILE TRACE: torch_layer_norm | LN_B16_S4096_D4096
4012
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4013
  Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
4014
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4015
- torch_layer_norm 1.15% 71.882us 26.71% 1.668ms 1.668ms 0.000us 0.00% 6.222ms 6.222ms 1
4016
- aten::layer_norm 0.15% 9.629us 25.56% 1.596ms 532.153us 0.000us 0.00% 6.222ms 2.074ms 3
4017
- aten::native_layer_norm 0.90% 56.373us 25.41% 1.587ms 528.943us 4.717ms 100.00% 6.222ms 2.074ms 3
4018
- torch_layer_norm 0.00% 0.000us 0.00% 0.000us 0.000us 4.718ms 100.03% 4.718ms 4.718ms 1
4019
- void at::native::(anonymous namespace)::vectorized_l... 0.00% 0.000us 0.00% 0.000us 0.000us 4.717ms 100.00% 4.717ms 1.572ms 3
4020
- Activity Buffer Request 23.44% 1.464ms 23.44% 1.464ms 1.464ms 1.506ms 31.93% 1.506ms 1.506ms 1
4021
- aten::empty 0.46% 28.850us 0.46% 28.850us 3.206us 0.000us 0.00% 0.000us 0.000us 9
4022
- cudaLaunchKernel 0.52% 32.781us 0.52% 32.781us 10.927us 0.000us 0.00% 0.000us 0.000us 3
4023
- aten::view 0.07% 4.590us 0.07% 4.590us 0.765us 0.000us 0.00% 0.000us 0.000us 6
4024
- cudaDeviceSynchronize 73.29% 4.577ms 73.29% 4.577ms 4.577ms 0.000us 0.00% 0.000us 0.000us 1
4025
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4026
- Self CPU time total: 6.246ms
4027
- Self CUDA time total: 4.717ms
4028
 
4029
 
4030
 
@@ -4034,19 +4042,19 @@ PROFILE TRACE: torch_layer_norm | LN_B16_S4096_D8192
4034
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4035
  Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
4036
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4037
- torch_layer_norm 0.67% 74.340us 13.35% 1.490ms 1.490ms 0.000us 0.00% 13.028ms 13.028ms 1
4038
- aten::layer_norm 0.09% 9.510us 12.69% 1.416ms 471.835us 0.000us 0.00% 13.028ms 4.343ms 3
4039
- aten::native_layer_norm 0.47% 52.269us 12.60% 1.406ms 468.665us 9.808ms 100.00% 13.028ms 4.343ms 3
4040
- torch_layer_norm 0.00% 0.000us 0.00% 0.000us 0.000us 9.809ms 100.02% 9.809ms 9.809ms 1
4041
- void at::native::(anonymous namespace)::vectorized_l... 0.00% 0.000us 0.00% 0.000us 0.000us 9.808ms 100.00% 9.808ms 3.269ms 3
4042
- Activity Buffer Request 9.72% 1.085ms 9.72% 1.085ms 1.085ms 3.220ms 32.83% 3.220ms 3.220ms 1
4043
- aten::empty 0.26% 29.181us 0.26% 29.181us 3.242us 0.000us 0.00% 0.000us 0.000us 9
4044
- cudaLaunchKernel 2.11% 235.817us 2.11% 235.817us 78.606us 0.000us 0.00% 0.000us 0.000us 3
4045
- aten::view 0.04% 4.022us 0.04% 4.022us 0.670us 0.000us 0.00% 0.000us 0.000us 6
4046
- cudaDeviceSynchronize 86.65% 9.669ms 86.65% 9.669ms 9.669ms 0.000us 0.00% 0.000us 0.000us 1
4047
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4048
- Self CPU time total: 11.159ms
4049
- Self CUDA time total: 9.808ms
4050
 
4051
 
4052
  impl wl p50(ms) ok
@@ -4055,12 +4063,6 @@ torch_layer_norm LN_B16_S2048_D8192 1.68 True
4055
  torch_layer_norm LN_B16_S4096_D4096 1.61 True
4056
  torch_layer_norm LN_B16_S4096_D8192 3.33 True
4057
  </pre></div>
4058
- <div class="uv-install-logs" id="uv-logs-benchmark">
4059
- <div class="uv-logs-header" onclick="toggleUvLogs(this)">▶ UV Install Logs</div>
4060
- <div class="uv-logs-content" style="display: none;">
4061
- Installed 37 packages in 222ms
4062
- </div>
4063
- </div>
4064
  <div class="cell-artifacts">
4065
  <h4>Artifacts:</h4>
4066
  <a href="artifacts/benchmark/layer_norm.jsonl" class="artifact" target="_blank">layer_norm.jsonl</a>
 
809
  .artifact-preview svg {
810
  background: transparent;
811
  }
812
+ /* Invert SVG images in dark mode */
813
+ :root[data-theme="dark"] .artifact-preview img[src$=".svg"] {
814
+ filter: invert(0.9) hue-rotate(180deg);
815
+ }
816
+ /* Keep SVG images readable in monocolor mode */
817
+ :root[data-ui="monocolor"] .artifact-preview img[src$=".svg"] {
818
+ filter: none;
819
+ }
820
  /* CSV table styling */
821
  .artifact-csv {
822
  margin-top: 1rem;
 
3879
  <span onclick="toggleOutput('nv')" style="cursor: pointer;">▼ output</span>
3880
  <span id="uv-indicator-nv" style="cursor: default; opacity: 0.3;">▶ uv-logs</span>
3881
  </span> |
3882
+ Cell: nv | 0.23s
3883
  | <button class="run-btn" onclick="runCell('nv')">▶ run</button>
3884
  <button class="copy-btn" onclick="copyCell('nv')">Copy</button>
3885
  <a href="cells/nv.py" target="_blank" class="raw-btn">Raw</a>
 
3895
  </div>
3896
  </div>
3897
  <div id="output-nv" class="cell-output">
3898
+ <div class="cell-stdout"><pre class="stdout-text">Wed Oct 29 15:50:44 2025
3899
  +-----------------------------------------------------------------------------------------+
3900
  | NVIDIA-SMI 570.195.03 Driver Version: 570.195.03 CUDA Version: 12.8 |
3901
  |-----------------------------------------+------------------------+----------------------+
 
3904
  | | | MIG M. |
3905
  |=========================================+========================+======================|
3906
  | 0 NVIDIA L40S On | 00000000:4D:00.0 Off | 0 |
3907
+ | N/A 29C P0 138W / 350W | 0MiB / 46068MiB | 49% Default |
3908
  | | | N/A |
3909
  +-----------------------------------------+------------------------+----------------------+
3910
 
 
3926
  <span class="collapse-indicators">
3927
  <span onclick="toggleCode('benchmark')" style="cursor: pointer;">▼ code</span>
3928
  <span onclick="toggleOutput('benchmark')" style="cursor: pointer;">▼ output</span>
3929
+ <span id="uv-indicator-benchmark" style="cursor: default; opacity: 0.3;">▶ uv-logs</span>
3930
  </span> |
3931
+ Cell: benchmark | 3.85s
3932
  | <button class="run-btn" onclick="runCell('benchmark')">▶ run</button>
3933
  <button class="copy-btn" onclick="copyCell('benchmark')">Copy</button>
3934
  <a href="cells/benchmark.py" target="_blank" class="raw-btn">Raw</a>
 
3976
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
3977
  Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
3978
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
3979
+ torch_layer_norm 3.59% 140.394us 45.88% 1.793ms 1.793ms 0.000us 0.00% 3.034ms 3.034ms 1
3980
+ aten::layer_norm 0.43% 16.891us 42.29% 1.653ms 551.033us 0.000us 0.00% 3.034ms 1.011ms 3
3981
+ aten::native_layer_norm 2.49% 97.515us 41.85% 1.636ms 545.403us 2.324ms 100.00% 3.034ms 1.011ms 3
3982
+ torch_layer_norm 0.00% 0.000us 0.00% 0.000us 0.000us 2.325ms 100.07% 2.325ms 2.325ms 1
3983
+ void at::native::(anonymous namespace)::vectorized_l... 0.00% 0.000us 0.00% 0.000us 0.000us 2.324ms 100.00% 2.324ms 774.631us 3
3984
+ Activity Buffer Request 36.92% 1.443ms 36.92% 1.443ms 1.443ms 709.916us 30.55% 709.916us 709.916us 1
3985
+ aten::empty 1.11% 43.309us 1.11% 43.309us 4.812us 0.000us 0.00% 0.000us 0.000us 9
3986
+ cudaLaunchKernel 1.17% 45.620us 1.17% 45.620us 15.207us 0.000us 0.00% 0.000us 0.000us 3
3987
+ aten::view 0.17% 6.600us 0.17% 6.600us 1.100us 0.000us 0.00% 0.000us 0.000us 6
3988
+ cudaDeviceSynchronize 54.12% 2.116ms 54.12% 2.116ms 2.116ms 0.000us 0.00% 0.000us 0.000us 1
3989
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
3990
+ Self CPU time total: 3.909ms
3991
+ Self CUDA time total: 2.324ms
3992
 
3993
 
3994
 
 
3998
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
3999
  Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
4000
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4001
+ torch_layer_norm 1.51% 96.533us 25.68% 1.646ms 1.646ms 0.000us 0.00% 6.506ms 6.506ms 1
4002
+ aten::layer_norm 0.14% 9.019us 24.18% 1.550ms 516.535us 0.000us 0.00% 6.506ms 2.169ms 3
4003
+ aten::native_layer_norm 0.81% 51.783us 24.04% 1.541ms 513.529us 4.903ms 100.00% 6.506ms 2.169ms 3
4004
+ torch_layer_norm 0.00% 0.000us 0.00% 0.000us 0.000us 4.905ms 100.03% 4.905ms 4.905ms 1
4005
+ void at::native::(anonymous namespace)::vectorized_l... 0.00% 0.000us 0.00% 0.000us 0.000us 4.903ms 100.00% 4.903ms 1.634ms 3
4006
+ Activity Buffer Request 22.28% 1.428ms 22.28% 1.428ms 1.428ms 1.602ms 32.68% 1.602ms 1.602ms 1
4007
+ aten::empty 0.45% 29.001us 0.45% 29.001us 3.222us 0.000us 0.00% 0.000us 0.000us 9
4008
+ cudaLaunchKernel 0.43% 27.850us 0.43% 27.850us 9.283us 0.000us 0.00% 0.000us 0.000us 3
4009
+ aten::view 0.07% 4.220us 0.07% 4.220us 0.703us 0.000us 0.00% 0.000us 0.000us 6
4010
+ cudaDeviceSynchronize 74.32% 4.763ms 74.32% 4.763ms 4.763ms 0.000us 0.00% 0.000us 0.000us 1
4011
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4012
+ Self CPU time total: 6.409ms
4013
+ Self CUDA time total: 4.903ms
4014
 
4015
 
4016
 
 
4020
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4021
  Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
4022
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4023
+ torch_layer_norm 1.49% 93.320us 26.51% 1.656ms 1.656ms 0.000us 0.00% 6.235ms 6.235ms 1
4024
+ aten::layer_norm 0.15% 9.262us 25.02% 1.563ms 520.876us 0.000us 0.00% 6.235ms 2.078ms 3
4025
+ aten::native_layer_norm 0.82% 51.181us 24.87% 1.553ms 517.789us 4.722ms 100.00% 6.235ms 2.078ms 3
4026
+ torch_layer_norm 0.00% 0.000us 0.00% 0.000us 0.000us 4.723ms 100.03% 4.723ms 4.723ms 1
4027
+ void at::native::(anonymous namespace)::vectorized_l... 0.00% 0.000us 0.00% 0.000us 0.000us 4.722ms 100.00% 4.722ms 1.574ms 3
4028
+ Activity Buffer Request 23.09% 1.443ms 23.09% 1.443ms 1.443ms 1.513ms 32.04% 1.513ms 1.513ms 1
4029
+ aten::empty 0.46% 28.530us 0.46% 28.530us 3.170us 0.000us 0.00% 0.000us 0.000us 9
4030
+ cudaLaunchKernel 0.44% 27.431us 0.44% 27.431us 9.144us 0.000us 0.00% 0.000us 0.000us 3
4031
+ aten::view 0.06% 3.670us 0.06% 3.670us 0.612us 0.000us 0.00% 0.000us 0.000us 6
4032
+ cudaDeviceSynchronize 73.49% 4.591ms 73.49% 4.591ms 4.591ms 0.000us 0.00% 0.000us 0.000us 1
4033
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4034
+ Self CPU time total: 6.247ms
4035
+ Self CUDA time total: 4.722ms
4036
 
4037
 
4038
 
 
4042
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4043
  Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
4044
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4045
+ torch_layer_norm 0.75% 86.532us 16.17% 1.873ms 1.873ms 0.000us 0.00% 13.086ms 13.086ms 1
4046
+ aten::layer_norm 0.08% 9.721us 15.43% 1.787ms 595.631us 0.000us 0.00% 13.086ms 4.362ms 3
4047
+ aten::native_layer_norm 0.46% 53.132us 15.34% 1.777ms 592.390us 9.848ms 100.00% 13.086ms 4.362ms 3
4048
+ torch_layer_norm 0.00% 0.000us 0.00% 0.000us 0.000us 9.850ms 100.01% 9.850ms 9.850ms 1
4049
+ void at::native::(anonymous namespace)::vectorized_l... 0.00% 0.000us 0.00% 0.000us 0.000us 9.848ms 100.00% 9.848ms 3.283ms 3
4050
+ Activity Buffer Request 12.61% 1.460ms 12.61% 1.460ms 1.460ms 3.238ms 32.88% 3.238ms 3.238ms 1
4051
+ aten::empty 0.27% 30.840us 0.27% 30.840us 3.427us 0.000us 0.00% 0.000us 0.000us 9
4052
+ cudaLaunchKernel 1.98% 229.105us 1.98% 229.105us 76.368us 0.000us 0.00% 0.000us 0.000us 3
4053
+ aten::view 0.03% 3.969us 0.03% 3.969us 0.661us 0.000us 0.00% 0.000us 0.000us 6
4054
+ cudaDeviceSynchronize 83.83% 9.710ms 83.83% 9.710ms 9.710ms 0.000us 0.00% 0.000us 0.000us 1
4055
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4056
+ Self CPU time total: 11.583ms
4057
+ Self CUDA time total: 9.848ms
4058
 
4059
 
4060
  impl wl p50(ms) ok
 
4063
  torch_layer_norm LN_B16_S4096_D4096 1.61 True
4064
  torch_layer_norm LN_B16_S4096_D8192 3.33 True
4065
  </pre></div>
 
 
 
 
 
 
4066
  <div class="cell-artifacts">
4067
  <h4>Artifacts:</h4>
4068
  <a href="artifacts/benchmark/layer_norm.jsonl" class="artifact" target="_blank">layer_norm.jsonl</a>
layer_norm/results/artifacts/combine/latency.svg CHANGED

Git LFS Details

  • SHA256: 8fd53794c4617f7e947676c655de6f739b720b8f16a59432369c127bfc08190a
  • Pointer size: 130 Bytes
  • Size of remote file: 14.6 kB

Git LFS Details

  • SHA256: 9bbb6ba8f80ad7d025abae8130bb65dedc3691b259d1e31011653d588f2a3243
  • Pointer size: 130 Bytes
  • Size of remote file: 14.6 kB
layer_norm/results/combined_results.html CHANGED
@@ -809,6 +809,14 @@
809
  .artifact-preview svg {
810
  background: transparent;
811
  }
 
 
 
 
 
 
 
 
812
  /* CSV table styling */
813
  .artifact-csv {
814
  margin-top: 1rem;
@@ -3872,7 +3880,7 @@ span.linenos.special { color: #000000; background-color: #ffffc0; padding-left:
3872
  <rdf:RDF>
3873
  <ns2:Work>
3874
  <dc:type rdf:resource="http://purl.org/dc/dcmitype/StillImage" />
3875
- <dc:date>2025-10-29T14:27:45.722521</dc:date>
3876
  <dc:format>image/svg+xml</dc:format>
3877
  <dc:creator>
3878
  <ns2:Agent>
@@ -3956,70 +3964,70 @@ span.linenos.special { color: #000000; background-color: #ffffc0; padding-left:
3956
  <g id="matplotlib.axis_2">
3957
  <g id="ytick_1">
3958
  <g id="grid-y--2" class="grid grid-y">
3959
- <path d="M 47.72 408.957392 L 840.20233 408.957392 " clip-path="url(#p2214f54723)" style="fill: none; stroke: #b0b0b0; stroke-opacity: 0.3; stroke-width: 0.8; stroke-linecap: square" />
3960
  </g>
3961
  <g id="line2d_5">
3962
  <defs>
3963
  <path id="m0fca2865ba" d="M 0 0 L -3.5 0 " style="stroke: #000000; stroke-width: 0.8" />
3964
  </defs>
3965
  <g>
3966
- <use ns4:href="#m0fca2865ba" x="47.72" y="408.957392" style="stroke: #000000; stroke-width: 0.8" />
3967
  </g>
3968
  </g>
3969
  <g id="text_5">
3970
- <text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: end" x="40.72" y="412.756611" transform="rotate(-0 40.72 412.756611)">1.0</text>
3971
  </g>
3972
  </g>
3973
  <g id="ytick_2">
3974
  <g id="grid-y--3" class="grid grid-y">
3975
- <path d="M 47.72 331.05018 L 840.20233 331.05018 " clip-path="url(#p2214f54723)" style="fill: none; stroke: #b0b0b0; stroke-opacity: 0.3; stroke-width: 0.8; stroke-linecap: square" />
3976
  </g>
3977
  <g id="line2d_6">
3978
  <g>
3979
- <use ns4:href="#m0fca2865ba" x="47.72" y="331.05018" style="stroke: #000000; stroke-width: 0.8" />
3980
  </g>
3981
  </g>
3982
  <g id="text_6">
3983
- <text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: end" x="40.72" y="334.849399" transform="rotate(-0 40.72 334.849399)">1.5</text>
3984
  </g>
3985
  </g>
3986
  <g id="ytick_3">
3987
  <g id="grid-y--4" class="grid grid-y">
3988
- <path d="M 47.72 253.142969 L 840.20233 253.142969 " clip-path="url(#p2214f54723)" style="fill: none; stroke: #b0b0b0; stroke-opacity: 0.3; stroke-width: 0.8; stroke-linecap: square" />
3989
  </g>
3990
  <g id="line2d_7">
3991
  <g>
3992
- <use ns4:href="#m0fca2865ba" x="47.72" y="253.142969" style="stroke: #000000; stroke-width: 0.8" />
3993
  </g>
3994
  </g>
3995
  <g id="text_7">
3996
- <text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: end" x="40.72" y="256.942188" transform="rotate(-0 40.72 256.942188)">2.0</text>
3997
  </g>
3998
  </g>
3999
  <g id="ytick_4">
4000
  <g id="grid-y--5" class="grid grid-y">
4001
- <path d="M 47.72 175.235758 L 840.20233 175.235758 " clip-path="url(#p2214f54723)" style="fill: none; stroke: #b0b0b0; stroke-opacity: 0.3; stroke-width: 0.8; stroke-linecap: square" />
4002
  </g>
4003
  <g id="line2d_8">
4004
  <g>
4005
- <use ns4:href="#m0fca2865ba" x="47.72" y="175.235758" style="stroke: #000000; stroke-width: 0.8" />
4006
  </g>
4007
  </g>
4008
  <g id="text_8">
4009
- <text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: end" x="40.72" y="179.034976" transform="rotate(-0 40.72 179.034976)">2.5</text>
4010
  </g>
4011
  </g>
4012
  <g id="ytick_5">
4013
  <g id="grid-y--6" class="grid grid-y">
4014
- <path d="M 47.72 97.328546 L 840.20233 97.328546 " clip-path="url(#p2214f54723)" style="fill: none; stroke: #b0b0b0; stroke-opacity: 0.3; stroke-width: 0.8; stroke-linecap: square" />
4015
  </g>
4016
  <g id="line2d_9">
4017
  <g>
4018
- <use ns4:href="#m0fca2865ba" x="47.72" y="97.328546" style="stroke: #000000; stroke-width: 0.8" />
4019
  </g>
4020
  </g>
4021
  <g id="text_9">
4022
- <text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: end" x="40.72" y="101.127765" transform="rotate(-0 40.72 101.127765)">3.0</text>
4023
  </g>
4024
  </g>
4025
  <g id="label--y" class="ylabel">
@@ -4027,27 +4035,27 @@ span.linenos.special { color: #000000; background-color: #ffffc0; padding-left:
4027
  </g>
4028
  </g>
4029
  <g id="series--torch-layer-norm" class="series">
4030
- <path d="M 83.741924 437.689571 L 323.888085 302.833591 L 564.034245 313.993176 L 804.180406 46.442361 " clip-path="url(#p2214f54723)" style="fill: none; stroke: #1f77b4; stroke-width: 1.5; stroke-linecap: square" />
4031
  <defs>
4032
  <path id="md7efaf3aec" d="M 0 3 C 0.795609 3 1.55874 2.683901 2.12132 2.12132 C 2.683901 1.55874 3 0.795609 3 0 C 3 -0.795609 2.683901 -1.55874 2.12132 -2.12132 C 1.55874 -2.683901 0.795609 -3 0 -3 C -0.795609 -3 -1.55874 -2.683901 -2.12132 -2.12132 C -2.683901 -1.55874 -3 -0.795609 -3 0 C -3 0.795609 -2.683901 1.55874 -2.12132 2.12132 C -1.55874 2.683901 -0.795609 3 0 3 z " style="stroke: #1f77b4" />
4033
  </defs>
4034
  <g clip-path="url(#p2214f54723)">
4035
  <use ns4:href="#md7efaf3aec" x="83.741924" y="437.689571" style="fill: #1f77b4; stroke: #1f77b4" />
4036
- <use ns4:href="#md7efaf3aec" x="323.888085" y="302.833591" style="fill: #1f77b4; stroke: #1f77b4" />
4037
- <use ns4:href="#md7efaf3aec" x="564.034245" y="313.993176" style="fill: #1f77b4; stroke: #1f77b4" />
4038
  <use ns4:href="#md7efaf3aec" x="804.180406" y="46.442361" style="fill: #1f77b4; stroke: #1f77b4" />
4039
  </g>
4040
  </g>
4041
  <g id="series--hf-kernels-layer-norm" class="series">
4042
- <path d="M 83.741924 434.434608 L 323.888085 307.690482 L 564.034245 307.408302 L 804.180406 57.182805 " clip-path="url(#p2214f54723)" style="fill: none; stroke: #ff7f0e; stroke-width: 1.5; stroke-linecap: square" />
4043
  <defs>
4044
  <path id="m9b8c54d372" d="M 0 3 C 0.795609 3 1.55874 2.683901 2.12132 2.12132 C 2.683901 1.55874 3 0.795609 3 0 C 3 -0.795609 2.683901 -1.55874 2.12132 -2.12132 C 1.55874 -2.683901 0.795609 -3 0 -3 C -0.795609 -3 -1.55874 -2.683901 -2.12132 -2.12132 C -2.683901 -1.55874 -3 -0.795609 -3 0 C -3 0.795609 -2.683901 1.55874 -2.12132 2.12132 C -1.55874 2.683901 -0.795609 3 0 3 z " style="stroke: #ff7f0e" />
4045
  </defs>
4046
  <g clip-path="url(#p2214f54723)">
4047
- <use ns4:href="#m9b8c54d372" x="83.741924" y="434.434608" style="fill: #ff7f0e; stroke: #ff7f0e" />
4048
- <use ns4:href="#m9b8c54d372" x="323.888085" y="307.690482" style="fill: #ff7f0e; stroke: #ff7f0e" />
4049
- <use ns4:href="#m9b8c54d372" x="564.034245" y="307.408302" style="fill: #ff7f0e; stroke: #ff7f0e" />
4050
- <use ns4:href="#m9b8c54d372" x="804.180406" y="57.182805" style="fill: #ff7f0e; stroke: #ff7f0e" />
4051
  </g>
4052
  </g>
4053
  <g id="patch_3">
@@ -4105,7 +4113,7 @@ span.linenos.special { color: #000000; background-color: #ffffc0; padding-left:
4105
  <span onclick="toggleOutput('combine')" style="cursor: pointer;">▼ output</span>
4106
  <span id="uv-indicator-combine" onclick="toggleUvLogsFromHeader('combine')" style="cursor: pointer;">▶ uv-logs</span>
4107
  </span> |
4108
- Cell: combine | 4.21s
4109
  | <button class="run-btn" onclick="runCell('combine')">▶ run</button>
4110
  <button class="copy-btn" onclick="copyCell('combine')">Copy</button>
4111
  <a href="cells/combine.py" target="_blank" class="raw-btn">Raw</a>
@@ -4192,8 +4200,8 @@ Summary: 2 found, 0 skipped, 0 missing
4192
  COMBINED BENCHMARK SUMMARY
4193
 
4194
  impl wl p50(ms) ok
4195
- hf_kernels_layer_norm LN_B16_S2048_D4096 0.84 True
4196
- hf_kernels_layer_norm LN_B16_S2048_D8192 1.65 True
4197
  hf_kernels_layer_norm LN_B16_S4096_D4096 1.65 True
4198
  hf_kernels_layer_norm LN_B16_S4096_D8192 3.26 True
4199
  torch_layer_norm LN_B16_S2048_D4096 0.82 True
@@ -4219,7 +4227,7 @@ Implementations included:
4219
  <div class="uv-install-logs" id="uv-logs-combine">
4220
  <div class="uv-logs-header" onclick="toggleUvLogs(this)">▶ UV Install Logs</div>
4221
  <div class="uv-logs-content" style="display: none;">
4222
- Installed 37 packages in 210ms
4223
  </div>
4224
  </div>
4225
  <div class="cell-artifacts">
@@ -4232,7 +4240,7 @@ Installed 37 packages in 210ms
4232
  <rdf:RDF>
4233
  <ns2:Work>
4234
  <dc:type rdf:resource="http://purl.org/dc/dcmitype/StillImage" />
4235
- <dc:date>2025-10-29T14:27:45.722521</dc:date>
4236
  <dc:format>image/svg+xml</dc:format>
4237
  <dc:creator>
4238
  <ns2:Agent>
@@ -4316,70 +4324,70 @@ Installed 37 packages in 210ms
4316
  <g id="matplotlib.axis_2">
4317
  <g id="ytick_1">
4318
  <g id="grid-y--2" class="grid grid-y">
4319
- <path d="M 47.72 408.957392 L 840.20233 408.957392 " clip-path="url(#p2214f54723)" style="fill: none; stroke: #b0b0b0; stroke-opacity: 0.3; stroke-width: 0.8; stroke-linecap: square" />
4320
  </g>
4321
  <g id="line2d_5">
4322
  <defs>
4323
  <path id="m0fca2865ba" d="M 0 0 L -3.5 0 " style="stroke: #000000; stroke-width: 0.8" />
4324
  </defs>
4325
  <g>
4326
- <use ns4:href="#m0fca2865ba" x="47.72" y="408.957392" style="stroke: #000000; stroke-width: 0.8" />
4327
  </g>
4328
  </g>
4329
  <g id="text_5">
4330
- <text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: end" x="40.72" y="412.756611" transform="rotate(-0 40.72 412.756611)">1.0</text>
4331
  </g>
4332
  </g>
4333
  <g id="ytick_2">
4334
  <g id="grid-y--3" class="grid grid-y">
4335
- <path d="M 47.72 331.05018 L 840.20233 331.05018 " clip-path="url(#p2214f54723)" style="fill: none; stroke: #b0b0b0; stroke-opacity: 0.3; stroke-width: 0.8; stroke-linecap: square" />
4336
  </g>
4337
  <g id="line2d_6">
4338
  <g>
4339
- <use ns4:href="#m0fca2865ba" x="47.72" y="331.05018" style="stroke: #000000; stroke-width: 0.8" />
4340
  </g>
4341
  </g>
4342
  <g id="text_6">
4343
- <text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: end" x="40.72" y="334.849399" transform="rotate(-0 40.72 334.849399)">1.5</text>
4344
  </g>
4345
  </g>
4346
  <g id="ytick_3">
4347
  <g id="grid-y--4" class="grid grid-y">
4348
- <path d="M 47.72 253.142969 L 840.20233 253.142969 " clip-path="url(#p2214f54723)" style="fill: none; stroke: #b0b0b0; stroke-opacity: 0.3; stroke-width: 0.8; stroke-linecap: square" />
4349
  </g>
4350
  <g id="line2d_7">
4351
  <g>
4352
- <use ns4:href="#m0fca2865ba" x="47.72" y="253.142969" style="stroke: #000000; stroke-width: 0.8" />
4353
  </g>
4354
  </g>
4355
  <g id="text_7">
4356
- <text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: end" x="40.72" y="256.942188" transform="rotate(-0 40.72 256.942188)">2.0</text>
4357
  </g>
4358
  </g>
4359
  <g id="ytick_4">
4360
  <g id="grid-y--5" class="grid grid-y">
4361
- <path d="M 47.72 175.235758 L 840.20233 175.235758 " clip-path="url(#p2214f54723)" style="fill: none; stroke: #b0b0b0; stroke-opacity: 0.3; stroke-width: 0.8; stroke-linecap: square" />
4362
  </g>
4363
  <g id="line2d_8">
4364
  <g>
4365
- <use ns4:href="#m0fca2865ba" x="47.72" y="175.235758" style="stroke: #000000; stroke-width: 0.8" />
4366
  </g>
4367
  </g>
4368
  <g id="text_8">
4369
- <text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: end" x="40.72" y="179.034976" transform="rotate(-0 40.72 179.034976)">2.5</text>
4370
  </g>
4371
  </g>
4372
  <g id="ytick_5">
4373
  <g id="grid-y--6" class="grid grid-y">
4374
- <path d="M 47.72 97.328546 L 840.20233 97.328546 " clip-path="url(#p2214f54723)" style="fill: none; stroke: #b0b0b0; stroke-opacity: 0.3; stroke-width: 0.8; stroke-linecap: square" />
4375
  </g>
4376
  <g id="line2d_9">
4377
  <g>
4378
- <use ns4:href="#m0fca2865ba" x="47.72" y="97.328546" style="stroke: #000000; stroke-width: 0.8" />
4379
  </g>
4380
  </g>
4381
  <g id="text_9">
4382
- <text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: end" x="40.72" y="101.127765" transform="rotate(-0 40.72 101.127765)">3.0</text>
4383
  </g>
4384
  </g>
4385
  <g id="label--y" class="ylabel">
@@ -4387,27 +4395,27 @@ Installed 37 packages in 210ms
4387
  </g>
4388
  </g>
4389
  <g id="series--torch-layer-norm" class="series">
4390
- <path d="M 83.741924 437.689571 L 323.888085 302.833591 L 564.034245 313.993176 L 804.180406 46.442361 " clip-path="url(#p2214f54723)" style="fill: none; stroke: #1f77b4; stroke-width: 1.5; stroke-linecap: square" />
4391
  <defs>
4392
  <path id="md7efaf3aec" d="M 0 3 C 0.795609 3 1.55874 2.683901 2.12132 2.12132 C 2.683901 1.55874 3 0.795609 3 0 C 3 -0.795609 2.683901 -1.55874 2.12132 -2.12132 C 1.55874 -2.683901 0.795609 -3 0 -3 C -0.795609 -3 -1.55874 -2.683901 -2.12132 -2.12132 C -2.683901 -1.55874 -3 -0.795609 -3 0 C -3 0.795609 -2.683901 1.55874 -2.12132 2.12132 C -1.55874 2.683901 -0.795609 3 0 3 z " style="stroke: #1f77b4" />
4393
  </defs>
4394
  <g clip-path="url(#p2214f54723)">
4395
  <use ns4:href="#md7efaf3aec" x="83.741924" y="437.689571" style="fill: #1f77b4; stroke: #1f77b4" />
4396
- <use ns4:href="#md7efaf3aec" x="323.888085" y="302.833591" style="fill: #1f77b4; stroke: #1f77b4" />
4397
- <use ns4:href="#md7efaf3aec" x="564.034245" y="313.993176" style="fill: #1f77b4; stroke: #1f77b4" />
4398
  <use ns4:href="#md7efaf3aec" x="804.180406" y="46.442361" style="fill: #1f77b4; stroke: #1f77b4" />
4399
  </g>
4400
  </g>
4401
  <g id="series--hf-kernels-layer-norm" class="series">
4402
- <path d="M 83.741924 434.434608 L 323.888085 307.690482 L 564.034245 307.408302 L 804.180406 57.182805 " clip-path="url(#p2214f54723)" style="fill: none; stroke: #ff7f0e; stroke-width: 1.5; stroke-linecap: square" />
4403
  <defs>
4404
  <path id="m9b8c54d372" d="M 0 3 C 0.795609 3 1.55874 2.683901 2.12132 2.12132 C 2.683901 1.55874 3 0.795609 3 0 C 3 -0.795609 2.683901 -1.55874 2.12132 -2.12132 C 1.55874 -2.683901 0.795609 -3 0 -3 C -0.795609 -3 -1.55874 -2.683901 -2.12132 -2.12132 C -2.683901 -1.55874 -3 -0.795609 -3 0 C -3 0.795609 -2.683901 1.55874 -2.12132 2.12132 C -1.55874 2.683901 -0.795609 3 0 3 z " style="stroke: #ff7f0e" />
4405
  </defs>
4406
  <g clip-path="url(#p2214f54723)">
4407
- <use ns4:href="#m9b8c54d372" x="83.741924" y="434.434608" style="fill: #ff7f0e; stroke: #ff7f0e" />
4408
- <use ns4:href="#m9b8c54d372" x="323.888085" y="307.690482" style="fill: #ff7f0e; stroke: #ff7f0e" />
4409
- <use ns4:href="#m9b8c54d372" x="564.034245" y="307.408302" style="fill: #ff7f0e; stroke: #ff7f0e" />
4410
- <use ns4:href="#m9b8c54d372" x="804.180406" y="57.182805" style="fill: #ff7f0e; stroke: #ff7f0e" />
4411
  </g>
4412
  </g>
4413
  <g id="patch_3">
 
809
  .artifact-preview svg {
810
  background: transparent;
811
  }
812
+ /* Invert SVG images in dark mode */
813
+ :root[data-theme="dark"] .artifact-preview img[src$=".svg"] {
814
+ filter: invert(0.9) hue-rotate(180deg);
815
+ }
816
+ /* Keep SVG images readable in monocolor mode */
817
+ :root[data-ui="monocolor"] .artifact-preview img[src$=".svg"] {
818
+ filter: none;
819
+ }
820
  /* CSV table styling */
821
  .artifact-csv {
822
  margin-top: 1rem;
 
3880
  <rdf:RDF>
3881
  <ns2:Work>
3882
  <dc:type rdf:resource="http://purl.org/dc/dcmitype/StillImage" />
3883
+ <dc:date>2025-10-29T15:51:05.081730</dc:date>
3884
  <dc:format>image/svg+xml</dc:format>
3885
  <dc:creator>
3886
  <ns2:Agent>
 
3964
  <g id="matplotlib.axis_2">
3965
  <g id="ytick_1">
3966
  <g id="grid-y--2" class="grid grid-y">
3967
+ <path d="M 47.72 409.909979 L 840.20233 409.909979 " clip-path="url(#p2214f54723)" style="fill: none; stroke: #b0b0b0; stroke-opacity: 0.3; stroke-width: 0.8; stroke-linecap: square" />
3968
  </g>
3969
  <g id="line2d_5">
3970
  <defs>
3971
  <path id="m0fca2865ba" d="M 0 0 L -3.5 0 " style="stroke: #000000; stroke-width: 0.8" />
3972
  </defs>
3973
  <g>
3974
+ <use ns4:href="#m0fca2865ba" x="47.72" y="409.909979" style="stroke: #000000; stroke-width: 0.8" />
3975
  </g>
3976
  </g>
3977
  <g id="text_5">
3978
+ <text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: end" x="40.72" y="413.709198" transform="rotate(-0 40.72 413.709198)">1.0</text>
3979
  </g>
3980
  </g>
3981
  <g id="ytick_2">
3982
  <g id="grid-y--3" class="grid grid-y">
3983
+ <path d="M 47.72 331.917289 L 840.20233 331.917289 " clip-path="url(#p2214f54723)" style="fill: none; stroke: #b0b0b0; stroke-opacity: 0.3; stroke-width: 0.8; stroke-linecap: square" />
3984
  </g>
3985
  <g id="line2d_6">
3986
  <g>
3987
+ <use ns4:href="#m0fca2865ba" x="47.72" y="331.917289" style="stroke: #000000; stroke-width: 0.8" />
3988
  </g>
3989
  </g>
3990
  <g id="text_6">
3991
+ <text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: end" x="40.72" y="335.716508" transform="rotate(-0 40.72 335.716508)">1.5</text>
3992
  </g>
3993
  </g>
3994
  <g id="ytick_3">
3995
  <g id="grid-y--4" class="grid grid-y">
3996
+ <path d="M 47.72 253.924599 L 840.20233 253.924599 " clip-path="url(#p2214f54723)" style="fill: none; stroke: #b0b0b0; stroke-opacity: 0.3; stroke-width: 0.8; stroke-linecap: square" />
3997
  </g>
3998
  <g id="line2d_7">
3999
  <g>
4000
+ <use ns4:href="#m0fca2865ba" x="47.72" y="253.924599" style="stroke: #000000; stroke-width: 0.8" />
4001
  </g>
4002
  </g>
4003
  <g id="text_7">
4004
+ <text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: end" x="40.72" y="257.723817" transform="rotate(-0 40.72 257.723817)">2.0</text>
4005
  </g>
4006
  </g>
4007
  <g id="ytick_4">
4008
  <g id="grid-y--5" class="grid grid-y">
4009
+ <path d="M 47.72 175.931908 L 840.20233 175.931908 " clip-path="url(#p2214f54723)" style="fill: none; stroke: #b0b0b0; stroke-opacity: 0.3; stroke-width: 0.8; stroke-linecap: square" />
4010
  </g>
4011
  <g id="line2d_8">
4012
  <g>
4013
+ <use ns4:href="#m0fca2865ba" x="47.72" y="175.931908" style="stroke: #000000; stroke-width: 0.8" />
4014
  </g>
4015
  </g>
4016
  <g id="text_8">
4017
+ <text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: end" x="40.72" y="179.731127" transform="rotate(-0 40.72 179.731127)">2.5</text>
4018
  </g>
4019
  </g>
4020
  <g id="ytick_5">
4021
  <g id="grid-y--6" class="grid grid-y">
4022
+ <path d="M 47.72 97.939218 L 840.20233 97.939218 " clip-path="url(#p2214f54723)" style="fill: none; stroke: #b0b0b0; stroke-opacity: 0.3; stroke-width: 0.8; stroke-linecap: square" />
4023
  </g>
4024
  <g id="line2d_9">
4025
  <g>
4026
+ <use ns4:href="#m0fca2865ba" x="47.72" y="97.939218" style="stroke: #000000; stroke-width: 0.8" />
4027
  </g>
4028
  </g>
4029
  <g id="text_9">
4030
+ <text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: end" x="40.72" y="101.738437" transform="rotate(-0 40.72 101.738437)">3.0</text>
4031
  </g>
4032
  </g>
4033
  <g id="label--y" class="ylabel">
 
4035
  </g>
4036
  </g>
4037
  <g id="series--torch-layer-norm" class="series">
4038
+ <path d="M 83.741924 437.689571 L 323.888085 303.515627 L 564.034245 314.85592 L 804.180406 46.442361 " clip-path="url(#p2214f54723)" style="fill: none; stroke: #1f77b4; stroke-width: 1.5; stroke-linecap: square" />
4039
  <defs>
4040
  <path id="md7efaf3aec" d="M 0 3 C 0.795609 3 1.55874 2.683901 2.12132 2.12132 C 2.683901 1.55874 3 0.795609 3 0 C 3 -0.795609 2.683901 -1.55874 2.12132 -2.12132 C 1.55874 -2.683901 0.795609 -3 0 -3 C -0.795609 -3 -1.55874 -2.683901 -2.12132 -2.12132 C -2.683901 -1.55874 -3 -0.795609 -3 0 C -3 0.795609 -2.683901 1.55874 -2.12132 2.12132 C -1.55874 2.683901 -0.795609 3 0 3 z " style="stroke: #1f77b4" />
4041
  </defs>
4042
  <g clip-path="url(#p2214f54723)">
4043
  <use ns4:href="#md7efaf3aec" x="83.741924" y="437.689571" style="fill: #1f77b4; stroke: #1f77b4" />
4044
+ <use ns4:href="#md7efaf3aec" x="323.888085" y="303.515627" style="fill: #1f77b4; stroke: #1f77b4" />
4045
+ <use ns4:href="#md7efaf3aec" x="564.034245" y="314.85592" style="fill: #1f77b4; stroke: #1f77b4" />
4046
  <use ns4:href="#md7efaf3aec" x="804.180406" y="46.442361" style="fill: #1f77b4; stroke: #1f77b4" />
4047
  </g>
4048
  </g>
4049
  <g id="series--hf-kernels-layer-norm" class="series">
4050
+ <path d="M 83.741924 435.933176 L 323.888085 307.404498 L 564.034245 307.981644 L 804.180406 57.739446 " clip-path="url(#p2214f54723)" style="fill: none; stroke: #ff7f0e; stroke-width: 1.5; stroke-linecap: square" />
4051
  <defs>
4052
  <path id="m9b8c54d372" d="M 0 3 C 0.795609 3 1.55874 2.683901 2.12132 2.12132 C 2.683901 1.55874 3 0.795609 3 0 C 3 -0.795609 2.683901 -1.55874 2.12132 -2.12132 C 1.55874 -2.683901 0.795609 -3 0 -3 C -0.795609 -3 -1.55874 -2.683901 -2.12132 -2.12132 C -2.683901 -1.55874 -3 -0.795609 -3 0 C -3 0.795609 -2.683901 1.55874 -2.12132 2.12132 C -1.55874 2.683901 -0.795609 3 0 3 z " style="stroke: #ff7f0e" />
4053
  </defs>
4054
  <g clip-path="url(#p2214f54723)">
4055
+ <use ns4:href="#m9b8c54d372" x="83.741924" y="435.933176" style="fill: #ff7f0e; stroke: #ff7f0e" />
4056
+ <use ns4:href="#m9b8c54d372" x="323.888085" y="307.404498" style="fill: #ff7f0e; stroke: #ff7f0e" />
4057
+ <use ns4:href="#m9b8c54d372" x="564.034245" y="307.981644" style="fill: #ff7f0e; stroke: #ff7f0e" />
4058
+ <use ns4:href="#m9b8c54d372" x="804.180406" y="57.739446" style="fill: #ff7f0e; stroke: #ff7f0e" />
4059
  </g>
4060
  </g>
4061
  <g id="patch_3">
 
4113
  <span onclick="toggleOutput('combine')" style="cursor: pointer;">▼ output</span>
4114
  <span id="uv-indicator-combine" onclick="toggleUvLogsFromHeader('combine')" style="cursor: pointer;">▶ uv-logs</span>
4115
  </span> |
4116
+ Cell: combine | 4.18s
4117
  | <button class="run-btn" onclick="runCell('combine')">▶ run</button>
4118
  <button class="copy-btn" onclick="copyCell('combine')">Copy</button>
4119
  <a href="cells/combine.py" target="_blank" class="raw-btn">Raw</a>
 
4200
  COMBINED BENCHMARK SUMMARY
4201
 
4202
  impl wl p50(ms) ok
4203
+ hf_kernels_layer_norm LN_B16_S2048_D4096 0.83 True
4204
+ hf_kernels_layer_norm LN_B16_S2048_D8192 1.66 True
4205
  hf_kernels_layer_norm LN_B16_S4096_D4096 1.65 True
4206
  hf_kernels_layer_norm LN_B16_S4096_D8192 3.26 True
4207
  torch_layer_norm LN_B16_S2048_D4096 0.82 True
 
4227
  <div class="uv-install-logs" id="uv-logs-combine">
4228
  <div class="uv-logs-header" onclick="toggleUvLogs(this)">▶ UV Install Logs</div>
4229
  <div class="uv-logs-content" style="display: none;">
4230
+ Installed 37 packages in 195ms
4231
  </div>
4232
  </div>
4233
  <div class="cell-artifacts">
 
4240
  <rdf:RDF>
4241
  <ns2:Work>
4242
  <dc:type rdf:resource="http://purl.org/dc/dcmitype/StillImage" />
4243
+ <dc:date>2025-10-29T15:51:05.081730</dc:date>
4244
  <dc:format>image/svg+xml</dc:format>
4245
  <dc:creator>
4246
  <ns2:Agent>
 
4324
  <g id="matplotlib.axis_2">
4325
  <g id="ytick_1">
4326
  <g id="grid-y--2" class="grid grid-y">
4327
+ <path d="M 47.72 409.909979 L 840.20233 409.909979 " clip-path="url(#p2214f54723)" style="fill: none; stroke: #b0b0b0; stroke-opacity: 0.3; stroke-width: 0.8; stroke-linecap: square" />
4328
  </g>
4329
  <g id="line2d_5">
4330
  <defs>
4331
  <path id="m0fca2865ba" d="M 0 0 L -3.5 0 " style="stroke: #000000; stroke-width: 0.8" />
4332
  </defs>
4333
  <g>
4334
+ <use ns4:href="#m0fca2865ba" x="47.72" y="409.909979" style="stroke: #000000; stroke-width: 0.8" />
4335
  </g>
4336
  </g>
4337
  <g id="text_5">
4338
+ <text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: end" x="40.72" y="413.709198" transform="rotate(-0 40.72 413.709198)">1.0</text>
4339
  </g>
4340
  </g>
4341
  <g id="ytick_2">
4342
  <g id="grid-y--3" class="grid grid-y">
4343
+ <path d="M 47.72 331.917289 L 840.20233 331.917289 " clip-path="url(#p2214f54723)" style="fill: none; stroke: #b0b0b0; stroke-opacity: 0.3; stroke-width: 0.8; stroke-linecap: square" />
4344
  </g>
4345
  <g id="line2d_6">
4346
  <g>
4347
+ <use ns4:href="#m0fca2865ba" x="47.72" y="331.917289" style="stroke: #000000; stroke-width: 0.8" />
4348
  </g>
4349
  </g>
4350
  <g id="text_6">
4351
+ <text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: end" x="40.72" y="335.716508" transform="rotate(-0 40.72 335.716508)">1.5</text>
4352
  </g>
4353
  </g>
4354
  <g id="ytick_3">
4355
  <g id="grid-y--4" class="grid grid-y">
4356
+ <path d="M 47.72 253.924599 L 840.20233 253.924599 " clip-path="url(#p2214f54723)" style="fill: none; stroke: #b0b0b0; stroke-opacity: 0.3; stroke-width: 0.8; stroke-linecap: square" />
4357
  </g>
4358
  <g id="line2d_7">
4359
  <g>
4360
+ <use ns4:href="#m0fca2865ba" x="47.72" y="253.924599" style="stroke: #000000; stroke-width: 0.8" />
4361
  </g>
4362
  </g>
4363
  <g id="text_7">
4364
+ <text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: end" x="40.72" y="257.723817" transform="rotate(-0 40.72 257.723817)">2.0</text>
4365
  </g>
4366
  </g>
4367
  <g id="ytick_4">
4368
  <g id="grid-y--5" class="grid grid-y">
4369
+ <path d="M 47.72 175.931908 L 840.20233 175.931908 " clip-path="url(#p2214f54723)" style="fill: none; stroke: #b0b0b0; stroke-opacity: 0.3; stroke-width: 0.8; stroke-linecap: square" />
4370
  </g>
4371
  <g id="line2d_8">
4372
  <g>
4373
+ <use ns4:href="#m0fca2865ba" x="47.72" y="175.931908" style="stroke: #000000; stroke-width: 0.8" />
4374
  </g>
4375
  </g>
4376
  <g id="text_8">
4377
+ <text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: end" x="40.72" y="179.731127" transform="rotate(-0 40.72 179.731127)">2.5</text>
4378
  </g>
4379
  </g>
4380
  <g id="ytick_5">
4381
  <g id="grid-y--6" class="grid grid-y">
4382
+ <path d="M 47.72 97.939218 L 840.20233 97.939218 " clip-path="url(#p2214f54723)" style="fill: none; stroke: #b0b0b0; stroke-opacity: 0.3; stroke-width: 0.8; stroke-linecap: square" />
4383
  </g>
4384
  <g id="line2d_9">
4385
  <g>
4386
+ <use ns4:href="#m0fca2865ba" x="47.72" y="97.939218" style="stroke: #000000; stroke-width: 0.8" />
4387
  </g>
4388
  </g>
4389
  <g id="text_9">
4390
+ <text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: end" x="40.72" y="101.738437" transform="rotate(-0 40.72 101.738437)">3.0</text>
4391
  </g>
4392
  </g>
4393
  <g id="label--y" class="ylabel">
 
4395
  </g>
4396
  </g>
4397
  <g id="series--torch-layer-norm" class="series">
4398
+ <path d="M 83.741924 437.689571 L 323.888085 303.515627 L 564.034245 314.85592 L 804.180406 46.442361 " clip-path="url(#p2214f54723)" style="fill: none; stroke: #1f77b4; stroke-width: 1.5; stroke-linecap: square" />
4399
  <defs>
4400
  <path id="md7efaf3aec" d="M 0 3 C 0.795609 3 1.55874 2.683901 2.12132 2.12132 C 2.683901 1.55874 3 0.795609 3 0 C 3 -0.795609 2.683901 -1.55874 2.12132 -2.12132 C 1.55874 -2.683901 0.795609 -3 0 -3 C -0.795609 -3 -1.55874 -2.683901 -2.12132 -2.12132 C -2.683901 -1.55874 -3 -0.795609 -3 0 C -3 0.795609 -2.683901 1.55874 -2.12132 2.12132 C -1.55874 2.683901 -0.795609 3 0 3 z " style="stroke: #1f77b4" />
4401
  </defs>
4402
  <g clip-path="url(#p2214f54723)">
4403
  <use ns4:href="#md7efaf3aec" x="83.741924" y="437.689571" style="fill: #1f77b4; stroke: #1f77b4" />
4404
+ <use ns4:href="#md7efaf3aec" x="323.888085" y="303.515627" style="fill: #1f77b4; stroke: #1f77b4" />
4405
+ <use ns4:href="#md7efaf3aec" x="564.034245" y="314.85592" style="fill: #1f77b4; stroke: #1f77b4" />
4406
  <use ns4:href="#md7efaf3aec" x="804.180406" y="46.442361" style="fill: #1f77b4; stroke: #1f77b4" />
4407
  </g>
4408
  </g>
4409
  <g id="series--hf-kernels-layer-norm" class="series">
4410
+ <path d="M 83.741924 435.933176 L 323.888085 307.404498 L 564.034245 307.981644 L 804.180406 57.739446 " clip-path="url(#p2214f54723)" style="fill: none; stroke: #ff7f0e; stroke-width: 1.5; stroke-linecap: square" />
4411
  <defs>
4412
  <path id="m9b8c54d372" d="M 0 3 C 0.795609 3 1.55874 2.683901 2.12132 2.12132 C 2.683901 1.55874 3 0.795609 3 0 C 3 -0.795609 2.683901 -1.55874 2.12132 -2.12132 C 1.55874 -2.683901 0.795609 -3 0 -3 C -0.795609 -3 -1.55874 -2.683901 -2.12132 -2.12132 C -2.683901 -1.55874 -3 -0.795609 -3 0 C -3 0.795609 -2.683901 1.55874 -2.12132 2.12132 C -1.55874 2.683901 -0.795609 3 0 3 z " style="stroke: #ff7f0e" />
4413
  </defs>
4414
  <g clip-path="url(#p2214f54723)">
4415
+ <use ns4:href="#m9b8c54d372" x="83.741924" y="435.933176" style="fill: #ff7f0e; stroke: #ff7f0e" />
4416
+ <use ns4:href="#m9b8c54d372" x="323.888085" y="307.404498" style="fill: #ff7f0e; stroke: #ff7f0e" />
4417
+ <use ns4:href="#m9b8c54d372" x="564.034245" y="307.981644" style="fill: #ff7f0e; stroke: #ff7f0e" />
4418
+ <use ns4:href="#m9b8c54d372" x="804.180406" y="57.739446" style="fill: #ff7f0e; stroke: #ff7f0e" />
4419
  </g>
4420
  </g>
4421
  <g id="patch_3">
rotary/impls/artifacts/benchmark/rotary.jsonl CHANGED
@@ -1,24 +1,24 @@
1
- {"ts": "2025-10-29T14:27:29Z", "run": "fb4d417a09cd449ead797cfa379bb3d8", "impl": "torch_eager", "tags": {"family": "pytorch", "backend": "eager"}, "wl": {"name": "cuda_B1_S128_H8_D64_R32", "batch": 1, "seqlen": 128, "num_heads": 8, "head_dim": 64, "rotary_dim": 32, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.17006399997399058, "p50": 0.17533400000502297, "p90": 0.1853339999797754, "mean": 0.1802961999942454, "iqr": 0.014799999974002276, "raw_times": [0.17533400000502297, 0.20021500000666492, 0.17053400000577312, 0.1853339999797754, 0.17006399997399058], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.18331500001522727, "peak_bytes": 1720320, "ok": true, "absmax": null, "corr": {"ok": true, "rtol": 0.003, "atol": 0.005, "absmax_q": 0.0, "absmax_k": 0.0, "mae_q": 0.0, "mae_k": 0.0, "mse_q": 0.0, "mse_k": 0.0, "ref": "rotary_torch"}, "err": null}
2
- {"ts": "2025-10-29T14:27:29Z", "run": "fb4d417a09cd449ead797cfa379bb3d8", "impl": "torch_eager", "tags": {"family": "pytorch", "backend": "eager"}, "wl": {"name": "cuda_B1_S128_H8_D128_R64", "batch": 1, "seqlen": 128, "num_heads": 8, "head_dim": 128, "rotary_dim": 64, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.2246159999685915, "p50": 0.2266160000203854, "p90": 0.22888500001272405, "mean": 0.22735560000910482, "iqr": 0.002880000010918593, "raw_times": [0.22600500000180546, 0.22888500001272405, 0.2246159999685915, 0.2266160000203854, 0.2306560000420177], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.2471160000254713, "peak_bytes": 3440640, "ok": true, "absmax": null, "corr": {"ok": true, "rtol": 0.003, "atol": 0.005, "absmax_q": 0.0, "absmax_k": 0.0, "mae_q": 0.0, "mae_k": 0.0, "mse_q": 0.0, "mse_k": 0.0, "ref": "rotary_torch"}, "err": null}
3
- {"ts": "2025-10-29T14:27:29Z", "run": "fb4d417a09cd449ead797cfa379bb3d8", "impl": "torch_eager", "tags": {"family": "pytorch", "backend": "eager"}, "wl": {"name": "cuda_B1_S128_H32_D64_R32", "batch": 1, "seqlen": 128, "num_heads": 32, "head_dim": 64, "rotary_dim": 32, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.21813499995460006, "p50": 0.22189599997091136, "p90": 0.2272149999953399, "mean": 0.22315939997952228, "iqr": 0.007960000004914036, "raw_times": [0.2272149999953399, 0.22189599997091136, 0.21925499999042586, 0.21813499995460006, 0.2292959999863342], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.2391049999914685, "peak_bytes": 6832128, "ok": true, "absmax": null, "corr": {"ok": true, "rtol": 0.003, "atol": 0.005, "absmax_q": 0.0, "absmax_k": 0.0, "mae_q": 0.0, "mae_k": 0.0, "mse_q": 0.0, "mse_k": 0.0, "ref": "rotary_torch"}, "err": null}
4
- {"ts": "2025-10-29T14:27:29Z", "run": "fb4d417a09cd449ead797cfa379bb3d8", "impl": "torch_eager", "tags": {"family": "pytorch", "backend": "eager"}, "wl": {"name": "cuda_B1_S128_H32_D128_R64", "batch": 1, "seqlen": 128, "num_heads": 32, "head_dim": 128, "rotary_dim": 64, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.21478600001501036, "p50": 0.21544499998071842, "p90": 0.2178249999928994, "mean": 0.2161891999890031, "iqr": 0.0027799999884337012, "raw_times": [0.21544499998071842, 0.2178249999928994, 0.21478600001501036, 0.21784499995192164, 0.2150450000044657], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.22814599998355334, "peak_bytes": 13664256, "ok": true, "absmax": null, "corr": {"ok": true, "rtol": 0.003, "atol": 0.005, "absmax_q": 0.0, "absmax_k": 0.0, "mae_q": 0.0, "mae_k": 0.0, "mse_q": 0.0, "mse_k": 0.0, "ref": "rotary_torch"}, "err": null}
5
- {"ts": "2025-10-29T14:27:29Z", "run": "fb4d417a09cd449ead797cfa379bb3d8", "impl": "torch_eager", "tags": {"family": "pytorch", "backend": "eager"}, "wl": {"name": "cuda_B1_S512_H8_D64_R32", "batch": 1, "seqlen": 512, "num_heads": 8, "head_dim": 64, "rotary_dim": 32, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.2160950000416051, "p50": 0.22390499998437008, "p90": 0.22473600000694205, "mean": 0.22559540000202105, "iqr": 0.00507100003233063, "raw_times": [0.22390499998437008, 0.24357600000257662, 0.22473600000694205, 0.2160950000416051, 0.21966499997461142], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.229085999990275, "peak_bytes": 6881280, "ok": true, "absmax": null, "corr": {"ok": true, "rtol": 0.003, "atol": 0.005, "absmax_q": 0.0, "absmax_k": 0.0, "mae_q": 0.0, "mae_k": 0.0, "mse_q": 0.0, "mse_k": 0.0, "ref": "rotary_torch"}, "err": null}
6
- {"ts": "2025-10-29T14:27:30Z", "run": "fb4d417a09cd449ead797cfa379bb3d8", "impl": "torch_eager", "tags": {"family": "pytorch", "backend": "eager"}, "wl": {"name": "cuda_B1_S512_H8_D128_R64", "batch": 1, "seqlen": 512, "num_heads": 8, "head_dim": 128, "rotary_dim": 64, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.21366499998975996, "p50": 0.21597500000325454, "p90": 0.21670500001391702, "mean": 0.2158129999884295, "iqr": 0.0008600000569458643, "raw_times": [0.21366499998975996, 0.2168749999782449, 0.21597500000325454, 0.21670500001391702, 0.21584499995697115], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.21572499997546402, "peak_bytes": 13762560, "ok": true, "absmax": null, "corr": {"ok": true, "rtol": 0.003, "atol": 0.005, "absmax_q": 0.0, "absmax_k": 0.0, "mae_q": 0.0, "mae_k": 0.0, "mse_q": 0.0, "mse_k": 0.0, "ref": "rotary_torch"}, "err": null}
7
- {"ts": "2025-10-29T14:27:30Z", "run": "fb4d417a09cd449ead797cfa379bb3d8", "impl": "torch_eager", "tags": {"family": "pytorch", "backend": "eager"}, "wl": {"name": "cuda_B1_S512_H32_D64_R32", "batch": 1, "seqlen": 512, "num_heads": 32, "head_dim": 64, "rotary_dim": 32, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.2142449999951168, "p50": 0.21574499999132968, "p90": 0.2169850000086626, "mean": 0.21585539999477987, "iqr": 0.0022990000161371427, "raw_times": [0.2142449999951168, 0.2176159999862648, 0.21468599999252547, 0.2169850000086626, 0.21574499999132968], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.2192349999745602, "peak_bytes": 27328512, "ok": true, "absmax": null, "corr": {"ok": true, "rtol": 0.003, "atol": 0.005, "absmax_q": 0.0, "absmax_k": 0.0, "mae_q": 0.0, "mae_k": 0.0, "mse_q": 0.0, "mse_k": 0.0, "ref": "rotary_torch"}, "err": null}
8
- {"ts": "2025-10-29T14:27:30Z", "run": "fb4d417a09cd449ead797cfa379bb3d8", "impl": "torch_eager", "tags": {"family": "pytorch", "backend": "eager"}, "wl": {"name": "cuda_B1_S512_H32_D128_R64", "batch": 1, "seqlen": 512, "num_heads": 32, "head_dim": 128, "rotary_dim": 64, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.21675499999673775, "p50": 0.21711599998752718, "p90": 0.21833499999956985, "mean": 0.2174776000060774, "iqr": 0.0015789999565640755, "raw_times": [0.21675499999673775, 0.21711599998752718, 0.21833499999956985, 0.21675600004300577, 0.2184260000035465], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.22064600000248902, "peak_bytes": 54657024, "ok": true, "absmax": null, "corr": {"ok": true, "rtol": 0.003, "atol": 0.005, "absmax_q": 0.0, "absmax_k": 0.0, "mae_q": 0.0, "mae_k": 0.0, "mse_q": 0.0, "mse_k": 0.0, "ref": "rotary_torch"}, "err": null}
9
- {"ts": "2025-10-29T14:27:30Z", "run": "fb4d417a09cd449ead797cfa379bb3d8", "impl": "torch_eager", "tags": {"family": "pytorch", "backend": "eager"}, "wl": {"name": "cuda_B1_S2048_H8_D64_R32", "batch": 1, "seqlen": 2048, "num_heads": 8, "head_dim": 64, "rotary_dim": 32, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.2153649999740992, "p50": 0.21702599997297511, "p90": 0.21829499996783852, "mean": 0.21729759998834197, "iqr": 0.0014989999499448459, "raw_times": [0.2153649999740992, 0.21679600001789368, 0.21900600000890336, 0.21702599997297511, 0.21829499996783852], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.22154499998805477, "peak_bytes": 27525120, "ok": true, "absmax": null, "corr": {"ok": true, "rtol": 0.003, "atol": 0.005, "absmax_q": 0.0, "absmax_k": 0.0, "mae_q": 0.0, "mae_k": 0.0, "mse_q": 0.0, "mse_k": 0.0, "ref": "rotary_torch"}, "err": null}
10
- {"ts": "2025-10-29T14:27:30Z", "run": "fb4d417a09cd449ead797cfa379bb3d8", "impl": "torch_eager", "tags": {"family": "pytorch", "backend": "eager"}, "wl": {"name": "cuda_B1_S2048_H8_D128_R64", "batch": 1, "seqlen": 2048, "num_heads": 8, "head_dim": 128, "rotary_dim": 64, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.2143060000321384, "p50": 0.2161449999675824, "p90": 0.21640500000330576, "mean": 0.21578740000904872, "iqr": 0.0008589999538344273, "raw_times": [0.21653499999274572, 0.21640500000330576, 0.2143060000321384, 0.2161449999675824, 0.21554600004947133], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.23455599995259035, "peak_bytes": 55050240, "ok": true, "absmax": null, "corr": {"ok": true, "rtol": 0.003, "atol": 0.005, "absmax_q": 0.0, "absmax_k": 0.0, "mae_q": 0.0, "mae_k": 0.0, "mse_q": 0.0, "mse_k": 0.0, "ref": "rotary_torch"}, "err": null}
11
- {"ts": "2025-10-29T14:27:30Z", "run": "fb4d417a09cd449ead797cfa379bb3d8", "impl": "torch_eager", "tags": {"family": "pytorch", "backend": "eager"}, "wl": {"name": "cuda_B1_S2048_H32_D64_R32", "batch": 1, "seqlen": 2048, "num_heads": 32, "head_dim": 64, "rotary_dim": 32, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.21488499999122723, "p50": 0.21633500000461936, "p90": 0.21918499999173946, "mean": 0.21730919999072285, "iqr": 0.004300000000512227, "raw_times": [0.21488499999122723, 0.21918499999173946, 0.21488499999122723, 0.22125599997480094, 0.21633500000461936], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.2185359999771208, "peak_bytes": 109314048, "ok": true, "absmax": null, "corr": {"ok": true, "rtol": 0.003, "atol": 0.005, "absmax_q": 0.0, "absmax_k": 0.0, "mae_q": 0.0, "mae_k": 0.0, "mse_q": 0.0, "mse_k": 0.0, "ref": "rotary_torch"}, "err": null}
12
- {"ts": "2025-10-29T14:27:30Z", "run": "fb4d417a09cd449ead797cfa379bb3d8", "impl": "torch_eager", "tags": {"family": "pytorch", "backend": "eager"}, "wl": {"name": "cuda_B1_S2048_H32_D128_R64", "batch": 1, "seqlen": 2048, "num_heads": 32, "head_dim": 128, "rotary_dim": 64, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.2230359999657594, "p50": 0.22526600002947816, "p90": 0.22695600000588456, "mean": 0.22723160000168718, "iqr": 0.0026509999884183344, "raw_times": [0.22526600002947816, 0.2230359999657594, 0.23659499998984757, 0.22430500001746623, 0.22695600000588456], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.22456599998577076, "peak_bytes": 218628096, "ok": true, "absmax": null, "corr": {"ok": true, "rtol": 0.003, "atol": 0.005, "absmax_q": 0.0, "absmax_k": 0.0, "mae_q": 0.0, "mae_k": 0.0, "mse_q": 0.0, "mse_k": 0.0, "ref": "rotary_torch"}, "err": null}
13
- {"ts": "2025-10-29T14:27:30Z", "run": "fb4d417a09cd449ead797cfa379bb3d8", "impl": "torch_eager", "tags": {"family": "pytorch", "backend": "eager"}, "wl": {"name": "cuda_B2_S128_H8_D64_R32", "batch": 2, "seqlen": 128, "num_heads": 8, "head_dim": 64, "rotary_dim": 32, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.21608499997682884, "p50": 0.2175149999743553, "p90": 0.22948600002337116, "mean": 0.2247094000040306, "iqr": 0.012610999988282856, "raw_times": [0.21608499997682884, 0.2168750000350883, 0.22948600002337116, 0.24358600001050945, 0.2175149999743553], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.21851499997183055, "peak_bytes": 68698112, "ok": true, "absmax": null, "corr": {"ok": true, "rtol": 0.003, "atol": 0.005, "absmax_q": 0.0, "absmax_k": 0.0, "mae_q": 0.0, "mae_k": 0.0, "mse_q": 0.0, "mse_k": 0.0, "ref": "rotary_torch"}, "err": null}
14
- {"ts": "2025-10-29T14:27:30Z", "run": "fb4d417a09cd449ead797cfa379bb3d8", "impl": "torch_eager", "tags": {"family": "pytorch", "backend": "eager"}, "wl": {"name": "cuda_B2_S128_H8_D128_R64", "batch": 2, "seqlen": 128, "num_heads": 8, "head_dim": 128, "rotary_dim": 64, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.2162149999662688, "p50": 0.21694499997693129, "p90": 0.2171250000060354, "mean": 0.21706100000074002, "iqr": 0.0003099999617006688, "raw_times": [0.2162149999662688, 0.21694499997693129, 0.2171250000060354, 0.21820500001012988, 0.21681500004433474], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.21809500003655558, "peak_bytes": 6848512, "ok": true, "absmax": null, "corr": {"ok": true, "rtol": 0.003, "atol": 0.005, "absmax_q": 0.0, "absmax_k": 0.0, "mae_q": 0.0, "mae_k": 0.0, "mse_q": 0.0, "mse_k": 0.0, "ref": "rotary_torch"}, "err": null}
15
- {"ts": "2025-10-29T14:27:30Z", "run": "fb4d417a09cd449ead797cfa379bb3d8", "impl": "torch_eager", "tags": {"family": "pytorch", "backend": "eager"}, "wl": {"name": "cuda_B2_S128_H32_D64_R32", "batch": 2, "seqlen": 128, "num_heads": 32, "head_dim": 64, "rotary_dim": 32, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.2136749999976928, "p50": 0.21658500003240988, "p90": 0.21662599999672238, "mean": 0.21621120000645533, "iqr": 0.00066100000140068, "raw_times": [0.2136749999976928, 0.2159649999953217, 0.21658500003240988, 0.21662599999672238, 0.21820500001012988], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.2181750000431748, "peak_bytes": 13647872, "ok": true, "absmax": null, "corr": {"ok": true, "rtol": 0.003, "atol": 0.005, "absmax_q": 0.0, "absmax_k": 0.0, "mae_q": 0.0, "mae_k": 0.0, "mse_q": 0.0, "mse_k": 0.0, "ref": "rotary_torch"}, "err": null}
16
- {"ts": "2025-10-29T14:27:30Z", "run": "fb4d417a09cd449ead797cfa379bb3d8", "impl": "torch_eager", "tags": {"family": "pytorch", "backend": "eager"}, "wl": {"name": "cuda_B2_S128_H32_D128_R64", "batch": 2, "seqlen": 128, "num_heads": 32, "head_dim": 128, "rotary_dim": 64, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.21695499998486412, "p50": 0.21774499998628016, "p90": 0.2285450000272249, "mean": 0.22256720000086716, "iqr": 0.010920000022451859, "raw_times": [0.21774499998628016, 0.21762500000477303, 0.2319660000011936, 0.21695499998486412, 0.2285450000272249], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.22269599998026024, "peak_bytes": 27295744, "ok": true, "absmax": null, "corr": {"ok": true, "rtol": 0.003, "atol": 0.005, "absmax_q": 0.0, "absmax_k": 0.0, "mae_q": 0.0, "mae_k": 0.0, "mse_q": 0.0, "mse_k": 0.0, "ref": "rotary_torch"}, "err": null}
17
- {"ts": "2025-10-29T14:27:30Z", "run": "fb4d417a09cd449ead797cfa379bb3d8", "impl": "torch_eager", "tags": {"family": "pytorch", "backend": "eager"}, "wl": {"name": "cuda_B2_S512_H8_D64_R32", "batch": 2, "seqlen": 512, "num_heads": 8, "head_dim": 64, "rotary_dim": 32, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.21440599999777987, "p50": 0.21785499995985447, "p90": 0.2335159999802272, "mean": 0.2228595999895333, "iqr": 0.01891099998374557, "raw_times": [0.21440599999777987, 0.23391600001332336, 0.21785499995985447, 0.2335159999802272, 0.21460499999648164], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.21932499998911226, "peak_bytes": 13697024, "ok": true, "absmax": null, "corr": {"ok": true, "rtol": 0.003, "atol": 0.005, "absmax_q": 0.0, "absmax_k": 0.0, "mae_q": 0.0, "mae_k": 0.0, "mse_q": 0.0, "mse_k": 0.0, "ref": "rotary_torch"}, "err": null}
18
- {"ts": "2025-10-29T14:27:30Z", "run": "fb4d417a09cd449ead797cfa379bb3d8", "impl": "torch_eager", "tags": {"family": "pytorch", "backend": "eager"}, "wl": {"name": "cuda_B2_S512_H8_D128_R64", "batch": 2, "seqlen": 512, "num_heads": 8, "head_dim": 128, "rotary_dim": 64, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.2133250000042608, "p50": 0.21422499997925115, "p90": 0.21653499999274572, "mean": 0.21708740000576654, "iqr": 0.0029589999712698045, "raw_times": [0.21357600002147592, 0.2133250000042608, 0.21653499999274572, 0.21422499997925115, 0.2277760000310991], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.22739600001386862, "peak_bytes": 27394048, "ok": true, "absmax": null, "corr": {"ok": true, "rtol": 0.003, "atol": 0.005, "absmax_q": 0.0, "absmax_k": 0.0, "mae_q": 0.0, "mae_k": 0.0, "mse_q": 0.0, "mse_k": 0.0, "ref": "rotary_torch"}, "err": null}
19
- {"ts": "2025-10-29T14:27:30Z", "run": "fb4d417a09cd449ead797cfa379bb3d8", "impl": "torch_eager", "tags": {"family": "pytorch", "backend": "eager"}, "wl": {"name": "cuda_B2_S512_H32_D64_R32", "batch": 2, "seqlen": 512, "num_heads": 32, "head_dim": 64, "rotary_dim": 32, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.21597500000325454, "p50": 0.2176859999849512, "p90": 0.21771499996248167, "mean": 0.21758339998996234, "iqr": 0.0013999999737279722, "raw_times": [0.2176859999849512, 0.21771499996248167, 0.22022600001037063, 0.2163149999887537, 0.21597500000325454], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.21613599994907418, "peak_bytes": 54591488, "ok": true, "absmax": null, "corr": {"ok": true, "rtol": 0.003, "atol": 0.005, "absmax_q": 0.0, "absmax_k": 0.0, "mae_q": 0.0, "mae_k": 0.0, "mse_q": 0.0, "mse_k": 0.0, "ref": "rotary_torch"}, "err": null}
20
- {"ts": "2025-10-29T14:27:30Z", "run": "fb4d417a09cd449ead797cfa379bb3d8", "impl": "torch_eager", "tags": {"family": "pytorch", "backend": "eager"}, "wl": {"name": "cuda_B2_S512_H32_D128_R64", "batch": 2, "seqlen": 512, "num_heads": 32, "head_dim": 128, "rotary_dim": 64, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.21577600000455277, "p50": 0.2173749999769825, "p90": 0.21900600000890336, "mean": 0.21836960000882755, "iqr": 0.0018509999790694565, "raw_times": [0.2171550000298339, 0.21577600000455277, 0.2225360000238652, 0.2173749999769825, 0.21900600000890336], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.22321599999486352, "peak_bytes": 109182976, "ok": true, "absmax": null, "corr": {"ok": true, "rtol": 0.003, "atol": 0.005, "absmax_q": 0.0, "absmax_k": 0.0, "mae_q": 0.0, "mae_k": 0.0, "mse_q": 0.0, "mse_k": 0.0, "ref": "rotary_torch"}, "err": null}
21
- {"ts": "2025-10-29T14:27:30Z", "run": "fb4d417a09cd449ead797cfa379bb3d8", "impl": "torch_eager", "tags": {"family": "pytorch", "backend": "eager"}, "wl": {"name": "cuda_B2_S2048_H8_D64_R32", "batch": 2, "seqlen": 2048, "num_heads": 8, "head_dim": 64, "rotary_dim": 32, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.21538499998996485, "p50": 0.21647599999141676, "p90": 0.21717500004569956, "mean": 0.2167214000110107, "iqr": 0.001030000021273736, "raw_times": [0.21717500004569956, 0.2184260000035465, 0.21538499998996485, 0.21614500002442583, 0.21647599999141676], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.21872600001415776, "peak_bytes": 54788096, "ok": true, "absmax": null, "corr": {"ok": true, "rtol": 0.003, "atol": 0.005, "absmax_q": 0.0, "absmax_k": 0.0, "mae_q": 0.0, "mae_k": 0.0, "mse_q": 0.0, "mse_k": 0.0, "ref": "rotary_torch"}, "err": null}
22
- {"ts": "2025-10-29T14:27:30Z", "run": "fb4d417a09cd449ead797cfa379bb3d8", "impl": "torch_eager", "tags": {"family": "pytorch", "backend": "eager"}, "wl": {"name": "cuda_B2_S2048_H8_D128_R64", "batch": 2, "seqlen": 2048, "num_heads": 8, "head_dim": 128, "rotary_dim": 64, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.21501500003751062, "p50": 0.2168760000245129, "p90": 0.2187050000088675, "mean": 0.21949320001795058, "iqr": 0.0030500000320898835, "raw_times": [0.21565499997677762, 0.2187050000088675, 0.21501500003751062, 0.23121500004208428, 0.2168760000245129], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.22076499999457155, "peak_bytes": 109576192, "ok": true, "absmax": null, "corr": {"ok": true, "rtol": 0.003, "atol": 0.005, "absmax_q": 0.0, "absmax_k": 0.0, "mae_q": 0.0, "mae_k": 0.0, "mse_q": 0.0, "mse_k": 0.0, "ref": "rotary_torch"}, "err": null}
23
- {"ts": "2025-10-29T14:27:30Z", "run": "fb4d417a09cd449ead797cfa379bb3d8", "impl": "torch_eager", "tags": {"family": "pytorch", "backend": "eager"}, "wl": {"name": "cuda_B2_S2048_H32_D64_R32", "batch": 2, "seqlen": 2048, "num_heads": 32, "head_dim": 64, "rotary_dim": 32, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.22937599999295344, "p50": 0.23008499999832566, "p90": 0.23144499999716572, "mean": 0.23359140000138723, "iqr": 0.0020100000028833165, "raw_times": [0.23144499999716572, 0.23008499999832566, 0.2294349999942824, 0.22937599999295344, 0.24761600002420892], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.23195599999326078, "peak_bytes": 218365952, "ok": true, "absmax": null, "corr": {"ok": true, "rtol": 0.003, "atol": 0.005, "absmax_q": 0.0, "absmax_k": 0.0, "mae_q": 0.0, "mae_k": 0.0, "mse_q": 0.0, "mse_k": 0.0, "ref": "rotary_torch"}, "err": null}
24
- {"ts": "2025-10-29T14:27:30Z", "run": "fb4d417a09cd449ead797cfa379bb3d8", "impl": "torch_eager", "tags": {"family": "pytorch", "backend": "eager"}, "wl": {"name": "cuda_B2_S2048_H32_D128_R64", "batch": 2, "seqlen": 2048, "num_heads": 32, "head_dim": 128, "rotary_dim": 64, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.637245999996594, "p50": 0.6388759999822469, "p90": 0.6389449999915087, "mean": 0.6396317999929124, "iqr": 0.0012190000120426703, "raw_times": [0.6388759999822469, 0.6453660000147465, 0.6389449999915087, 0.637245999996594, 0.637725999979466], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.6364359999793123, "peak_bytes": 436731904, "ok": true, "absmax": null, "corr": {"ok": true, "rtol": 0.003, "atol": 0.005, "absmax_q": 0.0, "absmax_k": 0.0, "mae_q": 0.0, "mae_k": 0.0, "mse_q": 0.0, "mse_k": 0.0, "ref": "rotary_torch"}, "err": null}
 
1
+ {"ts": "2025-10-29T15:50:31Z", "run": "acca4aa63c524f09b2ae69a99c25a06c", "impl": "hf_kernels_rotary", "tags": {"family": "hf-kernels", "backend": "cuda"}, "wl": {"name": "cuda_B1_S128_H8_D64_R32", "batch": 1, "seqlen": 128, "num_heads": 8, "head_dim": 64, "rotary_dim": 32, "dtype": "float32", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.0735019999638098, "p50": 0.07410199998503231, "p90": 0.07441199994673298, "mean": 0.07416379996811884, "iqr": 0.00038999996831989847, "raw_times": [0.07478099996660603, 0.0735019999638098, 0.07441199994673298, 0.07402199997841308, 0.07410199998503231], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.08146199996872383, "peak_bytes": 3178496, "ok": true, "absmax": null, "corr": {"ok": true, "rtol": 1e-05, "atol": 1e-05, "absmax_q": 9.5367431640625e-07, "absmax_k": 9.5367431640625e-07, "mae_q": 1.590452924915553e-08, "mae_k": 1.5487040982975486e-08, "mse_q": 2.5241010080938753e-15, "mse_k": 2.364223539299626e-15, "ref": "rotary_torch"}, "err": null}
2
+ {"ts": "2025-10-29T15:50:31Z", "run": "acca4aa63c524f09b2ae69a99c25a06c", "impl": "hf_kernels_rotary", "tags": {"family": "hf-kernels", "backend": "cuda"}, "wl": {"name": "cuda_B1_S128_H8_D128_R64", "batch": 1, "seqlen": 128, "num_heads": 8, "head_dim": 128, "rotary_dim": 64, "dtype": "float32", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.0912220000373054, "p50": 0.09200200003078862, "p90": 0.09276200000840618, "mean": 0.09224400001812683, "iqr": 0.0012400000173329317, "raw_times": [0.09152199999107324, 0.09276200000840618, 0.0912220000373054, 0.09200200003078862, 0.09371200002306068], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.09689300003401513, "peak_bytes": 6356992, "ok": true, "absmax": null, "corr": {"ok": true, "rtol": 1e-05, "atol": 1e-05, "absmax_q": 9.5367431640625e-07, "absmax_k": 9.5367431640625e-07, "mae_q": 1.5508486939097565e-08, "mae_k": 1.567566698668088e-08, "mse_q": 2.3630110116356316e-15, "mse_k": 2.416562128626943e-15, "ref": "rotary_torch"}, "err": null}
3
+ {"ts": "2025-10-29T15:50:31Z", "run": "acca4aa63c524f09b2ae69a99c25a06c", "impl": "hf_kernels_rotary", "tags": {"family": "hf-kernels", "backend": "cuda"}, "wl": {"name": "cuda_B1_S128_H32_D64_R32", "batch": 1, "seqlen": 128, "num_heads": 32, "head_dim": 64, "rotary_dim": 32, "dtype": "float32", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.08820200002901402, "p50": 0.09085200002800775, "p90": 0.0915720000307374, "mean": 0.09087420002060753, "iqr": 0.002170000016121776, "raw_times": [0.08820200002901402, 0.09434300000066287, 0.08940200001461562, 0.0915720000307374, 0.09085200002800775], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.0964319999638974, "peak_bytes": 12615680, "ok": true, "absmax": null, "corr": {"ok": true, "rtol": 1e-05, "atol": 1e-05, "absmax_q": 9.5367431640625e-07, "absmax_k": 9.5367431640625e-07, "mae_q": 1.5856898016863852e-08, "mae_k": 1.572981211950264e-08, "mse_q": 2.4771055025978386e-15, "mse_k": 2.4544071371937915e-15, "ref": "rotary_torch"}, "err": null}
4
+ {"ts": "2025-10-29T15:50:31Z", "run": "acca4aa63c524f09b2ae69a99c25a06c", "impl": "hf_kernels_rotary", "tags": {"family": "hf-kernels", "backend": "cuda"}, "wl": {"name": "cuda_B1_S128_H32_D128_R64", "batch": 1, "seqlen": 128, "num_heads": 32, "head_dim": 128, "rotary_dim": 64, "dtype": "float32", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.09069200001476929, "p50": 0.09134200001881254, "p90": 0.09142199996858835, "mean": 0.09263220000548245, "iqr": 0.0006699999630654929, "raw_times": [0.09069200001476929, 0.09075200000552286, 0.09142199996858835, 0.09895300001971918, 0.09134200001881254], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.09313199996086041, "peak_bytes": 25231360, "ok": true, "absmax": null, "corr": {"ok": true, "rtol": 1e-05, "atol": 1e-05, "absmax_q": 9.5367431640625e-07, "absmax_k": 9.5367431640625e-07, "mae_q": 1.5617658277733426e-08, "mae_k": 1.5788685914230882e-08, "mse_q": 2.4549424620164562e-15, "mse_k": 2.492823469483563e-15, "ref": "rotary_torch"}, "err": null}
5
+ {"ts": "2025-10-29T15:50:31Z", "run": "acca4aa63c524f09b2ae69a99c25a06c", "impl": "hf_kernels_rotary", "tags": {"family": "hf-kernels", "backend": "cuda"}, "wl": {"name": "cuda_B1_S512_H8_D64_R32", "batch": 1, "seqlen": 512, "num_heads": 8, "head_dim": 64, "rotary_dim": 32, "dtype": "float32", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.0885120000475581, "p50": 0.08998200001997247, "p90": 0.09122199998046199, "mean": 0.09028400000943293, "iqr": 0.0016600000094513234, "raw_times": [0.09122199998046199, 0.0885120000475581, 0.09214200002816142, 0.08998200001997247, 0.08956199997101066], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.1227330000119764, "peak_bytes": 12779520, "ok": true, "absmax": null, "corr": {"ok": true, "rtol": 1e-05, "atol": 1e-05, "absmax_q": 9.5367431640625e-07, "absmax_k": 9.5367431640625e-07, "mae_q": 1.5962712041073246e-08, "mae_k": 1.5743363945830424e-08, "mse_q": 2.534145124782417e-15, "mse_k": 2.451281585618423e-15, "ref": "rotary_torch"}, "err": null}
6
+ {"ts": "2025-10-29T15:50:31Z", "run": "acca4aa63c524f09b2ae69a99c25a06c", "impl": "hf_kernels_rotary", "tags": {"family": "hf-kernels", "backend": "cuda"}, "wl": {"name": "cuda_B1_S512_H8_D128_R64", "batch": 1, "seqlen": 512, "num_heads": 8, "head_dim": 128, "rotary_dim": 64, "dtype": "float32", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.08860200000526675, "p50": 0.09058200004119499, "p90": 0.09118299999499868, "mean": 0.09031840000943703, "iqr": 0.0011699999618031143, "raw_times": [0.08860200000526675, 0.09001300003319557, 0.09058200004119499, 0.09121199997252916, 0.09118299999499868], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.09078199997247793, "peak_bytes": 25427968, "ok": true, "absmax": null, "corr": {"ok": true, "rtol": 1e-05, "atol": 1e-05, "absmax_q": 9.5367431640625e-07, "absmax_k": 9.5367431640625e-07, "mae_q": 1.578730035589615e-08, "mae_k": 1.5859711766097462e-08, "mse_q": 2.440287521479536e-15, "mse_k": 2.477901290051784e-15, "ref": "rotary_torch"}, "err": null}
7
+ {"ts": "2025-10-29T15:50:31Z", "run": "acca4aa63c524f09b2ae69a99c25a06c", "impl": "hf_kernels_rotary", "tags": {"family": "hf-kernels", "backend": "cuda"}, "wl": {"name": "cuda_B1_S512_H32_D64_R32", "batch": 1, "seqlen": 512, "num_heads": 32, "head_dim": 64, "rotary_dim": 32, "dtype": "float32", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.08772200004614206, "p50": 0.09064199997510514, "p90": 0.09105200001613412, "mean": 0.08990000001176668, "iqr": 0.002190000031987438, "raw_times": [0.08772200004614206, 0.09064199997510514, 0.09105200001613412, 0.0912220000373054, 0.08886199998414668], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.09194199998319164, "peak_bytes": 50462720, "ok": true, "absmax": null, "corr": {"ok": true, "rtol": 1e-05, "atol": 1e-05, "absmax_q": 9.5367431640625e-07, "absmax_k": 9.5367431640625e-07, "mae_q": 1.5775295736375483e-08, "mae_k": 1.5847881229547056e-08, "mse_q": 2.471039476146077e-15, "mse_k": 2.472378635235686e-15, "ref": "rotary_torch"}, "err": null}
8
+ {"ts": "2025-10-29T15:50:32Z", "run": "acca4aa63c524f09b2ae69a99c25a06c", "impl": "hf_kernels_rotary", "tags": {"family": "hf-kernels", "backend": "cuda"}, "wl": {"name": "cuda_B1_S512_H32_D128_R64", "batch": 1, "seqlen": 512, "num_heads": 32, "head_dim": 128, "rotary_dim": 64, "dtype": "float32", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.08855200002244601, "p50": 0.08938199999874996, "p90": 0.0907319999896572, "mean": 0.0897739999913938, "iqr": 0.0015100000041456951, "raw_times": [0.0892219999855115, 0.0909819999606043, 0.0907319999896572, 0.08855200002244601, 0.08938199999874996], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.09457200002316313, "peak_bytes": 100925440, "ok": true, "absmax": null, "corr": {"ok": true, "rtol": 1e-05, "atol": 1e-05, "absmax_q": 9.5367431640625e-07, "absmax_k": 9.5367431640625e-07, "mae_q": 1.5959869870130206e-08, "mae_k": 1.588083975434529e-08, "mse_q": 2.510663677418633e-15, "mse_k": 2.502786271009168e-15, "ref": "rotary_torch"}, "err": null}
9
+ {"ts": "2025-10-29T15:50:32Z", "run": "acca4aa63c524f09b2ae69a99c25a06c", "impl": "hf_kernels_rotary", "tags": {"family": "hf-kernels", "backend": "cuda"}, "wl": {"name": "cuda_B1_S2048_H8_D64_R32", "batch": 1, "seqlen": 2048, "num_heads": 8, "head_dim": 64, "rotary_dim": 32, "dtype": "float32", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.08880199999339311, "p50": 0.08953200000405559, "p90": 0.08999199997106189, "mean": 0.08967999999640597, "iqr": 0.0006899999789311551, "raw_times": [0.08880199999339311, 0.08930199999213073, 0.08999199997106189, 0.09077200002138852, 0.08953200000405559], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.09282199999915974, "peak_bytes": 51118080, "ok": true, "absmax": null, "corr": {"ok": true, "rtol": 1e-05, "atol": 1e-05, "absmax_q": 9.5367431640625e-07, "absmax_k": 9.5367431640625e-07, "mae_q": 1.5894533689220225e-08, "mae_k": 1.5873395042831362e-08, "mse_q": 2.5093181655819197e-15, "mse_k": 2.488611809911578e-15, "ref": "rotary_torch"}, "err": null}
10
+ {"ts": "2025-10-29T15:50:32Z", "run": "acca4aa63c524f09b2ae69a99c25a06c", "impl": "hf_kernels_rotary", "tags": {"family": "hf-kernels", "backend": "cuda"}, "wl": {"name": "cuda_B1_S2048_H8_D128_R64", "batch": 1, "seqlen": 2048, "num_heads": 8, "head_dim": 128, "rotary_dim": 64, "dtype": "float32", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.08904199995640738, "p50": 0.09102199999233562, "p90": 0.09121199997252916, "mean": 0.0907579999761765, "iqr": 0.0006099999723119254, "raw_times": [0.08904199995640738, 0.09191199995939314, 0.09121199997252916, 0.09060200000021723, 0.09102199999233562], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.09379199997283649, "peak_bytes": 101711872, "ok": true, "absmax": null, "corr": {"ok": true, "rtol": 1e-05, "atol": 1e-05, "absmax_q": 9.5367431640625e-07, "absmax_k": 9.5367431640625e-07, "mae_q": 1.5936768349433805e-08, "mae_k": 1.5960043953100467e-08, "mse_q": 2.51039008577667e-15, "mse_k": 2.5111253103748867e-15, "ref": "rotary_torch"}, "err": null}
11
+ {"ts": "2025-10-29T15:50:32Z", "run": "acca4aa63c524f09b2ae69a99c25a06c", "impl": "hf_kernels_rotary", "tags": {"family": "hf-kernels", "backend": "cuda"}, "wl": {"name": "cuda_B1_S2048_H32_D64_R32", "batch": 1, "seqlen": 2048, "num_heads": 32, "head_dim": 64, "rotary_dim": 32, "dtype": "float32", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.09005199996181545, "p50": 0.09118200000557408, "p90": 0.0916120000056253, "mean": 0.09133820000215565, "iqr": 0.0005590000000665896, "raw_times": [0.09005199996181545, 0.09105300000555872, 0.09279200003220467, 0.0916120000056253, 0.09118200000557408], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.09626199999956953, "peak_bytes": 201850880, "ok": true, "absmax": null, "corr": {"ok": true, "rtol": 1e-05, "atol": 1e-05, "absmax_q": 1.9073486328125e-06, "absmax_k": 9.5367431640625e-07, "mae_q": 1.586510300910504e-08, "mae_k": 1.5813935050346117e-08, "mse_q": 2.499836478770355e-15, "mse_k": 2.4755639026338358e-15, "ref": "rotary_torch"}, "err": null}
12
+ {"ts": "2025-10-29T15:50:32Z", "run": "acca4aa63c524f09b2ae69a99c25a06c", "impl": "hf_kernels_rotary", "tags": {"family": "hf-kernels", "backend": "cuda"}, "wl": {"name": "cuda_B1_S2048_H32_D128_R64", "batch": 1, "seqlen": 2048, "num_heads": 32, "head_dim": 128, "rotary_dim": 64, "dtype": "float32", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.2600759999609181, "p50": 0.261636000004728, "p90": 0.2620960000285777, "mean": 0.26208780000160914, "iqr": 0.0012810000384888554, "raw_times": [0.2600759999609181, 0.261636000004728, 0.26581600002373307, 0.26081499999008884, 0.2620960000285777], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.263886000027469, "peak_bytes": 403701760, "ok": true, "absmax": null, "corr": {"ok": true, "rtol": 1e-05, "atol": 1e-05, "absmax_q": 9.5367431640625e-07, "absmax_k": 9.5367431640625e-07, "mae_q": 1.581049247079136e-08, "mae_k": 1.5861061797295406e-08, "mse_q": 2.4735094242202705e-15, "mse_k": 2.486832828964107e-15, "ref": "rotary_torch"}, "err": null}
13
+ {"ts": "2025-10-29T15:50:32Z", "run": "acca4aa63c524f09b2ae69a99c25a06c", "impl": "hf_kernels_rotary", "tags": {"family": "hf-kernels", "backend": "cuda"}, "wl": {"name": "cuda_B2_S128_H8_D64_R32", "batch": 2, "seqlen": 128, "num_heads": 8, "head_dim": 64, "rotary_dim": 32, "dtype": "float32", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.08898199996565381, "p50": 0.09088199999496283, "p90": 0.09099199996853713, "mean": 0.09348599999157159, "iqr": 0.001969999971151992, "raw_times": [0.08898199996565381, 0.09099199996853713, 0.10755200003131904, 0.09088199999496283, 0.08902199999738514], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.09600300001011419, "peak_bytes": 137396224, "ok": true, "absmax": null, "corr": {"ok": true, "rtol": 1e-05, "atol": 1e-05, "absmax_q": 9.5367431640625e-07, "absmax_k": 9.5367431640625e-07, "mae_q": 1.5824980437173508e-08, "mae_k": 1.5615324144846454e-08, "mse_q": 2.488090249374306e-15, "mse_k": 2.425079044911585e-15, "ref": "rotary_torch"}, "err": null}
14
+ {"ts": "2025-10-29T15:50:32Z", "run": "acca4aa63c524f09b2ae69a99c25a06c", "impl": "hf_kernels_rotary", "tags": {"family": "hf-kernels", "backend": "cuda"}, "wl": {"name": "cuda_B2_S128_H8_D128_R64", "batch": 2, "seqlen": 128, "num_heads": 8, "head_dim": 128, "rotary_dim": 64, "dtype": "float32", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.08880199999339311, "p50": 0.09035199997242671, "p90": 0.09093199997778356, "mean": 0.09011999998165265, "iqr": 0.0015400000279441883, "raw_times": [0.08939199994983937, 0.09093199997778356, 0.09035199997242671, 0.09112200001482051, 0.08880199999339311], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.09145199999238685, "peak_bytes": 12648448, "ok": true, "absmax": null, "corr": {"ok": true, "rtol": 1e-05, "atol": 1e-05, "absmax_q": 9.5367431640625e-07, "absmax_k": 9.5367431640625e-07, "mae_q": 1.5683587761827766e-08, "mae_k": 1.574532682013796e-08, "mse_q": 2.4310271220254415e-15, "mse_k": 2.4601385856313877e-15, "ref": "rotary_torch"}, "err": null}
15
+ {"ts": "2025-10-29T15:50:32Z", "run": "acca4aa63c524f09b2ae69a99c25a06c", "impl": "hf_kernels_rotary", "tags": {"family": "hf-kernels", "backend": "cuda"}, "wl": {"name": "cuda_B2_S128_H32_D64_R32", "batch": 2, "seqlen": 128, "num_heads": 32, "head_dim": 64, "rotary_dim": 32, "dtype": "float32", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.08985199997368909, "p50": 0.09101199998440279, "p90": 0.09125200000426048, "mean": 0.09087419999787016, "iqr": 0.0002900000026784255, "raw_times": [0.08985199997368909, 0.09096200000158206, 0.0912930000254164, 0.09125200000426048, 0.09101199998440279], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.09303199999521894, "peak_bytes": 25198592, "ok": true, "absmax": null, "corr": {"ok": true, "rtol": 1e-05, "atol": 1e-05, "absmax_q": 9.5367431640625e-07, "absmax_k": 9.5367431640625e-07, "mae_q": 1.5835009747888762e-08, "mae_k": 1.572560215379326e-08, "mse_q": 2.478222950813504e-15, "mse_k": 2.4541699679685603e-15, "ref": "rotary_torch"}, "err": null}
16
+ {"ts": "2025-10-29T15:50:32Z", "run": "acca4aa63c524f09b2ae69a99c25a06c", "impl": "hf_kernels_rotary", "tags": {"family": "hf-kernels", "backend": "cuda"}, "wl": {"name": "cuda_B2_S128_H32_D128_R64", "batch": 2, "seqlen": 128, "num_heads": 32, "head_dim": 128, "rotary_dim": 64, "dtype": "float32", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.08871200003568447, "p50": 0.0907719999645451, "p90": 0.09140200000956611, "mean": 0.09065600000894847, "iqr": 0.001259999976355175, "raw_times": [0.08871200003568447, 0.09225200000173572, 0.09014200003321093, 0.0907719999645451, 0.09140200000956611], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.09131100000558945, "peak_bytes": 50397184, "ok": true, "absmax": null, "corr": {"ok": true, "rtol": 1e-05, "atol": 1e-05, "absmax_q": 9.5367431640625e-07, "absmax_k": 9.5367431640625e-07, "mae_q": 1.5876850056883995e-08, "mae_k": 1.5927410501603845e-08, "mse_q": 2.504224532953606e-15, "mse_k": 2.503892919554756e-15, "ref": "rotary_torch"}, "err": null}
17
+ {"ts": "2025-10-29T15:50:32Z", "run": "acca4aa63c524f09b2ae69a99c25a06c", "impl": "hf_kernels_rotary", "tags": {"family": "hf-kernels", "backend": "cuda"}, "wl": {"name": "cuda_B2_S512_H8_D64_R32", "batch": 2, "seqlen": 512, "num_heads": 8, "head_dim": 64, "rotary_dim": 32, "dtype": "float32", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.08793200004220125, "p50": 0.0902419999988524, "p90": 0.09114200003068618, "mean": 0.09024000002000321, "iqr": 0.001160000010713702, "raw_times": [0.08793200004220125, 0.08998200001997247, 0.0902419999988524, 0.09114200003068618, 0.09190200000830373], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.09403199999269418, "peak_bytes": 25362432, "ok": true, "absmax": null, "corr": {"ok": true, "rtol": 1e-05, "atol": 1e-05, "absmax_q": 9.5367431640625e-07, "absmax_k": 9.5367431640625e-07, "mae_q": 1.5820052823301012e-08, "mae_k": 1.580205122309053e-08, "mse_q": 2.4876468276264184e-15, "mse_k": 2.4866062476507165e-15, "ref": "rotary_torch"}, "err": null}
18
+ {"ts": "2025-10-29T15:50:32Z", "run": "acca4aa63c524f09b2ae69a99c25a06c", "impl": "hf_kernels_rotary", "tags": {"family": "hf-kernels", "backend": "cuda"}, "wl": {"name": "cuda_B2_S512_H8_D128_R64", "batch": 2, "seqlen": 512, "num_heads": 8, "head_dim": 128, "rotary_dim": 64, "dtype": "float32", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.08730199999718025, "p50": 0.0906619999909708, "p90": 0.09115200003861901, "mean": 0.08998400001019036, "iqr": 0.0016399999935856613, "raw_times": [0.08730199999718025, 0.09115200003861901, 0.09129199997914839, 0.08951200004503335, 0.0906619999909708], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.09093099998835896, "peak_bytes": 50593792, "ok": true, "absmax": null, "corr": {"ok": true, "rtol": 1e-05, "atol": 1e-05, "absmax_q": 9.5367431640625e-07, "absmax_k": 9.5367431640625e-07, "mae_q": 1.5823172105911e-08, "mae_k": 1.582038855474366e-08, "mse_q": 2.464257071579175e-15, "mse_k": 2.4775099608301526e-15, "ref": "rotary_torch"}, "err": null}
19
+ {"ts": "2025-10-29T15:50:32Z", "run": "acca4aa63c524f09b2ae69a99c25a06c", "impl": "hf_kernels_rotary", "tags": {"family": "hf-kernels", "backend": "cuda"}, "wl": {"name": "cuda_B2_S512_H32_D64_R32", "batch": 2, "seqlen": 512, "num_heads": 32, "head_dim": 64, "rotary_dim": 32, "dtype": "float32", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.08923199999344433, "p50": 0.09018200000809884, "p90": 0.09221200002684782, "mean": 0.09105200000476543, "iqr": 0.0028300000280978566, "raw_times": [0.08923199999344433, 0.09221200002684782, 0.09018200000809884, 0.08938199999874996, 0.09425199999668621], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.09410199999138058, "peak_bytes": 100794368, "ok": true, "absmax": null, "corr": {"ok": true, "rtol": 1e-05, "atol": 1e-05, "absmax_q": 9.5367431640625e-07, "absmax_k": 9.5367431640625e-07, "mae_q": 1.5888783622131086e-08, "mae_k": 1.5861886026868888e-08, "mse_q": 2.4766798685418433e-15, "mse_k": 2.475923891636419e-15, "ref": "rotary_torch"}, "err": null}
20
+ {"ts": "2025-10-29T15:50:32Z", "run": "acca4aa63c524f09b2ae69a99c25a06c", "impl": "hf_kernels_rotary", "tags": {"family": "hf-kernels", "backend": "cuda"}, "wl": {"name": "cuda_B2_S512_H32_D128_R64", "batch": 2, "seqlen": 512, "num_heads": 32, "head_dim": 128, "rotary_dim": 64, "dtype": "float32", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.08850200003962527, "p50": 0.0899920000279053, "p90": 0.09176200001093093, "mean": 0.09526220001134789, "iqr": 0.002740000013545796, "raw_times": [0.08850200003962527, 0.0899920000279053, 0.11703299998089278, 0.08902199999738514, 0.09176200001093093], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.09607300000880059, "peak_bytes": 201588736, "ok": true, "absmax": null, "corr": {"ok": true, "rtol": 1e-05, "atol": 1e-05, "absmax_q": 9.5367431640625e-07, "absmax_k": 9.5367431640625e-07, "mae_q": 1.5826390864503992e-08, "mae_k": 1.5792682717119533e-08, "mse_q": 2.480465258783123e-15, "mse_k": 2.475580631534544e-15, "ref": "rotary_torch"}, "err": null}
21
+ {"ts": "2025-10-29T15:50:32Z", "run": "acca4aa63c524f09b2ae69a99c25a06c", "impl": "hf_kernels_rotary", "tags": {"family": "hf-kernels", "backend": "cuda"}, "wl": {"name": "cuda_B2_S2048_H8_D64_R32", "batch": 2, "seqlen": 2048, "num_heads": 8, "head_dim": 64, "rotary_dim": 32, "dtype": "float32", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.0902020000239645, "p50": 0.09163200002149097, "p90": 0.09188199999243807, "mean": 0.09142600000586754, "iqr": 0.0006299999881775875, "raw_times": [0.09163200002149097, 0.09216199998718366, 0.09188199999243807, 0.09125200000426048, 0.0902020000239645], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.09537199997566859, "peak_bytes": 101449728, "ok": true, "absmax": null, "corr": {"ok": true, "rtol": 1e-05, "atol": 1e-05, "absmax_q": 9.5367431640625e-07, "absmax_k": 9.5367431640625e-07, "mae_q": 1.592899323554775e-08, "mae_k": 1.5925031959795888e-08, "mse_q": 2.50783882253954e-15, "mse_k": 2.5015648494992274e-15, "ref": "rotary_torch"}, "err": null}
22
+ {"ts": "2025-10-29T15:50:32Z", "run": "acca4aa63c524f09b2ae69a99c25a06c", "impl": "hf_kernels_rotary", "tags": {"family": "hf-kernels", "backend": "cuda"}, "wl": {"name": "cuda_B2_S2048_H8_D128_R64", "batch": 2, "seqlen": 2048, "num_heads": 8, "head_dim": 128, "rotary_dim": 64, "dtype": "float32", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.08815199998934986, "p50": 0.08920199996964584, "p90": 0.0900620000265917, "mean": 0.08925999999291889, "iqr": 0.001270000041131425, "raw_times": [0.08815199998934986, 0.09009199999354678, 0.0900620000265917, 0.08920199996964584, 0.08879199998546028], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.09250199997268282, "peak_bytes": 202375168, "ok": true, "absmax": null, "corr": {"ok": true, "rtol": 1e-05, "atol": 1e-05, "absmax_q": 9.5367431640625e-07, "absmax_k": 9.5367431640625e-07, "mae_q": 1.590209919299923e-08, "mae_k": 1.590130160877834e-08, "mse_q": 2.4971026799330918e-15, "mse_k": 2.506967649153289e-15, "ref": "rotary_torch"}, "err": null}
23
+ {"ts": "2025-10-29T15:50:32Z", "run": "acca4aa63c524f09b2ae69a99c25a06c", "impl": "hf_kernels_rotary", "tags": {"family": "hf-kernels", "backend": "cuda"}, "wl": {"name": "cuda_B2_S2048_H32_D64_R32", "batch": 2, "seqlen": 2048, "num_heads": 32, "head_dim": 64, "rotary_dim": 32, "dtype": "float32", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.26207600001271203, "p50": 0.263255999982448, "p90": 0.2654460000144354, "mean": 0.26436599999897226, "iqr": 0.0022400000148081745, "raw_times": [0.26207600001271203, 0.263255999982448, 0.2678459999856386, 0.26320599999962724, 0.2654460000144354], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.25824599998713893, "peak_bytes": 403177472, "ok": true, "absmax": null, "corr": {"ok": true, "rtol": 1e-05, "atol": 1e-05, "absmax_q": 9.5367431640625e-07, "absmax_k": 9.5367431640625e-07, "mae_q": 1.5847520629108658e-08, "mae_k": 1.5862454461057496e-08, "mse_q": 2.4917348203881045e-15, "mse_k": 2.491306009958557e-15, "ref": "rotary_torch"}, "err": null}
24
+ {"ts": "2025-10-29T15:50:32Z", "run": "acca4aa63c524f09b2ae69a99c25a06c", "impl": "hf_kernels_rotary", "tags": {"family": "hf-kernels", "backend": "cuda"}, "wl": {"name": "cuda_B2_S2048_H32_D128_R64", "batch": 2, "seqlen": 2048, "num_heads": 32, "head_dim": 128, "rotary_dim": 64, "dtype": "float32", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.8428699999853961, "p50": 0.8440990000053716, "p90": 0.8457790000306886, "mean": 0.8458453999992344, "iqr": 0.0025290000280620006, "raw_times": [0.8428699999853961, 0.8532289999720888, 0.8432500000026266, 0.8440990000053716, 0.8457790000306886], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.8568399999830945, "peak_bytes": 806354944, "ok": true, "absmax": null, "corr": {"ok": true, "rtol": 1e-05, "atol": 1e-05, "absmax_q": 9.5367431640625e-07, "absmax_k": 9.5367431640625e-07, "mae_q": 1.585225106737198e-08, "mae_k": 1.581303976649906e-08, "mse_q": 2.4866460581992374e-15, "mse_k": 2.4721545950211372e-15, "ref": "rotary_torch"}, "err": null}
rotary/impls/cells/benchmark.py CHANGED
@@ -4,6 +4,7 @@
4
  # "numpy",
5
  # "torch==2.8.0",
6
  # "kernels-benchmark-tools",
 
7
  # ]
8
  #
9
  # [tool.uv.sources]
@@ -12,46 +13,36 @@
12
  import torch
13
  import sys
14
  from kernels_benchmark_tools import KernelTypeEnum, run_benchmark
 
15
 
 
 
16
 
17
- def apply_rotary_torch(x1, x2, cos, sin, conj=False):
18
- """Reference rotary implementation."""
19
- if not conj:
20
- out1 = x1 * cos - x2 * sin
21
- out2 = x1 * sin + x2 * cos
22
- else:
23
- out1 = x1 * cos + x2 * sin
24
- out2 = -x1 * sin + x2 * cos
25
- return out1, out2
26
 
27
-
28
- def torch_rotary(query, key, cos, sin, conj=False):
29
  rotary_dim = cos.shape[-1]
30
 
31
- # Clone inputs to avoid modifying them
32
  q_out = query.clone()
33
  k_out = key.clone()
34
 
35
  # Apply rotation to query
36
  q1 = q_out[..., :rotary_dim]
37
  q2 = q_out[..., rotary_dim : 2 * rotary_dim]
38
- q_out_1, q_out_2 = apply_rotary_torch(q1, q2, cos, sin, conj)
39
- q_out[..., :rotary_dim] = q_out_1
40
- q_out[..., rotary_dim : 2 * rotary_dim] = q_out_2
41
 
42
  # Apply rotation to key
43
  k1 = k_out[..., :rotary_dim]
44
  k2 = k_out[..., rotary_dim : 2 * rotary_dim]
45
- k_out_1, k_out_2 = apply_rotary_torch(k1, k2, cos, sin, conj)
46
- k_out[..., :rotary_dim] = k_out_1
47
- k_out[..., rotary_dim : 2 * rotary_dim] = k_out_2
48
 
49
  return q_out, k_out
50
 
51
 
52
  run_benchmark(
53
  kernel_type=KernelTypeEnum.ROTARY,
54
- impl_name="torch_eager",
55
- impl_tags={"family": "pytorch", "backend": "eager"},
56
- impl_func=torch_rotary,
 
57
  )
 
4
  # "numpy",
5
  # "torch==2.8.0",
6
  # "kernels-benchmark-tools",
7
+ # "kernels",
8
  # ]
9
  #
10
  # [tool.uv.sources]
 
13
  import torch
14
  import sys
15
  from kernels_benchmark_tools import KernelTypeEnum, run_benchmark
16
+ from kernels import get_kernel
17
 
18
+ # Load the rotary kernel
19
+ rotary = get_kernel("kernels-community/rotary")
20
 
 
 
 
 
 
 
 
 
 
21
 
22
+ def hf_kernels_rotary(query, key, cos, sin, conj=False):
 
23
  rotary_dim = cos.shape[-1]
24
 
25
+ # Clone to avoid modifying inputs
26
  q_out = query.clone()
27
  k_out = key.clone()
28
 
29
  # Apply rotation to query
30
  q1 = q_out[..., :rotary_dim]
31
  q2 = q_out[..., rotary_dim : 2 * rotary_dim]
32
+ rotary.apply_rotary(q1, q2, cos, sin, q1, q2, conj)
 
 
33
 
34
  # Apply rotation to key
35
  k1 = k_out[..., :rotary_dim]
36
  k2 = k_out[..., rotary_dim : 2 * rotary_dim]
37
+ rotary.apply_rotary(k1, k2, cos, sin, k1, k2, conj)
 
 
38
 
39
  return q_out, k_out
40
 
41
 
42
  run_benchmark(
43
  kernel_type=KernelTypeEnum.ROTARY,
44
+ impl_name="hf_kernels_rotary",
45
+ impl_tags={"family": "hf-kernels", "backend": "cuda"},
46
+ impl_func=hf_kernels_rotary,
47
+ dtype="float32",
48
  )
rotary/impls/hf_kernels_rotary.html CHANGED
The diff for this file is too large to render. See raw diff
 
rotary/impls/torch_rotary.html CHANGED
The diff for this file is too large to render. See raw diff
 
rotary/index.html CHANGED
@@ -809,6 +809,14 @@
809
  .artifact-preview svg {
810
  background: transparent;
811
  }
 
 
 
 
 
 
 
 
812
  /* CSV table styling */
813
  .artifact-csv {
814
  margin-top: 1rem;
 
809
  .artifact-preview svg {
810
  background: transparent;
811
  }
812
+ /* Invert SVG images in dark mode */
813
+ :root[data-theme="dark"] .artifact-preview img[src$=".svg"] {
814
+ filter: invert(0.9) hue-rotate(180deg);
815
+ }
816
+ /* Keep SVG images readable in monocolor mode */
817
+ :root[data-ui="monocolor"] .artifact-preview img[src$=".svg"] {
818
+ filter: none;
819
+ }
820
  /* CSV table styling */
821
  .artifact-csv {
822
  margin-top: 1rem;
rotary/results/artifacts/combine/latency.svg CHANGED

Git LFS Details

  • SHA256: 36e71e631ab1a00097df3bc72a4532b4b383ed31a1df2368bd041e765254a9c3
  • Pointer size: 130 Bytes
  • Size of remote file: 31 kB

Git LFS Details

  • SHA256: 1073fb7d2fda354b1fcc8c64879bfd105a9e36e75f5ab25c8a5cb53277099549
  • Pointer size: 130 Bytes
  • Size of remote file: 37.9 kB
rotary/results/combined_results.html CHANGED
@@ -809,6 +809,14 @@
809
  .artifact-preview svg {
810
  background: transparent;
811
  }
 
 
 
 
 
 
 
 
812
  /* CSV table styling */
813
  .artifact-csv {
814
  margin-top: 1rem;
@@ -3872,7 +3880,7 @@ span.linenos.special { color: #000000; background-color: #ffffc0; padding-left:
3872
  <rdf:RDF>
3873
  <ns2:Work>
3874
  <dc:type rdf:resource="http://purl.org/dc/dcmitype/StillImage" />
3875
- <dc:date>2025-10-29T14:27:54.393501</dc:date>
3876
  <dc:format>image/svg+xml</dc:format>
3877
  <dc:creator>
3878
  <ns2:Agent>
@@ -4216,108 +4224,179 @@ span.linenos.special { color: #000000; background-color: #ffffc0; padding-left:
4216
  <g id="matplotlib.axis_2">
4217
  <g id="ytick_1">
4218
  <g id="grid-y--2" class="grid grid-y">
4219
- <path d="M 47.72 385.895403 L 823.142937 385.895403 " clip-path="url(#p088c925177)" style="fill: none; stroke: #b0b0b0; stroke-opacity: 0.3; stroke-width: 0.8; stroke-linecap: square" />
4220
  </g>
4221
  <g id="line2d_25">
4222
  <defs>
4223
  <path id="m0fca2865ba" d="M 0 0 L -3.5 0 " style="stroke: #000000; stroke-width: 0.8" />
4224
  </defs>
4225
  <g>
4226
- <use ns4:href="#m0fca2865ba" x="47.72" y="385.895403" style="stroke: #000000; stroke-width: 0.8" />
4227
  </g>
4228
  </g>
4229
  <g id="text_25">
4230
- <text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: end" x="40.72" y="389.694621" transform="rotate(-0 40.72 389.694621)">0.2</text>
4231
  </g>
4232
  </g>
4233
  <g id="ytick_2">
4234
  <g id="grid-y--3" class="grid grid-y">
4235
- <path d="M 47.72 308.195371 L 823.142937 308.195371 " clip-path="url(#p088c925177)" style="fill: none; stroke: #b0b0b0; stroke-opacity: 0.3; stroke-width: 0.8; stroke-linecap: square" />
4236
  </g>
4237
  <g id="line2d_26">
4238
  <g>
4239
- <use ns4:href="#m0fca2865ba" x="47.72" y="308.195371" style="stroke: #000000; stroke-width: 0.8" />
4240
  </g>
4241
  </g>
4242
  <g id="text_26">
4243
- <text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: end" x="40.72" y="311.99459" transform="rotate(-0 40.72 311.99459)">0.3</text>
4244
  </g>
4245
  </g>
4246
  <g id="ytick_3">
4247
  <g id="grid-y--4" class="grid grid-y">
4248
- <path d="M 47.72 230.49534 L 823.142937 230.49534 " clip-path="url(#p088c925177)" style="fill: none; stroke: #b0b0b0; stroke-opacity: 0.3; stroke-width: 0.8; stroke-linecap: square" />
4249
  </g>
4250
  <g id="line2d_27">
4251
  <g>
4252
- <use ns4:href="#m0fca2865ba" x="47.72" y="230.49534" style="stroke: #000000; stroke-width: 0.8" />
4253
  </g>
4254
  </g>
4255
  <g id="text_27">
4256
- <text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: end" x="40.72" y="234.294559" transform="rotate(-0 40.72 234.294559)">0.4</text>
4257
  </g>
4258
  </g>
4259
  <g id="ytick_4">
4260
  <g id="grid-y--5" class="grid grid-y">
4261
- <path d="M 47.72 152.795309 L 823.142937 152.795309 " clip-path="url(#p088c925177)" style="fill: none; stroke: #b0b0b0; stroke-opacity: 0.3; stroke-width: 0.8; stroke-linecap: square" />
4262
  </g>
4263
  <g id="line2d_28">
4264
  <g>
4265
- <use ns4:href="#m0fca2865ba" x="47.72" y="152.795309" style="stroke: #000000; stroke-width: 0.8" />
4266
  </g>
4267
  </g>
4268
  <g id="text_28">
4269
- <text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: end" x="40.72" y="156.594528" transform="rotate(-0 40.72 156.594528)">0.5</text>
4270
  </g>
4271
  </g>
4272
  <g id="ytick_5">
4273
  <g id="grid-y--6" class="grid grid-y">
4274
- <path d="M 47.72 75.095278 L 823.142937 75.095278 " clip-path="url(#p088c925177)" style="fill: none; stroke: #b0b0b0; stroke-opacity: 0.3; stroke-width: 0.8; stroke-linecap: square" />
4275
  </g>
4276
  <g id="line2d_29">
4277
  <g>
4278
- <use ns4:href="#m0fca2865ba" x="47.72" y="75.095278" style="stroke: #000000; stroke-width: 0.8" />
4279
  </g>
4280
  </g>
4281
  <g id="text_29">
4282
- <text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: end" x="40.72" y="78.894497" transform="rotate(-0 40.72 78.894497)">0.6</text>
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
4283
  </g>
4284
  </g>
4285
  <g id="label--y" class="ylabel">
4286
  <text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: middle" x="18.737188" y="224.974753" transform="rotate(-90 18.737188 224.974753)">Latency P50 (ms)</text>
4287
  </g>
4288
  </g>
4289
- <g id="series--torch-eager" class="series">
4290
- <path d="M 82.966497 405.060892 L 113.615625 365.214762 L 144.264753 368.882204 L 174.913881 373.894633 L 205.563009 367.32121 L 236.212137 373.482823 L 266.861265 373.661533 L 297.510393 372.596265 L 328.159521 372.666195 L 358.808648 373.350733 L 389.457776 373.203103 L 420.106904 366.263713 L 450.756032 372.286242 L 481.40516 372.729132 L 512.054288 373.008852 L 542.703416 372.107532 L 573.352544 372.022062 L 604.001672 374.842573 L 634.6508 372.153375 L 665.299928 372.395022 L 695.949056 373.093546 L 726.598184 372.782745 L 757.247312 362.519348 L 787.896439 44.888614 " clip-path="url(#p088c925177)" style="fill: none; stroke: #1f77b4; stroke-width: 1.5; stroke-linecap: square" />
4291
  <defs>
4292
  <path id="md7efaf3aec" d="M 0 3 C 0.795609 3 1.55874 2.683901 2.12132 2.12132 C 2.683901 1.55874 3 0.795609 3 0 C 3 -0.795609 2.683901 -1.55874 2.12132 -2.12132 C 1.55874 -2.683901 0.795609 -3 0 -3 C -0.795609 -3 -1.55874 -2.683901 -2.12132 -2.12132 C -2.683901 -1.55874 -3 -0.795609 -3 0 C -3 0.795609 -2.683901 1.55874 -2.12132 2.12132 C -1.55874 2.683901 -0.795609 3 0 3 z " style="stroke: #1f77b4" />
4293
  </defs>
4294
  <g clip-path="url(#p088c925177)">
4295
  <use ns4:href="#md7efaf3aec" x="82.966497" y="405.060892" style="fill: #1f77b4; stroke: #1f77b4" />
4296
- <use ns4:href="#md7efaf3aec" x="113.615625" y="365.214762" style="fill: #1f77b4; stroke: #1f77b4" />
4297
- <use ns4:href="#md7efaf3aec" x="144.264753" y="368.882204" style="fill: #1f77b4; stroke: #1f77b4" />
4298
- <use ns4:href="#md7efaf3aec" x="174.913881" y="373.894633" style="fill: #1f77b4; stroke: #1f77b4" />
4299
- <use ns4:href="#md7efaf3aec" x="205.563009" y="367.32121" style="fill: #1f77b4; stroke: #1f77b4" />
4300
- <use ns4:href="#md7efaf3aec" x="236.212137" y="373.482823" style="fill: #1f77b4; stroke: #1f77b4" />
4301
- <use ns4:href="#md7efaf3aec" x="266.861265" y="373.661533" style="fill: #1f77b4; stroke: #1f77b4" />
4302
- <use ns4:href="#md7efaf3aec" x="297.510393" y="372.596265" style="fill: #1f77b4; stroke: #1f77b4" />
4303
- <use ns4:href="#md7efaf3aec" x="328.159521" y="372.666195" style="fill: #1f77b4; stroke: #1f77b4" />
4304
- <use ns4:href="#md7efaf3aec" x="358.808648" y="373.350733" style="fill: #1f77b4; stroke: #1f77b4" />
4305
- <use ns4:href="#md7efaf3aec" x="389.457776" y="373.203103" style="fill: #1f77b4; stroke: #1f77b4" />
4306
- <use ns4:href="#md7efaf3aec" x="420.106904" y="366.263713" style="fill: #1f77b4; stroke: #1f77b4" />
4307
- <use ns4:href="#md7efaf3aec" x="450.756032" y="372.286242" style="fill: #1f77b4; stroke: #1f77b4" />
4308
- <use ns4:href="#md7efaf3aec" x="481.40516" y="372.729132" style="fill: #1f77b4; stroke: #1f77b4" />
4309
- <use ns4:href="#md7efaf3aec" x="512.054288" y="373.008852" style="fill: #1f77b4; stroke: #1f77b4" />
4310
- <use ns4:href="#md7efaf3aec" x="542.703416" y="372.107532" style="fill: #1f77b4; stroke: #1f77b4" />
4311
- <use ns4:href="#md7efaf3aec" x="573.352544" y="372.022062" style="fill: #1f77b4; stroke: #1f77b4" />
4312
- <use ns4:href="#md7efaf3aec" x="604.001672" y="374.842573" style="fill: #1f77b4; stroke: #1f77b4" />
4313
- <use ns4:href="#md7efaf3aec" x="634.6508" y="372.153375" style="fill: #1f77b4; stroke: #1f77b4" />
4314
- <use ns4:href="#md7efaf3aec" x="665.299928" y="372.395022" style="fill: #1f77b4; stroke: #1f77b4" />
4315
- <use ns4:href="#md7efaf3aec" x="695.949056" y="373.093546" style="fill: #1f77b4; stroke: #1f77b4" />
4316
- <use ns4:href="#md7efaf3aec" x="726.598184" y="372.782745" style="fill: #1f77b4; stroke: #1f77b4" />
4317
- <use ns4:href="#md7efaf3aec" x="757.247312" y="362.519348" style="fill: #1f77b4; stroke: #1f77b4" />
4318
  <use ns4:href="#md7efaf3aec" x="787.896439" y="44.888614" style="fill: #1f77b4; stroke: #1f77b4" />
4319
  </g>
4320
  </g>
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
4321
  <g id="patch_3">
4322
  <path d="M 47.72 423.069506 L 47.72 26.88 " style="fill: none; stroke: #000000; stroke-width: 0.8; stroke-linejoin: miter; stroke-linecap: square" />
4323
  </g>
@@ -4330,21 +4409,30 @@ span.linenos.special { color: #000000; background-color: #ffffc0; padding-left:
4330
  <g id="patch_6">
4331
  <path d="M 47.72 26.88 L 823.142937 26.88 " style="fill: none; stroke: #000000; stroke-width: 0.8; stroke-linejoin: miter; stroke-linecap: square" />
4332
  </g>
4333
- <g id="text_30">
4334
  <text style="font-size: 12px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: middle" x="435.431468" y="20.88" transform="rotate(-0 435.431468 20.88)">Attention Implementation Latency</text>
4335
  </g>
4336
  <g id="legend" class="legend">
4337
  <g id="patch_7">
4338
- <path d="M 54.72 49.83625 L 146.374687 49.83625 Q 148.374687 49.83625 148.374687 47.83625 L 148.374687 33.88 Q 148.374687 31.88 146.374687 31.88 L 54.72 31.88 Q 52.72 31.88 52.72 33.88 L 52.72 47.83625 Q 52.72 49.83625 54.72 49.83625 L 54.72 49.83625 z " style="fill: none; opacity: 0.8; stroke: #cccccc; stroke-linejoin: miter" />
4339
  </g>
4340
- <g id="line2d_30">
4341
  <path d="M 56.72 39.978438 L 66.72 39.978438 L 76.72 39.978438 " style="fill: none; stroke: #1f77b4; stroke-width: 1.5; stroke-linecap: square" />
4342
  <g>
4343
  <use ns4:href="#md7efaf3aec" x="66.72" y="39.978438" style="fill: #1f77b4; stroke: #1f77b4" />
4344
  </g>
4345
  </g>
 
 
 
 
 
 
 
 
 
4346
  <g id="legend-label--torch-eager" class="legend">
4347
- <text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: start" x="84.72" y="43.478438" transform="rotate(-0 84.72 43.478438)">torch_eager</text>
4348
  </g>
4349
  </g>
4350
  </g>
@@ -4364,7 +4452,7 @@ span.linenos.special { color: #000000; background-color: #ffffc0; padding-left:
4364
  <span onclick="toggleOutput('combine')" style="cursor: pointer;">▼ output</span>
4365
  <span id="uv-indicator-combine" onclick="toggleUvLogsFromHeader('combine')" style="cursor: pointer;">▶ uv-logs</span>
4366
  </span> |
4367
- Cell: combine | 4.35s
4368
  | <button class="run-btn" onclick="runCell('combine')">▶ run</button>
4369
  <button class="copy-btn" onclick="copyCell('combine')">Copy</button>
4370
  <a href="cells/combine.py" target="_blank" class="raw-btn">Raw</a>
@@ -4436,11 +4524,11 @@ Cell: combine | 4.35s
4436
  <div class="cell-stdout"><pre class="stdout-text">======================================================================
4437
  LOADING BENCHMARK DATA
4438
  ======================================================================
4439
- ✓ HF Kernels Rotary : /__w/kernels-benchmarks/kernels-benchmarks/benches/rotary/impls/.uvnote/cache/49ec9501b131c967277abe3cccb638422565260339bb30f5ea386b0076f2183e
4440
  ✓ PyTorch Rotary : /__w/kernels-benchmarks/kernels-benchmarks/benches/rotary/impls/.uvnote/cache/abf801d6445dfa81a8dd7b2e6257930c39c18160a9b97a739858c3b244e16cc5
4441
 
4442
  ✓ Found HF Kernels Rotary
4443
- Path: /__w/kernels-benchmarks/kernels-benchmarks/benches/rotary/impls/.uvnote/cache/49ec9501b131c967277abe3cccb638422565260339bb30f5ea386b0076f2183e/rotary.jsonl
4444
  ✓ Found PyTorch Rotary
4445
  Path: /__w/kernels-benchmarks/kernels-benchmarks/benches/rotary/impls/.uvnote/cache/abf801d6445dfa81a8dd7b2e6257930c39c18160a9b97a739858c3b244e16cc5/rotary.jsonl
4446
 
@@ -4451,54 +4539,54 @@ Summary: 2 found, 0 skipped, 0 missing
4451
  COMBINED BENCHMARK SUMMARY
4452
 
4453
  impl wl p50(ms) ok
4454
- hf_kernels_rotary cuda_B1_S128_H32_D128_R64 0.09 False
4455
- hf_kernels_rotary cuda_B1_S128_H32_D64_R32 0.09 False
4456
- hf_kernels_rotary cuda_B1_S128_H8_D128_R64 0.10 False
4457
- hf_kernels_rotary cuda_B1_S128_H8_D64_R32 0.08 False
4458
- hf_kernels_rotary cuda_B1_S2048_H32_D128_R64 0.09 False
4459
- hf_kernels_rotary cuda_B1_S2048_H32_D64_R32 0.09 False
4460
- hf_kernels_rotary cuda_B1_S2048_H8_D128_R64 0.09 False
4461
- hf_kernels_rotary cuda_B1_S2048_H8_D64_R32 0.09 False
4462
- hf_kernels_rotary cuda_B1_S512_H32_D128_R64 0.09 False
4463
- hf_kernels_rotary cuda_B1_S512_H32_D64_R32 0.09 False
4464
- hf_kernels_rotary cuda_B1_S512_H8_D128_R64 0.09 False
4465
- hf_kernels_rotary cuda_B1_S512_H8_D64_R32 0.09 False
4466
- hf_kernels_rotary cuda_B2_S128_H32_D128_R64 0.09 False
4467
- hf_kernels_rotary cuda_B2_S128_H32_D64_R32 0.09 False
4468
- hf_kernels_rotary cuda_B2_S128_H8_D128_R64 0.09 False
4469
- hf_kernels_rotary cuda_B2_S128_H8_D64_R32 0.09 False
4470
- hf_kernels_rotary cuda_B2_S2048_H32_D128_R64 0.28 False
4471
- hf_kernels_rotary cuda_B2_S2048_H32_D64_R32 0.10 False
4472
- hf_kernels_rotary cuda_B2_S2048_H8_D128_R64 0.09 False
4473
- hf_kernels_rotary cuda_B2_S2048_H8_D64_R32 0.09 False
4474
- hf_kernels_rotary cuda_B2_S512_H32_D128_R64 0.09 False
4475
- hf_kernels_rotary cuda_B2_S512_H32_D64_R32 0.09 False
4476
- hf_kernels_rotary cuda_B2_S512_H8_D128_R64 0.09 False
4477
- hf_kernels_rotary cuda_B2_S512_H8_D64_R32 0.09 False
4478
- torch_eager cuda_B1_S128_H32_D128_R64 0.22 True
4479
  torch_eager cuda_B1_S128_H32_D64_R32 0.22 True
4480
- torch_eager cuda_B1_S128_H8_D128_R64 0.23 True
4481
- torch_eager cuda_B1_S128_H8_D64_R32 0.18 True
4482
- torch_eager cuda_B1_S2048_H32_D128_R64 0.23 True
4483
- torch_eager cuda_B1_S2048_H32_D64_R32 0.22 True
4484
- torch_eager cuda_B1_S2048_H8_D128_R64 0.22 True
4485
- torch_eager cuda_B1_S2048_H8_D64_R32 0.22 True
4486
- torch_eager cuda_B1_S512_H32_D128_R64 0.22 True
4487
- torch_eager cuda_B1_S512_H32_D64_R32 0.22 True
4488
- torch_eager cuda_B1_S512_H8_D128_R64 0.22 True
4489
  torch_eager cuda_B1_S512_H8_D64_R32 0.22 True
4490
- torch_eager cuda_B2_S128_H32_D128_R64 0.22 True
4491
- torch_eager cuda_B2_S128_H32_D64_R32 0.22 True
4492
- torch_eager cuda_B2_S128_H8_D128_R64 0.22 True
4493
- torch_eager cuda_B2_S128_H8_D64_R32 0.22 True
4494
  torch_eager cuda_B2_S2048_H32_D128_R64 0.64 True
4495
  torch_eager cuda_B2_S2048_H32_D64_R32 0.23 True
4496
  torch_eager cuda_B2_S2048_H8_D128_R64 0.22 True
4497
- torch_eager cuda_B2_S2048_H8_D64_R32 0.22 True
4498
- torch_eager cuda_B2_S512_H32_D128_R64 0.22 True
4499
- torch_eager cuda_B2_S512_H32_D64_R32 0.22 True
4500
  torch_eager cuda_B2_S512_H8_D128_R64 0.21 True
4501
- torch_eager cuda_B2_S512_H8_D64_R32 0.22 True
4502
 
4503
  GENERATING COMBINED VISUALIZATION
4504
 
@@ -4518,7 +4606,7 @@ Implementations included:
4518
  <div class="uv-install-logs" id="uv-logs-combine">
4519
  <div class="uv-logs-header" onclick="toggleUvLogs(this)">▶ UV Install Logs</div>
4520
  <div class="uv-logs-content" style="display: none;">
4521
- Installed 37 packages in 239ms
4522
  </div>
4523
  </div>
4524
  <div class="cell-artifacts">
@@ -4531,7 +4619,7 @@ Installed 37 packages in 239ms
4531
  <rdf:RDF>
4532
  <ns2:Work>
4533
  <dc:type rdf:resource="http://purl.org/dc/dcmitype/StillImage" />
4534
- <dc:date>2025-10-29T14:27:54.393501</dc:date>
4535
  <dc:format>image/svg+xml</dc:format>
4536
  <dc:creator>
4537
  <ns2:Agent>
@@ -4875,108 +4963,179 @@ Installed 37 packages in 239ms
4875
  <g id="matplotlib.axis_2">
4876
  <g id="ytick_1">
4877
  <g id="grid-y--2" class="grid grid-y">
4878
- <path d="M 47.72 385.895403 L 823.142937 385.895403 " clip-path="url(#p088c925177)" style="fill: none; stroke: #b0b0b0; stroke-opacity: 0.3; stroke-width: 0.8; stroke-linecap: square" />
4879
  </g>
4880
  <g id="line2d_25">
4881
  <defs>
4882
  <path id="m0fca2865ba" d="M 0 0 L -3.5 0 " style="stroke: #000000; stroke-width: 0.8" />
4883
  </defs>
4884
  <g>
4885
- <use ns4:href="#m0fca2865ba" x="47.72" y="385.895403" style="stroke: #000000; stroke-width: 0.8" />
4886
  </g>
4887
  </g>
4888
  <g id="text_25">
4889
- <text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: end" x="40.72" y="389.694621" transform="rotate(-0 40.72 389.694621)">0.2</text>
4890
  </g>
4891
  </g>
4892
  <g id="ytick_2">
4893
  <g id="grid-y--3" class="grid grid-y">
4894
- <path d="M 47.72 308.195371 L 823.142937 308.195371 " clip-path="url(#p088c925177)" style="fill: none; stroke: #b0b0b0; stroke-opacity: 0.3; stroke-width: 0.8; stroke-linecap: square" />
4895
  </g>
4896
  <g id="line2d_26">
4897
  <g>
4898
- <use ns4:href="#m0fca2865ba" x="47.72" y="308.195371" style="stroke: #000000; stroke-width: 0.8" />
4899
  </g>
4900
  </g>
4901
  <g id="text_26">
4902
- <text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: end" x="40.72" y="311.99459" transform="rotate(-0 40.72 311.99459)">0.3</text>
4903
  </g>
4904
  </g>
4905
  <g id="ytick_3">
4906
  <g id="grid-y--4" class="grid grid-y">
4907
- <path d="M 47.72 230.49534 L 823.142937 230.49534 " clip-path="url(#p088c925177)" style="fill: none; stroke: #b0b0b0; stroke-opacity: 0.3; stroke-width: 0.8; stroke-linecap: square" />
4908
  </g>
4909
  <g id="line2d_27">
4910
  <g>
4911
- <use ns4:href="#m0fca2865ba" x="47.72" y="230.49534" style="stroke: #000000; stroke-width: 0.8" />
4912
  </g>
4913
  </g>
4914
  <g id="text_27">
4915
- <text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: end" x="40.72" y="234.294559" transform="rotate(-0 40.72 234.294559)">0.4</text>
4916
  </g>
4917
  </g>
4918
  <g id="ytick_4">
4919
  <g id="grid-y--5" class="grid grid-y">
4920
- <path d="M 47.72 152.795309 L 823.142937 152.795309 " clip-path="url(#p088c925177)" style="fill: none; stroke: #b0b0b0; stroke-opacity: 0.3; stroke-width: 0.8; stroke-linecap: square" />
4921
  </g>
4922
  <g id="line2d_28">
4923
  <g>
4924
- <use ns4:href="#m0fca2865ba" x="47.72" y="152.795309" style="stroke: #000000; stroke-width: 0.8" />
4925
  </g>
4926
  </g>
4927
  <g id="text_28">
4928
- <text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: end" x="40.72" y="156.594528" transform="rotate(-0 40.72 156.594528)">0.5</text>
4929
  </g>
4930
  </g>
4931
  <g id="ytick_5">
4932
  <g id="grid-y--6" class="grid grid-y">
4933
- <path d="M 47.72 75.095278 L 823.142937 75.095278 " clip-path="url(#p088c925177)" style="fill: none; stroke: #b0b0b0; stroke-opacity: 0.3; stroke-width: 0.8; stroke-linecap: square" />
4934
  </g>
4935
  <g id="line2d_29">
4936
  <g>
4937
- <use ns4:href="#m0fca2865ba" x="47.72" y="75.095278" style="stroke: #000000; stroke-width: 0.8" />
4938
  </g>
4939
  </g>
4940
  <g id="text_29">
4941
- <text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: end" x="40.72" y="78.894497" transform="rotate(-0 40.72 78.894497)">0.6</text>
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
4942
  </g>
4943
  </g>
4944
  <g id="label--y" class="ylabel">
4945
  <text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: middle" x="18.737188" y="224.974753" transform="rotate(-90 18.737188 224.974753)">Latency P50 (ms)</text>
4946
  </g>
4947
  </g>
4948
- <g id="series--torch-eager" class="series">
4949
- <path d="M 82.966497 405.060892 L 113.615625 365.214762 L 144.264753 368.882204 L 174.913881 373.894633 L 205.563009 367.32121 L 236.212137 373.482823 L 266.861265 373.661533 L 297.510393 372.596265 L 328.159521 372.666195 L 358.808648 373.350733 L 389.457776 373.203103 L 420.106904 366.263713 L 450.756032 372.286242 L 481.40516 372.729132 L 512.054288 373.008852 L 542.703416 372.107532 L 573.352544 372.022062 L 604.001672 374.842573 L 634.6508 372.153375 L 665.299928 372.395022 L 695.949056 373.093546 L 726.598184 372.782745 L 757.247312 362.519348 L 787.896439 44.888614 " clip-path="url(#p088c925177)" style="fill: none; stroke: #1f77b4; stroke-width: 1.5; stroke-linecap: square" />
4950
  <defs>
4951
  <path id="md7efaf3aec" d="M 0 3 C 0.795609 3 1.55874 2.683901 2.12132 2.12132 C 2.683901 1.55874 3 0.795609 3 0 C 3 -0.795609 2.683901 -1.55874 2.12132 -2.12132 C 1.55874 -2.683901 0.795609 -3 0 -3 C -0.795609 -3 -1.55874 -2.683901 -2.12132 -2.12132 C -2.683901 -1.55874 -3 -0.795609 -3 0 C -3 0.795609 -2.683901 1.55874 -2.12132 2.12132 C -1.55874 2.683901 -0.795609 3 0 3 z " style="stroke: #1f77b4" />
4952
  </defs>
4953
  <g clip-path="url(#p088c925177)">
4954
  <use ns4:href="#md7efaf3aec" x="82.966497" y="405.060892" style="fill: #1f77b4; stroke: #1f77b4" />
4955
- <use ns4:href="#md7efaf3aec" x="113.615625" y="365.214762" style="fill: #1f77b4; stroke: #1f77b4" />
4956
- <use ns4:href="#md7efaf3aec" x="144.264753" y="368.882204" style="fill: #1f77b4; stroke: #1f77b4" />
4957
- <use ns4:href="#md7efaf3aec" x="174.913881" y="373.894633" style="fill: #1f77b4; stroke: #1f77b4" />
4958
- <use ns4:href="#md7efaf3aec" x="205.563009" y="367.32121" style="fill: #1f77b4; stroke: #1f77b4" />
4959
- <use ns4:href="#md7efaf3aec" x="236.212137" y="373.482823" style="fill: #1f77b4; stroke: #1f77b4" />
4960
- <use ns4:href="#md7efaf3aec" x="266.861265" y="373.661533" style="fill: #1f77b4; stroke: #1f77b4" />
4961
- <use ns4:href="#md7efaf3aec" x="297.510393" y="372.596265" style="fill: #1f77b4; stroke: #1f77b4" />
4962
- <use ns4:href="#md7efaf3aec" x="328.159521" y="372.666195" style="fill: #1f77b4; stroke: #1f77b4" />
4963
- <use ns4:href="#md7efaf3aec" x="358.808648" y="373.350733" style="fill: #1f77b4; stroke: #1f77b4" />
4964
- <use ns4:href="#md7efaf3aec" x="389.457776" y="373.203103" style="fill: #1f77b4; stroke: #1f77b4" />
4965
- <use ns4:href="#md7efaf3aec" x="420.106904" y="366.263713" style="fill: #1f77b4; stroke: #1f77b4" />
4966
- <use ns4:href="#md7efaf3aec" x="450.756032" y="372.286242" style="fill: #1f77b4; stroke: #1f77b4" />
4967
- <use ns4:href="#md7efaf3aec" x="481.40516" y="372.729132" style="fill: #1f77b4; stroke: #1f77b4" />
4968
- <use ns4:href="#md7efaf3aec" x="512.054288" y="373.008852" style="fill: #1f77b4; stroke: #1f77b4" />
4969
- <use ns4:href="#md7efaf3aec" x="542.703416" y="372.107532" style="fill: #1f77b4; stroke: #1f77b4" />
4970
- <use ns4:href="#md7efaf3aec" x="573.352544" y="372.022062" style="fill: #1f77b4; stroke: #1f77b4" />
4971
- <use ns4:href="#md7efaf3aec" x="604.001672" y="374.842573" style="fill: #1f77b4; stroke: #1f77b4" />
4972
- <use ns4:href="#md7efaf3aec" x="634.6508" y="372.153375" style="fill: #1f77b4; stroke: #1f77b4" />
4973
- <use ns4:href="#md7efaf3aec" x="665.299928" y="372.395022" style="fill: #1f77b4; stroke: #1f77b4" />
4974
- <use ns4:href="#md7efaf3aec" x="695.949056" y="373.093546" style="fill: #1f77b4; stroke: #1f77b4" />
4975
- <use ns4:href="#md7efaf3aec" x="726.598184" y="372.782745" style="fill: #1f77b4; stroke: #1f77b4" />
4976
- <use ns4:href="#md7efaf3aec" x="757.247312" y="362.519348" style="fill: #1f77b4; stroke: #1f77b4" />
4977
  <use ns4:href="#md7efaf3aec" x="787.896439" y="44.888614" style="fill: #1f77b4; stroke: #1f77b4" />
4978
  </g>
4979
  </g>
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
4980
  <g id="patch_3">
4981
  <path d="M 47.72 423.069506 L 47.72 26.88 " style="fill: none; stroke: #000000; stroke-width: 0.8; stroke-linejoin: miter; stroke-linecap: square" />
4982
  </g>
@@ -4989,21 +5148,30 @@ Installed 37 packages in 239ms
4989
  <g id="patch_6">
4990
  <path d="M 47.72 26.88 L 823.142937 26.88 " style="fill: none; stroke: #000000; stroke-width: 0.8; stroke-linejoin: miter; stroke-linecap: square" />
4991
  </g>
4992
- <g id="text_30">
4993
  <text style="font-size: 12px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: middle" x="435.431468" y="20.88" transform="rotate(-0 435.431468 20.88)">Attention Implementation Latency</text>
4994
  </g>
4995
  <g id="legend" class="legend">
4996
  <g id="patch_7">
4997
- <path d="M 54.72 49.83625 L 146.374687 49.83625 Q 148.374687 49.83625 148.374687 47.83625 L 148.374687 33.88 Q 148.374687 31.88 146.374687 31.88 L 54.72 31.88 Q 52.72 31.88 52.72 33.88 L 52.72 47.83625 Q 52.72 49.83625 54.72 49.83625 L 54.72 49.83625 z " style="fill: none; opacity: 0.8; stroke: #cccccc; stroke-linejoin: miter" />
4998
  </g>
4999
- <g id="line2d_30">
5000
  <path d="M 56.72 39.978438 L 66.72 39.978438 L 76.72 39.978438 " style="fill: none; stroke: #1f77b4; stroke-width: 1.5; stroke-linecap: square" />
5001
  <g>
5002
  <use ns4:href="#md7efaf3aec" x="66.72" y="39.978438" style="fill: #1f77b4; stroke: #1f77b4" />
5003
  </g>
5004
  </g>
 
 
 
 
 
 
 
 
 
5005
  <g id="legend-label--torch-eager" class="legend">
5006
- <text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: start" x="84.72" y="43.478438" transform="rotate(-0 84.72 43.478438)">torch_eager</text>
5007
  </g>
5008
  </g>
5009
  </g>
 
809
  .artifact-preview svg {
810
  background: transparent;
811
  }
812
+ /* Invert SVG images in dark mode */
813
+ :root[data-theme="dark"] .artifact-preview img[src$=".svg"] {
814
+ filter: invert(0.9) hue-rotate(180deg);
815
+ }
816
+ /* Keep SVG images readable in monocolor mode */
817
+ :root[data-ui="monocolor"] .artifact-preview img[src$=".svg"] {
818
+ filter: none;
819
+ }
820
  /* CSV table styling */
821
  .artifact-csv {
822
  margin-top: 1rem;
 
3880
  <rdf:RDF>
3881
  <ns2:Work>
3882
  <dc:type rdf:resource="http://purl.org/dc/dcmitype/StillImage" />
3883
+ <dc:date>2025-10-29T15:51:00.751980</dc:date>
3884
  <dc:format>image/svg+xml</dc:format>
3885
  <dc:creator>
3886
  <ns2:Agent>
 
4224
  <g id="matplotlib.axis_2">
4225
  <g id="ytick_1">
4226
  <g id="grid-y--2" class="grid grid-y">
4227
+ <path d="M 47.72 392.946895 L 823.142937 392.946895 " clip-path="url(#p088c925177)" style="fill: none; stroke: #b0b0b0; stroke-opacity: 0.3; stroke-width: 0.8; stroke-linecap: square" />
4228
  </g>
4229
  <g id="line2d_25">
4230
  <defs>
4231
  <path id="m0fca2865ba" d="M 0 0 L -3.5 0 " style="stroke: #000000; stroke-width: 0.8" />
4232
  </defs>
4233
  <g>
4234
+ <use ns4:href="#m0fca2865ba" x="47.72" y="392.946895" style="stroke: #000000; stroke-width: 0.8" />
4235
  </g>
4236
  </g>
4237
  <g id="text_25">
4238
+ <text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: end" x="40.72" y="396.746114" transform="rotate(-0 40.72 396.746114)">0.1</text>
4239
  </g>
4240
  </g>
4241
  <g id="ytick_2">
4242
  <g id="grid-y--3" class="grid grid-y">
4243
+ <path d="M 47.72 346.171092 L 823.142937 346.171092 " clip-path="url(#p088c925177)" style="fill: none; stroke: #b0b0b0; stroke-opacity: 0.3; stroke-width: 0.8; stroke-linecap: square" />
4244
  </g>
4245
  <g id="line2d_26">
4246
  <g>
4247
+ <use ns4:href="#m0fca2865ba" x="47.72" y="346.171092" style="stroke: #000000; stroke-width: 0.8" />
4248
  </g>
4249
  </g>
4250
  <g id="text_26">
4251
+ <text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: end" x="40.72" y="349.970311" transform="rotate(-0 40.72 349.970311)">0.2</text>
4252
  </g>
4253
  </g>
4254
  <g id="ytick_3">
4255
  <g id="grid-y--4" class="grid grid-y">
4256
+ <path d="M 47.72 299.395289 L 823.142937 299.395289 " clip-path="url(#p088c925177)" style="fill: none; stroke: #b0b0b0; stroke-opacity: 0.3; stroke-width: 0.8; stroke-linecap: square" />
4257
  </g>
4258
  <g id="line2d_27">
4259
  <g>
4260
+ <use ns4:href="#m0fca2865ba" x="47.72" y="299.395289" style="stroke: #000000; stroke-width: 0.8" />
4261
  </g>
4262
  </g>
4263
  <g id="text_27">
4264
+ <text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: end" x="40.72" y="303.194508" transform="rotate(-0 40.72 303.194508)">0.3</text>
4265
  </g>
4266
  </g>
4267
  <g id="ytick_4">
4268
  <g id="grid-y--5" class="grid grid-y">
4269
+ <path d="M 47.72 252.619486 L 823.142937 252.619486 " clip-path="url(#p088c925177)" style="fill: none; stroke: #b0b0b0; stroke-opacity: 0.3; stroke-width: 0.8; stroke-linecap: square" />
4270
  </g>
4271
  <g id="line2d_28">
4272
  <g>
4273
+ <use ns4:href="#m0fca2865ba" x="47.72" y="252.619486" style="stroke: #000000; stroke-width: 0.8" />
4274
  </g>
4275
  </g>
4276
  <g id="text_28">
4277
+ <text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: end" x="40.72" y="256.418705" transform="rotate(-0 40.72 256.418705)">0.4</text>
4278
  </g>
4279
  </g>
4280
  <g id="ytick_5">
4281
  <g id="grid-y--6" class="grid grid-y">
4282
+ <path d="M 47.72 205.843684 L 823.142937 205.843684 " clip-path="url(#p088c925177)" style="fill: none; stroke: #b0b0b0; stroke-opacity: 0.3; stroke-width: 0.8; stroke-linecap: square" />
4283
  </g>
4284
  <g id="line2d_29">
4285
  <g>
4286
+ <use ns4:href="#m0fca2865ba" x="47.72" y="205.843684" style="stroke: #000000; stroke-width: 0.8" />
4287
  </g>
4288
  </g>
4289
  <g id="text_29">
4290
+ <text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: end" x="40.72" y="209.642902" transform="rotate(-0 40.72 209.642902)">0.5</text>
4291
+ </g>
4292
+ </g>
4293
+ <g id="ytick_6">
4294
+ <g id="grid-y--7" class="grid grid-y">
4295
+ <path d="M 47.72 159.067881 L 823.142937 159.067881 " clip-path="url(#p088c925177)" style="fill: none; stroke: #b0b0b0; stroke-opacity: 0.3; stroke-width: 0.8; stroke-linecap: square" />
4296
+ </g>
4297
+ <g id="line2d_30">
4298
+ <g>
4299
+ <use ns4:href="#m0fca2865ba" x="47.72" y="159.067881" style="stroke: #000000; stroke-width: 0.8" />
4300
+ </g>
4301
+ </g>
4302
+ <g id="text_30">
4303
+ <text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: end" x="40.72" y="162.8671" transform="rotate(-0 40.72 162.8671)">0.6</text>
4304
+ </g>
4305
+ </g>
4306
+ <g id="ytick_7">
4307
+ <g id="grid-y--8" class="grid grid-y">
4308
+ <path d="M 47.72 112.292078 L 823.142937 112.292078 " clip-path="url(#p088c925177)" style="fill: none; stroke: #b0b0b0; stroke-opacity: 0.3; stroke-width: 0.8; stroke-linecap: square" />
4309
+ </g>
4310
+ <g id="line2d_31">
4311
+ <g>
4312
+ <use ns4:href="#m0fca2865ba" x="47.72" y="112.292078" style="stroke: #000000; stroke-width: 0.8" />
4313
+ </g>
4314
+ </g>
4315
+ <g id="text_31">
4316
+ <text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: end" x="40.72" y="116.091297" transform="rotate(-0 40.72 116.091297)">0.7</text>
4317
+ </g>
4318
+ </g>
4319
+ <g id="ytick_8">
4320
+ <g id="grid-y--9" class="grid grid-y">
4321
+ <path d="M 47.72 65.516275 L 823.142937 65.516275 " clip-path="url(#p088c925177)" style="fill: none; stroke: #b0b0b0; stroke-opacity: 0.3; stroke-width: 0.8; stroke-linecap: square" />
4322
+ </g>
4323
+ <g id="line2d_32">
4324
+ <g>
4325
+ <use ns4:href="#m0fca2865ba" x="47.72" y="65.516275" style="stroke: #000000; stroke-width: 0.8" />
4326
+ </g>
4327
+ </g>
4328
+ <g id="text_32">
4329
+ <text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: end" x="40.72" y="69.315494" transform="rotate(-0 40.72 69.315494)">0.8</text>
4330
  </g>
4331
  </g>
4332
  <g id="label--y" class="ylabel">
4333
  <text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: middle" x="18.737188" y="224.974753" transform="rotate(-90 18.737188 224.974753)">Latency P50 (ms)</text>
4334
  </g>
4335
  </g>
4336
+ <g id="series--hf-kernels-rotary" class="series">
4337
+ <path d="M 82.966497 405.060892 L 113.615625 396.688024 L 144.264753 397.225945 L 174.913881 396.996744 L 205.563009 397.632895 L 236.212137 397.35224 L 266.861265 397.324175 L 297.510393 397.91355 L 328.159521 397.843386 L 358.808648 397.146426 L 389.457776 397.071585 L 420.106904 317.340358 L 450.756032 397.211913 L 481.40516 397.459824 L 512.054288 397.151104 L 542.703416 397.263366 L 573.352544 397.511278 L 604.001672 397.314819 L 634.6508 397.539343 L 665.299928 397.628217 L 695.949056 396.861094 L 726.598184 397.997746 L 757.247312 316.58259 L 787.896439 44.888614 " clip-path="url(#p088c925177)" style="fill: none; stroke: #1f77b4; stroke-width: 1.5; stroke-linecap: square" />
4338
  <defs>
4339
  <path id="md7efaf3aec" d="M 0 3 C 0.795609 3 1.55874 2.683901 2.12132 2.12132 C 2.683901 1.55874 3 0.795609 3 0 C 3 -0.795609 2.683901 -1.55874 2.12132 -2.12132 C 1.55874 -2.683901 0.795609 -3 0 -3 C -0.795609 -3 -1.55874 -2.683901 -2.12132 -2.12132 C -2.683901 -1.55874 -3 -0.795609 -3 0 C -3 0.795609 -2.683901 1.55874 -2.12132 2.12132 C -1.55874 2.683901 -0.795609 3 0 3 z " style="stroke: #1f77b4" />
4340
  </defs>
4341
  <g clip-path="url(#p088c925177)">
4342
  <use ns4:href="#md7efaf3aec" x="82.966497" y="405.060892" style="fill: #1f77b4; stroke: #1f77b4" />
4343
+ <use ns4:href="#md7efaf3aec" x="113.615625" y="396.688024" style="fill: #1f77b4; stroke: #1f77b4" />
4344
+ <use ns4:href="#md7efaf3aec" x="144.264753" y="397.225945" style="fill: #1f77b4; stroke: #1f77b4" />
4345
+ <use ns4:href="#md7efaf3aec" x="174.913881" y="396.996744" style="fill: #1f77b4; stroke: #1f77b4" />
4346
+ <use ns4:href="#md7efaf3aec" x="205.563009" y="397.632895" style="fill: #1f77b4; stroke: #1f77b4" />
4347
+ <use ns4:href="#md7efaf3aec" x="236.212137" y="397.35224" style="fill: #1f77b4; stroke: #1f77b4" />
4348
+ <use ns4:href="#md7efaf3aec" x="266.861265" y="397.324175" style="fill: #1f77b4; stroke: #1f77b4" />
4349
+ <use ns4:href="#md7efaf3aec" x="297.510393" y="397.91355" style="fill: #1f77b4; stroke: #1f77b4" />
4350
+ <use ns4:href="#md7efaf3aec" x="328.159521" y="397.843386" style="fill: #1f77b4; stroke: #1f77b4" />
4351
+ <use ns4:href="#md7efaf3aec" x="358.808648" y="397.146426" style="fill: #1f77b4; stroke: #1f77b4" />
4352
+ <use ns4:href="#md7efaf3aec" x="389.457776" y="397.071585" style="fill: #1f77b4; stroke: #1f77b4" />
4353
+ <use ns4:href="#md7efaf3aec" x="420.106904" y="317.340358" style="fill: #1f77b4; stroke: #1f77b4" />
4354
+ <use ns4:href="#md7efaf3aec" x="450.756032" y="397.211913" style="fill: #1f77b4; stroke: #1f77b4" />
4355
+ <use ns4:href="#md7efaf3aec" x="481.40516" y="397.459824" style="fill: #1f77b4; stroke: #1f77b4" />
4356
+ <use ns4:href="#md7efaf3aec" x="512.054288" y="397.151104" style="fill: #1f77b4; stroke: #1f77b4" />
4357
+ <use ns4:href="#md7efaf3aec" x="542.703416" y="397.263366" style="fill: #1f77b4; stroke: #1f77b4" />
4358
+ <use ns4:href="#md7efaf3aec" x="573.352544" y="397.511278" style="fill: #1f77b4; stroke: #1f77b4" />
4359
+ <use ns4:href="#md7efaf3aec" x="604.001672" y="397.314819" style="fill: #1f77b4; stroke: #1f77b4" />
4360
+ <use ns4:href="#md7efaf3aec" x="634.6508" y="397.539343" style="fill: #1f77b4; stroke: #1f77b4" />
4361
+ <use ns4:href="#md7efaf3aec" x="665.299928" y="397.628217" style="fill: #1f77b4; stroke: #1f77b4" />
4362
+ <use ns4:href="#md7efaf3aec" x="695.949056" y="396.861094" style="fill: #1f77b4; stroke: #1f77b4" />
4363
+ <use ns4:href="#md7efaf3aec" x="726.598184" y="397.997746" style="fill: #1f77b4; stroke: #1f77b4" />
4364
+ <use ns4:href="#md7efaf3aec" x="757.247312" y="316.58259" style="fill: #1f77b4; stroke: #1f77b4" />
4365
  <use ns4:href="#md7efaf3aec" x="787.896439" y="44.888614" style="fill: #1f77b4; stroke: #1f77b4" />
4366
  </g>
4367
  </g>
4368
+ <g id="series--torch-eager" class="series">
4369
+ <path d="M 82.966497 359.229025 L 113.615625 336.991341 L 144.264753 338.68977 L 174.913881 340.649676 L 205.563009 337.183122 L 236.212137 342.30975 L 266.861265 341.346168 L 297.510393 339.896118 L 328.159521 341.271327 L 358.808648 340.438717 L 389.457776 342.132002 L 420.106904 334.919173 L 450.756032 340.405974 L 481.40516 340.528059 L 512.054288 341.112289 L 542.703416 340.373231 L 573.352544 340.218871 L 604.001672 341.364878 L 634.6508 340.069189 L 665.299928 340.574367 L 695.949056 340.438717 L 726.598184 339.068186 L 757.247312 332.696854 L 787.896439 141.132167 " clip-path="url(#p088c925177)" style="fill: none; stroke: #ff7f0e; stroke-width: 1.5; stroke-linecap: square" />
4370
+ <defs>
4371
+ <path id="m9b8c54d372" d="M 0 3 C 0.795609 3 1.55874 2.683901 2.12132 2.12132 C 2.683901 1.55874 3 0.795609 3 0 C 3 -0.795609 2.683901 -1.55874 2.12132 -2.12132 C 1.55874 -2.683901 0.795609 -3 0 -3 C -0.795609 -3 -1.55874 -2.683901 -2.12132 -2.12132 C -2.683901 -1.55874 -3 -0.795609 -3 0 C -3 0.795609 -2.683901 1.55874 -2.12132 2.12132 C -1.55874 2.683901 -0.795609 3 0 3 z " style="stroke: #ff7f0e" />
4372
+ </defs>
4373
+ <g clip-path="url(#p088c925177)">
4374
+ <use ns4:href="#m9b8c54d372" x="82.966497" y="359.229025" style="fill: #ff7f0e; stroke: #ff7f0e" />
4375
+ <use ns4:href="#m9b8c54d372" x="113.615625" y="336.991341" style="fill: #ff7f0e; stroke: #ff7f0e" />
4376
+ <use ns4:href="#m9b8c54d372" x="144.264753" y="338.68977" style="fill: #ff7f0e; stroke: #ff7f0e" />
4377
+ <use ns4:href="#m9b8c54d372" x="174.913881" y="340.649676" style="fill: #ff7f0e; stroke: #ff7f0e" />
4378
+ <use ns4:href="#m9b8c54d372" x="205.563009" y="337.183122" style="fill: #ff7f0e; stroke: #ff7f0e" />
4379
+ <use ns4:href="#m9b8c54d372" x="236.212137" y="342.30975" style="fill: #ff7f0e; stroke: #ff7f0e" />
4380
+ <use ns4:href="#m9b8c54d372" x="266.861265" y="341.346168" style="fill: #ff7f0e; stroke: #ff7f0e" />
4381
+ <use ns4:href="#m9b8c54d372" x="297.510393" y="339.896118" style="fill: #ff7f0e; stroke: #ff7f0e" />
4382
+ <use ns4:href="#m9b8c54d372" x="328.159521" y="341.271327" style="fill: #ff7f0e; stroke: #ff7f0e" />
4383
+ <use ns4:href="#m9b8c54d372" x="358.808648" y="340.438717" style="fill: #ff7f0e; stroke: #ff7f0e" />
4384
+ <use ns4:href="#m9b8c54d372" x="389.457776" y="342.132002" style="fill: #ff7f0e; stroke: #ff7f0e" />
4385
+ <use ns4:href="#m9b8c54d372" x="420.106904" y="334.919173" style="fill: #ff7f0e; stroke: #ff7f0e" />
4386
+ <use ns4:href="#m9b8c54d372" x="450.756032" y="340.405974" style="fill: #ff7f0e; stroke: #ff7f0e" />
4387
+ <use ns4:href="#m9b8c54d372" x="481.40516" y="340.528059" style="fill: #ff7f0e; stroke: #ff7f0e" />
4388
+ <use ns4:href="#m9b8c54d372" x="512.054288" y="341.112289" style="fill: #ff7f0e; stroke: #ff7f0e" />
4389
+ <use ns4:href="#m9b8c54d372" x="542.703416" y="340.373231" style="fill: #ff7f0e; stroke: #ff7f0e" />
4390
+ <use ns4:href="#m9b8c54d372" x="573.352544" y="340.218871" style="fill: #ff7f0e; stroke: #ff7f0e" />
4391
+ <use ns4:href="#m9b8c54d372" x="604.001672" y="341.364878" style="fill: #ff7f0e; stroke: #ff7f0e" />
4392
+ <use ns4:href="#m9b8c54d372" x="634.6508" y="340.069189" style="fill: #ff7f0e; stroke: #ff7f0e" />
4393
+ <use ns4:href="#m9b8c54d372" x="665.299928" y="340.574367" style="fill: #ff7f0e; stroke: #ff7f0e" />
4394
+ <use ns4:href="#m9b8c54d372" x="695.949056" y="340.438717" style="fill: #ff7f0e; stroke: #ff7f0e" />
4395
+ <use ns4:href="#m9b8c54d372" x="726.598184" y="339.068186" style="fill: #ff7f0e; stroke: #ff7f0e" />
4396
+ <use ns4:href="#m9b8c54d372" x="757.247312" y="332.696854" style="fill: #ff7f0e; stroke: #ff7f0e" />
4397
+ <use ns4:href="#m9b8c54d372" x="787.896439" y="141.132167" style="fill: #ff7f0e; stroke: #ff7f0e" />
4398
+ </g>
4399
+ </g>
4400
  <g id="patch_3">
4401
  <path d="M 47.72 423.069506 L 47.72 26.88 " style="fill: none; stroke: #000000; stroke-width: 0.8; stroke-linejoin: miter; stroke-linecap: square" />
4402
  </g>
 
4409
  <g id="patch_6">
4410
  <path d="M 47.72 26.88 L 823.142937 26.88 " style="fill: none; stroke: #000000; stroke-width: 0.8; stroke-linejoin: miter; stroke-linecap: square" />
4411
  </g>
4412
+ <g id="text_33">
4413
  <text style="font-size: 12px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: middle" x="435.431468" y="20.88" transform="rotate(-0 435.431468 20.88)">Attention Implementation Latency</text>
4414
  </g>
4415
  <g id="legend" class="legend">
4416
  <g id="patch_7">
4417
+ <path d="M 54.72 64.7925 L 172.655938 64.7925 Q 174.655938 64.7925 174.655938 62.7925 L 174.655938 33.88 Q 174.655938 31.88 172.655938 31.88 L 54.72 31.88 Q 52.72 31.88 52.72 33.88 L 52.72 62.7925 Q 52.72 64.7925 54.72 64.7925 L 54.72 64.7925 z " style="fill: none; opacity: 0.8; stroke: #cccccc; stroke-linejoin: miter" />
4418
  </g>
4419
+ <g id="line2d_33">
4420
  <path d="M 56.72 39.978438 L 66.72 39.978438 L 76.72 39.978438 " style="fill: none; stroke: #1f77b4; stroke-width: 1.5; stroke-linecap: square" />
4421
  <g>
4422
  <use ns4:href="#md7efaf3aec" x="66.72" y="39.978438" style="fill: #1f77b4; stroke: #1f77b4" />
4423
  </g>
4424
  </g>
4425
+ <g id="legend-label--hf-kernels-rotary" class="legend">
4426
+ <text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: start" x="84.72" y="43.478438" transform="rotate(-0 84.72 43.478438)">hf_kernels_rotary</text>
4427
+ </g>
4428
+ <g id="line2d_34">
4429
+ <path d="M 56.72 54.934687 L 66.72 54.934687 L 76.72 54.934687 " style="fill: none; stroke: #ff7f0e; stroke-width: 1.5; stroke-linecap: square" />
4430
+ <g>
4431
+ <use ns4:href="#m9b8c54d372" x="66.72" y="54.934687" style="fill: #ff7f0e; stroke: #ff7f0e" />
4432
+ </g>
4433
+ </g>
4434
  <g id="legend-label--torch-eager" class="legend">
4435
+ <text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: start" x="84.72" y="58.434687" transform="rotate(-0 84.72 58.434687)">torch_eager</text>
4436
  </g>
4437
  </g>
4438
  </g>
 
4452
  <span onclick="toggleOutput('combine')" style="cursor: pointer;">▼ output</span>
4453
  <span id="uv-indicator-combine" onclick="toggleUvLogsFromHeader('combine')" style="cursor: pointer;">▶ uv-logs</span>
4454
  </span> |
4455
+ Cell: combine | 4.43s
4456
  | <button class="run-btn" onclick="runCell('combine')">▶ run</button>
4457
  <button class="copy-btn" onclick="copyCell('combine')">Copy</button>
4458
  <a href="cells/combine.py" target="_blank" class="raw-btn">Raw</a>
 
4524
  <div class="cell-stdout"><pre class="stdout-text">======================================================================
4525
  LOADING BENCHMARK DATA
4526
  ======================================================================
4527
+ ✓ HF Kernels Rotary : /__w/kernels-benchmarks/kernels-benchmarks/benches/rotary/impls/.uvnote/cache/3884170bda871392d403d55c822a8b7de8970f81c4733ae7630938c3bf0db88a
4528
  ✓ PyTorch Rotary : /__w/kernels-benchmarks/kernels-benchmarks/benches/rotary/impls/.uvnote/cache/abf801d6445dfa81a8dd7b2e6257930c39c18160a9b97a739858c3b244e16cc5
4529
 
4530
  ✓ Found HF Kernels Rotary
4531
+ Path: /__w/kernels-benchmarks/kernels-benchmarks/benches/rotary/impls/.uvnote/cache/3884170bda871392d403d55c822a8b7de8970f81c4733ae7630938c3bf0db88a/rotary.jsonl
4532
  ✓ Found PyTorch Rotary
4533
  Path: /__w/kernels-benchmarks/kernels-benchmarks/benches/rotary/impls/.uvnote/cache/abf801d6445dfa81a8dd7b2e6257930c39c18160a9b97a739858c3b244e16cc5/rotary.jsonl
4534
 
 
4539
  COMBINED BENCHMARK SUMMARY
4540
 
4541
  impl wl p50(ms) ok
4542
+ hf_kernels_rotary cuda_B1_S128_H32_D128_R64 0.09 True
4543
+ hf_kernels_rotary cuda_B1_S128_H32_D64_R32 0.09 True
4544
+ hf_kernels_rotary cuda_B1_S128_H8_D128_R64 0.09 True
4545
+ hf_kernels_rotary cuda_B1_S128_H8_D64_R32 0.07 True
4546
+ hf_kernels_rotary cuda_B1_S2048_H32_D128_R64 0.26 True
4547
+ hf_kernels_rotary cuda_B1_S2048_H32_D64_R32 0.09 True
4548
+ hf_kernels_rotary cuda_B1_S2048_H8_D128_R64 0.09 True
4549
+ hf_kernels_rotary cuda_B1_S2048_H8_D64_R32 0.09 True
4550
+ hf_kernels_rotary cuda_B1_S512_H32_D128_R64 0.09 True
4551
+ hf_kernels_rotary cuda_B1_S512_H32_D64_R32 0.09 True
4552
+ hf_kernels_rotary cuda_B1_S512_H8_D128_R64 0.09 True
4553
+ hf_kernels_rotary cuda_B1_S512_H8_D64_R32 0.09 True
4554
+ hf_kernels_rotary cuda_B2_S128_H32_D128_R64 0.09 True
4555
+ hf_kernels_rotary cuda_B2_S128_H32_D64_R32 0.09 True
4556
+ hf_kernels_rotary cuda_B2_S128_H8_D128_R64 0.09 True
4557
+ hf_kernels_rotary cuda_B2_S128_H8_D64_R32 0.09 True
4558
+ hf_kernels_rotary cuda_B2_S2048_H32_D128_R64 0.84 True
4559
+ hf_kernels_rotary cuda_B2_S2048_H32_D64_R32 0.26 True
4560
+ hf_kernels_rotary cuda_B2_S2048_H8_D128_R64 0.09 True
4561
+ hf_kernels_rotary cuda_B2_S2048_H8_D64_R32 0.09 True
4562
+ hf_kernels_rotary cuda_B2_S512_H32_D128_R64 0.09 True
4563
+ hf_kernels_rotary cuda_B2_S512_H32_D64_R32 0.09 True
4564
+ hf_kernels_rotary cuda_B2_S512_H8_D128_R64 0.09 True
4565
+ hf_kernels_rotary cuda_B2_S512_H8_D64_R32 0.09 True
4566
+ torch_eager cuda_B1_S128_H32_D128_R64 0.21 True
4567
  torch_eager cuda_B1_S128_H32_D64_R32 0.22 True
4568
+ torch_eager cuda_B1_S128_H8_D128_R64 0.22 True
4569
+ torch_eager cuda_B1_S128_H8_D64_R32 0.17 True
4570
+ torch_eager cuda_B1_S2048_H32_D128_R64 0.22 True
4571
+ torch_eager cuda_B1_S2048_H32_D64_R32 0.21 True
4572
+ torch_eager cuda_B1_S2048_H8_D128_R64 0.21 True
4573
+ torch_eager cuda_B1_S2048_H8_D64_R32 0.21 True
4574
+ torch_eager cuda_B1_S512_H32_D128_R64 0.21 True
4575
+ torch_eager cuda_B1_S512_H32_D64_R32 0.21 True
4576
+ torch_eager cuda_B1_S512_H8_D128_R64 0.21 True
4577
  torch_eager cuda_B1_S512_H8_D64_R32 0.22 True
4578
+ torch_eager cuda_B2_S128_H32_D128_R64 0.21 True
4579
+ torch_eager cuda_B2_S128_H32_D64_R32 0.21 True
4580
+ torch_eager cuda_B2_S128_H8_D128_R64 0.21 True
4581
+ torch_eager cuda_B2_S128_H8_D64_R32 0.21 True
4582
  torch_eager cuda_B2_S2048_H32_D128_R64 0.64 True
4583
  torch_eager cuda_B2_S2048_H32_D64_R32 0.23 True
4584
  torch_eager cuda_B2_S2048_H8_D128_R64 0.22 True
4585
+ torch_eager cuda_B2_S2048_H8_D64_R32 0.21 True
4586
+ torch_eager cuda_B2_S512_H32_D128_R64 0.21 True
4587
+ torch_eager cuda_B2_S512_H32_D64_R32 0.21 True
4588
  torch_eager cuda_B2_S512_H8_D128_R64 0.21 True
4589
+ torch_eager cuda_B2_S512_H8_D64_R32 0.21 True
4590
 
4591
  GENERATING COMBINED VISUALIZATION
4592
 
 
4606
  <div class="uv-install-logs" id="uv-logs-combine">
4607
  <div class="uv-logs-header" onclick="toggleUvLogs(this)">▶ UV Install Logs</div>
4608
  <div class="uv-logs-content" style="display: none;">
4609
+ Installed 37 packages in 229ms
4610
  </div>
4611
  </div>
4612
  <div class="cell-artifacts">
 
4619
  <rdf:RDF>
4620
  <ns2:Work>
4621
  <dc:type rdf:resource="http://purl.org/dc/dcmitype/StillImage" />
4622
+ <dc:date>2025-10-29T15:51:00.751980</dc:date>
4623
  <dc:format>image/svg+xml</dc:format>
4624
  <dc:creator>
4625
  <ns2:Agent>
 
4963
  <g id="matplotlib.axis_2">
4964
  <g id="ytick_1">
4965
  <g id="grid-y--2" class="grid grid-y">
4966
+ <path d="M 47.72 392.946895 L 823.142937 392.946895 " clip-path="url(#p088c925177)" style="fill: none; stroke: #b0b0b0; stroke-opacity: 0.3; stroke-width: 0.8; stroke-linecap: square" />
4967
  </g>
4968
  <g id="line2d_25">
4969
  <defs>
4970
  <path id="m0fca2865ba" d="M 0 0 L -3.5 0 " style="stroke: #000000; stroke-width: 0.8" />
4971
  </defs>
4972
  <g>
4973
+ <use ns4:href="#m0fca2865ba" x="47.72" y="392.946895" style="stroke: #000000; stroke-width: 0.8" />
4974
  </g>
4975
  </g>
4976
  <g id="text_25">
4977
+ <text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: end" x="40.72" y="396.746114" transform="rotate(-0 40.72 396.746114)">0.1</text>
4978
  </g>
4979
  </g>
4980
  <g id="ytick_2">
4981
  <g id="grid-y--3" class="grid grid-y">
4982
+ <path d="M 47.72 346.171092 L 823.142937 346.171092 " clip-path="url(#p088c925177)" style="fill: none; stroke: #b0b0b0; stroke-opacity: 0.3; stroke-width: 0.8; stroke-linecap: square" />
4983
  </g>
4984
  <g id="line2d_26">
4985
  <g>
4986
+ <use ns4:href="#m0fca2865ba" x="47.72" y="346.171092" style="stroke: #000000; stroke-width: 0.8" />
4987
  </g>
4988
  </g>
4989
  <g id="text_26">
4990
+ <text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: end" x="40.72" y="349.970311" transform="rotate(-0 40.72 349.970311)">0.2</text>
4991
  </g>
4992
  </g>
4993
  <g id="ytick_3">
4994
  <g id="grid-y--4" class="grid grid-y">
4995
+ <path d="M 47.72 299.395289 L 823.142937 299.395289 " clip-path="url(#p088c925177)" style="fill: none; stroke: #b0b0b0; stroke-opacity: 0.3; stroke-width: 0.8; stroke-linecap: square" />
4996
  </g>
4997
  <g id="line2d_27">
4998
  <g>
4999
+ <use ns4:href="#m0fca2865ba" x="47.72" y="299.395289" style="stroke: #000000; stroke-width: 0.8" />
5000
  </g>
5001
  </g>
5002
  <g id="text_27">
5003
+ <text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: end" x="40.72" y="303.194508" transform="rotate(-0 40.72 303.194508)">0.3</text>
5004
  </g>
5005
  </g>
5006
  <g id="ytick_4">
5007
  <g id="grid-y--5" class="grid grid-y">
5008
+ <path d="M 47.72 252.619486 L 823.142937 252.619486 " clip-path="url(#p088c925177)" style="fill: none; stroke: #b0b0b0; stroke-opacity: 0.3; stroke-width: 0.8; stroke-linecap: square" />
5009
  </g>
5010
  <g id="line2d_28">
5011
  <g>
5012
+ <use ns4:href="#m0fca2865ba" x="47.72" y="252.619486" style="stroke: #000000; stroke-width: 0.8" />
5013
  </g>
5014
  </g>
5015
  <g id="text_28">
5016
+ <text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: end" x="40.72" y="256.418705" transform="rotate(-0 40.72 256.418705)">0.4</text>
5017
  </g>
5018
  </g>
5019
  <g id="ytick_5">
5020
  <g id="grid-y--6" class="grid grid-y">
5021
+ <path d="M 47.72 205.843684 L 823.142937 205.843684 " clip-path="url(#p088c925177)" style="fill: none; stroke: #b0b0b0; stroke-opacity: 0.3; stroke-width: 0.8; stroke-linecap: square" />
5022
  </g>
5023
  <g id="line2d_29">
5024
  <g>
5025
+ <use ns4:href="#m0fca2865ba" x="47.72" y="205.843684" style="stroke: #000000; stroke-width: 0.8" />
5026
  </g>
5027
  </g>
5028
  <g id="text_29">
5029
+ <text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: end" x="40.72" y="209.642902" transform="rotate(-0 40.72 209.642902)">0.5</text>
5030
+ </g>
5031
+ </g>
5032
+ <g id="ytick_6">
5033
+ <g id="grid-y--7" class="grid grid-y">
5034
+ <path d="M 47.72 159.067881 L 823.142937 159.067881 " clip-path="url(#p088c925177)" style="fill: none; stroke: #b0b0b0; stroke-opacity: 0.3; stroke-width: 0.8; stroke-linecap: square" />
5035
+ </g>
5036
+ <g id="line2d_30">
5037
+ <g>
5038
+ <use ns4:href="#m0fca2865ba" x="47.72" y="159.067881" style="stroke: #000000; stroke-width: 0.8" />
5039
+ </g>
5040
+ </g>
5041
+ <g id="text_30">
5042
+ <text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: end" x="40.72" y="162.8671" transform="rotate(-0 40.72 162.8671)">0.6</text>
5043
+ </g>
5044
+ </g>
5045
+ <g id="ytick_7">
5046
+ <g id="grid-y--8" class="grid grid-y">
5047
+ <path d="M 47.72 112.292078 L 823.142937 112.292078 " clip-path="url(#p088c925177)" style="fill: none; stroke: #b0b0b0; stroke-opacity: 0.3; stroke-width: 0.8; stroke-linecap: square" />
5048
+ </g>
5049
+ <g id="line2d_31">
5050
+ <g>
5051
+ <use ns4:href="#m0fca2865ba" x="47.72" y="112.292078" style="stroke: #000000; stroke-width: 0.8" />
5052
+ </g>
5053
+ </g>
5054
+ <g id="text_31">
5055
+ <text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: end" x="40.72" y="116.091297" transform="rotate(-0 40.72 116.091297)">0.7</text>
5056
+ </g>
5057
+ </g>
5058
+ <g id="ytick_8">
5059
+ <g id="grid-y--9" class="grid grid-y">
5060
+ <path d="M 47.72 65.516275 L 823.142937 65.516275 " clip-path="url(#p088c925177)" style="fill: none; stroke: #b0b0b0; stroke-opacity: 0.3; stroke-width: 0.8; stroke-linecap: square" />
5061
+ </g>
5062
+ <g id="line2d_32">
5063
+ <g>
5064
+ <use ns4:href="#m0fca2865ba" x="47.72" y="65.516275" style="stroke: #000000; stroke-width: 0.8" />
5065
+ </g>
5066
+ </g>
5067
+ <g id="text_32">
5068
+ <text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: end" x="40.72" y="69.315494" transform="rotate(-0 40.72 69.315494)">0.8</text>
5069
  </g>
5070
  </g>
5071
  <g id="label--y" class="ylabel">
5072
  <text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: middle" x="18.737188" y="224.974753" transform="rotate(-90 18.737188 224.974753)">Latency P50 (ms)</text>
5073
  </g>
5074
  </g>
5075
+ <g id="series--hf-kernels-rotary" class="series">
5076
+ <path d="M 82.966497 405.060892 L 113.615625 396.688024 L 144.264753 397.225945 L 174.913881 396.996744 L 205.563009 397.632895 L 236.212137 397.35224 L 266.861265 397.324175 L 297.510393 397.91355 L 328.159521 397.843386 L 358.808648 397.146426 L 389.457776 397.071585 L 420.106904 317.340358 L 450.756032 397.211913 L 481.40516 397.459824 L 512.054288 397.151104 L 542.703416 397.263366 L 573.352544 397.511278 L 604.001672 397.314819 L 634.6508 397.539343 L 665.299928 397.628217 L 695.949056 396.861094 L 726.598184 397.997746 L 757.247312 316.58259 L 787.896439 44.888614 " clip-path="url(#p088c925177)" style="fill: none; stroke: #1f77b4; stroke-width: 1.5; stroke-linecap: square" />
5077
  <defs>
5078
  <path id="md7efaf3aec" d="M 0 3 C 0.795609 3 1.55874 2.683901 2.12132 2.12132 C 2.683901 1.55874 3 0.795609 3 0 C 3 -0.795609 2.683901 -1.55874 2.12132 -2.12132 C 1.55874 -2.683901 0.795609 -3 0 -3 C -0.795609 -3 -1.55874 -2.683901 -2.12132 -2.12132 C -2.683901 -1.55874 -3 -0.795609 -3 0 C -3 0.795609 -2.683901 1.55874 -2.12132 2.12132 C -1.55874 2.683901 -0.795609 3 0 3 z " style="stroke: #1f77b4" />
5079
  </defs>
5080
  <g clip-path="url(#p088c925177)">
5081
  <use ns4:href="#md7efaf3aec" x="82.966497" y="405.060892" style="fill: #1f77b4; stroke: #1f77b4" />
5082
+ <use ns4:href="#md7efaf3aec" x="113.615625" y="396.688024" style="fill: #1f77b4; stroke: #1f77b4" />
5083
+ <use ns4:href="#md7efaf3aec" x="144.264753" y="397.225945" style="fill: #1f77b4; stroke: #1f77b4" />
5084
+ <use ns4:href="#md7efaf3aec" x="174.913881" y="396.996744" style="fill: #1f77b4; stroke: #1f77b4" />
5085
+ <use ns4:href="#md7efaf3aec" x="205.563009" y="397.632895" style="fill: #1f77b4; stroke: #1f77b4" />
5086
+ <use ns4:href="#md7efaf3aec" x="236.212137" y="397.35224" style="fill: #1f77b4; stroke: #1f77b4" />
5087
+ <use ns4:href="#md7efaf3aec" x="266.861265" y="397.324175" style="fill: #1f77b4; stroke: #1f77b4" />
5088
+ <use ns4:href="#md7efaf3aec" x="297.510393" y="397.91355" style="fill: #1f77b4; stroke: #1f77b4" />
5089
+ <use ns4:href="#md7efaf3aec" x="328.159521" y="397.843386" style="fill: #1f77b4; stroke: #1f77b4" />
5090
+ <use ns4:href="#md7efaf3aec" x="358.808648" y="397.146426" style="fill: #1f77b4; stroke: #1f77b4" />
5091
+ <use ns4:href="#md7efaf3aec" x="389.457776" y="397.071585" style="fill: #1f77b4; stroke: #1f77b4" />
5092
+ <use ns4:href="#md7efaf3aec" x="420.106904" y="317.340358" style="fill: #1f77b4; stroke: #1f77b4" />
5093
+ <use ns4:href="#md7efaf3aec" x="450.756032" y="397.211913" style="fill: #1f77b4; stroke: #1f77b4" />
5094
+ <use ns4:href="#md7efaf3aec" x="481.40516" y="397.459824" style="fill: #1f77b4; stroke: #1f77b4" />
5095
+ <use ns4:href="#md7efaf3aec" x="512.054288" y="397.151104" style="fill: #1f77b4; stroke: #1f77b4" />
5096
+ <use ns4:href="#md7efaf3aec" x="542.703416" y="397.263366" style="fill: #1f77b4; stroke: #1f77b4" />
5097
+ <use ns4:href="#md7efaf3aec" x="573.352544" y="397.511278" style="fill: #1f77b4; stroke: #1f77b4" />
5098
+ <use ns4:href="#md7efaf3aec" x="604.001672" y="397.314819" style="fill: #1f77b4; stroke: #1f77b4" />
5099
+ <use ns4:href="#md7efaf3aec" x="634.6508" y="397.539343" style="fill: #1f77b4; stroke: #1f77b4" />
5100
+ <use ns4:href="#md7efaf3aec" x="665.299928" y="397.628217" style="fill: #1f77b4; stroke: #1f77b4" />
5101
+ <use ns4:href="#md7efaf3aec" x="695.949056" y="396.861094" style="fill: #1f77b4; stroke: #1f77b4" />
5102
+ <use ns4:href="#md7efaf3aec" x="726.598184" y="397.997746" style="fill: #1f77b4; stroke: #1f77b4" />
5103
+ <use ns4:href="#md7efaf3aec" x="757.247312" y="316.58259" style="fill: #1f77b4; stroke: #1f77b4" />
5104
  <use ns4:href="#md7efaf3aec" x="787.896439" y="44.888614" style="fill: #1f77b4; stroke: #1f77b4" />
5105
  </g>
5106
  </g>
5107
+ <g id="series--torch-eager" class="series">
5108
+ <path d="M 82.966497 359.229025 L 113.615625 336.991341 L 144.264753 338.68977 L 174.913881 340.649676 L 205.563009 337.183122 L 236.212137 342.30975 L 266.861265 341.346168 L 297.510393 339.896118 L 328.159521 341.271327 L 358.808648 340.438717 L 389.457776 342.132002 L 420.106904 334.919173 L 450.756032 340.405974 L 481.40516 340.528059 L 512.054288 341.112289 L 542.703416 340.373231 L 573.352544 340.218871 L 604.001672 341.364878 L 634.6508 340.069189 L 665.299928 340.574367 L 695.949056 340.438717 L 726.598184 339.068186 L 757.247312 332.696854 L 787.896439 141.132167 " clip-path="url(#p088c925177)" style="fill: none; stroke: #ff7f0e; stroke-width: 1.5; stroke-linecap: square" />
5109
+ <defs>
5110
+ <path id="m9b8c54d372" d="M 0 3 C 0.795609 3 1.55874 2.683901 2.12132 2.12132 C 2.683901 1.55874 3 0.795609 3 0 C 3 -0.795609 2.683901 -1.55874 2.12132 -2.12132 C 1.55874 -2.683901 0.795609 -3 0 -3 C -0.795609 -3 -1.55874 -2.683901 -2.12132 -2.12132 C -2.683901 -1.55874 -3 -0.795609 -3 0 C -3 0.795609 -2.683901 1.55874 -2.12132 2.12132 C -1.55874 2.683901 -0.795609 3 0 3 z " style="stroke: #ff7f0e" />
5111
+ </defs>
5112
+ <g clip-path="url(#p088c925177)">
5113
+ <use ns4:href="#m9b8c54d372" x="82.966497" y="359.229025" style="fill: #ff7f0e; stroke: #ff7f0e" />
5114
+ <use ns4:href="#m9b8c54d372" x="113.615625" y="336.991341" style="fill: #ff7f0e; stroke: #ff7f0e" />
5115
+ <use ns4:href="#m9b8c54d372" x="144.264753" y="338.68977" style="fill: #ff7f0e; stroke: #ff7f0e" />
5116
+ <use ns4:href="#m9b8c54d372" x="174.913881" y="340.649676" style="fill: #ff7f0e; stroke: #ff7f0e" />
5117
+ <use ns4:href="#m9b8c54d372" x="205.563009" y="337.183122" style="fill: #ff7f0e; stroke: #ff7f0e" />
5118
+ <use ns4:href="#m9b8c54d372" x="236.212137" y="342.30975" style="fill: #ff7f0e; stroke: #ff7f0e" />
5119
+ <use ns4:href="#m9b8c54d372" x="266.861265" y="341.346168" style="fill: #ff7f0e; stroke: #ff7f0e" />
5120
+ <use ns4:href="#m9b8c54d372" x="297.510393" y="339.896118" style="fill: #ff7f0e; stroke: #ff7f0e" />
5121
+ <use ns4:href="#m9b8c54d372" x="328.159521" y="341.271327" style="fill: #ff7f0e; stroke: #ff7f0e" />
5122
+ <use ns4:href="#m9b8c54d372" x="358.808648" y="340.438717" style="fill: #ff7f0e; stroke: #ff7f0e" />
5123
+ <use ns4:href="#m9b8c54d372" x="389.457776" y="342.132002" style="fill: #ff7f0e; stroke: #ff7f0e" />
5124
+ <use ns4:href="#m9b8c54d372" x="420.106904" y="334.919173" style="fill: #ff7f0e; stroke: #ff7f0e" />
5125
+ <use ns4:href="#m9b8c54d372" x="450.756032" y="340.405974" style="fill: #ff7f0e; stroke: #ff7f0e" />
5126
+ <use ns4:href="#m9b8c54d372" x="481.40516" y="340.528059" style="fill: #ff7f0e; stroke: #ff7f0e" />
5127
+ <use ns4:href="#m9b8c54d372" x="512.054288" y="341.112289" style="fill: #ff7f0e; stroke: #ff7f0e" />
5128
+ <use ns4:href="#m9b8c54d372" x="542.703416" y="340.373231" style="fill: #ff7f0e; stroke: #ff7f0e" />
5129
+ <use ns4:href="#m9b8c54d372" x="573.352544" y="340.218871" style="fill: #ff7f0e; stroke: #ff7f0e" />
5130
+ <use ns4:href="#m9b8c54d372" x="604.001672" y="341.364878" style="fill: #ff7f0e; stroke: #ff7f0e" />
5131
+ <use ns4:href="#m9b8c54d372" x="634.6508" y="340.069189" style="fill: #ff7f0e; stroke: #ff7f0e" />
5132
+ <use ns4:href="#m9b8c54d372" x="665.299928" y="340.574367" style="fill: #ff7f0e; stroke: #ff7f0e" />
5133
+ <use ns4:href="#m9b8c54d372" x="695.949056" y="340.438717" style="fill: #ff7f0e; stroke: #ff7f0e" />
5134
+ <use ns4:href="#m9b8c54d372" x="726.598184" y="339.068186" style="fill: #ff7f0e; stroke: #ff7f0e" />
5135
+ <use ns4:href="#m9b8c54d372" x="757.247312" y="332.696854" style="fill: #ff7f0e; stroke: #ff7f0e" />
5136
+ <use ns4:href="#m9b8c54d372" x="787.896439" y="141.132167" style="fill: #ff7f0e; stroke: #ff7f0e" />
5137
+ </g>
5138
+ </g>
5139
  <g id="patch_3">
5140
  <path d="M 47.72 423.069506 L 47.72 26.88 " style="fill: none; stroke: #000000; stroke-width: 0.8; stroke-linejoin: miter; stroke-linecap: square" />
5141
  </g>
 
5148
  <g id="patch_6">
5149
  <path d="M 47.72 26.88 L 823.142937 26.88 " style="fill: none; stroke: #000000; stroke-width: 0.8; stroke-linejoin: miter; stroke-linecap: square" />
5150
  </g>
5151
+ <g id="text_33">
5152
  <text style="font-size: 12px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: middle" x="435.431468" y="20.88" transform="rotate(-0 435.431468 20.88)">Attention Implementation Latency</text>
5153
  </g>
5154
  <g id="legend" class="legend">
5155
  <g id="patch_7">
5156
+ <path d="M 54.72 64.7925 L 172.655938 64.7925 Q 174.655938 64.7925 174.655938 62.7925 L 174.655938 33.88 Q 174.655938 31.88 172.655938 31.88 L 54.72 31.88 Q 52.72 31.88 52.72 33.88 L 52.72 62.7925 Q 52.72 64.7925 54.72 64.7925 L 54.72 64.7925 z " style="fill: none; opacity: 0.8; stroke: #cccccc; stroke-linejoin: miter" />
5157
  </g>
5158
+ <g id="line2d_33">
5159
  <path d="M 56.72 39.978438 L 66.72 39.978438 L 76.72 39.978438 " style="fill: none; stroke: #1f77b4; stroke-width: 1.5; stroke-linecap: square" />
5160
  <g>
5161
  <use ns4:href="#md7efaf3aec" x="66.72" y="39.978438" style="fill: #1f77b4; stroke: #1f77b4" />
5162
  </g>
5163
  </g>
5164
+ <g id="legend-label--hf-kernels-rotary" class="legend">
5165
+ <text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: start" x="84.72" y="43.478438" transform="rotate(-0 84.72 43.478438)">hf_kernels_rotary</text>
5166
+ </g>
5167
+ <g id="line2d_34">
5168
+ <path d="M 56.72 54.934687 L 66.72 54.934687 L 76.72 54.934687 " style="fill: none; stroke: #ff7f0e; stroke-width: 1.5; stroke-linecap: square" />
5169
+ <g>
5170
+ <use ns4:href="#m9b8c54d372" x="66.72" y="54.934687" style="fill: #ff7f0e; stroke: #ff7f0e" />
5171
+ </g>
5172
+ </g>
5173
  <g id="legend-label--torch-eager" class="legend">
5174
+ <text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: start" x="84.72" y="58.434687" transform="rotate(-0 84.72 58.434687)">torch_eager</text>
5175
  </g>
5176
  </g>
5177
  </g>