drbh HF Staff commited on
Commit
c415961
·
verified ·
1 Parent(s): e8e4be6

Upload folder using huggingface_hub

Browse files
Files changed (33) hide show
  1. activation/impls/artifacts/benchmark/activation.jsonl +9 -9
  2. activation/impls/hf_kernels_swiglu.html +91 -92
  3. activation/impls/torch_swiglu.html +120 -120
  4. activation/results/artifacts/combine/latency.svg +2 -2
  5. activation/results/combined_results.html +122 -96
  6. causal_conv1d/impls/artifacts/benchmark/causal_conv1d.jsonl +24 -24
  7. causal_conv1d/impls/cells/benchmark.py +9 -18
  8. causal_conv1d/impls/hf_kernels_causal_conv1d.html +0 -0
  9. causal_conv1d/impls/torch_causal_conv1d.html +0 -0
  10. causal_conv1d/results/artifacts/combine/latency.svg +2 -2
  11. causal_conv1d/results/combined_results.html +138 -138
  12. flash_attn/impls/artifacts/benchmark/attention.jsonl +6 -6
  13. flash_attn/impls/cells/benchmark.py +9 -10
  14. flash_attn/impls/flash_attention.html +192 -140
  15. flash_attn/impls/hf_kernels_flash_attn.html +94 -99
  16. flash_attn/impls/hf_kernels_flash_attn3.html +80 -80
  17. flash_attn/impls/mem_efficient_attention.html +133 -185
  18. flash_attn/impls/sage_attention.html +17 -12
  19. flash_attn/impls/xformers.html +89 -89
  20. flash_attn/results/artifacts/combine/latency.svg +2 -2
  21. flash_attn/results/combined_results.html +138 -138
  22. index.html +0 -0
  23. layer_norm/impls/artifacts/benchmark/layer_norm.jsonl +4 -4
  24. layer_norm/impls/hf_kernels_layer_norm.html +56 -55
  25. layer_norm/impls/torch_layer_norm.html +55 -54
  26. layer_norm/results/artifacts/combine/latency.svg +2 -2
  27. layer_norm/results/combined_results.html +51 -51
  28. rotary/impls/artifacts/benchmark/rotary.jsonl +24 -24
  29. rotary/impls/hf_kernels_rotary.html +0 -0
  30. rotary/impls/torch_rotary.html +0 -0
  31. rotary/index.html +0 -0
  32. rotary/results/artifacts/combine/latency.svg +1 -1
  33. rotary/results/combined_results.html +84 -84
activation/impls/artifacts/benchmark/activation.jsonl CHANGED
@@ -1,9 +1,9 @@
1
- {"ts": "2025-10-28T14:08:47Z", "run": "c851494cf96a4119be3e14911bb5592b", "impl": "hf_kernels_swiglu", "tags": {"family": "hf-kernels", "backend": "cuda"}, "wl": {"name": "cuda_T128_D768", "num_tokens": 128, "hidden_dim": 768, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.02110099990204617, "p50": 0.022570000055566197, "p90": 0.02266100000269944, "mean": 0.022242599993660406, "iqr": 0.0007410000080199097, "raw_times": [0.022570000055566197, 0.022961000013310695, 0.02191999999467953, 0.02266100000269944, 0.02110099990204617], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.02889100005631917, "peak_bytes": 1966080, "ok": true, "absmax": 0.0, "corr": {"ok": true, "rtol": 0.02, "atol": 0.02, "absmax": 0.0, "mae": 0.0, "mse": 0.0, "ref": "swiglu_fp32"}, "err": null}
2
- {"ts": "2025-10-28T14:08:47Z", "run": "c851494cf96a4119be3e14911bb5592b", "impl": "hf_kernels_swiglu", "tags": {"family": "hf-kernels", "backend": "cuda"}, "wl": {"name": "cuda_T128_D1024", "num_tokens": 128, "hidden_dim": 1024, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.02585100003216212, "p50": 0.02831100005096232, "p90": 0.02854100000604376, "mean": 0.02791500000967062, "iqr": 0.0013400000398178236, "raw_times": [0.02585100003216212, 0.02854100000604376, 0.02967099999295897, 0.02831100005096232, 0.027200999966225936], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.031750999937685265, "peak_bytes": 2621440, "ok": true, "absmax": 0.0, "corr": {"ok": true, "rtol": 0.02, "atol": 0.02, "absmax": 0.0, "mae": 0.0, "mse": 0.0, "ref": "swiglu_fp32"}, "err": null}
3
- {"ts": "2025-10-28T14:08:47Z", "run": "c851494cf96a4119be3e14911bb5592b", "impl": "hf_kernels_swiglu", "tags": {"family": "hf-kernels", "backend": "cuda"}, "wl": {"name": "cuda_T128_D2048", "num_tokens": 128, "hidden_dim": 2048, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.02804099995046272, "p50": 0.028271000019230996, "p90": 0.02853099999811093, "mean": 0.032097199982672464, "iqr": 0.0004900000476482091, "raw_times": [0.04760199999509496, 0.028271000019230996, 0.02853099999811093, 0.02804099995046272, 0.02804099995046272], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.031132000003708526, "peak_bytes": 5242880, "ok": true, "absmax": 0.0, "corr": {"ok": true, "rtol": 0.02, "atol": 0.02, "absmax": 0.0, "mae": 0.0, "mse": 0.0, "ref": "swiglu_fp32"}, "err": null}
4
- {"ts": "2025-10-28T14:08:47Z", "run": "c851494cf96a4119be3e14911bb5592b", "impl": "hf_kernels_swiglu", "tags": {"family": "hf-kernels", "backend": "cuda"}, "wl": {"name": "cuda_T256_D768", "num_tokens": 256, "hidden_dim": 768, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.02513000004000787, "p50": 0.027131000024382956, "p90": 0.027909999971598154, "mean": 0.027204600019103964, "iqr": 0.0014589999182135216, "raw_times": [0.02513000004000787, 0.027131000024382956, 0.027909999971598154, 0.029401000006146205, 0.026451000053384632], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.030690999892613036, "peak_bytes": 3932160, "ok": true, "absmax": 0.0, "corr": {"ok": true, "rtol": 0.02, "atol": 0.02, "absmax": 0.0, "mae": 0.0, "mse": 0.0, "ref": "swiglu_fp32"}, "err": null}
5
- {"ts": "2025-10-28T14:08:47Z", "run": "c851494cf96a4119be3e14911bb5592b", "impl": "hf_kernels_swiglu", "tags": {"family": "hf-kernels", "backend": "cuda"}, "wl": {"name": "cuda_T256_D1024", "num_tokens": 256, "hidden_dim": 1024, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.02570000003743189, "p50": 0.026741000056063058, "p90": 0.02731099993980024, "mean": 0.02703079999264446, "iqr": 0.0012099999366910197, "raw_times": [0.02570000003743189, 0.02731099993980024, 0.029300999926817894, 0.02610100000310922, 0.026741000056063058], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.030331000061778468, "peak_bytes": 5242880, "ok": true, "absmax": 0.0, "corr": {"ok": true, "rtol": 0.02, "atol": 0.02, "absmax": 0.0, "mae": 0.0, "mse": 0.0, "ref": "swiglu_fp32"}, "err": null}
6
- {"ts": "2025-10-28T14:08:47Z", "run": "c851494cf96a4119be3e14911bb5592b", "impl": "hf_kernels_swiglu", "tags": {"family": "hf-kernels", "backend": "cuda"}, "wl": {"name": "cuda_T256_D2048", "num_tokens": 256, "hidden_dim": 2048, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.025050999965969822, "p50": 0.026220999984616356, "p90": 0.028031000056216726, "mean": 0.026778999995258346, "iqr": 0.0018400000953988638, "raw_times": [0.025050999965969822, 0.026190999960817862, 0.026220999984616356, 0.028031000056216726, 0.028401000008670962], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.031100999990485434, "peak_bytes": 10485760, "ok": true, "absmax": 0.0, "corr": {"ok": true, "rtol": 0.02, "atol": 0.02, "absmax": 0.0, "mae": 0.0, "mse": 0.0, "ref": "swiglu_fp32"}, "err": null}
7
- {"ts": "2025-10-28T14:08:47Z", "run": "c851494cf96a4119be3e14911bb5592b", "impl": "hf_kernels_swiglu", "tags": {"family": "hf-kernels", "backend": "cuda"}, "wl": {"name": "cuda_T512_D768", "num_tokens": 512, "hidden_dim": 768, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.02494000000297092, "p50": 0.026971000011144497, "p90": 0.02789099994515709, "mean": 0.027030599972022173, "iqr": 0.0009699999736767495, "raw_times": [0.02494000000297092, 0.026971000011144497, 0.02789099994515709, 0.02842999992935802, 0.02692099997148034], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.029161000043131935, "peak_bytes": 7864320, "ok": true, "absmax": 0.0, "corr": {"ok": true, "rtol": 0.02, "atol": 0.02, "absmax": 0.0, "mae": 0.0, "mse": 0.0, "ref": "swiglu_fp32"}, "err": null}
8
- {"ts": "2025-10-28T14:08:47Z", "run": "c851494cf96a4119be3e14911bb5592b", "impl": "hf_kernels_swiglu", "tags": {"family": "hf-kernels", "backend": "cuda"}, "wl": {"name": "cuda_T512_D1024", "num_tokens": 512, "hidden_dim": 1024, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.024340999971173005, "p50": 0.02594099998987076, "p90": 0.027440999929240206, "mean": 0.026286999968760938, "iqr": 0.0016499999446750735, "raw_times": [0.024340999971173005, 0.027920999968955584, 0.027440999929240206, 0.02594099998987076, 0.025790999984565133], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.02797100000861974, "peak_bytes": 10485760, "ok": true, "absmax": 0.0, "corr": {"ok": true, "rtol": 0.02, "atol": 0.02, "absmax": 0.0, "mae": 0.0, "mse": 0.0, "ref": "swiglu_fp32"}, "err": null}
9
- {"ts": "2025-10-28T14:08:47Z", "run": "c851494cf96a4119be3e14911bb5592b", "impl": "hf_kernels_swiglu", "tags": {"family": "hf-kernels", "backend": "cuda"}, "wl": {"name": "cuda_T512_D2048", "num_tokens": 512, "hidden_dim": 2048, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.025551000021550863, "p50": 0.026880999939749017, "p90": 0.028271000019230996, "mean": 0.027656800011754967, "iqr": 0.002240999947389355, "raw_times": [0.025551000021550863, 0.026880999939749017, 0.02603000007184164, 0.03155100000640232, 0.028271000019230996], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.02960100005111599, "peak_bytes": 20971520, "ok": true, "absmax": 0.0, "corr": {"ok": true, "rtol": 0.02, "atol": 0.02, "absmax": 0.0, "mae": 0.0, "mse": 0.0, "ref": "swiglu_fp32"}, "err": null}
 
1
+ {"ts": "2025-10-29T14:27:26Z", "run": "cb02b80f11f04022890ac7026c401c57", "impl": "hf_kernels_swiglu", "tags": {"family": "hf-kernels", "backend": "cuda"}, "wl": {"name": "cuda_T128_D768", "num_tokens": 128, "hidden_dim": 768, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.02251099999739381, "p50": 0.02324100000805629, "p90": 0.023539999972399528, "mean": 0.023146399996676337, "iqr": 0.0007499999696847226, "raw_times": [0.023539999972399528, 0.022790000002714805, 0.02324100000805629, 0.02365000000281725, 0.02251099999739381], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.029810000000907166, "peak_bytes": 1966080, "ok": true, "absmax": 0.0, "corr": {"ok": true, "rtol": 0.02, "atol": 0.02, "absmax": 0.0, "mae": 0.0, "mse": 0.0, "ref": "swiglu_fp32"}, "err": null}
2
+ {"ts": "2025-10-29T14:27:26Z", "run": "cb02b80f11f04022890ac7026c401c57", "impl": "hf_kernels_swiglu", "tags": {"family": "hf-kernels", "backend": "cuda"}, "wl": {"name": "cuda_T128_D1024", "num_tokens": 128, "hidden_dim": 1024, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.027370999987397227, "p50": 0.028240999995432503, "p90": 0.028329999963716546, "mean": 0.02825879998908931, "iqr": 0.00023899997358967084, "raw_times": [0.028090999990126875, 0.028240999995432503, 0.028329999963716546, 0.029261000008773408, 0.027370999987397227], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.03212000001440174, "peak_bytes": 2621440, "ok": true, "absmax": 0.0, "corr": {"ok": true, "rtol": 0.02, "atol": 0.02, "absmax": 0.0, "mae": 0.0, "mse": 0.0, "ref": "swiglu_fp32"}, "err": null}
3
+ {"ts": "2025-10-29T14:27:26Z", "run": "cb02b80f11f04022890ac7026c401c57", "impl": "hf_kernels_swiglu", "tags": {"family": "hf-kernels", "backend": "cuda"}, "wl": {"name": "cuda_T128_D2048", "num_tokens": 128, "hidden_dim": 2048, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.02641099996480989, "p50": 0.027520999992702855, "p90": 0.028440999983558868, "mean": 0.027734599996165343, "iqr": 0.001440999938040477, "raw_times": [0.02641099996480989, 0.028440999983558868, 0.029299999994236714, 0.027520999992702855, 0.02700000004551839], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.032080999972095015, "peak_bytes": 5242880, "ok": true, "absmax": 0.0, "corr": {"ok": true, "rtol": 0.02, "atol": 0.02, "absmax": 0.0, "mae": 0.0, "mse": 0.0, "ref": "swiglu_fp32"}, "err": null}
4
+ {"ts": "2025-10-29T14:27:26Z", "run": "cb02b80f11f04022890ac7026c401c57", "impl": "hf_kernels_swiglu", "tags": {"family": "hf-kernels", "backend": "cuda"}, "wl": {"name": "cuda_T256_D768", "num_tokens": 256, "hidden_dim": 768, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.026321000007101247, "p50": 0.02724099999795726, "p90": 0.028659999998126295, "mean": 0.02923079999845868, "iqr": 0.0014990000067882647, "raw_times": [0.026321000007101247, 0.03677099999777056, 0.028659999998126295, 0.02716099999133803, 0.02724099999795726], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.031121000006351096, "peak_bytes": 3932160, "ok": true, "absmax": 0.0, "corr": {"ok": true, "rtol": 0.02, "atol": 0.02, "absmax": 0.0, "mae": 0.0, "mse": 0.0, "ref": "swiglu_fp32"}, "err": null}
5
+ {"ts": "2025-10-29T14:27:26Z", "run": "cb02b80f11f04022890ac7026c401c57", "impl": "hf_kernels_swiglu", "tags": {"family": "hf-kernels", "backend": "cuda"}, "wl": {"name": "cuda_T256_D1024", "num_tokens": 256, "hidden_dim": 1024, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.025979999975334067, "p50": 0.028520999990178098, "p90": 0.028720999978304462, "mean": 0.027810800008865044, "iqr": 0.00169999992749581, "raw_times": [0.025979999975334067, 0.028811000049699942, 0.028520999990178098, 0.027021000050808652, 0.028720999978304462], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.02976100000751103, "peak_bytes": 5242880, "ok": true, "absmax": 0.0, "corr": {"ok": true, "rtol": 0.02, "atol": 0.02, "absmax": 0.0, "mae": 0.0, "mse": 0.0, "ref": "swiglu_fp32"}, "err": null}
6
+ {"ts": "2025-10-29T14:27:26Z", "run": "cb02b80f11f04022890ac7026c401c57", "impl": "hf_kernels_swiglu", "tags": {"family": "hf-kernels", "backend": "cuda"}, "wl": {"name": "cuda_T256_D2048", "num_tokens": 256, "hidden_dim": 2048, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.026359999992564553, "p50": 0.027051000017763727, "p90": 0.027101000000584463, "mean": 0.027004599996871548, "iqr": 0.00035100003970001126, "raw_times": [0.027101000000584463, 0.027051000017763727, 0.027761000012560544, 0.026749999960884452, 0.026359999992564553], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.029620999953294813, "peak_bytes": 10485760, "ok": true, "absmax": 0.0, "corr": {"ok": true, "rtol": 0.02, "atol": 0.02, "absmax": 0.0, "mae": 0.0, "mse": 0.0, "ref": "swiglu_fp32"}, "err": null}
7
+ {"ts": "2025-10-29T14:27:26Z", "run": "cb02b80f11f04022890ac7026c401c57", "impl": "hf_kernels_swiglu", "tags": {"family": "hf-kernels", "backend": "cuda"}, "wl": {"name": "cuda_T512_D768", "num_tokens": 512, "hidden_dim": 768, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.02576100001761006, "p50": 0.027530000011211087, "p90": 0.02828099997032041, "mean": 0.0273743999969156, "iqr": 0.001340999972399004, "raw_times": [0.02576100001761006, 0.02828099997032041, 0.026939999997921404, 0.02835999998751504, 0.027530000011211087], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.030121000008875853, "peak_bytes": 7864320, "ok": true, "absmax": 0.0, "corr": {"ok": true, "rtol": 0.02, "atol": 0.02, "absmax": 0.0, "mae": 0.0, "mse": 0.0, "ref": "swiglu_fp32"}, "err": null}
8
+ {"ts": "2025-10-29T14:27:26Z", "run": "cb02b80f11f04022890ac7026c401c57", "impl": "hf_kernels_swiglu", "tags": {"family": "hf-kernels", "backend": "cuda"}, "wl": {"name": "cuda_T512_D1024", "num_tokens": 512, "hidden_dim": 1024, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.025459999960730784, "p50": 0.028590999988864496, "p90": 0.02870100001928222, "mean": 0.027812799999082927, "iqr": 0.00113999999484804, "raw_times": [0.025459999960730784, 0.02870100001928222, 0.028751000002102955, 0.02756100002443418, 0.028590999988864496], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.029950999987704563, "peak_bytes": 10485760, "ok": true, "absmax": 0.0, "corr": {"ok": true, "rtol": 0.02, "atol": 0.02, "absmax": 0.0, "mae": 0.0, "mse": 0.0, "ref": "swiglu_fp32"}, "err": null}
9
+ {"ts": "2025-10-29T14:27:26Z", "run": "cb02b80f11f04022890ac7026c401c57", "impl": "hf_kernels_swiglu", "tags": {"family": "hf-kernels", "backend": "cuda"}, "wl": {"name": "cuda_T512_D2048", "num_tokens": 512, "hidden_dim": 2048, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.02632999996876606, "p50": 0.027500999976837193, "p90": 0.028640000039104052, "mean": 0.028318399995441723, "iqr": 0.0021100000253682083, "raw_times": [0.02632999996876606, 0.03259099997876547, 0.027500999976837193, 0.026530000013735844, 0.028640000039104052], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.029991000019435887, "peak_bytes": 20971520, "ok": true, "absmax": 0.0, "corr": {"ok": true, "rtol": 0.02, "atol": 0.02, "absmax": 0.0, "mae": 0.0, "mse": 0.0, "ref": "swiglu_fp32"}, "err": null}
activation/impls/hf_kernels_swiglu.html CHANGED
@@ -3871,7 +3871,7 @@ span.linenos.special { color: #000000; background-color: #ffffc0; padding-left:
3871
  <span onclick="toggleOutput('nv')" style="cursor: pointer;">▼ output</span>
3872
  <span id="uv-indicator-nv" style="cursor: default; opacity: 0.3;">▶ uv-logs</span>
3873
  </span> |
3874
- Cell: nv | 0.21s
3875
  | <button class="run-btn" onclick="runCell('nv')">▶ run</button>
3876
  <button class="copy-btn" onclick="copyCell('nv')">Copy</button>
3877
  <a href="cells/nv.py" target="_blank" class="raw-btn">Raw</a>
@@ -3887,7 +3887,7 @@ Cell: nv | 0.21s
3887
  </div>
3888
  </div>
3889
  <div id="output-nv" class="cell-output">
3890
- <div class="cell-stdout"><pre class="stdout-text">Tue Oct 28 14:07:54 2025
3891
  +-----------------------------------------------------------------------------------------+
3892
  | NVIDIA-SMI 570.195.03 Driver Version: 570.195.03 CUDA Version: 12.8 |
3893
  |-----------------------------------------+------------------------+----------------------+
@@ -3896,7 +3896,7 @@ Cell: nv | 0.21s
3896
  | | | MIG M. |
3897
  |=========================================+========================+======================|
3898
  | 0 NVIDIA L40S On | 00000000:4D:00.0 Off | 0 |
3899
- | N/A 27C P0 80W / 350W | 0MiB / 46068MiB | 1% Default |
3900
  | | | N/A |
3901
  +-----------------------------------------+------------------------+----------------------+
3902
 
@@ -3920,7 +3920,7 @@ Cell: nv | 0.21s
3920
  <span onclick="toggleOutput('benchmark')" style="cursor: pointer;">▼ output</span>
3921
  <span id="uv-indicator-benchmark" onclick="toggleUvLogsFromHeader('benchmark')" style="cursor: pointer;">▶ uv-logs</span>
3922
  </span> |
3923
- Cell: benchmark | 4.26s
3924
  | <button class="run-btn" onclick="runCell('benchmark')">▶ run</button>
3925
  <button class="copy-btn" onclick="copyCell('benchmark')">Copy</button>
3926
  <a href="cells/benchmark.py" target="_blank" class="raw-btn">Raw</a>
@@ -3976,17 +3976,17 @@ PROFILE TRACE: hf_kernels_swiglu | cuda_T128_D768
3976
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
3977
  Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
3978
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
3979
- hf_kernels_swiglu 0.00% 0.000us 0.00% 0.000us 0.000us 70.944us 1745.67% 70.944us 70.944us 1
3980
- hf_kernels_swiglu 10.31% 179.916us 99.57% 1.738ms 1.738ms 0.000us 0.00% 5.472us 5.472us 1
3981
- _activation_beeaae6::silu_and_mul 1.09% 18.951us 86.60% 1.512ms 503.911us 4.064us 100.00% 5.472us 1.824us 3
3982
- void vllm::act_and_mul_kernel&lt;c10::BFloat16, &amp;(c10::... 0.00% 0.000us 0.00% 0.000us 0.000us 4.064us 100.00% 4.064us 1.355us 3
3983
- Activity Buffer Request 83.12% 1.451ms 83.12% 1.451ms 1.451ms 1.408us 34.65% 1.408us 1.408us 1
3984
- aten::empty 2.66% 46.432us 2.66% 46.432us 15.477us 0.000us 0.00% 0.000us 0.000us 3
3985
- cudaLaunchKernel 2.39% 41.801us 2.39% 41.801us 13.934us 0.000us 0.00% 0.000us 0.000us 3
3986
- cudaDeviceSynchronize 0.43% 7.500us 0.43% 7.500us 7.500us 0.000us 0.00% 0.000us 0.000us 1
3987
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
3988
- Self CPU time total: 1.746ms
3989
- Self CUDA time total: 4.064us
3990
 
3991
 
3992
 
@@ -3996,17 +3996,17 @@ PROFILE TRACE: hf_kernels_swiglu | cuda_T128_D1024
3996
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
3997
  Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
3998
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
3999
- hf_kernels_swiglu 0.00% 0.000us 0.00% 0.000us 0.000us 68.703us 1759.36% 68.703us 68.703us 1
4000
- hf_kernels_swiglu 6.60% 109.215us 99.70% 1.650ms 1.650ms 0.000us 0.00% 5.217us 5.217us 1
4001
- _activation_beeaae6::silu_and_mul 1.44% 23.760us 91.91% 1.521ms 506.927us 3.905us 100.00% 5.217us 1.739us 3
4002
- void vllm::act_and_mul_kernel&lt;c10::BFloat16, &amp;(c10::... 0.00% 0.000us 0.00% 0.000us 0.000us 3.905us 100.00% 3.905us 1.302us 3
4003
- Activity Buffer Request 88.83% 1.470ms 88.83% 1.470ms 1.470ms 1.312us 33.60% 1.312us 1.312us 1
4004
- aten::empty 1.19% 19.640us 1.19% 19.640us 6.547us 0.000us 0.00% 0.000us 0.000us 3
4005
- cudaLaunchKernel 1.65% 27.251us 1.65% 27.251us 9.084us 0.000us 0.00% 0.000us 0.000us 3
4006
- cudaDeviceSynchronize 0.30% 4.941us 0.30% 4.941us 4.941us 0.000us 0.00% 0.000us 0.000us 1
4007
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4008
- Self CPU time total: 1.655ms
4009
- Self CUDA time total: 3.905us
4010
 
4011
 
4012
 
@@ -4016,16 +4016,16 @@ PROFILE TRACE: hf_kernels_swiglu | cuda_T128_D2048
4016
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4017
  Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
4018
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4019
- hf_kernels_swiglu 0.00% 0.000us 0.00% 0.000us 0.000us 67.999us 1388.58% 67.999us 67.999us 1
4020
- hf_kernels_swiglu 6.71% 113.524us 99.73% 1.687ms 1.687ms 0.000us 0.00% 6.529us 6.529us 1
4021
- _activation_beeaae6::silu_and_mul 1.26% 21.380us 91.91% 1.555ms 518.231us 4.897us 100.00% 6.529us 2.176us 3
4022
  void vllm::act_and_mul_kernel&lt;c10::BFloat16, &amp;(c10::... 0.00% 0.000us 0.00% 0.000us 0.000us 4.897us 100.00% 4.897us 1.632us 3
4023
- Activity Buffer Request 89.08% 1.507ms 89.08% 1.507ms 1.507ms 1.632us 33.33% 1.632us 1.632us 1
4024
- aten::empty 1.11% 18.802us 1.11% 18.802us 6.267us 0.000us 0.00% 0.000us 0.000us 3
4025
- cudaLaunchKernel 1.56% 26.371us 1.56% 26.371us 8.790us 0.000us 0.00% 0.000us 0.000us 3
4026
- cudaDeviceSynchronize 0.27% 4.571us 0.27% 4.571us 4.571us 0.000us 0.00% 0.000us 0.000us 1
4027
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4028
- Self CPU time total: 1.692ms
4029
  Self CUDA time total: 4.897us
4030
 
4031
 
@@ -4036,16 +4036,16 @@ PROFILE TRACE: hf_kernels_swiglu | cuda_T256_D768
4036
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4037
  Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
4038
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4039
- hf_kernels_swiglu 0.00% 0.000us 0.00% 0.000us 0.000us 63.776us 1498.50% 63.776us 63.776us 1
4040
- hf_kernels_swiglu 5.54% 99.283us 99.75% 1.788ms 1.788ms 0.000us 0.00% 5.696us 5.696us 1
4041
- _activation_beeaae6::silu_and_mul 1.20% 21.550us 93.21% 1.671ms 556.862us 4.256us 100.00% 5.696us 1.899us 3
4042
  void vllm::act_and_mul_kernel&lt;c10::BFloat16, &amp;(c10::... 0.00% 0.000us 0.00% 0.000us 0.000us 4.256us 100.00% 4.256us 1.419us 3
4043
- Activity Buffer Request 79.15% 1.419ms 79.15% 1.419ms 1.419ms 1.440us 33.83% 1.440us 1.440us 1
4044
- aten::empty 1.00% 17.972us 1.00% 17.972us 5.991us 0.000us 0.00% 0.000us 0.000us 3
4045
- cudaLaunchKernel 12.85% 230.398us 12.85% 230.398us 76.799us 0.000us 0.00% 0.000us 0.000us 3
4046
- cudaDeviceSynchronize 0.25% 4.510us 0.25% 4.510us 4.510us 0.000us 0.00% 0.000us 0.000us 1
4047
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4048
- Self CPU time total: 1.792ms
4049
  Self CUDA time total: 4.256us
4050
 
4051
 
@@ -4056,17 +4056,17 @@ PROFILE TRACE: hf_kernels_swiglu | cuda_T256_D1024
4056
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4057
  Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
4058
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4059
- hf_kernels_swiglu 0.00% 0.000us 0.00% 0.000us 0.000us 62.431us 1060.31% 62.431us 62.431us 1
4060
- hf_kernels_swiglu 20.17% 83.914us 98.89% 411.305us 411.305us 0.000us 0.00% 7.872us 7.872us 1
4061
- _activation_beeaae6::silu_and_mul 5.09% 21.171us 74.40% 309.470us 103.157us 5.888us 100.00% 7.872us 2.624us 3
4062
- void vllm::act_and_mul_kernel&lt;c10::BFloat16, &amp;(c10::... 0.00% 0.000us 0.00% 0.000us 0.000us 5.888us 100.00% 5.888us 1.963us 3
4063
- Activity Buffer Request 32.60% 135.614us 32.60% 135.614us 135.614us 1.984us 33.70% 1.984us 1.984us 1
4064
- aten::empty 4.31% 17.921us 4.31% 17.921us 5.974us 0.000us 0.00% 0.000us 0.000us 3
4065
- cudaLaunchKernel 36.71% 152.685us 36.71% 152.685us 50.895us 0.000us 0.00% 0.000us 0.000us 3
4066
- cudaDeviceSynchronize 1.11% 4.631us 1.11% 4.631us 4.631us 0.000us 0.00% 0.000us 0.000us 1
4067
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4068
- Self CPU time total: 415.936us
4069
- Self CUDA time total: 5.888us
4070
 
4071
 
4072
 
@@ -4076,16 +4076,16 @@ PROFILE TRACE: hf_kernels_swiglu | cuda_T256_D2048
4076
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4077
  Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
4078
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4079
- hf_kernels_swiglu 0.00% 0.000us 0.00% 0.000us 0.000us 67.615us 880.40% 67.615us 67.615us 1
4080
- hf_kernels_swiglu 5.97% 103.444us 99.74% 1.727ms 1.727ms 0.000us 0.00% 10.240us 10.240us 1
4081
- _activation_beeaae6::silu_and_mul 1.23% 21.310us 92.70% 1.605ms 535.135us 7.680us 100.00% 10.240us 3.413us 3
4082
  void vllm::act_and_mul_kernel&lt;c10::BFloat16, &amp;(c10::... 0.00% 0.000us 0.00% 0.000us 0.000us 7.680us 100.00% 7.680us 2.560us 3
4083
- Activity Buffer Request 82.79% 1.434ms 82.79% 1.434ms 1.434ms 2.560us 33.33% 2.560us 2.560us 1
4084
- aten::empty 1.07% 18.611us 1.07% 18.611us 6.204us 0.000us 0.00% 0.000us 0.000us 3
4085
- cudaLaunchKernel 8.68% 150.305us 8.68% 150.305us 50.102us 0.000us 0.00% 0.000us 0.000us 3
4086
- cudaDeviceSynchronize 0.26% 4.450us 0.26% 4.450us 4.450us 0.000us 0.00% 0.000us 0.000us 1
4087
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4088
- Self CPU time total: 1.732ms
4089
  Self CUDA time total: 7.680us
4090
 
4091
 
@@ -4096,17 +4096,17 @@ PROFILE TRACE: hf_kernels_swiglu | cuda_T512_D768
4096
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4097
  Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
4098
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4099
- hf_kernels_swiglu 0.00% 0.000us 0.00% 0.000us 0.000us 63.423us 962.12% 63.423us 63.423us 1
4100
- hf_kernels_swiglu 5.71% 97.705us 99.74% 1.706ms 1.706ms 0.000us 0.00% 8.800us 8.800us 1
4101
- _activation_beeaae6::silu_and_mul 1.25% 21.440us 92.96% 1.590ms 530.071us 6.592us 100.00% 8.800us 2.933us 3
4102
- void vllm::act_and_mul_kernel&lt;c10::BFloat16, &amp;(c10::... 0.00% 0.000us 0.00% 0.000us 0.000us 6.592us 100.00% 6.592us 2.197us 3
4103
- Activity Buffer Request 82.94% 1.419ms 82.94% 1.419ms 1.419ms 2.208us 33.50% 2.208us 2.208us 1
4104
- aten::empty 1.07% 18.230us 1.07% 18.230us 6.077us 0.000us 0.00% 0.000us 0.000us 3
4105
- cudaLaunchKernel 8.77% 149.945us 8.77% 149.945us 49.982us 0.000us 0.00% 0.000us 0.000us 3
4106
- cudaDeviceSynchronize 0.26% 4.450us 0.26% 4.450us 4.450us 0.000us 0.00% 0.000us 0.000us 1
4107
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4108
- Self CPU time total: 1.711ms
4109
- Self CUDA time total: 6.592us
4110
 
4111
 
4112
 
@@ -4116,17 +4116,17 @@ PROFILE TRACE: hf_kernels_swiglu | cuda_T512_D1024
4116
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4117
  Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
4118
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4119
- hf_kernels_swiglu 0.00% 0.000us 0.00% 0.000us 0.000us 61.982us 658.89% 61.982us 61.982us 1
4120
- hf_kernels_swiglu 22.04% 82.603us 98.77% 370.213us 370.213us 0.000us 0.00% 12.543us 12.543us 1
4121
- _activation_beeaae6::silu_and_mul 5.90% 22.112us 71.72% 268.830us 89.610us 9.407us 100.00% 12.543us 4.181us 3
4122
- void vllm::act_and_mul_kernel&lt;c10::BFloat16, &amp;(c10::... 0.00% 0.000us 0.00% 0.000us 0.000us 9.407us 100.00% 9.407us 3.136us 3
4123
- Activity Buffer Request 26.16% 98.063us 26.16% 98.063us 98.063us 3.136us 33.34% 3.136us 3.136us 1
4124
- aten::empty 5.01% 18.780us 5.01% 18.780us 6.260us 0.000us 0.00% 0.000us 0.000us 3
4125
- cudaLaunchKernel 39.66% 148.655us 39.66% 148.655us 49.552us 0.000us 0.00% 0.000us 0.000us 3
4126
- cudaDeviceSynchronize 1.23% 4.600us 1.23% 4.600us 4.600us 0.000us 0.00% 0.000us 0.000us 1
4127
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4128
- Self CPU time total: 374.813us
4129
- Self CUDA time total: 9.407us
4130
 
4131
 
4132
 
@@ -4136,17 +4136,17 @@ PROFILE TRACE: hf_kernels_swiglu | cuda_T512_D2048
4136
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4137
  Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
4138
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4139
- hf_kernels_swiglu 0.00% 0.000us 0.00% 0.000us 0.000us 63.776us 490.85% 63.776us 63.776us 1
4140
- hf_kernels_swiglu 24.11% 99.284us 98.97% 407.515us 407.515us 0.000us 0.00% 17.346us 17.346us 1
4141
- _activation_beeaae6::silu_and_mul 5.19% 21.351us 70.31% 289.510us 96.503us 12.993us 100.00% 17.346us 5.782us 3
4142
- void vllm::act_and_mul_kernel&lt;c10::BFloat16, &amp;(c10::... 0.00% 0.000us 0.00% 0.000us 0.000us 12.993us 100.00% 12.993us 4.331us 3
4143
- Activity Buffer Request 28.96% 119.264us 28.96% 119.264us 119.264us 4.353us 33.50% 4.353us 4.353us 1
4144
- aten::empty 4.55% 18.721us 4.55% 18.721us 6.240us 0.000us 0.00% 0.000us 0.000us 3
4145
- cudaLaunchKernel 36.16% 148.895us 36.16% 148.895us 49.632us 0.000us 0.00% 0.000us 0.000us 3
4146
- cudaDeviceSynchronize 1.03% 4.240us 1.03% 4.240us 4.240us 0.000us 0.00% 0.000us 0.000us 1
4147
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4148
- Self CPU time total: 411.755us
4149
- Self CUDA time total: 12.993us
4150
 
4151
 
4152
  impl wl p50(ms) ok
@@ -4163,13 +4163,12 @@ hf_kernels_swiglu cuda_T512_D768 0.03 True
4163
  <div class="uv-install-logs" id="uv-logs-benchmark">
4164
  <div class="uv-logs-header" onclick="toggleUvLogs(this)">▶ UV Install Logs</div>
4165
  <div class="uv-logs-content" style="display: none;">
4166
- Installed 15 packages in 14ms
4167
  </div>
4168
  </div>
4169
  <div class="cell-stderr">Fetching 7 files: 0%| | 0/7 [00:00&lt;?, ?it/s]
4170
- Fetching 7 files: 14%|█▍ | 1/7 [00:00&lt;00:00, 7.79it/s]
4171
- Fetching 7 files: 71%|███████▏ | 5/7 [00:00&lt;00:00, 11.48it/s]
4172
- Fetching 7 files: 100%|██████████| 7/7 [00:00&lt;00:00, 15.62it/s]</div>
4173
  <div class="cell-artifacts">
4174
  <h4>Artifacts:</h4>
4175
  <a href="artifacts/benchmark/activation.jsonl" class="artifact" target="_blank">activation.jsonl</a>
 
3871
  <span onclick="toggleOutput('nv')" style="cursor: pointer;">▼ output</span>
3872
  <span id="uv-indicator-nv" style="cursor: default; opacity: 0.3;">▶ uv-logs</span>
3873
  </span> |
3874
+ Cell: nv | 0.26s
3875
  | <button class="run-btn" onclick="runCell('nv')">▶ run</button>
3876
  <button class="copy-btn" onclick="copyCell('nv')">Copy</button>
3877
  <a href="cells/nv.py" target="_blank" class="raw-btn">Raw</a>
 
3887
  </div>
3888
  </div>
3889
  <div id="output-nv" class="cell-output">
3890
+ <div class="cell-stdout"><pre class="stdout-text">Wed Oct 29 14:26:44 2025
3891
  +-----------------------------------------------------------------------------------------+
3892
  | NVIDIA-SMI 570.195.03 Driver Version: 570.195.03 CUDA Version: 12.8 |
3893
  |-----------------------------------------+------------------------+----------------------+
 
3896
  | | | MIG M. |
3897
  |=========================================+========================+======================|
3898
  | 0 NVIDIA L40S On | 00000000:4D:00.0 Off | 0 |
3899
+ | N/A 32C P0 133W / 350W | 0MiB / 46068MiB | 100% Default |
3900
  | | | N/A |
3901
  +-----------------------------------------+------------------------+----------------------+
3902
 
 
3920
  <span onclick="toggleOutput('benchmark')" style="cursor: pointer;">▼ output</span>
3921
  <span id="uv-indicator-benchmark" onclick="toggleUvLogsFromHeader('benchmark')" style="cursor: pointer;">▶ uv-logs</span>
3922
  </span> |
3923
+ Cell: benchmark | 4.19s
3924
  | <button class="run-btn" onclick="runCell('benchmark')">▶ run</button>
3925
  <button class="copy-btn" onclick="copyCell('benchmark')">Copy</button>
3926
  <a href="cells/benchmark.py" target="_blank" class="raw-btn">Raw</a>
 
3976
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
3977
  Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
3978
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
3979
+ hf_kernels_swiglu 0.00% 0.000us 0.00% 0.000us 0.000us 72.288us 1807.20% 72.288us 72.288us 1
3980
+ hf_kernels_swiglu 12.07% 211.387us 99.59% 1.744ms 1.744ms 0.000us 0.00% 5.376us 5.376us 1
3981
+ _activation_beeaae6::silu_and_mul 1.10% 19.319us 84.87% 1.486ms 495.368us 4.000us 100.00% 5.376us 1.792us 3
3982
+ void vllm::act_and_mul_kernel&lt;c10::BFloat16, &amp;(c10::... 0.00% 0.000us 0.00% 0.000us 0.000us 4.000us 100.00% 4.000us 1.333us 3
3983
+ Activity Buffer Request 81.49% 1.427ms 81.49% 1.427ms 1.427ms 1.376us 34.40% 1.376us 1.376us 1
3984
+ aten::empty 2.64% 46.231us 2.64% 46.231us 15.410us 0.000us 0.00% 0.000us 0.000us 3
3985
+ cudaLaunchKernel 2.28% 39.911us 2.28% 39.911us 13.304us 0.000us 0.00% 0.000us 0.000us 3
3986
+ cudaDeviceSynchronize 0.41% 7.220us 0.41% 7.220us 7.220us 0.000us 0.00% 0.000us 0.000us 1
3987
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
3988
+ Self CPU time total: 1.751ms
3989
+ Self CUDA time total: 4.000us
3990
 
3991
 
3992
 
 
3996
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
3997
  Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
3998
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
3999
+ hf_kernels_swiglu 0.00% 0.000us 0.00% 0.000us 0.000us 62.686us 1579.79% 62.686us 62.686us 1
4000
+ hf_kernels_swiglu 6.72% 108.943us 99.67% 1.616ms 1.616ms 0.000us 0.00% 5.312us 5.312us 1
4001
+ _activation_beeaae6::silu_and_mul 1.34% 21.721us 91.77% 1.488ms 495.875us 3.968us 100.00% 5.312us 1.771us 3
4002
+ void vllm::act_and_mul_kernel&lt;c10::BFloat16, &amp;(c10::... 0.00% 0.000us 0.00% 0.000us 0.000us 3.968us 100.00% 3.968us 1.323us 3
4003
+ Activity Buffer Request 88.82% 1.440ms 88.82% 1.440ms 1.440ms 1.344us 33.87% 1.344us 1.344us 1
4004
+ aten::empty 1.18% 19.150us 1.18% 19.150us 6.383us 0.000us 0.00% 0.000us 0.000us 3
4005
+ cudaLaunchKernel 1.61% 26.150us 1.61% 26.150us 8.717us 0.000us 0.00% 0.000us 0.000us 3
4006
+ cudaDeviceSynchronize 0.33% 5.310us 0.33% 5.310us 5.310us 0.000us 0.00% 0.000us 0.000us 1
4007
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4008
+ Self CPU time total: 1.621ms
4009
+ Self CUDA time total: 3.968us
4010
 
4011
 
4012
 
 
4016
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4017
  Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
4018
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4019
+ hf_kernels_swiglu 0.00% 0.000us 0.00% 0.000us 0.000us 66.687us 1361.79% 66.687us 66.687us 1
4020
+ hf_kernels_swiglu 6.74% 109.943us 99.70% 1.626ms 1.626ms 0.000us 0.00% 6.529us 6.529us 1
4021
+ _activation_beeaae6::silu_and_mul 1.25% 20.459us 91.78% 1.496ms 498.816us 4.897us 100.00% 6.529us 2.176us 3
4022
  void vllm::act_and_mul_kernel&lt;c10::BFloat16, &amp;(c10::... 0.00% 0.000us 0.00% 0.000us 0.000us 4.897us 100.00% 4.897us 1.632us 3
4023
+ Activity Buffer Request 88.91% 1.450ms 88.91% 1.450ms 1.450ms 1.632us 33.33% 1.632us 1.632us 1
4024
+ aten::empty 1.18% 19.260us 1.18% 19.260us 6.420us 0.000us 0.00% 0.000us 0.000us 3
4025
+ cudaLaunchKernel 1.61% 26.232us 1.61% 26.232us 8.744us 0.000us 0.00% 0.000us 0.000us 3
4026
+ cudaDeviceSynchronize 0.30% 4.870us 0.30% 4.870us 4.870us 0.000us 0.00% 0.000us 0.000us 1
4027
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4028
+ Self CPU time total: 1.631ms
4029
  Self CUDA time total: 4.897us
4030
 
4031
 
 
4036
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4037
  Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
4038
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4039
+ hf_kernels_swiglu 0.00% 0.000us 0.00% 0.000us 0.000us 66.081us 1552.66% 66.081us 66.081us 1
4040
+ hf_kernels_swiglu 6.15% 108.423us 99.71% 1.758ms 1.758ms 0.000us 0.00% 5.696us 5.696us 1
4041
+ _activation_beeaae6::silu_and_mul 1.25% 22.001us 92.49% 1.631ms 543.697us 4.256us 100.00% 5.696us 1.899us 3
4042
  void vllm::act_and_mul_kernel&lt;c10::BFloat16, &amp;(c10::... 0.00% 0.000us 0.00% 0.000us 0.000us 4.256us 100.00% 4.256us 1.419us 3
4043
+ Activity Buffer Request 80.93% 1.427ms 80.93% 1.427ms 1.427ms 1.440us 33.83% 1.440us 1.440us 1
4044
+ aten::empty 1.07% 18.910us 1.07% 18.910us 6.303us 0.000us 0.00% 0.000us 0.000us 3
4045
+ cudaLaunchKernel 10.31% 181.874us 10.31% 181.874us 60.625us 0.000us 0.00% 0.000us 0.000us 3
4046
+ cudaDeviceSynchronize 0.29% 5.110us 0.29% 5.110us 5.110us 0.000us 0.00% 0.000us 0.000us 1
4047
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4048
+ Self CPU time total: 1.764ms
4049
  Self CUDA time total: 4.256us
4050
 
4051
 
 
4056
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4057
  Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
4058
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4059
+ hf_kernels_swiglu 0.00% 0.000us 0.00% 0.000us 0.000us 63.167us 1072.63% 63.167us 63.167us 1
4060
+ hf_kernels_swiglu 15.22% 87.332us 99.19% 569.294us 569.294us 0.000us 0.00% 7.873us 7.873us 1
4061
+ _activation_beeaae6::silu_and_mul 3.58% 20.570us 80.67% 463.002us 154.334us 5.889us 100.00% 7.873us 2.624us 3
4062
+ void vllm::act_and_mul_kernel&lt;c10::BFloat16, &amp;(c10::... 0.00% 0.000us 0.00% 0.000us 0.000us 5.889us 100.00% 5.889us 1.963us 3
4063
+ Activity Buffer Request 48.76% 279.877us 48.76% 279.877us 279.877us 1.984us 33.69% 1.984us 1.984us 1
4064
+ aten::empty 3.30% 18.960us 3.30% 18.960us 6.320us 0.000us 0.00% 0.000us 0.000us 3
4065
+ cudaLaunchKernel 28.32% 162.555us 28.32% 162.555us 54.185us 0.000us 0.00% 0.000us 0.000us 3
4066
+ cudaDeviceSynchronize 0.81% 4.660us 0.81% 4.660us 4.660us 0.000us 0.00% 0.000us 0.000us 1
4067
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4068
+ Self CPU time total: 573.954us
4069
+ Self CUDA time total: 5.889us
4070
 
4071
 
4072
 
 
4076
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4077
  Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
4078
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4079
+ hf_kernels_swiglu 0.00% 0.000us 0.00% 0.000us 0.000us 69.632us 906.67% 69.632us 69.632us 1
4080
+ hf_kernels_swiglu 6.07% 107.484us 99.73% 1.766ms 1.766ms 0.000us 0.00% 10.240us 10.240us 1
4081
+ _activation_beeaae6::silu_and_mul 1.19% 21.010us 92.55% 1.639ms 546.413us 7.680us 100.00% 10.240us 3.413us 3
4082
  void vllm::act_and_mul_kernel&lt;c10::BFloat16, &amp;(c10::... 0.00% 0.000us 0.00% 0.000us 0.000us 7.680us 100.00% 7.680us 2.560us 3
4083
+ Activity Buffer Request 81.69% 1.447ms 81.69% 1.447ms 1.447ms 2.560us 33.33% 2.560us 2.560us 1
4084
+ aten::empty 1.11% 19.720us 1.11% 19.720us 6.573us 0.000us 0.00% 0.000us 0.000us 3
4085
+ cudaLaunchKernel 9.67% 171.234us 9.67% 171.234us 57.078us 0.000us 0.00% 0.000us 0.000us 3
4086
+ cudaDeviceSynchronize 0.27% 4.800us 0.27% 4.800us 4.800us 0.000us 0.00% 0.000us 0.000us 1
4087
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4088
+ Self CPU time total: 1.771ms
4089
  Self CUDA time total: 7.680us
4090
 
4091
 
 
4096
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4097
  Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
4098
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4099
+ hf_kernels_swiglu 0.00% 0.000us 0.00% 0.000us 0.000us 72.064us 1098.54% 72.064us 72.064us 1
4100
+ hf_kernels_swiglu 6.19% 109.521us 99.72% 1.763ms 1.763ms 0.000us 0.00% 8.768us 8.768us 1
4101
+ _activation_beeaae6::silu_and_mul 1.22% 21.580us 92.43% 1.635ms 544.850us 6.560us 100.00% 8.768us 2.923us 3
4102
+ void vllm::act_and_mul_kernel&lt;c10::BFloat16, &amp;(c10::... 0.00% 0.000us 0.00% 0.000us 0.000us 6.560us 100.00% 6.560us 2.187us 3
4103
+ Activity Buffer Request 81.92% 1.449ms 81.92% 1.449ms 1.449ms 2.208us 33.66% 2.208us 2.208us 1
4104
+ aten::empty 1.09% 19.351us 1.09% 19.351us 6.450us 0.000us 0.00% 0.000us 0.000us 3
4105
+ cudaLaunchKernel 9.29% 164.205us 9.29% 164.205us 54.735us 0.000us 0.00% 0.000us 0.000us 3
4106
+ cudaDeviceSynchronize 0.28% 4.990us 0.28% 4.990us 4.990us 0.000us 0.00% 0.000us 0.000us 1
4107
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4108
+ Self CPU time total: 1.768ms
4109
+ Self CUDA time total: 6.560us
4110
 
4111
 
4112
 
 
4116
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4117
  Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
4118
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4119
+ hf_kernels_swiglu 0.00% 0.000us 0.00% 0.000us 0.000us 65.118us 692.16% 65.118us 65.118us 1
4120
+ hf_kernels_swiglu 16.62% 89.683us 99.03% 534.374us 534.374us 0.000us 0.00% 12.576us 12.576us 1
4121
+ _activation_beeaae6::silu_and_mul 3.96% 21.372us 78.99% 426.201us 142.067us 9.408us 100.00% 12.576us 4.192us 3
4122
+ void vllm::act_and_mul_kernel&lt;c10::BFloat16, &amp;(c10::... 0.00% 0.000us 0.00% 0.000us 0.000us 9.408us 100.00% 9.408us 3.136us 3
4123
+ Activity Buffer Request 44.61% 240.735us 44.61% 240.735us 240.735us 3.168us 33.67% 3.168us 3.168us 1
4124
+ aten::empty 3.43% 18.490us 3.43% 18.490us 6.163us 0.000us 0.00% 0.000us 0.000us 3
4125
+ cudaLaunchKernel 30.41% 164.094us 30.41% 164.094us 54.698us 0.000us 0.00% 0.000us 0.000us 3
4126
+ cudaDeviceSynchronize 0.97% 5.210us 0.97% 5.210us 5.210us 0.000us 0.00% 0.000us 0.000us 1
4127
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4128
+ Self CPU time total: 539.584us
4129
+ Self CUDA time total: 9.408us
4130
 
4131
 
4132
 
 
4136
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4137
  Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
4138
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4139
+ hf_kernels_swiglu 0.00% 0.000us 0.00% 0.000us 0.000us 69.182us 527.34% 69.182us 69.182us 1
4140
+ hf_kernels_swiglu 12.86% 103.214us 99.41% 797.800us 797.800us 0.000us 0.00% 17.534us 17.534us 1
4141
+ _activation_beeaae6::silu_and_mul 2.63% 21.139us 84.20% 675.726us 225.242us 13.119us 100.00% 17.534us 5.845us 3
4142
+ void vllm::act_and_mul_kernel&lt;c10::BFloat16, &amp;(c10::... 0.00% 0.000us 0.00% 0.000us 0.000us 13.119us 100.00% 13.119us 4.373us 3
4143
+ Activity Buffer Request 61.21% 491.232us 61.21% 491.232us 491.232us 4.415us 33.65% 4.415us 4.415us 1
4144
+ aten::empty 2.35% 18.860us 2.35% 18.860us 6.287us 0.000us 0.00% 0.000us 0.000us 3
4145
+ cudaLaunchKernel 20.35% 163.355us 20.35% 163.355us 54.452us 0.000us 0.00% 0.000us 0.000us 3
4146
+ cudaDeviceSynchronize 0.59% 4.750us 0.59% 4.750us 4.750us 0.000us 0.00% 0.000us 0.000us 1
4147
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4148
+ Self CPU time total: 802.550us
4149
+ Self CUDA time total: 13.119us
4150
 
4151
 
4152
  impl wl p50(ms) ok
 
4163
  <div class="uv-install-logs" id="uv-logs-benchmark">
4164
  <div class="uv-logs-header" onclick="toggleUvLogs(this)">▶ UV Install Logs</div>
4165
  <div class="uv-logs-content" style="display: none;">
4166
+ Installed 15 packages in 13ms
4167
  </div>
4168
  </div>
4169
  <div class="cell-stderr">Fetching 7 files: 0%| | 0/7 [00:00&lt;?, ?it/s]
4170
+ Fetching 7 files: 71%|███████▏ | 5/7 [00:00&lt;00:00, 14.29it/s]
4171
+ Fetching 7 files: 100%|██████████| 7/7 [00:00&lt;00:00, 19.98it/s]</div>
 
4172
  <div class="cell-artifacts">
4173
  <h4>Artifacts:</h4>
4174
  <a href="artifacts/benchmark/activation.jsonl" class="artifact" target="_blank">activation.jsonl</a>
activation/impls/torch_swiglu.html CHANGED
@@ -3871,7 +3871,7 @@ span.linenos.special { color: #000000; background-color: #ffffc0; padding-left:
3871
  <span onclick="toggleOutput('nv')" style="cursor: pointer;">▼ output</span>
3872
  <span id="uv-indicator-nv" style="cursor: default; opacity: 0.3;">▶ uv-logs</span>
3873
  </span> |
3874
- Cell: nv | 0.21s
3875
  | <button class="run-btn" onclick="runCell('nv')">▶ run</button>
3876
  <button class="copy-btn" onclick="copyCell('nv')">Copy</button>
3877
  <a href="cells/nv.py" target="_blank" class="raw-btn">Raw</a>
@@ -3887,7 +3887,7 @@ Cell: nv | 0.21s
3887
  </div>
3888
  </div>
3889
  <div id="output-nv" class="cell-output">
3890
- <div class="cell-stdout"><pre class="stdout-text">Tue Oct 28 14:07:54 2025
3891
  +-----------------------------------------------------------------------------------------+
3892
  | NVIDIA-SMI 570.195.03 Driver Version: 570.195.03 CUDA Version: 12.8 |
3893
  |-----------------------------------------+------------------------+----------------------+
@@ -3896,7 +3896,7 @@ Cell: nv | 0.21s
3896
  | | | MIG M. |
3897
  |=========================================+========================+======================|
3898
  | 0 NVIDIA L40S On | 00000000:4D:00.0 Off | 0 |
3899
- | N/A 27C P0 80W / 350W | 0MiB / 46068MiB | 1% Default |
3900
  | | | N/A |
3901
  +-----------------------------------------+------------------------+----------------------+
3902
 
@@ -3920,7 +3920,7 @@ Cell: nv | 0.21s
3920
  <span onclick="toggleOutput('benchmark')" style="cursor: pointer;">▼ output</span>
3921
  <span id="uv-indicator-benchmark" onclick="toggleUvLogsFromHeader('benchmark')" style="cursor: pointer;">▶ uv-logs</span>
3922
  </span> |
3923
- Cell: benchmark | 6.88s
3924
  | <button class="run-btn" onclick="runCell('benchmark')">▶ run</button>
3925
  <button class="copy-btn" onclick="copyCell('benchmark')">Copy</button>
3926
  <a href="cells/benchmark.py" target="_blank" class="raw-btn">Raw</a>
@@ -3970,20 +3970,20 @@ PROFILE TRACE: torch_eager | cuda_T128_D768
3970
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
3971
  Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
3972
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
3973
- torch_eager 0.00% 0.000us 0.00% 0.000us 0.000us 206.526us 1621.34% 206.526us 206.526us 1
3974
- torch_eager 11.16% 213.167us 99.55% 1.902ms 1.902ms 0.000us 0.00% 15.042us 15.042us 1
3975
- aten::silu 3.29% 62.892us 81.79% 1.563ms 520.961us 6.529us 51.26% 8.833us 2.944us 3
3976
- void at::native::elementwise_kernel&lt;128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 6.529us 51.26% 6.529us 2.176us 3
3977
- aten::mul 2.06% 39.382us 3.23% 61.724us 20.575us 6.209us 48.74% 6.209us 2.070us 3
3978
- void at::native::elementwise_kernel&lt;128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 6.209us 48.74% 6.209us 2.070us 3
3979
- Activity Buffer Request 76.05% 1.453ms 76.05% 1.453ms 1.453ms 2.304us 18.09% 2.304us 2.304us 1
3980
- aten::slice 2.72% 51.931us 3.38% 64.581us 10.764us 0.000us 0.00% 0.000us 0.000us 6
3981
- aten::as_strided 0.66% 12.650us 0.66% 12.650us 2.108us 0.000us 0.00% 0.000us 0.000us 6
3982
- cudaLaunchKernel 3.62% 69.144us 3.62% 69.144us 11.524us 0.000us 0.00% 0.000us 0.000us 6
3983
- cudaDeviceSynchronize 0.45% 8.521us 0.45% 8.521us 8.521us 0.000us 0.00% 0.000us 0.000us 1
3984
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
3985
- Self CPU time total: 1.911ms
3986
- Self CUDA time total: 12.738us
3987
 
3988
 
3989
 
@@ -3993,20 +3993,20 @@ PROFILE TRACE: torch_eager | cuda_T128_D1024
3993
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
3994
  Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
3995
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
3996
- torch_eager 0.00% 0.000us 0.00% 0.000us 0.000us 151.965us 1227.21% 151.965us 151.965us 1
3997
- torch_eager 7.02% 119.974us 99.63% 1.704ms 1.704ms 0.000us 0.00% 14.558us 14.558us 1
3998
- aten::silu 2.35% 40.140us 88.12% 1.507ms 502.320us 6.399us 51.68% 8.574us 2.858us 3
3999
  void at::native::elementwise_kernel&lt;128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 6.399us 51.68% 6.399us 2.133us 3
4000
- aten::mul 1.61% 27.481us 2.72% 46.541us 15.514us 5.984us 48.32% 5.984us 1.995us 3
4001
- void at::native::elementwise_kernel&lt;128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 5.984us 48.32% 5.984us 1.995us 3
4002
- Activity Buffer Request 84.14% 1.439ms 84.14% 1.439ms 1.439ms 2.175us 17.56% 2.175us 2.175us 1
4003
- aten::slice 1.43% 24.471us 1.78% 30.412us 5.069us 0.000us 0.00% 0.000us 0.000us 6
4004
- aten::as_strided 0.35% 5.941us 0.35% 5.941us 0.990us 0.000us 0.00% 0.000us 0.000us 6
4005
- cudaLaunchKernel 2.74% 46.851us 2.74% 46.851us 7.809us 0.000us 0.00% 0.000us 0.000us 6
4006
- cudaDeviceSynchronize 0.37% 6.320us 0.37% 6.320us 6.320us 0.000us 0.00% 0.000us 0.000us 1
4007
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4008
- Self CPU time total: 1.710ms
4009
- Self CUDA time total: 12.383us
4010
 
4011
 
4012
 
@@ -4016,20 +4016,20 @@ PROFILE TRACE: torch_eager | cuda_T128_D2048
4016
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4017
  Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
4018
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4019
- torch_eager 0.00% 0.000us 0.00% 0.000us 0.000us 151.008us 1139.77% 151.008us 151.008us 1
4020
- torch_eager 6.34% 107.173us 99.70% 1.687ms 1.687ms 0.000us 0.00% 15.522us 15.522us 1
4021
- aten::silu 2.38% 40.332us 88.83% 1.503ms 500.911us 6.817us 51.45% 9.090us 3.030us 3
4022
- void at::native::elementwise_kernel&lt;128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 6.817us 51.45% 6.817us 2.272us 3
4023
- aten::mul 1.57% 26.503us 2.73% 46.253us 15.418us 6.432us 48.55% 6.432us 2.144us 3
4024
- void at::native::elementwise_kernel&lt;128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 6.432us 48.55% 6.432us 2.144us 3
4025
- Activity Buffer Request 84.91% 1.436ms 84.91% 1.436ms 1.436ms 2.273us 17.16% 2.273us 2.273us 1
4026
- aten::slice 1.43% 24.250us 1.81% 30.550us 5.092us 0.000us 0.00% 0.000us 0.000us 6
4027
- aten::as_strided 0.37% 6.300us 0.37% 6.300us 1.050us 0.000us 0.00% 0.000us 0.000us 6
4028
- cudaLaunchKernel 2.70% 45.731us 2.70% 45.731us 7.622us 0.000us 0.00% 0.000us 0.000us 6
4029
- cudaDeviceSynchronize 0.30% 5.000us 0.30% 5.000us 5.000us 0.000us 0.00% 0.000us 0.000us 1
4030
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4031
  Self CPU time total: 1.692ms
4032
- Self CUDA time total: 13.249us
4033
 
4034
 
4035
 
@@ -4039,20 +4039,20 @@ PROFILE TRACE: torch_eager | cuda_T256_D768
4039
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4040
  Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
4041
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4042
- torch_eager 0.00% 0.000us 0.00% 0.000us 0.000us 153.149us 1202.68% 153.149us 153.149us 1
4043
- torch_eager 6.34% 109.104us 99.71% 1.717ms 1.717ms 0.000us 0.00% 14.941us 14.941us 1
4044
- aten::silu 2.38% 40.982us 88.93% 1.531ms 510.411us 6.558us 51.50% 8.765us 2.922us 3
4045
- void at::native::elementwise_kernel&lt;128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 6.558us 51.50% 6.558us 2.186us 3
4046
- aten::mul 1.52% 26.241us 2.68% 46.222us 15.407us 6.176us 48.50% 6.176us 2.059us 3
4047
- void at::native::elementwise_kernel&lt;128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 6.176us 48.50% 6.176us 2.059us 3
4048
- Activity Buffer Request 73.41% 1.264ms 73.41% 1.264ms 1.264ms 2.207us 17.33% 2.207us 2.207us 1
4049
- aten::slice 1.43% 24.560us 1.77% 30.400us 5.067us 0.000us 0.00% 0.000us 0.000us 6
4050
- aten::as_strided 0.34% 5.840us 0.34% 5.840us 0.973us 0.000us 0.00% 0.000us 0.000us 6
4051
- cudaLaunchKernel 14.29% 246.139us 14.29% 246.139us 41.023us 0.000us 0.00% 0.000us 0.000us 6
4052
- cudaDeviceSynchronize 0.29% 4.920us 0.29% 4.920us 4.920us 0.000us 0.00% 0.000us 0.000us 1
4053
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4054
- Self CPU time total: 1.722ms
4055
- Self CUDA time total: 12.734us
4056
 
4057
 
4058
 
@@ -4062,20 +4062,20 @@ PROFILE TRACE: torch_eager | cuda_T256_D1024
4062
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4063
  Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
4064
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4065
- torch_eager 0.00% 0.000us 0.00% 0.000us 0.000us 149.310us 1126.87% 149.310us 149.310us 1
4066
- torch_eager 5.88% 107.113us 99.73% 1.817ms 1.817ms 0.000us 0.00% 15.555us 15.555us 1
4067
- aten::silu 2.34% 42.602us 89.83% 1.636ms 545.432us 6.785us 51.21% 9.090us 3.030us 3
4068
- void at::native::elementwise_kernel&lt;128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 6.785us 51.21% 6.785us 2.262us 3
4069
- aten::mul 1.33% 24.312us 2.33% 42.512us 14.171us 6.465us 48.79% 6.465us 2.155us 3
4070
- void at::native::elementwise_kernel&lt;128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 6.465us 48.79% 6.465us 2.155us 3
4071
- Activity Buffer Request 78.20% 1.424ms 78.20% 1.424ms 1.424ms 2.305us 17.40% 2.305us 2.305us 1
4072
- aten::slice 1.35% 24.650us 1.68% 30.660us 5.110us 0.000us 0.00% 0.000us 0.000us 6
4073
- aten::as_strided 0.33% 6.010us 0.33% 6.010us 1.002us 0.000us 0.00% 0.000us 0.000us 6
4074
- cudaLaunchKernel 10.29% 187.406us 10.29% 187.406us 31.234us 0.000us 0.00% 0.000us 0.000us 6
4075
- cudaDeviceSynchronize 0.27% 4.950us 0.27% 4.950us 4.950us 0.000us 0.00% 0.000us 0.000us 1
4076
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4077
- Self CPU time total: 1.822ms
4078
- Self CUDA time total: 13.250us
4079
 
4080
 
4081
 
@@ -4085,20 +4085,20 @@ PROFILE TRACE: torch_eager | cuda_T256_D2048
4085
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4086
  Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
4087
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4088
- torch_eager 0.00% 0.000us 0.00% 0.000us 0.000us 143.804us 924.73% 143.804us 143.804us 1
4089
- torch_eager 21.50% 103.524us 99.01% 476.736us 476.736us 0.000us 0.00% 18.271us 18.271us 1
4090
- aten::silu 8.70% 41.893us 62.70% 301.891us 100.630us 7.999us 51.44% 10.719us 3.573us 3
4091
- void at::native::elementwise_kernel&lt;128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 7.999us 51.44% 7.999us 2.666us 3
4092
- aten::mul 5.07% 24.390us 8.83% 42.521us 14.174us 7.552us 48.56% 7.552us 2.517us 3
4093
- void at::native::elementwise_kernel&lt;128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 7.552us 48.56% 7.552us 2.517us 3
4094
- Activity Buffer Request 22.22% 106.973us 22.22% 106.973us 106.973us 2.720us 17.49% 2.720us 2.720us 1
4095
- aten::slice 4.80% 23.090us 5.98% 28.800us 4.800us 0.000us 0.00% 0.000us 0.000us 6
4096
- aten::as_strided 1.19% 5.710us 1.19% 5.710us 0.952us 0.000us 0.00% 0.000us 0.000us 6
4097
- cudaLaunchKernel 35.55% 171.156us 35.55% 171.156us 28.526us 0.000us 0.00% 0.000us 0.000us 6
4098
- cudaDeviceSynchronize 0.99% 4.760us 0.99% 4.760us 4.760us 0.000us 0.00% 0.000us 0.000us 1
4099
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4100
- Self CPU time total: 481.496us
4101
- Self CUDA time total: 15.551us
4102
 
4103
 
4104
 
@@ -4108,20 +4108,20 @@ PROFILE TRACE: torch_eager | cuda_T512_D768
4108
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4109
  Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
4110
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4111
- torch_eager 0.00% 0.000us 0.00% 0.000us 0.000us 153.372us 1067.46% 153.372us 153.372us 1
4112
- torch_eager 5.96% 108.164us 99.73% 1.810ms 1.810ms 0.000us 0.00% 16.832us 16.832us 1
4113
- aten::silu 2.30% 41.731us 89.59% 1.626ms 541.925us 7.360us 51.22% 9.824us 3.275us 3
4114
- void at::native::elementwise_kernel&lt;128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 7.360us 51.22% 7.360us 2.453us 3
4115
- aten::mul 1.41% 25.542us 2.47% 44.792us 14.931us 7.008us 48.78% 7.008us 2.336us 3
4116
- void at::native::elementwise_kernel&lt;128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 7.008us 48.78% 7.008us 2.336us 3
4117
- Activity Buffer Request 78.82% 1.430ms 78.82% 1.430ms 1.430ms 2.464us 17.15% 2.464us 2.464us 1
4118
- aten::slice 1.37% 24.840us 1.70% 30.900us 5.150us 0.000us 0.00% 0.000us 0.000us 6
4119
- aten::as_strided 0.33% 6.060us 0.33% 6.060us 1.010us 0.000us 0.00% 0.000us 0.000us 6
4120
- cudaLaunchKernel 9.53% 172.976us 9.53% 172.976us 28.829us 0.000us 0.00% 0.000us 0.000us 6
4121
- cudaDeviceSynchronize 0.27% 4.960us 0.27% 4.960us 4.960us 0.000us 0.00% 0.000us 0.000us 1
4122
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4123
- Self CPU time total: 1.815ms
4124
- Self CUDA time total: 14.368us
4125
 
4126
 
4127
 
@@ -4131,20 +4131,20 @@ PROFILE TRACE: torch_eager | cuda_T512_D1024
4131
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4132
  Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
4133
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4134
- torch_eager 0.00% 0.000us 0.00% 0.000us 0.000us 146.240us 942.27% 146.240us 146.240us 1
4135
- torch_eager 22.59% 104.486us 98.96% 457.726us 457.726us 0.000us 0.00% 18.208us 18.208us 1
4136
- aten::silu 8.78% 40.590us 60.43% 279.519us 93.173us 7.936us 51.13% 10.624us 3.541us 3
4137
- void at::native::elementwise_kernel&lt;128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 7.936us 51.13% 7.936us 2.645us 3
4138
- aten::mul 5.53% 25.579us 9.45% 43.730us 14.577us 7.584us 48.87% 7.584us 2.528us 3
4139
- void at::native::elementwise_kernel&lt;128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 7.584us 48.87% 7.584us 2.528us 3
4140
- Activity Buffer Request 18.85% 87.193us 18.85% 87.193us 87.193us 2.688us 17.32% 2.688us 2.688us 1
4141
- aten::slice 5.23% 24.201us 6.48% 29.991us 4.999us 0.000us 0.00% 0.000us 0.000us 6
4142
- aten::as_strided 1.25% 5.790us 1.25% 5.790us 0.965us 0.000us 0.00% 0.000us 0.000us 6
4143
- cudaLaunchKernel 36.73% 169.887us 36.73% 169.887us 28.314us 0.000us 0.00% 0.000us 0.000us 6
4144
- cudaDeviceSynchronize 1.04% 4.800us 1.04% 4.800us 4.800us 0.000us 0.00% 0.000us 0.000us 1
4145
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4146
- Self CPU time total: 462.526us
4147
- Self CUDA time total: 15.520us
4148
 
4149
 
4150
 
@@ -4154,20 +4154,20 @@ PROFILE TRACE: torch_eager | cuda_T512_D2048
4154
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4155
  Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
4156
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4157
- torch_eager 0.00% 0.000us 0.00% 0.000us 0.000us 181.470us 803.28% 181.470us 181.470us 1
4158
- torch_eager 5.97% 109.125us 99.74% 1.823ms 1.823ms 0.000us 0.00% 26.526us 26.526us 1
4159
- aten::silu 2.38% 43.492us 88.50% 1.617ms 539.072us 11.647us 51.56% 15.582us 5.194us 3
4160
- void at::native::elementwise_kernel&lt;128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 11.647us 51.56% 11.647us 3.882us 3
4161
- aten::mul 1.42% 25.882us 3.51% 64.123us 21.374us 10.944us 48.44% 10.944us 3.648us 3
4162
- void at::native::elementwise_kernel&lt;128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 10.944us 48.44% 10.944us 3.648us 3
4163
- Activity Buffer Request 77.67% 1.419ms 77.67% 1.419ms 1.419ms 3.935us 17.42% 3.935us 3.935us 1
4164
- aten::slice 1.42% 25.910us 1.76% 32.089us 5.348us 0.000us 0.00% 0.000us 0.000us 6
4165
- aten::as_strided 0.34% 6.179us 0.34% 6.179us 1.030us 0.000us 0.00% 0.000us 0.000us 6
4166
- cudaLaunchKernel 10.54% 192.606us 10.54% 192.606us 32.101us 0.000us 0.00% 0.000us 0.000us 6
4167
- cudaDeviceSynchronize 0.26% 4.790us 0.26% 4.790us 4.790us 0.000us 0.00% 0.000us 0.000us 1
4168
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4169
- Self CPU time total: 1.827ms
4170
- Self CUDA time total: 22.591us
4171
 
4172
 
4173
  impl wl p50(ms) ok
@@ -4184,7 +4184,7 @@ torch_eager cuda_T512_D768 0.05 True
4184
  <div class="uv-install-logs" id="uv-logs-benchmark">
4185
  <div class="uv-logs-header" onclick="toggleUvLogs(this)">▶ UV Install Logs</div>
4186
  <div class="uv-logs-content" style="display: none;">
4187
- Installed 37 packages in 192ms
4188
  </div>
4189
  </div>
4190
  <div class="cell-artifacts">
 
3871
  <span onclick="toggleOutput('nv')" style="cursor: pointer;">▼ output</span>
3872
  <span id="uv-indicator-nv" style="cursor: default; opacity: 0.3;">▶ uv-logs</span>
3873
  </span> |
3874
+ Cell: nv | 0.26s
3875
  | <button class="run-btn" onclick="runCell('nv')">▶ run</button>
3876
  <button class="copy-btn" onclick="copyCell('nv')">Copy</button>
3877
  <a href="cells/nv.py" target="_blank" class="raw-btn">Raw</a>
 
3887
  </div>
3888
  </div>
3889
  <div id="output-nv" class="cell-output">
3890
+ <div class="cell-stdout"><pre class="stdout-text">Wed Oct 29 14:26:44 2025
3891
  +-----------------------------------------------------------------------------------------+
3892
  | NVIDIA-SMI 570.195.03 Driver Version: 570.195.03 CUDA Version: 12.8 |
3893
  |-----------------------------------------+------------------------+----------------------+
 
3896
  | | | MIG M. |
3897
  |=========================================+========================+======================|
3898
  | 0 NVIDIA L40S On | 00000000:4D:00.0 Off | 0 |
3899
+ | N/A 32C P0 133W / 350W | 0MiB / 46068MiB | 100% Default |
3900
  | | | N/A |
3901
  +-----------------------------------------+------------------------+----------------------+
3902
 
 
3920
  <span onclick="toggleOutput('benchmark')" style="cursor: pointer;">▼ output</span>
3921
  <span id="uv-indicator-benchmark" onclick="toggleUvLogsFromHeader('benchmark')" style="cursor: pointer;">▶ uv-logs</span>
3922
  </span> |
3923
+ Cell: benchmark | 6.86s
3924
  | <button class="run-btn" onclick="runCell('benchmark')">▶ run</button>
3925
  <button class="copy-btn" onclick="copyCell('benchmark')">Copy</button>
3926
  <a href="cells/benchmark.py" target="_blank" class="raw-btn">Raw</a>
 
3970
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
3971
  Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
3972
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
3973
+ torch_eager 0.00% 0.000us 0.00% 0.000us 0.000us 189.470us 1483.94% 189.470us 189.470us 1
3974
+ torch_eager 11.64% 220.727us 99.60% 1.889ms 1.889ms 0.000us 0.00% 15.103us 15.103us 1
3975
+ aten::silu 3.36% 63.732us 81.84% 1.552ms 517.326us 6.559us 51.37% 8.894us 2.965us 3
3976
+ void at::native::elementwise_kernel&lt;128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 6.559us 51.37% 6.559us 2.186us 3
3977
+ aten::mul 1.83% 34.608us 3.05% 57.780us 19.260us 6.209us 48.63% 6.209us 2.070us 3
3978
+ void at::native::elementwise_kernel&lt;128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 6.209us 48.63% 6.209us 2.070us 3
3979
+ Activity Buffer Request 76.17% 1.444ms 76.17% 1.444ms 1.444ms 2.335us 18.29% 2.335us 2.335us 1
3980
+ aten::slice 2.47% 46.790us 3.07% 58.281us 9.714us 0.000us 0.00% 0.000us 0.000us 6
3981
+ aten::as_strided 0.61% 11.491us 0.61% 11.491us 1.915us 0.000us 0.00% 0.000us 0.000us 6
3982
+ cudaLaunchKernel 3.54% 67.043us 3.54% 67.043us 11.174us 0.000us 0.00% 0.000us 0.000us 6
3983
+ cudaDeviceSynchronize 0.40% 7.531us 0.40% 7.531us 7.531us 0.000us 0.00% 0.000us 0.000us 1
3984
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
3985
+ Self CPU time total: 1.896ms
3986
+ Self CUDA time total: 12.768us
3987
 
3988
 
3989
 
 
3993
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
3994
  Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
3995
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
3996
+ torch_eager 0.00% 0.000us 0.00% 0.000us 0.000us 160.895us 1299.43% 160.895us 160.895us 1
3997
+ torch_eager 6.82% 117.243us 99.71% 1.713ms 1.713ms 0.000us 0.00% 14.558us 14.558us 1
3998
+ aten::silu 2.46% 42.340us 88.23% 1.516ms 505.362us 6.399us 51.68% 8.575us 2.858us 3
3999
  void at::native::elementwise_kernel&lt;128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 6.399us 51.68% 6.399us 2.133us 3
4000
+ aten::mul 1.64% 28.101us 2.83% 48.681us 16.227us 5.983us 48.32% 5.983us 1.994us 3
4001
+ void at::native::elementwise_kernel&lt;128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 5.983us 48.32% 5.983us 1.994us 3
4002
+ Activity Buffer Request 84.10% 1.445ms 84.10% 1.445ms 1.445ms 2.176us 17.57% 2.176us 2.176us 1
4003
+ aten::slice 1.47% 25.252us 1.82% 31.222us 5.204us 0.000us 0.00% 0.000us 0.000us 6
4004
+ aten::as_strided 0.35% 5.970us 0.35% 5.970us 0.995us 0.000us 0.00% 0.000us 0.000us 6
4005
+ cudaLaunchKernel 2.87% 49.290us 2.87% 49.290us 8.215us 0.000us 0.00% 0.000us 0.000us 6
4006
+ cudaDeviceSynchronize 0.29% 5.020us 0.29% 5.020us 5.020us 0.000us 0.00% 0.000us 0.000us 1
4007
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4008
+ Self CPU time total: 1.718ms
4009
+ Self CUDA time total: 12.382us
4010
 
4011
 
4012
 
 
4016
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4017
  Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
4018
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4019
+ torch_eager 0.00% 0.000us 0.00% 0.000us 0.000us 157.982us 1195.38% 157.982us 157.982us 1
4020
+ torch_eager 6.51% 110.244us 99.65% 1.686ms 1.686ms 0.000us 0.00% 15.488us 15.488us 1
4021
+ aten::silu 2.52% 42.653us 88.50% 1.498ms 499.192us 6.784us 51.33% 9.056us 3.019us 3
4022
+ void at::native::elementwise_kernel&lt;128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 6.784us 51.33% 6.784us 2.261us 3
4023
+ aten::mul 1.66% 28.021us 2.76% 46.791us 15.597us 6.432us 48.67% 6.432us 2.144us 3
4024
+ void at::native::elementwise_kernel&lt;128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 6.432us 48.67% 6.432us 2.144us 3
4025
+ Activity Buffer Request 84.30% 1.427ms 84.30% 1.427ms 1.427ms 2.272us 17.19% 2.272us 2.272us 1
4026
+ aten::slice 1.51% 25.627us 1.87% 31.700us 5.283us 0.000us 0.00% 0.000us 0.000us 6
4027
+ aten::as_strided 0.36% 6.073us 0.36% 6.073us 1.012us 0.000us 0.00% 0.000us 0.000us 6
4028
+ cudaLaunchKernel 2.78% 47.050us 2.78% 47.050us 7.842us 0.000us 0.00% 0.000us 0.000us 6
4029
+ cudaDeviceSynchronize 0.35% 5.950us 0.35% 5.950us 5.950us 0.000us 0.00% 0.000us 0.000us 1
4030
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4031
  Self CPU time total: 1.692ms
4032
+ Self CUDA time total: 13.216us
4033
 
4034
 
4035
 
 
4039
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4040
  Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
4041
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4042
+ torch_eager 0.00% 0.000us 0.00% 0.000us 0.000us 159.902us 1258.67% 159.902us 159.902us 1
4043
+ torch_eager 6.73% 114.317us 99.66% 1.694ms 1.694ms 0.000us 0.00% 14.912us 14.912us 1
4044
+ aten::silu 2.46% 41.881us 88.34% 1.501ms 500.465us 6.560us 51.64% 8.768us 2.923us 3
4045
+ void at::native::elementwise_kernel&lt;128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 6.560us 51.64% 6.560us 2.187us 3
4046
+ aten::mul 1.68% 28.581us 2.79% 47.441us 15.814us 6.144us 48.36% 6.144us 2.048us 3
4047
+ void at::native::elementwise_kernel&lt;128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 6.144us 48.36% 6.144us 2.048us 3
4048
+ Activity Buffer Request 74.33% 1.263ms 74.33% 1.263ms 1.263ms 2.208us 17.38% 2.208us 2.208us 1
4049
+ aten::slice 1.44% 24.468us 1.80% 30.638us 5.106us 0.000us 0.00% 0.000us 0.000us 6
4050
+ aten::as_strided 0.36% 6.170us 0.36% 6.170us 1.028us 0.000us 0.00% 0.000us 0.000us 6
4051
+ cudaLaunchKernel 12.65% 214.994us 12.65% 214.994us 35.832us 0.000us 0.00% 0.000us 0.000us 6
4052
+ cudaDeviceSynchronize 0.34% 5.830us 0.34% 5.830us 5.830us 0.000us 0.00% 0.000us 0.000us 1
4053
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4054
+ Self CPU time total: 1.700ms
4055
+ Self CUDA time total: 12.704us
4056
 
4057
 
4058
 
 
4062
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4063
  Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
4064
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4065
+ torch_eager 0.00% 0.000us 0.00% 0.000us 0.000us 157.053us 1185.48% 157.053us 157.053us 1
4066
+ torch_eager 6.08% 111.294us 99.69% 1.824ms 1.824ms 0.000us 0.00% 15.552us 15.552us 1
4067
+ aten::silu 2.39% 43.729us 89.42% 1.636ms 545.306us 6.784us 51.21% 9.088us 3.029us 3
4068
+ void at::native::elementwise_kernel&lt;128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 6.784us 51.21% 6.784us 2.261us 3
4069
+ aten::mul 1.44% 26.361us 2.52% 46.181us 15.394us 6.464us 48.79% 6.464us 2.155us 3
4070
+ void at::native::elementwise_kernel&lt;128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 6.464us 48.79% 6.464us 2.155us 3
4071
+ Activity Buffer Request 77.97% 1.426ms 77.97% 1.426ms 1.426ms 2.304us 17.39% 2.304us 2.304us 1
4072
+ aten::slice 1.34% 24.571us 1.66% 30.441us 5.074us 0.000us 0.00% 0.000us 0.000us 6
4073
+ aten::as_strided 0.32% 5.870us 0.32% 5.870us 0.978us 0.000us 0.00% 0.000us 0.000us 6
4074
+ cudaLaunchKernel 10.14% 185.544us 10.14% 185.544us 30.924us 0.000us 0.00% 0.000us 0.000us 6
4075
+ cudaDeviceSynchronize 0.31% 5.601us 0.31% 5.601us 5.601us 0.000us 0.00% 0.000us 0.000us 1
4076
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4077
+ Self CPU time total: 1.829ms
4078
+ Self CUDA time total: 13.248us
4079
 
4080
 
4081
 
 
4085
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4086
  Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
4087
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4088
+ torch_eager 0.00% 0.000us 0.00% 0.000us 0.000us 151.390us 977.47% 151.390us 151.390us 1
4089
+ torch_eager 22.03% 109.975us 99.02% 494.363us 494.363us 0.000us 0.00% 18.176us 18.176us 1
4090
+ aten::silu 8.41% 41.971us 61.88% 308.937us 102.979us 7.936us 51.24% 10.624us 3.541us 3
4091
+ void at::native::elementwise_kernel&lt;128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 7.936us 51.24% 7.936us 2.645us 3
4092
+ aten::mul 5.23% 26.101us 8.92% 44.531us 14.844us 7.552us 48.76% 7.552us 2.517us 3
4093
+ void at::native::elementwise_kernel&lt;128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 7.552us 48.76% 7.552us 2.517us 3
4094
+ Activity Buffer Request 22.19% 110.773us 22.19% 110.773us 110.773us 2.688us 17.36% 2.688us 2.688us 1
4095
+ aten::slice 5.05% 25.220us 6.19% 30.920us 5.153us 0.000us 0.00% 0.000us 0.000us 6
4096
+ aten::as_strided 1.14% 5.700us 1.14% 5.700us 0.950us 0.000us 0.00% 0.000us 0.000us 6
4097
+ cudaLaunchKernel 34.98% 174.623us 34.98% 174.623us 29.104us 0.000us 0.00% 0.000us 0.000us 6
4098
+ cudaDeviceSynchronize 0.98% 4.900us 0.98% 4.900us 4.900us 0.000us 0.00% 0.000us 0.000us 1
4099
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4100
+ Self CPU time total: 499.263us
4101
+ Self CUDA time total: 15.488us
4102
 
4103
 
4104
 
 
4108
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4109
  Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
4110
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4111
+ torch_eager 0.00% 0.000us 0.00% 0.000us 0.000us 163.583us 1143.70% 163.583us 163.583us 1
4112
+ torch_eager 6.28% 116.052us 99.70% 1.841ms 1.841ms 0.000us 0.00% 16.767us 16.767us 1
4113
+ aten::silu 2.27% 41.942us 89.09% 1.645ms 548.450us 7.327us 51.23% 9.791us 3.264us 3
4114
+ void at::native::elementwise_kernel&lt;128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 7.327us 51.23% 7.327us 2.442us 3
4115
+ aten::mul 1.55% 28.681us 2.62% 48.392us 16.131us 6.976us 48.77% 6.976us 2.325us 3
4116
+ void at::native::elementwise_kernel&lt;128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 6.976us 48.77% 6.976us 2.325us 3
4117
+ Activity Buffer Request 78.22% 1.445ms 78.22% 1.445ms 1.445ms 2.464us 17.23% 2.464us 2.464us 1
4118
+ aten::slice 1.38% 25.430us 1.70% 31.392us 5.232us 0.000us 0.00% 0.000us 0.000us 6
4119
+ aten::as_strided 0.32% 5.962us 0.32% 5.962us 0.994us 0.000us 0.00% 0.000us 0.000us 6
4120
+ cudaLaunchKernel 9.67% 178.614us 9.67% 178.614us 29.769us 0.000us 0.00% 0.000us 0.000us 6
4121
+ cudaDeviceSynchronize 0.30% 5.570us 0.30% 5.570us 5.570us 0.000us 0.00% 0.000us 0.000us 1
4122
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4123
+ Self CPU time total: 1.847ms
4124
+ Self CUDA time total: 14.303us
4125
 
4126
 
4127
 
 
4131
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4132
  Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
4133
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4134
+ torch_eager 0.00% 0.000us 0.00% 0.000us 0.000us 150.172us 969.60% 150.172us 150.172us 1
4135
+ torch_eager 23.07% 110.204us 98.98% 472.752us 472.752us 0.000us 0.00% 18.176us 18.176us 1
4136
+ aten::silu 9.08% 43.371us 60.20% 287.547us 95.849us 7.936us 51.24% 10.624us 3.541us 3
4137
+ void at::native::elementwise_kernel&lt;128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 7.936us 51.24% 7.936us 2.645us 3
4138
+ aten::mul 5.48% 26.181us 9.38% 44.801us 14.934us 7.552us 48.76% 7.552us 2.517us 3
4139
+ void at::native::elementwise_kernel&lt;128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 7.552us 48.76% 7.552us 2.517us 3
4140
+ Activity Buffer Request 19.26% 92.002us 19.26% 92.002us 92.002us 2.688us 17.36% 2.688us 2.688us 1
4141
+ aten::slice 5.00% 23.870us 6.32% 30.200us 5.033us 0.000us 0.00% 0.000us 0.000us 6
4142
+ aten::as_strided 1.33% 6.330us 1.33% 6.330us 1.055us 0.000us 0.00% 0.000us 0.000us 6
4143
+ cudaLaunchKernel 35.76% 170.794us 35.76% 170.794us 28.466us 0.000us 0.00% 0.000us 0.000us 6
4144
+ cudaDeviceSynchronize 1.02% 4.871us 1.02% 4.871us 4.871us 0.000us 0.00% 0.000us 0.000us 1
4145
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4146
+ Self CPU time total: 477.623us
4147
+ Self CUDA time total: 15.488us
4148
 
4149
 
4150
 
 
4154
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4155
  Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
4156
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4157
+ torch_eager 0.00% 0.000us 0.00% 0.000us 0.000us 160.000us 713.30% 160.000us 160.000us 1
4158
+ torch_eager 5.99% 109.975us 99.73% 1.831ms 1.831ms 0.000us 0.00% 26.335us 26.335us 1
4159
+ aten::silu 2.30% 42.230us 89.52% 1.643ms 547.763us 11.583us 51.64% 15.487us 5.162us 3
4160
+ void at::native::elementwise_kernel&lt;128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 11.583us 51.64% 11.583us 3.861us 3
4161
+ aten::mul 1.54% 28.250us 2.52% 46.180us 15.393us 10.848us 48.36% 10.848us 3.616us 3
4162
+ void at::native::elementwise_kernel&lt;128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 10.848us 48.36% 10.848us 3.616us 3
4163
+ Activity Buffer Request 78.83% 1.447ms 78.83% 1.447ms 1.447ms 3.904us 17.40% 3.904us 3.904us 1
4164
+ aten::slice 1.37% 25.211us 1.70% 31.261us 5.210us 0.000us 0.00% 0.000us 0.000us 6
4165
+ aten::as_strided 0.33% 6.050us 0.33% 6.050us 1.008us 0.000us 0.00% 0.000us 0.000us 6
4166
+ cudaLaunchKernel 9.37% 171.964us 9.37% 171.964us 28.661us 0.000us 0.00% 0.000us 0.000us 6
4167
+ cudaDeviceSynchronize 0.27% 4.930us 0.27% 4.930us 4.930us 0.000us 0.00% 0.000us 0.000us 1
4168
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4169
+ Self CPU time total: 1.836ms
4170
+ Self CUDA time total: 22.431us
4171
 
4172
 
4173
  impl wl p50(ms) ok
 
4184
  <div class="uv-install-logs" id="uv-logs-benchmark">
4185
  <div class="uv-logs-header" onclick="toggleUvLogs(this)">▶ UV Install Logs</div>
4186
  <div class="uv-logs-content" style="display: none;">
4187
+ Installed 37 packages in 230ms
4188
  </div>
4189
  </div>
4190
  <div class="cell-artifacts">
activation/results/artifacts/combine/latency.svg CHANGED

Git LFS Details

  • SHA256: 431dea6a591fc822f7d0d0d6f793e8c11170edb647c627b5a44ad9883df2c3fc
  • Pointer size: 130 Bytes
  • Size of remote file: 20.7 kB

Git LFS Details

  • SHA256: f62c7d85fc4a76cf7a1060a62df99ff0d32133ab94bb502b68dcd53171c39602
  • Pointer size: 130 Bytes
  • Size of remote file: 21.4 kB
activation/results/combined_results.html CHANGED
@@ -3872,7 +3872,7 @@ span.linenos.special { color: #000000; background-color: #ffffc0; padding-left:
3872
  <rdf:RDF>
3873
  <ns2:Work>
3874
  <dc:type rdf:resource="http://purl.org/dc/dcmitype/StillImage" />
3875
- <dc:date>2025-10-28T14:09:13.211569</dc:date>
3876
  <dc:format>image/svg+xml</dc:format>
3877
  <dc:creator>
3878
  <ns2:Agent>
@@ -4021,83 +4021,96 @@ span.linenos.special { color: #000000; background-color: #ffffc0; padding-left:
4021
  <g id="matplotlib.axis_2">
4022
  <g id="ytick_1">
4023
  <g id="grid-y--2" class="grid grid-y">
4024
- <path d="M 60.23 416.825206 L 847.294169 416.825206 " clip-path="url(#p620c7d392f)" style="fill: none; stroke: #b0b0b0; stroke-opacity: 0.3; stroke-width: 0.8; stroke-linecap: square" />
4025
  </g>
4026
  <g id="line2d_10">
4027
  <defs>
4028
  <path id="m0fca2865ba" d="M 0 0 L -3.5 0 " style="stroke: #000000; stroke-width: 0.8" />
4029
  </defs>
4030
  <g>
4031
- <use ns4:href="#m0fca2865ba" x="60.23" y="416.825206" style="stroke: #000000; stroke-width: 0.8" />
4032
  </g>
4033
  </g>
4034
  <g id="text_10">
4035
- <text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: end" x="53.23" y="420.624425" transform="rotate(-0 53.23 420.624425)">0.025</text>
4036
  </g>
4037
  </g>
4038
  <g id="ytick_2">
4039
  <g id="grid-y--3" class="grid grid-y">
4040
- <path d="M 60.23 346.161452 L 847.294169 346.161452 " clip-path="url(#p620c7d392f)" style="fill: none; stroke: #b0b0b0; stroke-opacity: 0.3; stroke-width: 0.8; stroke-linecap: square" />
4041
  </g>
4042
  <g id="line2d_11">
4043
  <g>
4044
- <use ns4:href="#m0fca2865ba" x="60.23" y="346.161452" style="stroke: #000000; stroke-width: 0.8" />
4045
  </g>
4046
  </g>
4047
  <g id="text_11">
4048
- <text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: end" x="53.23" y="349.960671" transform="rotate(-0 53.23 349.960671)">0.030</text>
4049
  </g>
4050
  </g>
4051
  <g id="ytick_3">
4052
  <g id="grid-y--4" class="grid grid-y">
4053
- <path d="M 60.23 275.497698 L 847.294169 275.497698 " clip-path="url(#p620c7d392f)" style="fill: none; stroke: #b0b0b0; stroke-opacity: 0.3; stroke-width: 0.8; stroke-linecap: square" />
4054
  </g>
4055
  <g id="line2d_12">
4056
  <g>
4057
- <use ns4:href="#m0fca2865ba" x="60.23" y="275.497698" style="stroke: #000000; stroke-width: 0.8" />
4058
  </g>
4059
  </g>
4060
  <g id="text_12">
4061
- <text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: end" x="53.23" y="279.296917" transform="rotate(-0 53.23 279.296917)">0.035</text>
4062
  </g>
4063
  </g>
4064
  <g id="ytick_4">
4065
  <g id="grid-y--5" class="grid grid-y">
4066
- <path d="M 60.23 204.833944 L 847.294169 204.833944 " clip-path="url(#p620c7d392f)" style="fill: none; stroke: #b0b0b0; stroke-opacity: 0.3; stroke-width: 0.8; stroke-linecap: square" />
4067
  </g>
4068
  <g id="line2d_13">
4069
  <g>
4070
- <use ns4:href="#m0fca2865ba" x="60.23" y="204.833944" style="stroke: #000000; stroke-width: 0.8" />
4071
  </g>
4072
  </g>
4073
  <g id="text_13">
4074
- <text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: end" x="53.23" y="208.633163" transform="rotate(-0 53.23 208.633163)">0.040</text>
4075
  </g>
4076
  </g>
4077
  <g id="ytick_5">
4078
  <g id="grid-y--6" class="grid grid-y">
4079
- <path d="M 60.23 134.170191 L 847.294169 134.170191 " clip-path="url(#p620c7d392f)" style="fill: none; stroke: #b0b0b0; stroke-opacity: 0.3; stroke-width: 0.8; stroke-linecap: square" />
4080
  </g>
4081
  <g id="line2d_14">
4082
  <g>
4083
- <use ns4:href="#m0fca2865ba" x="60.23" y="134.170191" style="stroke: #000000; stroke-width: 0.8" />
4084
  </g>
4085
  </g>
4086
  <g id="text_14">
4087
- <text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: end" x="53.23" y="137.969409" transform="rotate(-0 53.23 137.969409)">0.045</text>
4088
  </g>
4089
  </g>
4090
  <g id="ytick_6">
4091
  <g id="grid-y--7" class="grid grid-y">
4092
- <path d="M 60.23 63.506437 L 847.294169 63.506437 " clip-path="url(#p620c7d392f)" style="fill: none; stroke: #b0b0b0; stroke-opacity: 0.3; stroke-width: 0.8; stroke-linecap: square" />
4093
  </g>
4094
  <g id="line2d_15">
4095
  <g>
4096
- <use ns4:href="#m0fca2865ba" x="60.23" y="63.506437" style="stroke: #000000; stroke-width: 0.8" />
4097
  </g>
4098
  </g>
4099
  <g id="text_15">
4100
- <text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: end" x="53.23" y="67.305655" transform="rotate(-0 53.23 67.305655)">0.050</text>
 
 
 
 
 
 
 
 
 
 
 
 
 
4101
  </g>
4102
  </g>
4103
  <g id="label--y" class="ylabel">
@@ -4105,37 +4118,37 @@ span.linenos.special { color: #000000; background-color: #ffffc0; padding-left:
4105
  </g>
4106
  </g>
4107
  <g id="series--hf-kernels-swiglu" class="series">
4108
- <path d="M 96.005644 451.16779 L 185.444754 370.031668 L 274.883864 370.596978 L 364.322974 386.708314 L 453.762084 392.220086 L 543.201194 399.569118 L 632.640304 388.969554 L 722.079415 403.526288 L 811.518525 390.241503 " clip-path="url(#p620c7d392f)" style="fill: none; stroke: #1f77b4; stroke-width: 1.5; stroke-linecap: square" />
4109
  <defs>
4110
  <path id="md7efaf3aec" d="M 0 3 C 0.795609 3 1.55874 2.683901 2.12132 2.12132 C 2.683901 1.55874 3 0.795609 3 0 C 3 -0.795609 2.683901 -1.55874 2.12132 -2.12132 C 1.55874 -2.683901 0.795609 -3 0 -3 C -0.795609 -3 -1.55874 -2.683901 -2.12132 -2.12132 C -2.683901 -1.55874 -3 -0.795609 -3 0 C -3 0.795609 -2.683901 1.55874 -2.12132 2.12132 C -1.55874 2.683901 -0.795609 3 0 3 z " style="stroke: #1f77b4" />
4111
  </defs>
4112
  <g clip-path="url(#p620c7d392f)">
4113
  <use ns4:href="#md7efaf3aec" x="96.005644" y="451.16779" style="fill: #1f77b4; stroke: #1f77b4" />
4114
- <use ns4:href="#md7efaf3aec" x="185.444754" y="370.031668" style="fill: #1f77b4; stroke: #1f77b4" />
4115
- <use ns4:href="#md7efaf3aec" x="274.883864" y="370.596978" style="fill: #1f77b4; stroke: #1f77b4" />
4116
- <use ns4:href="#md7efaf3aec" x="364.322974" y="386.708314" style="fill: #1f77b4; stroke: #1f77b4" />
4117
- <use ns4:href="#md7efaf3aec" x="453.762084" y="392.220086" style="fill: #1f77b4; stroke: #1f77b4" />
4118
- <use ns4:href="#md7efaf3aec" x="543.201194" y="399.569118" style="fill: #1f77b4; stroke: #1f77b4" />
4119
- <use ns4:href="#md7efaf3aec" x="632.640304" y="388.969554" style="fill: #1f77b4; stroke: #1f77b4" />
4120
- <use ns4:href="#md7efaf3aec" x="722.079415" y="403.526288" style="fill: #1f77b4; stroke: #1f77b4" />
4121
- <use ns4:href="#md7efaf3aec" x="811.518525" y="390.241503" style="fill: #1f77b4; stroke: #1f77b4" />
4122
  </g>
4123
  </g>
4124
  <g id="series--torch-eager" class="series">
4125
- <path d="M 96.005644 166.37873 L 185.444754 47.08418 L 274.883864 54.857193 L 364.322974 60.807081 L 453.762084 69.569387 L 543.201194 78.176231 L 632.640304 66.44605 L 722.079415 63.902153 L 811.518525 71.109857 " clip-path="url(#p620c7d392f)" style="fill: none; stroke: #ff7f0e; stroke-width: 1.5; stroke-linecap: square" />
4126
  <defs>
4127
  <path id="m9b8c54d372" d="M 0 3 C 0.795609 3 1.55874 2.683901 2.12132 2.12132 C 2.683901 1.55874 3 0.795609 3 0 C 3 -0.795609 2.683901 -1.55874 2.12132 -2.12132 C 1.55874 -2.683901 0.795609 -3 0 -3 C -0.795609 -3 -1.55874 -2.683901 -2.12132 -2.12132 C -2.683901 -1.55874 -3 -0.795609 -3 0 C -3 0.795609 -2.683901 1.55874 -2.12132 2.12132 C -1.55874 2.683901 -0.795609 3 0 3 z " style="stroke: #ff7f0e" />
4128
  </defs>
4129
  <g clip-path="url(#p620c7d392f)">
4130
- <use ns4:href="#m9b8c54d372" x="96.005644" y="166.37873" style="fill: #ff7f0e; stroke: #ff7f0e" />
4131
  <use ns4:href="#m9b8c54d372" x="185.444754" y="47.08418" style="fill: #ff7f0e; stroke: #ff7f0e" />
4132
- <use ns4:href="#m9b8c54d372" x="274.883864" y="54.857193" style="fill: #ff7f0e; stroke: #ff7f0e" />
4133
- <use ns4:href="#m9b8c54d372" x="364.322974" y="60.807081" style="fill: #ff7f0e; stroke: #ff7f0e" />
4134
- <use ns4:href="#m9b8c54d372" x="453.762084" y="69.569387" style="fill: #ff7f0e; stroke: #ff7f0e" />
4135
- <use ns4:href="#m9b8c54d372" x="543.201194" y="78.176231" style="fill: #ff7f0e; stroke: #ff7f0e" />
4136
- <use ns4:href="#m9b8c54d372" x="632.640304" y="66.44605" style="fill: #ff7f0e; stroke: #ff7f0e" />
4137
- <use ns4:href="#m9b8c54d372" x="722.079415" y="63.902153" style="fill: #ff7f0e; stroke: #ff7f0e" />
4138
- <use ns4:href="#m9b8c54d372" x="811.518525" y="71.109857" style="fill: #ff7f0e; stroke: #ff7f0e" />
4139
  </g>
4140
  </g>
4141
  <g id="patch_3">
@@ -4150,30 +4163,30 @@ span.linenos.special { color: #000000; background-color: #ffffc0; padding-left:
4150
  <g id="patch_6">
4151
  <path d="M 60.23 26.88 L 847.294169 26.88 " style="fill: none; stroke: #000000; stroke-width: 0.8; stroke-linejoin: miter; stroke-linecap: square" />
4152
  </g>
4153
- <g id="text_16">
4154
  <text style="font-size: 12px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: middle" x="453.762084" y="20.88" transform="rotate(-0 453.762084 20.88)">Attention Implementation Latency</text>
4155
  </g>
4156
  <g id="legend" class="legend">
4157
  <g id="patch_7">
4158
- <path d="M 720.811356 466.37197 L 840.294169 466.37197 Q 842.294169 466.37197 842.294169 464.37197 L 842.294169 435.45947 Q 842.294169 433.45947 840.294169 433.45947 L 720.811356 433.45947 Q 718.811356 433.45947 718.811356 435.45947 L 718.811356 464.37197 Q 718.811356 466.37197 720.811356 466.37197 L 720.811356 466.37197 z " style="fill: none; opacity: 0.8; stroke: #cccccc; stroke-linejoin: miter" />
4159
  </g>
4160
- <g id="line2d_16">
4161
- <path d="M 722.811356 441.557908 L 732.811356 441.557908 L 742.811356 441.557908 " style="fill: none; stroke: #1f77b4; stroke-width: 1.5; stroke-linecap: square" />
4162
  <g>
4163
- <use ns4:href="#md7efaf3aec" x="732.811356" y="441.557908" style="fill: #1f77b4; stroke: #1f77b4" />
4164
  </g>
4165
  </g>
4166
  <g id="legend-label--hf-kernels-swiglu" class="legend">
4167
- <text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: start" x="750.811356" y="445.057908" transform="rotate(-0 750.811356 445.057908)">hf_kernels_swiglu</text>
4168
  </g>
4169
- <g id="line2d_17">
4170
- <path d="M 722.811356 456.514158 L 732.811356 456.514158 L 742.811356 456.514158 " style="fill: none; stroke: #ff7f0e; stroke-width: 1.5; stroke-linecap: square" />
4171
  <g>
4172
- <use ns4:href="#m9b8c54d372" x="732.811356" y="456.514158" style="fill: #ff7f0e; stroke: #ff7f0e" />
4173
  </g>
4174
  </g>
4175
  <g id="legend-label--torch-eager" class="legend">
4176
- <text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: start" x="750.811356" y="460.014158" transform="rotate(-0 750.811356 460.014158)">torch_eager</text>
4177
  </g>
4178
  </g>
4179
  </g>
@@ -4193,7 +4206,7 @@ span.linenos.special { color: #000000; background-color: #ffffc0; padding-left:
4193
  <span onclick="toggleOutput('combine')" style="cursor: pointer;">▼ output</span>
4194
  <span id="uv-indicator-combine" onclick="toggleUvLogsFromHeader('combine')" style="cursor: pointer;">▶ uv-logs</span>
4195
  </span> |
4196
- Cell: combine | 4.28s
4197
  | <button class="run-btn" onclick="runCell('combine')">▶ run</button>
4198
  <button class="copy-btn" onclick="copyCell('combine')">Copy</button>
4199
  <a href="cells/combine.py" target="_blank" class="raw-btn">Raw</a>
@@ -4319,7 +4332,7 @@ Implementations included:
4319
  <div class="uv-install-logs" id="uv-logs-combine">
4320
  <div class="uv-logs-header" onclick="toggleUvLogs(this)">▶ UV Install Logs</div>
4321
  <div class="uv-logs-content" style="display: none;">
4322
- Installed 37 packages in 195ms
4323
  </div>
4324
  </div>
4325
  <div class="cell-artifacts">
@@ -4332,7 +4345,7 @@ Installed 37 packages in 195ms
4332
  <rdf:RDF>
4333
  <ns2:Work>
4334
  <dc:type rdf:resource="http://purl.org/dc/dcmitype/StillImage" />
4335
- <dc:date>2025-10-28T14:09:13.211569</dc:date>
4336
  <dc:format>image/svg+xml</dc:format>
4337
  <dc:creator>
4338
  <ns2:Agent>
@@ -4481,83 +4494,96 @@ Installed 37 packages in 195ms
4481
  <g id="matplotlib.axis_2">
4482
  <g id="ytick_1">
4483
  <g id="grid-y--2" class="grid grid-y">
4484
- <path d="M 60.23 416.825206 L 847.294169 416.825206 " clip-path="url(#p620c7d392f)" style="fill: none; stroke: #b0b0b0; stroke-opacity: 0.3; stroke-width: 0.8; stroke-linecap: square" />
4485
  </g>
4486
  <g id="line2d_10">
4487
  <defs>
4488
  <path id="m0fca2865ba" d="M 0 0 L -3.5 0 " style="stroke: #000000; stroke-width: 0.8" />
4489
  </defs>
4490
  <g>
4491
- <use ns4:href="#m0fca2865ba" x="60.23" y="416.825206" style="stroke: #000000; stroke-width: 0.8" />
4492
  </g>
4493
  </g>
4494
  <g id="text_10">
4495
- <text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: end" x="53.23" y="420.624425" transform="rotate(-0 53.23 420.624425)">0.025</text>
4496
  </g>
4497
  </g>
4498
  <g id="ytick_2">
4499
  <g id="grid-y--3" class="grid grid-y">
4500
- <path d="M 60.23 346.161452 L 847.294169 346.161452 " clip-path="url(#p620c7d392f)" style="fill: none; stroke: #b0b0b0; stroke-opacity: 0.3; stroke-width: 0.8; stroke-linecap: square" />
4501
  </g>
4502
  <g id="line2d_11">
4503
  <g>
4504
- <use ns4:href="#m0fca2865ba" x="60.23" y="346.161452" style="stroke: #000000; stroke-width: 0.8" />
4505
  </g>
4506
  </g>
4507
  <g id="text_11">
4508
- <text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: end" x="53.23" y="349.960671" transform="rotate(-0 53.23 349.960671)">0.030</text>
4509
  </g>
4510
  </g>
4511
  <g id="ytick_3">
4512
  <g id="grid-y--4" class="grid grid-y">
4513
- <path d="M 60.23 275.497698 L 847.294169 275.497698 " clip-path="url(#p620c7d392f)" style="fill: none; stroke: #b0b0b0; stroke-opacity: 0.3; stroke-width: 0.8; stroke-linecap: square" />
4514
  </g>
4515
  <g id="line2d_12">
4516
  <g>
4517
- <use ns4:href="#m0fca2865ba" x="60.23" y="275.497698" style="stroke: #000000; stroke-width: 0.8" />
4518
  </g>
4519
  </g>
4520
  <g id="text_12">
4521
- <text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: end" x="53.23" y="279.296917" transform="rotate(-0 53.23 279.296917)">0.035</text>
4522
  </g>
4523
  </g>
4524
  <g id="ytick_4">
4525
  <g id="grid-y--5" class="grid grid-y">
4526
- <path d="M 60.23 204.833944 L 847.294169 204.833944 " clip-path="url(#p620c7d392f)" style="fill: none; stroke: #b0b0b0; stroke-opacity: 0.3; stroke-width: 0.8; stroke-linecap: square" />
4527
  </g>
4528
  <g id="line2d_13">
4529
  <g>
4530
- <use ns4:href="#m0fca2865ba" x="60.23" y="204.833944" style="stroke: #000000; stroke-width: 0.8" />
4531
  </g>
4532
  </g>
4533
  <g id="text_13">
4534
- <text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: end" x="53.23" y="208.633163" transform="rotate(-0 53.23 208.633163)">0.040</text>
4535
  </g>
4536
  </g>
4537
  <g id="ytick_5">
4538
  <g id="grid-y--6" class="grid grid-y">
4539
- <path d="M 60.23 134.170191 L 847.294169 134.170191 " clip-path="url(#p620c7d392f)" style="fill: none; stroke: #b0b0b0; stroke-opacity: 0.3; stroke-width: 0.8; stroke-linecap: square" />
4540
  </g>
4541
  <g id="line2d_14">
4542
  <g>
4543
- <use ns4:href="#m0fca2865ba" x="60.23" y="134.170191" style="stroke: #000000; stroke-width: 0.8" />
4544
  </g>
4545
  </g>
4546
  <g id="text_14">
4547
- <text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: end" x="53.23" y="137.969409" transform="rotate(-0 53.23 137.969409)">0.045</text>
4548
  </g>
4549
  </g>
4550
  <g id="ytick_6">
4551
  <g id="grid-y--7" class="grid grid-y">
4552
- <path d="M 60.23 63.506437 L 847.294169 63.506437 " clip-path="url(#p620c7d392f)" style="fill: none; stroke: #b0b0b0; stroke-opacity: 0.3; stroke-width: 0.8; stroke-linecap: square" />
4553
  </g>
4554
  <g id="line2d_15">
4555
  <g>
4556
- <use ns4:href="#m0fca2865ba" x="60.23" y="63.506437" style="stroke: #000000; stroke-width: 0.8" />
4557
  </g>
4558
  </g>
4559
  <g id="text_15">
4560
- <text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: end" x="53.23" y="67.305655" transform="rotate(-0 53.23 67.305655)">0.050</text>
 
 
 
 
 
 
 
 
 
 
 
 
 
4561
  </g>
4562
  </g>
4563
  <g id="label--y" class="ylabel">
@@ -4565,37 +4591,37 @@ Installed 37 packages in 195ms
4565
  </g>
4566
  </g>
4567
  <g id="series--hf-kernels-swiglu" class="series">
4568
- <path d="M 96.005644 451.16779 L 185.444754 370.031668 L 274.883864 370.596978 L 364.322974 386.708314 L 453.762084 392.220086 L 543.201194 399.569118 L 632.640304 388.969554 L 722.079415 403.526288 L 811.518525 390.241503 " clip-path="url(#p620c7d392f)" style="fill: none; stroke: #1f77b4; stroke-width: 1.5; stroke-linecap: square" />
4569
  <defs>
4570
  <path id="md7efaf3aec" d="M 0 3 C 0.795609 3 1.55874 2.683901 2.12132 2.12132 C 2.683901 1.55874 3 0.795609 3 0 C 3 -0.795609 2.683901 -1.55874 2.12132 -2.12132 C 1.55874 -2.683901 0.795609 -3 0 -3 C -0.795609 -3 -1.55874 -2.683901 -2.12132 -2.12132 C -2.683901 -1.55874 -3 -0.795609 -3 0 C -3 0.795609 -2.683901 1.55874 -2.12132 2.12132 C -1.55874 2.683901 -0.795609 3 0 3 z " style="stroke: #1f77b4" />
4571
  </defs>
4572
  <g clip-path="url(#p620c7d392f)">
4573
  <use ns4:href="#md7efaf3aec" x="96.005644" y="451.16779" style="fill: #1f77b4; stroke: #1f77b4" />
4574
- <use ns4:href="#md7efaf3aec" x="185.444754" y="370.031668" style="fill: #1f77b4; stroke: #1f77b4" />
4575
- <use ns4:href="#md7efaf3aec" x="274.883864" y="370.596978" style="fill: #1f77b4; stroke: #1f77b4" />
4576
- <use ns4:href="#md7efaf3aec" x="364.322974" y="386.708314" style="fill: #1f77b4; stroke: #1f77b4" />
4577
- <use ns4:href="#md7efaf3aec" x="453.762084" y="392.220086" style="fill: #1f77b4; stroke: #1f77b4" />
4578
- <use ns4:href="#md7efaf3aec" x="543.201194" y="399.569118" style="fill: #1f77b4; stroke: #1f77b4" />
4579
- <use ns4:href="#md7efaf3aec" x="632.640304" y="388.969554" style="fill: #1f77b4; stroke: #1f77b4" />
4580
- <use ns4:href="#md7efaf3aec" x="722.079415" y="403.526288" style="fill: #1f77b4; stroke: #1f77b4" />
4581
- <use ns4:href="#md7efaf3aec" x="811.518525" y="390.241503" style="fill: #1f77b4; stroke: #1f77b4" />
4582
  </g>
4583
  </g>
4584
  <g id="series--torch-eager" class="series">
4585
- <path d="M 96.005644 166.37873 L 185.444754 47.08418 L 274.883864 54.857193 L 364.322974 60.807081 L 453.762084 69.569387 L 543.201194 78.176231 L 632.640304 66.44605 L 722.079415 63.902153 L 811.518525 71.109857 " clip-path="url(#p620c7d392f)" style="fill: none; stroke: #ff7f0e; stroke-width: 1.5; stroke-linecap: square" />
4586
  <defs>
4587
  <path id="m9b8c54d372" d="M 0 3 C 0.795609 3 1.55874 2.683901 2.12132 2.12132 C 2.683901 1.55874 3 0.795609 3 0 C 3 -0.795609 2.683901 -1.55874 2.12132 -2.12132 C 1.55874 -2.683901 0.795609 -3 0 -3 C -0.795609 -3 -1.55874 -2.683901 -2.12132 -2.12132 C -2.683901 -1.55874 -3 -0.795609 -3 0 C -3 0.795609 -2.683901 1.55874 -2.12132 2.12132 C -1.55874 2.683901 -0.795609 3 0 3 z " style="stroke: #ff7f0e" />
4588
  </defs>
4589
  <g clip-path="url(#p620c7d392f)">
4590
- <use ns4:href="#m9b8c54d372" x="96.005644" y="166.37873" style="fill: #ff7f0e; stroke: #ff7f0e" />
4591
  <use ns4:href="#m9b8c54d372" x="185.444754" y="47.08418" style="fill: #ff7f0e; stroke: #ff7f0e" />
4592
- <use ns4:href="#m9b8c54d372" x="274.883864" y="54.857193" style="fill: #ff7f0e; stroke: #ff7f0e" />
4593
- <use ns4:href="#m9b8c54d372" x="364.322974" y="60.807081" style="fill: #ff7f0e; stroke: #ff7f0e" />
4594
- <use ns4:href="#m9b8c54d372" x="453.762084" y="69.569387" style="fill: #ff7f0e; stroke: #ff7f0e" />
4595
- <use ns4:href="#m9b8c54d372" x="543.201194" y="78.176231" style="fill: #ff7f0e; stroke: #ff7f0e" />
4596
- <use ns4:href="#m9b8c54d372" x="632.640304" y="66.44605" style="fill: #ff7f0e; stroke: #ff7f0e" />
4597
- <use ns4:href="#m9b8c54d372" x="722.079415" y="63.902153" style="fill: #ff7f0e; stroke: #ff7f0e" />
4598
- <use ns4:href="#m9b8c54d372" x="811.518525" y="71.109857" style="fill: #ff7f0e; stroke: #ff7f0e" />
4599
  </g>
4600
  </g>
4601
  <g id="patch_3">
@@ -4610,30 +4636,30 @@ Installed 37 packages in 195ms
4610
  <g id="patch_6">
4611
  <path d="M 60.23 26.88 L 847.294169 26.88 " style="fill: none; stroke: #000000; stroke-width: 0.8; stroke-linejoin: miter; stroke-linecap: square" />
4612
  </g>
4613
- <g id="text_16">
4614
  <text style="font-size: 12px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: middle" x="453.762084" y="20.88" transform="rotate(-0 453.762084 20.88)">Attention Implementation Latency</text>
4615
  </g>
4616
  <g id="legend" class="legend">
4617
  <g id="patch_7">
4618
- <path d="M 720.811356 466.37197 L 840.294169 466.37197 Q 842.294169 466.37197 842.294169 464.37197 L 842.294169 435.45947 Q 842.294169 433.45947 840.294169 433.45947 L 720.811356 433.45947 Q 718.811356 433.45947 718.811356 435.45947 L 718.811356 464.37197 Q 718.811356 466.37197 720.811356 466.37197 L 720.811356 466.37197 z " style="fill: none; opacity: 0.8; stroke: #cccccc; stroke-linejoin: miter" />
4619
  </g>
4620
- <g id="line2d_16">
4621
- <path d="M 722.811356 441.557908 L 732.811356 441.557908 L 742.811356 441.557908 " style="fill: none; stroke: #1f77b4; stroke-width: 1.5; stroke-linecap: square" />
4622
  <g>
4623
- <use ns4:href="#md7efaf3aec" x="732.811356" y="441.557908" style="fill: #1f77b4; stroke: #1f77b4" />
4624
  </g>
4625
  </g>
4626
  <g id="legend-label--hf-kernels-swiglu" class="legend">
4627
- <text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: start" x="750.811356" y="445.057908" transform="rotate(-0 750.811356 445.057908)">hf_kernels_swiglu</text>
4628
  </g>
4629
- <g id="line2d_17">
4630
- <path d="M 722.811356 456.514158 L 732.811356 456.514158 L 742.811356 456.514158 " style="fill: none; stroke: #ff7f0e; stroke-width: 1.5; stroke-linecap: square" />
4631
  <g>
4632
- <use ns4:href="#m9b8c54d372" x="732.811356" y="456.514158" style="fill: #ff7f0e; stroke: #ff7f0e" />
4633
  </g>
4634
  </g>
4635
  <g id="legend-label--torch-eager" class="legend">
4636
- <text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: start" x="750.811356" y="460.014158" transform="rotate(-0 750.811356 460.014158)">torch_eager</text>
4637
  </g>
4638
  </g>
4639
  </g>
 
3872
  <rdf:RDF>
3873
  <ns2:Work>
3874
  <dc:type rdf:resource="http://purl.org/dc/dcmitype/StillImage" />
3875
+ <dc:date>2025-10-29T14:27:49.999657</dc:date>
3876
  <dc:format>image/svg+xml</dc:format>
3877
  <dc:creator>
3878
  <ns2:Agent>
 
4021
  <g id="matplotlib.axis_2">
4022
  <g id="ytick_1">
4023
  <g id="grid-y--2" class="grid grid-y">
4024
+ <path d="M 60.23 428.188156 L 847.294169 428.188156 " clip-path="url(#p620c7d392f)" style="fill: none; stroke: #b0b0b0; stroke-opacity: 0.3; stroke-width: 0.8; stroke-linecap: square" />
4025
  </g>
4026
  <g id="line2d_10">
4027
  <defs>
4028
  <path id="m0fca2865ba" d="M 0 0 L -3.5 0 " style="stroke: #000000; stroke-width: 0.8" />
4029
  </defs>
4030
  <g>
4031
+ <use ns4:href="#m0fca2865ba" x="60.23" y="428.188156" style="stroke: #000000; stroke-width: 0.8" />
4032
  </g>
4033
  </g>
4034
  <g id="text_10">
4035
+ <text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: end" x="53.23" y="431.987375" transform="rotate(-0 53.23 431.987375)">0.025</text>
4036
  </g>
4037
  </g>
4038
  <g id="ytick_2">
4039
  <g id="grid-y--3" class="grid grid-y">
4040
+ <path d="M 60.23 362.86799 L 847.294169 362.86799 " clip-path="url(#p620c7d392f)" style="fill: none; stroke: #b0b0b0; stroke-opacity: 0.3; stroke-width: 0.8; stroke-linecap: square" />
4041
  </g>
4042
  <g id="line2d_11">
4043
  <g>
4044
+ <use ns4:href="#m0fca2865ba" x="60.23" y="362.86799" style="stroke: #000000; stroke-width: 0.8" />
4045
  </g>
4046
  </g>
4047
  <g id="text_11">
4048
+ <text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: end" x="53.23" y="366.667209" transform="rotate(-0 53.23 366.667209)">0.030</text>
4049
  </g>
4050
  </g>
4051
  <g id="ytick_3">
4052
  <g id="grid-y--4" class="grid grid-y">
4053
+ <path d="M 60.23 297.547824 L 847.294169 297.547824 " clip-path="url(#p620c7d392f)" style="fill: none; stroke: #b0b0b0; stroke-opacity: 0.3; stroke-width: 0.8; stroke-linecap: square" />
4054
  </g>
4055
  <g id="line2d_12">
4056
  <g>
4057
+ <use ns4:href="#m0fca2865ba" x="60.23" y="297.547824" style="stroke: #000000; stroke-width: 0.8" />
4058
  </g>
4059
  </g>
4060
  <g id="text_12">
4061
+ <text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: end" x="53.23" y="301.347043" transform="rotate(-0 53.23 301.347043)">0.035</text>
4062
  </g>
4063
  </g>
4064
  <g id="ytick_4">
4065
  <g id="grid-y--5" class="grid grid-y">
4066
+ <path d="M 60.23 232.227658 L 847.294169 232.227658 " clip-path="url(#p620c7d392f)" style="fill: none; stroke: #b0b0b0; stroke-opacity: 0.3; stroke-width: 0.8; stroke-linecap: square" />
4067
  </g>
4068
  <g id="line2d_13">
4069
  <g>
4070
+ <use ns4:href="#m0fca2865ba" x="60.23" y="232.227658" style="stroke: #000000; stroke-width: 0.8" />
4071
  </g>
4072
  </g>
4073
  <g id="text_13">
4074
+ <text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: end" x="53.23" y="236.026877" transform="rotate(-0 53.23 236.026877)">0.040</text>
4075
  </g>
4076
  </g>
4077
  <g id="ytick_5">
4078
  <g id="grid-y--6" class="grid grid-y">
4079
+ <path d="M 60.23 166.907492 L 847.294169 166.907492 " clip-path="url(#p620c7d392f)" style="fill: none; stroke: #b0b0b0; stroke-opacity: 0.3; stroke-width: 0.8; stroke-linecap: square" />
4080
  </g>
4081
  <g id="line2d_14">
4082
  <g>
4083
+ <use ns4:href="#m0fca2865ba" x="60.23" y="166.907492" style="stroke: #000000; stroke-width: 0.8" />
4084
  </g>
4085
  </g>
4086
  <g id="text_14">
4087
+ <text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: end" x="53.23" y="170.706711" transform="rotate(-0 53.23 170.706711)">0.045</text>
4088
  </g>
4089
  </g>
4090
  <g id="ytick_6">
4091
  <g id="grid-y--7" class="grid grid-y">
4092
+ <path d="M 60.23 101.587327 L 847.294169 101.587327 " clip-path="url(#p620c7d392f)" style="fill: none; stroke: #b0b0b0; stroke-opacity: 0.3; stroke-width: 0.8; stroke-linecap: square" />
4093
  </g>
4094
  <g id="line2d_15">
4095
  <g>
4096
+ <use ns4:href="#m0fca2865ba" x="60.23" y="101.587327" style="stroke: #000000; stroke-width: 0.8" />
4097
  </g>
4098
  </g>
4099
  <g id="text_15">
4100
+ <text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: end" x="53.23" y="105.386545" transform="rotate(-0 53.23 105.386545)">0.050</text>
4101
+ </g>
4102
+ </g>
4103
+ <g id="ytick_7">
4104
+ <g id="grid-y--8" class="grid grid-y">
4105
+ <path d="M 60.23 36.267161 L 847.294169 36.267161 " clip-path="url(#p620c7d392f)" style="fill: none; stroke: #b0b0b0; stroke-opacity: 0.3; stroke-width: 0.8; stroke-linecap: square" />
4106
+ </g>
4107
+ <g id="line2d_16">
4108
+ <g>
4109
+ <use ns4:href="#m0fca2865ba" x="60.23" y="36.267161" style="stroke: #000000; stroke-width: 0.8" />
4110
+ </g>
4111
+ </g>
4112
+ <g id="text_16">
4113
+ <text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: end" x="53.23" y="40.066379" transform="rotate(-0 53.23 40.066379)">0.055</text>
4114
  </g>
4115
  </g>
4116
  <g id="label--y" class="ylabel">
 
4118
  </g>
4119
  </g>
4120
  <g id="series--hf-kernels-swiglu" class="series">
4121
+ <path d="M 96.005644 451.16779 L 185.444754 385.847624 L 274.883864 395.253728 L 364.322974 398.911657 L 453.762084 382.189695 L 543.201194 401.393823 L 632.640304 395.136152 L 722.079415 381.275213 L 811.518525 395.515009 " clip-path="url(#p620c7d392f)" style="fill: none; stroke: #1f77b4; stroke-width: 1.5; stroke-linecap: square" />
4122
  <defs>
4123
  <path id="md7efaf3aec" d="M 0 3 C 0.795609 3 1.55874 2.683901 2.12132 2.12132 C 2.683901 1.55874 3 0.795609 3 0 C 3 -0.795609 2.683901 -1.55874 2.12132 -2.12132 C 1.55874 -2.683901 0.795609 -3 0 -3 C -0.795609 -3 -1.55874 -2.683901 -2.12132 -2.12132 C -2.683901 -1.55874 -3 -0.795609 -3 0 C -3 0.795609 -2.683901 1.55874 -2.12132 2.12132 C -1.55874 2.683901 -0.795609 3 0 3 z " style="stroke: #1f77b4" />
4124
  </defs>
4125
  <g clip-path="url(#p620c7d392f)">
4126
  <use ns4:href="#md7efaf3aec" x="96.005644" y="451.16779" style="fill: #1f77b4; stroke: #1f77b4" />
4127
+ <use ns4:href="#md7efaf3aec" x="185.444754" y="385.847624" style="fill: #1f77b4; stroke: #1f77b4" />
4128
+ <use ns4:href="#md7efaf3aec" x="274.883864" y="395.253728" style="fill: #1f77b4; stroke: #1f77b4" />
4129
+ <use ns4:href="#md7efaf3aec" x="364.322974" y="398.911657" style="fill: #1f77b4; stroke: #1f77b4" />
4130
+ <use ns4:href="#md7efaf3aec" x="453.762084" y="382.189695" style="fill: #1f77b4; stroke: #1f77b4" />
4131
+ <use ns4:href="#md7efaf3aec" x="543.201194" y="401.393823" style="fill: #1f77b4; stroke: #1f77b4" />
4132
+ <use ns4:href="#md7efaf3aec" x="632.640304" y="395.136152" style="fill: #1f77b4; stroke: #1f77b4" />
4133
+ <use ns4:href="#md7efaf3aec" x="722.079415" y="381.275213" style="fill: #1f77b4; stroke: #1f77b4" />
4134
+ <use ns4:href="#md7efaf3aec" x="811.518525" y="395.515009" style="fill: #1f77b4; stroke: #1f77b4" />
4135
  </g>
4136
  </g>
4137
  <g id="series--torch-eager" class="series">
4138
+ <path d="M 96.005644 194.328898 L 185.444754 47.08418 L 274.883864 59.495011 L 364.322974 61.46768 L 453.762084 66.170732 L 543.201194 84.055394 L 632.640304 56.503348 L 722.079415 80.67181 L 811.518525 81.586292 " clip-path="url(#p620c7d392f)" style="fill: none; stroke: #ff7f0e; stroke-width: 1.5; stroke-linecap: square" />
4139
  <defs>
4140
  <path id="m9b8c54d372" d="M 0 3 C 0.795609 3 1.55874 2.683901 2.12132 2.12132 C 2.683901 1.55874 3 0.795609 3 0 C 3 -0.795609 2.683901 -1.55874 2.12132 -2.12132 C 1.55874 -2.683901 0.795609 -3 0 -3 C -0.795609 -3 -1.55874 -2.683901 -2.12132 -2.12132 C -2.683901 -1.55874 -3 -0.795609 -3 0 C -3 0.795609 -2.683901 1.55874 -2.12132 2.12132 C -1.55874 2.683901 -0.795609 3 0 3 z " style="stroke: #ff7f0e" />
4141
  </defs>
4142
  <g clip-path="url(#p620c7d392f)">
4143
+ <use ns4:href="#m9b8c54d372" x="96.005644" y="194.328898" style="fill: #ff7f0e; stroke: #ff7f0e" />
4144
  <use ns4:href="#m9b8c54d372" x="185.444754" y="47.08418" style="fill: #ff7f0e; stroke: #ff7f0e" />
4145
+ <use ns4:href="#m9b8c54d372" x="274.883864" y="59.495011" style="fill: #ff7f0e; stroke: #ff7f0e" />
4146
+ <use ns4:href="#m9b8c54d372" x="364.322974" y="61.46768" style="fill: #ff7f0e; stroke: #ff7f0e" />
4147
+ <use ns4:href="#m9b8c54d372" x="453.762084" y="66.170732" style="fill: #ff7f0e; stroke: #ff7f0e" />
4148
+ <use ns4:href="#m9b8c54d372" x="543.201194" y="84.055394" style="fill: #ff7f0e; stroke: #ff7f0e" />
4149
+ <use ns4:href="#m9b8c54d372" x="632.640304" y="56.503348" style="fill: #ff7f0e; stroke: #ff7f0e" />
4150
+ <use ns4:href="#m9b8c54d372" x="722.079415" y="80.67181" style="fill: #ff7f0e; stroke: #ff7f0e" />
4151
+ <use ns4:href="#m9b8c54d372" x="811.518525" y="81.586292" style="fill: #ff7f0e; stroke: #ff7f0e" />
4152
  </g>
4153
  </g>
4154
  <g id="patch_3">
 
4163
  <g id="patch_6">
4164
  <path d="M 60.23 26.88 L 847.294169 26.88 " style="fill: none; stroke: #000000; stroke-width: 0.8; stroke-linejoin: miter; stroke-linecap: square" />
4165
  </g>
4166
+ <g id="text_17">
4167
  <text style="font-size: 12px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: middle" x="453.762084" y="20.88" transform="rotate(-0 453.762084 20.88)">Attention Implementation Latency</text>
4168
  </g>
4169
  <g id="legend" class="legend">
4170
  <g id="patch_7">
4171
+ <path d="M 720.811356 64.7925 L 840.294169 64.7925 Q 842.294169 64.7925 842.294169 62.7925 L 842.294169 33.88 Q 842.294169 31.88 840.294169 31.88 L 720.811356 31.88 Q 718.811356 31.88 718.811356 33.88 L 718.811356 62.7925 Q 718.811356 64.7925 720.811356 64.7925 L 720.811356 64.7925 z " style="fill: none; opacity: 0.8; stroke: #cccccc; stroke-linejoin: miter" />
4172
  </g>
4173
+ <g id="line2d_17">
4174
+ <path d="M 722.811356 39.978438 L 732.811356 39.978438 L 742.811356 39.978438 " style="fill: none; stroke: #1f77b4; stroke-width: 1.5; stroke-linecap: square" />
4175
  <g>
4176
+ <use ns4:href="#md7efaf3aec" x="732.811356" y="39.978438" style="fill: #1f77b4; stroke: #1f77b4" />
4177
  </g>
4178
  </g>
4179
  <g id="legend-label--hf-kernels-swiglu" class="legend">
4180
+ <text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: start" x="750.811356" y="43.478438" transform="rotate(-0 750.811356 43.478438)">hf_kernels_swiglu</text>
4181
  </g>
4182
+ <g id="line2d_18">
4183
+ <path d="M 722.811356 54.934687 L 732.811356 54.934687 L 742.811356 54.934687 " style="fill: none; stroke: #ff7f0e; stroke-width: 1.5; stroke-linecap: square" />
4184
  <g>
4185
+ <use ns4:href="#m9b8c54d372" x="732.811356" y="54.934687" style="fill: #ff7f0e; stroke: #ff7f0e" />
4186
  </g>
4187
  </g>
4188
  <g id="legend-label--torch-eager" class="legend">
4189
+ <text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: start" x="750.811356" y="58.434687" transform="rotate(-0 750.811356 58.434687)">torch_eager</text>
4190
  </g>
4191
  </g>
4192
  </g>
 
4206
  <span onclick="toggleOutput('combine')" style="cursor: pointer;">▼ output</span>
4207
  <span id="uv-indicator-combine" onclick="toggleUvLogsFromHeader('combine')" style="cursor: pointer;">▶ uv-logs</span>
4208
  </span> |
4209
+ Cell: combine | 4.24s
4210
  | <button class="run-btn" onclick="runCell('combine')">▶ run</button>
4211
  <button class="copy-btn" onclick="copyCell('combine')">Copy</button>
4212
  <a href="cells/combine.py" target="_blank" class="raw-btn">Raw</a>
 
4332
  <div class="uv-install-logs" id="uv-logs-combine">
4333
  <div class="uv-logs-header" onclick="toggleUvLogs(this)">▶ UV Install Logs</div>
4334
  <div class="uv-logs-content" style="display: none;">
4335
+ Installed 37 packages in 218ms
4336
  </div>
4337
  </div>
4338
  <div class="cell-artifacts">
 
4345
  <rdf:RDF>
4346
  <ns2:Work>
4347
  <dc:type rdf:resource="http://purl.org/dc/dcmitype/StillImage" />
4348
+ <dc:date>2025-10-29T14:27:49.999657</dc:date>
4349
  <dc:format>image/svg+xml</dc:format>
4350
  <dc:creator>
4351
  <ns2:Agent>
 
4494
  <g id="matplotlib.axis_2">
4495
  <g id="ytick_1">
4496
  <g id="grid-y--2" class="grid grid-y">
4497
+ <path d="M 60.23 428.188156 L 847.294169 428.188156 " clip-path="url(#p620c7d392f)" style="fill: none; stroke: #b0b0b0; stroke-opacity: 0.3; stroke-width: 0.8; stroke-linecap: square" />
4498
  </g>
4499
  <g id="line2d_10">
4500
  <defs>
4501
  <path id="m0fca2865ba" d="M 0 0 L -3.5 0 " style="stroke: #000000; stroke-width: 0.8" />
4502
  </defs>
4503
  <g>
4504
+ <use ns4:href="#m0fca2865ba" x="60.23" y="428.188156" style="stroke: #000000; stroke-width: 0.8" />
4505
  </g>
4506
  </g>
4507
  <g id="text_10">
4508
+ <text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: end" x="53.23" y="431.987375" transform="rotate(-0 53.23 431.987375)">0.025</text>
4509
  </g>
4510
  </g>
4511
  <g id="ytick_2">
4512
  <g id="grid-y--3" class="grid grid-y">
4513
+ <path d="M 60.23 362.86799 L 847.294169 362.86799 " clip-path="url(#p620c7d392f)" style="fill: none; stroke: #b0b0b0; stroke-opacity: 0.3; stroke-width: 0.8; stroke-linecap: square" />
4514
  </g>
4515
  <g id="line2d_11">
4516
  <g>
4517
+ <use ns4:href="#m0fca2865ba" x="60.23" y="362.86799" style="stroke: #000000; stroke-width: 0.8" />
4518
  </g>
4519
  </g>
4520
  <g id="text_11">
4521
+ <text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: end" x="53.23" y="366.667209" transform="rotate(-0 53.23 366.667209)">0.030</text>
4522
  </g>
4523
  </g>
4524
  <g id="ytick_3">
4525
  <g id="grid-y--4" class="grid grid-y">
4526
+ <path d="M 60.23 297.547824 L 847.294169 297.547824 " clip-path="url(#p620c7d392f)" style="fill: none; stroke: #b0b0b0; stroke-opacity: 0.3; stroke-width: 0.8; stroke-linecap: square" />
4527
  </g>
4528
  <g id="line2d_12">
4529
  <g>
4530
+ <use ns4:href="#m0fca2865ba" x="60.23" y="297.547824" style="stroke: #000000; stroke-width: 0.8" />
4531
  </g>
4532
  </g>
4533
  <g id="text_12">
4534
+ <text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: end" x="53.23" y="301.347043" transform="rotate(-0 53.23 301.347043)">0.035</text>
4535
  </g>
4536
  </g>
4537
  <g id="ytick_4">
4538
  <g id="grid-y--5" class="grid grid-y">
4539
+ <path d="M 60.23 232.227658 L 847.294169 232.227658 " clip-path="url(#p620c7d392f)" style="fill: none; stroke: #b0b0b0; stroke-opacity: 0.3; stroke-width: 0.8; stroke-linecap: square" />
4540
  </g>
4541
  <g id="line2d_13">
4542
  <g>
4543
+ <use ns4:href="#m0fca2865ba" x="60.23" y="232.227658" style="stroke: #000000; stroke-width: 0.8" />
4544
  </g>
4545
  </g>
4546
  <g id="text_13">
4547
+ <text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: end" x="53.23" y="236.026877" transform="rotate(-0 53.23 236.026877)">0.040</text>
4548
  </g>
4549
  </g>
4550
  <g id="ytick_5">
4551
  <g id="grid-y--6" class="grid grid-y">
4552
+ <path d="M 60.23 166.907492 L 847.294169 166.907492 " clip-path="url(#p620c7d392f)" style="fill: none; stroke: #b0b0b0; stroke-opacity: 0.3; stroke-width: 0.8; stroke-linecap: square" />
4553
  </g>
4554
  <g id="line2d_14">
4555
  <g>
4556
+ <use ns4:href="#m0fca2865ba" x="60.23" y="166.907492" style="stroke: #000000; stroke-width: 0.8" />
4557
  </g>
4558
  </g>
4559
  <g id="text_14">
4560
+ <text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: end" x="53.23" y="170.706711" transform="rotate(-0 53.23 170.706711)">0.045</text>
4561
  </g>
4562
  </g>
4563
  <g id="ytick_6">
4564
  <g id="grid-y--7" class="grid grid-y">
4565
+ <path d="M 60.23 101.587327 L 847.294169 101.587327 " clip-path="url(#p620c7d392f)" style="fill: none; stroke: #b0b0b0; stroke-opacity: 0.3; stroke-width: 0.8; stroke-linecap: square" />
4566
  </g>
4567
  <g id="line2d_15">
4568
  <g>
4569
+ <use ns4:href="#m0fca2865ba" x="60.23" y="101.587327" style="stroke: #000000; stroke-width: 0.8" />
4570
  </g>
4571
  </g>
4572
  <g id="text_15">
4573
+ <text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: end" x="53.23" y="105.386545" transform="rotate(-0 53.23 105.386545)">0.050</text>
4574
+ </g>
4575
+ </g>
4576
+ <g id="ytick_7">
4577
+ <g id="grid-y--8" class="grid grid-y">
4578
+ <path d="M 60.23 36.267161 L 847.294169 36.267161 " clip-path="url(#p620c7d392f)" style="fill: none; stroke: #b0b0b0; stroke-opacity: 0.3; stroke-width: 0.8; stroke-linecap: square" />
4579
+ </g>
4580
+ <g id="line2d_16">
4581
+ <g>
4582
+ <use ns4:href="#m0fca2865ba" x="60.23" y="36.267161" style="stroke: #000000; stroke-width: 0.8" />
4583
+ </g>
4584
+ </g>
4585
+ <g id="text_16">
4586
+ <text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: end" x="53.23" y="40.066379" transform="rotate(-0 53.23 40.066379)">0.055</text>
4587
  </g>
4588
  </g>
4589
  <g id="label--y" class="ylabel">
 
4591
  </g>
4592
  </g>
4593
  <g id="series--hf-kernels-swiglu" class="series">
4594
+ <path d="M 96.005644 451.16779 L 185.444754 385.847624 L 274.883864 395.253728 L 364.322974 398.911657 L 453.762084 382.189695 L 543.201194 401.393823 L 632.640304 395.136152 L 722.079415 381.275213 L 811.518525 395.515009 " clip-path="url(#p620c7d392f)" style="fill: none; stroke: #1f77b4; stroke-width: 1.5; stroke-linecap: square" />
4595
  <defs>
4596
  <path id="md7efaf3aec" d="M 0 3 C 0.795609 3 1.55874 2.683901 2.12132 2.12132 C 2.683901 1.55874 3 0.795609 3 0 C 3 -0.795609 2.683901 -1.55874 2.12132 -2.12132 C 1.55874 -2.683901 0.795609 -3 0 -3 C -0.795609 -3 -1.55874 -2.683901 -2.12132 -2.12132 C -2.683901 -1.55874 -3 -0.795609 -3 0 C -3 0.795609 -2.683901 1.55874 -2.12132 2.12132 C -1.55874 2.683901 -0.795609 3 0 3 z " style="stroke: #1f77b4" />
4597
  </defs>
4598
  <g clip-path="url(#p620c7d392f)">
4599
  <use ns4:href="#md7efaf3aec" x="96.005644" y="451.16779" style="fill: #1f77b4; stroke: #1f77b4" />
4600
+ <use ns4:href="#md7efaf3aec" x="185.444754" y="385.847624" style="fill: #1f77b4; stroke: #1f77b4" />
4601
+ <use ns4:href="#md7efaf3aec" x="274.883864" y="395.253728" style="fill: #1f77b4; stroke: #1f77b4" />
4602
+ <use ns4:href="#md7efaf3aec" x="364.322974" y="398.911657" style="fill: #1f77b4; stroke: #1f77b4" />
4603
+ <use ns4:href="#md7efaf3aec" x="453.762084" y="382.189695" style="fill: #1f77b4; stroke: #1f77b4" />
4604
+ <use ns4:href="#md7efaf3aec" x="543.201194" y="401.393823" style="fill: #1f77b4; stroke: #1f77b4" />
4605
+ <use ns4:href="#md7efaf3aec" x="632.640304" y="395.136152" style="fill: #1f77b4; stroke: #1f77b4" />
4606
+ <use ns4:href="#md7efaf3aec" x="722.079415" y="381.275213" style="fill: #1f77b4; stroke: #1f77b4" />
4607
+ <use ns4:href="#md7efaf3aec" x="811.518525" y="395.515009" style="fill: #1f77b4; stroke: #1f77b4" />
4608
  </g>
4609
  </g>
4610
  <g id="series--torch-eager" class="series">
4611
+ <path d="M 96.005644 194.328898 L 185.444754 47.08418 L 274.883864 59.495011 L 364.322974 61.46768 L 453.762084 66.170732 L 543.201194 84.055394 L 632.640304 56.503348 L 722.079415 80.67181 L 811.518525 81.586292 " clip-path="url(#p620c7d392f)" style="fill: none; stroke: #ff7f0e; stroke-width: 1.5; stroke-linecap: square" />
4612
  <defs>
4613
  <path id="m9b8c54d372" d="M 0 3 C 0.795609 3 1.55874 2.683901 2.12132 2.12132 C 2.683901 1.55874 3 0.795609 3 0 C 3 -0.795609 2.683901 -1.55874 2.12132 -2.12132 C 1.55874 -2.683901 0.795609 -3 0 -3 C -0.795609 -3 -1.55874 -2.683901 -2.12132 -2.12132 C -2.683901 -1.55874 -3 -0.795609 -3 0 C -3 0.795609 -2.683901 1.55874 -2.12132 2.12132 C -1.55874 2.683901 -0.795609 3 0 3 z " style="stroke: #ff7f0e" />
4614
  </defs>
4615
  <g clip-path="url(#p620c7d392f)">
4616
+ <use ns4:href="#m9b8c54d372" x="96.005644" y="194.328898" style="fill: #ff7f0e; stroke: #ff7f0e" />
4617
  <use ns4:href="#m9b8c54d372" x="185.444754" y="47.08418" style="fill: #ff7f0e; stroke: #ff7f0e" />
4618
+ <use ns4:href="#m9b8c54d372" x="274.883864" y="59.495011" style="fill: #ff7f0e; stroke: #ff7f0e" />
4619
+ <use ns4:href="#m9b8c54d372" x="364.322974" y="61.46768" style="fill: #ff7f0e; stroke: #ff7f0e" />
4620
+ <use ns4:href="#m9b8c54d372" x="453.762084" y="66.170732" style="fill: #ff7f0e; stroke: #ff7f0e" />
4621
+ <use ns4:href="#m9b8c54d372" x="543.201194" y="84.055394" style="fill: #ff7f0e; stroke: #ff7f0e" />
4622
+ <use ns4:href="#m9b8c54d372" x="632.640304" y="56.503348" style="fill: #ff7f0e; stroke: #ff7f0e" />
4623
+ <use ns4:href="#m9b8c54d372" x="722.079415" y="80.67181" style="fill: #ff7f0e; stroke: #ff7f0e" />
4624
+ <use ns4:href="#m9b8c54d372" x="811.518525" y="81.586292" style="fill: #ff7f0e; stroke: #ff7f0e" />
4625
  </g>
4626
  </g>
4627
  <g id="patch_3">
 
4636
  <g id="patch_6">
4637
  <path d="M 60.23 26.88 L 847.294169 26.88 " style="fill: none; stroke: #000000; stroke-width: 0.8; stroke-linejoin: miter; stroke-linecap: square" />
4638
  </g>
4639
+ <g id="text_17">
4640
  <text style="font-size: 12px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: middle" x="453.762084" y="20.88" transform="rotate(-0 453.762084 20.88)">Attention Implementation Latency</text>
4641
  </g>
4642
  <g id="legend" class="legend">
4643
  <g id="patch_7">
4644
+ <path d="M 720.811356 64.7925 L 840.294169 64.7925 Q 842.294169 64.7925 842.294169 62.7925 L 842.294169 33.88 Q 842.294169 31.88 840.294169 31.88 L 720.811356 31.88 Q 718.811356 31.88 718.811356 33.88 L 718.811356 62.7925 Q 718.811356 64.7925 720.811356 64.7925 L 720.811356 64.7925 z " style="fill: none; opacity: 0.8; stroke: #cccccc; stroke-linejoin: miter" />
4645
  </g>
4646
+ <g id="line2d_17">
4647
+ <path d="M 722.811356 39.978438 L 732.811356 39.978438 L 742.811356 39.978438 " style="fill: none; stroke: #1f77b4; stroke-width: 1.5; stroke-linecap: square" />
4648
  <g>
4649
+ <use ns4:href="#md7efaf3aec" x="732.811356" y="39.978438" style="fill: #1f77b4; stroke: #1f77b4" />
4650
  </g>
4651
  </g>
4652
  <g id="legend-label--hf-kernels-swiglu" class="legend">
4653
+ <text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: start" x="750.811356" y="43.478438" transform="rotate(-0 750.811356 43.478438)">hf_kernels_swiglu</text>
4654
  </g>
4655
+ <g id="line2d_18">
4656
+ <path d="M 722.811356 54.934687 L 732.811356 54.934687 L 742.811356 54.934687 " style="fill: none; stroke: #ff7f0e; stroke-width: 1.5; stroke-linecap: square" />
4657
  <g>
4658
+ <use ns4:href="#m9b8c54d372" x="732.811356" y="54.934687" style="fill: #ff7f0e; stroke: #ff7f0e" />
4659
  </g>
4660
  </g>
4661
  <g id="legend-label--torch-eager" class="legend">
4662
+ <text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: start" x="750.811356" y="58.434687" transform="rotate(-0 750.811356 58.434687)">torch_eager</text>
4663
  </g>
4664
  </g>
4665
  </g>
causal_conv1d/impls/artifacts/benchmark/causal_conv1d.jsonl CHANGED
@@ -1,24 +1,24 @@
1
- {"ts": "2025-10-28T14:09:04Z", "run": "f19e00ba30d74acf9dc0d60a3bc32059", "impl": "torch_eager", "tags": {"family": "pytorch", "backend": "eager"}, "wl": {"name": "cuda_B2_D64_S128_W2", "batch": 2, "dim": 64, "seqlen": 128, "width": 2, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.06712200001857127, "p50": 0.06883200001084333, "p90": 0.06976199995278876, "mean": 0.06901199997173535, "iqr": 0.0014600000213249587, "raw_times": [0.06976199995278876, 0.07104199994500959, 0.06712200001857127, 0.0683019999314638, 0.06883200001084333], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.0738530000035098, "peak_bytes": 295936, "ok": true, "absmax": 0.0, "corr": {"ok": true, "rtol": 0.003, "atol": 0.005, "absmax": 0.0, "mae": 0.0, "mse": 0.0, "ref": "causal_conv1d_fp32"}, "err": null}
2
- {"ts": "2025-10-28T14:09:04Z", "run": "f19e00ba30d74acf9dc0d60a3bc32059", "impl": "torch_eager", "tags": {"family": "pytorch", "backend": "eager"}, "wl": {"name": "cuda_B2_D64_S128_W4", "batch": 2, "dim": 64, "seqlen": 128, "width": 4, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.08455299996512622, "p50": 0.08599400007369695, "p90": 0.0868530000843748, "mean": 0.08612520005044644, "iqr": 0.0014299999975264654, "raw_times": [0.08780300004218589, 0.08455299996512622, 0.0868530000843748, 0.08542300008684833, 0.08599400007369695], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.08941300006881647, "peak_bytes": 296448, "ok": true, "absmax": 0.0, "corr": {"ok": true, "rtol": 0.003, "atol": 0.005, "absmax": 0.0, "mae": 0.0, "mse": 0.0, "ref": "causal_conv1d_fp32"}, "err": null}
3
- {"ts": "2025-10-28T14:09:04Z", "run": "f19e00ba30d74acf9dc0d60a3bc32059", "impl": "torch_eager", "tags": {"family": "pytorch", "backend": "eager"}, "wl": {"name": "cuda_B2_D64_S512_W2", "batch": 2, "dim": 64, "seqlen": 512, "width": 2, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.08494299993344612, "p50": 0.08714299997336639, "p90": 0.08724299993900786, "mean": 0.086546999955317, "iqr": 0.0020200000108161476, "raw_times": [0.08522299992819171, 0.08714299997336639, 0.08818300000257295, 0.08724299993900786, 0.08494299993344612], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.09105300000555872, "peak_bytes": 1180672, "ok": true, "absmax": 0.0, "corr": {"ok": true, "rtol": 0.003, "atol": 0.005, "absmax": 0.0, "mae": 0.0, "mse": 0.0, "ref": "causal_conv1d_fp32"}, "err": null}
4
- {"ts": "2025-10-28T14:09:04Z", "run": "f19e00ba30d74acf9dc0d60a3bc32059", "impl": "torch_eager", "tags": {"family": "pytorch", "backend": "eager"}, "wl": {"name": "cuda_B2_D64_S512_W4", "batch": 2, "dim": 64, "seqlen": 512, "width": 4, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.08327299997290538, "p50": 0.084122999965075, "p90": 0.08580299993354856, "mean": 0.08452299998680246, "iqr": 0.0023699999474047218, "raw_times": [0.08327299997290538, 0.084122999965075, 0.08598300007633952, 0.08580299993354856, 0.08343299998614384], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.08891300001323543, "peak_bytes": 1181184, "ok": true, "absmax": 0.0, "corr": {"ok": true, "rtol": 0.003, "atol": 0.005, "absmax": 0.0, "mae": 0.0, "mse": 0.0, "ref": "causal_conv1d_fp32"}, "err": null}
5
- {"ts": "2025-10-28T14:09:04Z", "run": "f19e00ba30d74acf9dc0d60a3bc32059", "impl": "torch_eager", "tags": {"family": "pytorch", "backend": "eager"}, "wl": {"name": "cuda_B2_D64_S2048_W2", "batch": 2, "dim": 64, "seqlen": 2048, "width": 2, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.08298299997022696, "p50": 0.08508299993081891, "p90": 0.08600299997851835, "mean": 0.0849267999683434, "iqr": 0.0016210000239880173, "raw_times": [0.08298299997022696, 0.08508299993081891, 0.08600299997851835, 0.08438199995453033, 0.08618300000762247], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.08780300004218589, "peak_bytes": 4719616, "ok": true, "absmax": 0.0, "corr": {"ok": true, "rtol": 0.003, "atol": 0.005, "absmax": 0.0, "mae": 0.0, "mse": 0.0, "ref": "causal_conv1d_fp32"}, "err": null}
6
- {"ts": "2025-10-28T14:09:04Z", "run": "f19e00ba30d74acf9dc0d60a3bc32059", "impl": "torch_eager", "tags": {"family": "pytorch", "backend": "eager"}, "wl": {"name": "cuda_B2_D64_S2048_W4", "batch": 2, "dim": 64, "seqlen": 2048, "width": 4, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.08270299997548136, "p50": 0.08315299999139825, "p90": 0.0846430000365217, "mean": 0.08407499999520951, "iqr": 0.0019010000187336118, "raw_times": [0.08315299999139825, 0.08713399995485815, 0.08270299997548136, 0.08274200001778809, 0.0846430000365217], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.08981299993138236, "peak_bytes": 4720128, "ok": true, "absmax": 0.0, "corr": {"ok": true, "rtol": 0.003, "atol": 0.005, "absmax": 0.0, "mae": 0.0, "mse": 0.0, "ref": "causal_conv1d_fp32"}, "err": null}
7
- {"ts": "2025-10-28T14:09:04Z", "run": "f19e00ba30d74acf9dc0d60a3bc32059", "impl": "torch_eager", "tags": {"family": "pytorch", "backend": "eager"}, "wl": {"name": "cuda_B2_D2048_S128_W2", "batch": 2, "dim": 2048, "seqlen": 128, "width": 2, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.08372299998882227, "p50": 0.08510199995725998, "p90": 0.08608299992829416, "mean": 0.08701479998762807, "iqr": 0.0011499998890940333, "raw_times": [0.08493300003920012, 0.09523300002456381, 0.08510199995725998, 0.08372299998882227, 0.08608299992829416], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.08923300003971235, "peak_bytes": 9461760, "ok": true, "absmax": 0.0, "corr": {"ok": true, "rtol": 0.003, "atol": 0.005, "absmax": 0.0, "mae": 0.0, "mse": 0.0, "ref": "causal_conv1d_fp32"}, "err": null}
8
- {"ts": "2025-10-28T14:09:04Z", "run": "f19e00ba30d74acf9dc0d60a3bc32059", "impl": "torch_eager", "tags": {"family": "pytorch", "backend": "eager"}, "wl": {"name": "cuda_B2_D2048_S128_W4", "batch": 2, "dim": 2048, "seqlen": 128, "width": 4, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.08285199999136239, "p50": 0.08483300007355865, "p90": 0.08511300006830425, "mean": 0.08449480001218035, "iqr": 0.0016500000583619112, "raw_times": [0.08285199999136239, 0.08346300000994233, 0.08483300007355865, 0.08621299991773412, 0.08511300006830425], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.08870299996033282, "peak_bytes": 9478144, "ok": true, "absmax": 0.0, "corr": {"ok": true, "rtol": 0.003, "atol": 0.005, "absmax": 0.0, "mae": 0.0, "mse": 0.0, "ref": "causal_conv1d_fp32"}, "err": null}
9
- {"ts": "2025-10-28T14:09:04Z", "run": "f19e00ba30d74acf9dc0d60a3bc32059", "impl": "torch_eager", "tags": {"family": "pytorch", "backend": "eager"}, "wl": {"name": "cuda_B2_D2048_S512_W2", "batch": 2, "dim": 2048, "seqlen": 512, "width": 2, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.08278300003894401, "p50": 0.08427300008406746, "p90": 0.08444299999155191, "mean": 0.08422300002166594, "iqr": 0.0002599999788799323, "raw_times": [0.08444299999155191, 0.08418300001267198, 0.08278300003894401, 0.08543299998109433, 0.08427300008406746], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.08903299999474257, "peak_bytes": 37773312, "ok": true, "absmax": 0.0, "corr": {"ok": true, "rtol": 0.003, "atol": 0.005, "absmax": 0.0, "mae": 0.0, "mse": 0.0, "ref": "causal_conv1d_fp32"}, "err": null}
10
- {"ts": "2025-10-28T14:09:04Z", "run": "f19e00ba30d74acf9dc0d60a3bc32059", "impl": "torch_eager", "tags": {"family": "pytorch", "backend": "eager"}, "wl": {"name": "cuda_B2_D2048_S512_W4", "batch": 2, "dim": 2048, "seqlen": 512, "width": 4, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.08352199995442788, "p50": 0.0842329999386493, "p90": 0.08553300006042264, "mean": 0.08496079999531503, "iqr": 0.0014400000054592965, "raw_times": [0.08409300005496334, 0.08742299996811198, 0.08553300006042264, 0.08352199995442788, 0.0842329999386493], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.08985400006622513, "peak_bytes": 37789696, "ok": true, "absmax": 0.0, "corr": {"ok": true, "rtol": 0.003, "atol": 0.005, "absmax": 0.0, "mae": 0.0, "mse": 0.0, "ref": "causal_conv1d_fp32"}, "err": null}
11
- {"ts": "2025-10-28T14:09:04Z", "run": "f19e00ba30d74acf9dc0d60a3bc32059", "impl": "torch_eager", "tags": {"family": "pytorch", "backend": "eager"}, "wl": {"name": "cuda_B2_D2048_S2048_W2", "batch": 2, "dim": 2048, "seqlen": 2048, "width": 2, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.14414499992199126, "p50": 0.14512600000671227, "p90": 0.14515400005166157, "mean": 0.1465472000063528, "iqr": 0.0008580000212532468, "raw_times": [0.14512600000671227, 0.14414499992199126, 0.14429600003040832, 0.15401500002099056, 0.14515400005166157], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.14571500003057736, "peak_bytes": 151019520, "ok": true, "absmax": 0.0, "corr": {"ok": true, "rtol": 0.003, "atol": 0.005, "absmax": 0.0, "mae": 0.0, "mse": 0.0, "ref": "causal_conv1d_fp32"}, "err": null}
12
- {"ts": "2025-10-28T14:09:04Z", "run": "f19e00ba30d74acf9dc0d60a3bc32059", "impl": "torch_eager", "tags": {"family": "pytorch", "backend": "eager"}, "wl": {"name": "cuda_B2_D2048_S2048_W4", "batch": 2, "dim": 2048, "seqlen": 2048, "width": 4, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.16020600003230356, "p50": 0.16135600003508443, "p90": 0.16139600006681576, "mean": 0.16140360005465482, "iqr": 0.00029099999210302485, "raw_times": [0.16139600006681576, 0.1629550000643576, 0.16110500007471273, 0.16020600003230356, 0.16135600003508443], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.1623660000404925, "peak_bytes": 151035904, "ok": true, "absmax": 0.0, "corr": {"ok": true, "rtol": 0.003, "atol": 0.005, "absmax": 0.0, "mae": 0.0, "mse": 0.0, "ref": "causal_conv1d_fp32"}, "err": null}
13
- {"ts": "2025-10-28T14:09:04Z", "run": "f19e00ba30d74acf9dc0d60a3bc32059", "impl": "torch_eager", "tags": {"family": "pytorch", "backend": "eager"}, "wl": {"name": "cuda_B4_D64_S128_W2", "batch": 4, "dim": 64, "seqlen": 128, "width": 2, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.07979300005445111, "p50": 0.08039299996198679, "p90": 0.08136300004935038, "mean": 0.08070500002759218, "iqr": 0.001150000002780871, "raw_times": [0.0802130000465695, 0.0817630000256031, 0.07979300005445111, 0.08039299996198679, 0.08136300004935038], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.0855329999467358, "peak_bytes": 33727488, "ok": true, "absmax": 0.0, "corr": {"ok": true, "rtol": 0.003, "atol": 0.005, "absmax": 0.0, "mae": 0.0, "mse": 0.0, "ref": "causal_conv1d_fp32"}, "err": null}
14
- {"ts": "2025-10-28T14:09:04Z", "run": "f19e00ba30d74acf9dc0d60a3bc32059", "impl": "torch_eager", "tags": {"family": "pytorch", "backend": "eager"}, "wl": {"name": "cuda_B4_D64_S128_W4", "batch": 4, "dim": 64, "seqlen": 128, "width": 4, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.0800829999434427, "p50": 0.08147299990923784, "p90": 0.08197300007850572, "mean": 0.08146099996793055, "iqr": 0.00109000018255756, "raw_times": [0.0800829999434427, 0.08197300007850572, 0.08147299990923784, 0.08289300001251831, 0.08088299989594816], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.08291199992527254, "peak_bytes": 591360, "ok": true, "absmax": 0.0, "corr": {"ok": true, "rtol": 0.003, "atol": 0.005, "absmax": 0.0, "mae": 0.0, "mse": 0.0, "ref": "causal_conv1d_fp32"}, "err": null}
15
- {"ts": "2025-10-28T14:09:04Z", "run": "f19e00ba30d74acf9dc0d60a3bc32059", "impl": "torch_eager", "tags": {"family": "pytorch", "backend": "eager"}, "wl": {"name": "cuda_B4_D64_S512_W2", "batch": 4, "dim": 64, "seqlen": 512, "width": 2, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.0799729999698684, "p50": 0.08137199995417177, "p90": 0.081513000054656, "mean": 0.08127659998535819, "iqr": 0.0006500000608866685, "raw_times": [0.0799729999698684, 0.08266199995432544, 0.081513000054656, 0.08086299999376934, 0.08137199995417177], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.08939400004237541, "peak_bytes": 2360320, "ok": true, "absmax": 0.0, "corr": {"ok": true, "rtol": 0.003, "atol": 0.005, "absmax": 0.0, "mae": 0.0, "mse": 0.0, "ref": "causal_conv1d_fp32"}, "err": null}
16
- {"ts": "2025-10-28T14:09:04Z", "run": "f19e00ba30d74acf9dc0d60a3bc32059", "impl": "torch_eager", "tags": {"family": "pytorch", "backend": "eager"}, "wl": {"name": "cuda_B4_D64_S512_W4", "batch": 4, "dim": 64, "seqlen": 512, "width": 4, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.08067300007041922, "p50": 0.08162300002823031, "p90": 0.08189199991193163, "mean": 0.08365860001049441, "iqr": 0.0008099999604382901, "raw_times": [0.08067300007041922, 0.08108199995149334, 0.08189199991193163, 0.08162300002823031, 0.09302300009039755], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.08415299998887349, "peak_bytes": 2360832, "ok": true, "absmax": 0.0, "corr": {"ok": true, "rtol": 0.003, "atol": 0.005, "absmax": 0.0, "mae": 0.0, "mse": 0.0, "ref": "causal_conv1d_fp32"}, "err": null}
17
- {"ts": "2025-10-28T14:09:04Z", "run": "f19e00ba30d74acf9dc0d60a3bc32059", "impl": "torch_eager", "tags": {"family": "pytorch", "backend": "eager"}, "wl": {"name": "cuda_B4_D64_S2048_W2", "batch": 4, "dim": 64, "seqlen": 2048, "width": 2, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.0806030000148894, "p50": 0.08186299999124458, "p90": 0.08199299998068454, "mean": 0.08162900001025264, "iqr": 0.001009999891721236, "raw_times": [0.08270299997548136, 0.08186299999124458, 0.0806030000148894, 0.08199299998068454, 0.08098300008896331], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.10199300004387624, "peak_bytes": 9438208, "ok": true, "absmax": 0.0, "corr": {"ok": true, "rtol": 0.003, "atol": 0.005, "absmax": 0.0, "mae": 0.0, "mse": 0.0, "ref": "causal_conv1d_fp32"}, "err": null}
18
- {"ts": "2025-10-28T14:09:04Z", "run": "f19e00ba30d74acf9dc0d60a3bc32059", "impl": "torch_eager", "tags": {"family": "pytorch", "backend": "eager"}, "wl": {"name": "cuda_B4_D64_S2048_W4", "batch": 4, "dim": 64, "seqlen": 2048, "width": 4, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.08040199998049502, "p50": 0.08168299996214046, "p90": 0.08185199999388715, "mean": 0.08171659999334224, "iqr": 0.0013889999763705418, "raw_times": [0.0804630000175166, 0.08418300001267198, 0.08168299996214046, 0.08040199998049502, 0.08185199999388715], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.08522300004187855, "peak_bytes": 9438720, "ok": true, "absmax": 0.0, "corr": {"ok": true, "rtol": 0.003, "atol": 0.005, "absmax": 0.0, "mae": 0.0, "mse": 0.0, "ref": "causal_conv1d_fp32"}, "err": null}
19
- {"ts": "2025-10-28T14:09:04Z", "run": "f19e00ba30d74acf9dc0d60a3bc32059", "impl": "torch_eager", "tags": {"family": "pytorch", "backend": "eager"}, "wl": {"name": "cuda_B4_D2048_S128_W2", "batch": 4, "dim": 2048, "seqlen": 128, "width": 2, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.08097300008103048, "p50": 0.08150300004672317, "p90": 0.08173299988811777, "mean": 0.08153900000706926, "iqr": 0.0005599998758043512, "raw_times": [0.08117300001231342, 0.08231300000716146, 0.08150300004672317, 0.08173299988811777, 0.08097300008103048], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.08440300007350743, "peak_bytes": 18931712, "ok": true, "absmax": 0.0, "corr": {"ok": true, "rtol": 0.003, "atol": 0.005, "absmax": 0.0, "mae": 0.0, "mse": 0.0, "ref": "causal_conv1d_fp32"}, "err": null}
20
- {"ts": "2025-10-28T14:09:04Z", "run": "f19e00ba30d74acf9dc0d60a3bc32059", "impl": "torch_eager", "tags": {"family": "pytorch", "backend": "eager"}, "wl": {"name": "cuda_B4_D2048_S128_W4", "batch": 4, "dim": 2048, "seqlen": 128, "width": 4, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.0802130000465695, "p50": 0.08124300006784324, "p90": 0.08242299998073577, "mean": 0.08162480000919459, "iqr": 0.0012000000424450263, "raw_times": [0.0802130000465695, 0.08302200001253368, 0.08242299998073577, 0.08124300006784324, 0.08122299993829074], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.08460300000479037, "peak_bytes": 18948096, "ok": true, "absmax": 0.0, "corr": {"ok": true, "rtol": 0.003, "atol": 0.005, "absmax": 0.0, "mae": 0.0, "mse": 0.0, "ref": "causal_conv1d_fp32"}, "err": null}
21
- {"ts": "2025-10-28T14:09:04Z", "run": "f19e00ba30d74acf9dc0d60a3bc32059", "impl": "torch_eager", "tags": {"family": "pytorch", "backend": "eager"}, "wl": {"name": "cuda_B4_D2048_S512_W2", "batch": 4, "dim": 2048, "seqlen": 512, "width": 2, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.09226300005593657, "p50": 0.09320300000581483, "p90": 0.0934630000983816, "mean": 0.09316100004070904, "iqr": 0.0007800000503266347, "raw_times": [0.09419299999535724, 0.09320300000581483, 0.0934630000983816, 0.09226300005593657, 0.09268300004805496], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.0951240000404141, "peak_bytes": 75522048, "ok": true, "absmax": 0.0, "corr": {"ok": true, "rtol": 0.003, "atol": 0.005, "absmax": 0.0, "mae": 0.0, "mse": 0.0, "ref": "causal_conv1d_fp32"}, "err": null}
22
- {"ts": "2025-10-28T14:09:04Z", "run": "f19e00ba30d74acf9dc0d60a3bc32059", "impl": "torch_eager", "tags": {"family": "pytorch", "backend": "eager"}, "wl": {"name": "cuda_B4_D2048_S512_W4", "batch": 4, "dim": 2048, "seqlen": 512, "width": 4, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.09887299995625654, "p50": 0.09917300008055463, "p90": 0.09990300009121711, "mean": 0.09939520000443736, "iqr": 0.0009100001534534385, "raw_times": [0.09887299995625654, 0.09917300008055463, 0.09990300009121711, 0.10003399995639484, 0.09899299993776367], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.1023739999936879, "peak_bytes": 75538432, "ok": true, "absmax": 0.0, "corr": {"ok": true, "rtol": 0.003, "atol": 0.005, "absmax": 0.0, "mae": 0.0, "mse": 0.0, "ref": "causal_conv1d_fp32"}, "err": null}
23
- {"ts": "2025-10-28T14:09:04Z", "run": "f19e00ba30d74acf9dc0d60a3bc32059", "impl": "torch_eager", "tags": {"family": "pytorch", "backend": "eager"}, "wl": {"name": "cuda_B4_D2048_S2048_W2", "batch": 4, "dim": 2048, "seqlen": 2048, "width": 2, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.4842959999677987, "p50": 0.4860569999891595, "p90": 0.4878769999550059, "mean": 0.48646659997757524, "iqr": 0.002959999960694404, "raw_times": [0.4849169999943115, 0.4860569999891595, 0.4878769999550059, 0.4842959999677987, 0.4891859999816006], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.4877669999814316, "peak_bytes": 302014464, "ok": true, "absmax": 0.0, "corr": {"ok": true, "rtol": 0.003, "atol": 0.005, "absmax": 0.0, "mae": 0.0, "mse": 0.0, "ref": "causal_conv1d_fp32"}, "err": null}
24
- {"ts": "2025-10-28T14:09:04Z", "run": "f19e00ba30d74acf9dc0d60a3bc32059", "impl": "torch_eager", "tags": {"family": "pytorch", "backend": "eager"}, "wl": {"name": "cuda_B4_D2048_S2048_W4", "batch": 4, "dim": 2048, "seqlen": 2048, "width": 4, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.4968179999877975, "p50": 0.49805800006197387, "p90": 0.4990780000753148, "mean": 0.4983496000022569, "iqr": 0.001141000097959477, "raw_times": [0.4979369999773553, 0.49985699990884314, 0.4990780000753148, 0.49805800006197387, 0.4968179999877975], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.49727700002222264, "peak_bytes": 302030848, "ok": true, "absmax": 0.0, "corr": {"ok": true, "rtol": 0.003, "atol": 0.005, "absmax": 0.0, "mae": 0.0, "mse": 0.0, "ref": "causal_conv1d_fp32"}, "err": null}
 
1
+ {"ts": "2025-10-29T14:27:35Z", "run": "b5ef0d3c828f4daf8ce959b0e1a6bf4c", "impl": "hf_kernels_causal_conv1d", "tags": {"family": "hf-kernels", "backend": "cuda"}, "wl": {"name": "cuda_B2_D64_S128_W2", "batch": 2, "dim": 64, "seqlen": 128, "width": 2, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.047051000024112, "p50": 0.048391000007086404, "p90": 0.048571999968771706, "mean": 0.04890720000503279, "iqr": 0.0005509999709829572, "raw_times": [0.0525010000274051, 0.048571999968771706, 0.04802099999778875, 0.048391000007086404, 0.047051000024112], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.058991999992485944, "peak_bytes": 295936, "ok": true, "absmax": 0.0, "corr": {"ok": true, "rtol": 0.003, "atol": 0.005, "absmax": 0.0, "mae": 0.0, "mse": 0.0, "ref": "causal_conv1d_fp32"}, "err": null}
2
+ {"ts": "2025-10-29T14:27:36Z", "run": "b5ef0d3c828f4daf8ce959b0e1a6bf4c", "impl": "hf_kernels_causal_conv1d", "tags": {"family": "hf-kernels", "backend": "cuda"}, "wl": {"name": "cuda_B2_D64_S128_W4", "batch": 2, "dim": 64, "seqlen": 128, "width": 4, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.05480199996554802, "p50": 0.05610099998420992, "p90": 0.05628099995647062, "mean": 0.056069199968078465, "iqr": 0.0006299999881775875, "raw_times": [0.057510999965870724, 0.05628099995647062, 0.05610099998420992, 0.055650999968293036, 0.05480199996554802], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.06025200002568454, "peak_bytes": 296448, "ok": true, "absmax": 0.0, "corr": {"ok": true, "rtol": 0.003, "atol": 0.005, "absmax": 0.0, "mae": 0.0, "mse": 0.0, "ref": "causal_conv1d_fp32"}, "err": null}
3
+ {"ts": "2025-10-29T14:27:36Z", "run": "b5ef0d3c828f4daf8ce959b0e1a6bf4c", "impl": "hf_kernels_causal_conv1d", "tags": {"family": "hf-kernels", "backend": "cuda"}, "wl": {"name": "cuda_B2_D64_S512_W2", "batch": 2, "dim": 64, "seqlen": 512, "width": 2, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.05469199999197372, "p50": 0.056971999981669796, "p90": 0.057361000017408514, "mean": 0.056363600003805914, "iqr": 0.0025490000439276628, "raw_times": [0.05469199999197372, 0.057361000017408514, 0.056971999981669796, 0.05798100005449669, 0.05481199997348085], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.0596809999819925, "peak_bytes": 1180672, "ok": true, "absmax": 0.0, "corr": {"ok": true, "rtol": 0.003, "atol": 0.005, "absmax": 0.0, "mae": 0.0, "mse": 0.0, "ref": "causal_conv1d_fp32"}, "err": null}
4
+ {"ts": "2025-10-29T14:27:36Z", "run": "b5ef0d3c828f4daf8ce959b0e1a6bf4c", "impl": "hf_kernels_causal_conv1d", "tags": {"family": "hf-kernels", "backend": "cuda"}, "wl": {"name": "cuda_B2_D64_S512_W4", "batch": 2, "dim": 64, "seqlen": 512, "width": 4, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.05416100003685642, "p50": 0.05502099998011545, "p90": 0.05511200004093553, "mean": 0.05489540000098714, "iqr": 0.00016000007008187822, "raw_times": [0.05416100003685642, 0.05495199997085365, 0.055230999976174644, 0.05511200004093553, 0.05502099998011545], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.05659100003185813, "peak_bytes": 1181184, "ok": true, "absmax": 0.0, "corr": {"ok": true, "rtol": 0.003, "atol": 0.005, "absmax": 0.0, "mae": 0.0, "mse": 0.0, "ref": "causal_conv1d_fp32"}, "err": null}
5
+ {"ts": "2025-10-29T14:27:36Z", "run": "b5ef0d3c828f4daf8ce959b0e1a6bf4c", "impl": "hf_kernels_causal_conv1d", "tags": {"family": "hf-kernels", "backend": "cuda"}, "wl": {"name": "cuda_B2_D64_S2048_W2", "batch": 2, "dim": 64, "seqlen": 2048, "width": 2, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.052401000004920206, "p50": 0.055000999964249786, "p90": 0.056301000029179704, "mean": 0.054810999995424936, "iqr": 0.0023400000372930663, "raw_times": [0.052401000004920206, 0.056301000029179704, 0.056390999986888346, 0.05396099999188664, 0.055000999964249786], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.05838100003074942, "peak_bytes": 4719616, "ok": true, "absmax": 0.0, "corr": {"ok": true, "rtol": 0.003, "atol": 0.005, "absmax": 0.0, "mae": 0.0, "mse": 0.0, "ref": "causal_conv1d_fp32"}, "err": null}
6
+ {"ts": "2025-10-29T14:27:36Z", "run": "b5ef0d3c828f4daf8ce959b0e1a6bf4c", "impl": "hf_kernels_causal_conv1d", "tags": {"family": "hf-kernels", "backend": "cuda"}, "wl": {"name": "cuda_B2_D64_S2048_W4", "batch": 2, "dim": 64, "seqlen": 2048, "width": 4, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.05270100001553146, "p50": 0.05342100001826111, "p90": 0.054111999986616865, "mean": 0.053611199996339565, "iqr": 0.0008709999974598759, "raw_times": [0.05324099998915699, 0.054580999972131394, 0.054111999986616865, 0.05342100001826111, 0.05270100001553146], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.0580610000042725, "peak_bytes": 4720128, "ok": true, "absmax": 0.0, "corr": {"ok": true, "rtol": 0.003, "atol": 0.005, "absmax": 0.0, "mae": 0.0, "mse": 0.0, "ref": "causal_conv1d_fp32"}, "err": null}
7
+ {"ts": "2025-10-29T14:27:36Z", "run": "b5ef0d3c828f4daf8ce959b0e1a6bf4c", "impl": "hf_kernels_causal_conv1d", "tags": {"family": "hf-kernels", "backend": "cuda"}, "wl": {"name": "cuda_B2_D2048_S128_W2", "batch": 2, "dim": 2048, "seqlen": 128, "width": 2, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.051810999991630524, "p50": 0.05310099999178419, "p90": 0.053301000036753976, "mean": 0.05577720000928821, "iqr": 0.0007700000423938036, "raw_times": [0.051810999991630524, 0.05253099999436017, 0.06814200003191218, 0.053301000036753976, 0.05310099999178419], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.056801999960498506, "peak_bytes": 9461760, "ok": true, "absmax": 0.0, "corr": {"ok": true, "rtol": 0.003, "atol": 0.005, "absmax": 0.0, "mae": 0.0, "mse": 0.0, "ref": "causal_conv1d_fp32"}, "err": null}
8
+ {"ts": "2025-10-29T14:27:36Z", "run": "b5ef0d3c828f4daf8ce959b0e1a6bf4c", "impl": "hf_kernels_causal_conv1d", "tags": {"family": "hf-kernels", "backend": "cuda"}, "wl": {"name": "cuda_B2_D2048_S128_W4", "batch": 2, "dim": 2048, "seqlen": 128, "width": 4, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.053270999956112064, "p50": 0.05397199998924407, "p90": 0.05399100001568513, "mean": 0.054061200000887766, "iqr": 0.00023000001192485797, "raw_times": [0.05399100001568513, 0.05531100003963729, 0.053270999956112064, 0.05397199998924407, 0.05376100000376027], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.056641000014678866, "peak_bytes": 9478144, "ok": true, "absmax": 0.0, "corr": {"ok": true, "rtol": 0.003, "atol": 0.005, "absmax": 0.0, "mae": 0.0, "mse": 0.0, "ref": "causal_conv1d_fp32"}, "err": null}
9
+ {"ts": "2025-10-29T14:27:36Z", "run": "b5ef0d3c828f4daf8ce959b0e1a6bf4c", "impl": "hf_kernels_causal_conv1d", "tags": {"family": "hf-kernels", "backend": "cuda"}, "wl": {"name": "cuda_B2_D2048_S512_W2", "batch": 2, "dim": 2048, "seqlen": 512, "width": 2, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.052231000040592335, "p50": 0.052561000018158666, "p90": 0.0526110000009794, "mean": 0.0529970000116009, "iqr": 0.0002200000039920269, "raw_times": [0.052231000040592335, 0.052390999996987375, 0.05519100000128674, 0.052561000018158666, 0.0526110000009794], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.05621100001462764, "peak_bytes": 37773312, "ok": true, "absmax": 0.0, "corr": {"ok": true, "rtol": 0.003, "atol": 0.005, "absmax": 0.0, "mae": 0.0, "mse": 0.0, "ref": "causal_conv1d_fp32"}, "err": null}
10
+ {"ts": "2025-10-29T14:27:36Z", "run": "b5ef0d3c828f4daf8ce959b0e1a6bf4c", "impl": "hf_kernels_causal_conv1d", "tags": {"family": "hf-kernels", "backend": "cuda"}, "wl": {"name": "cuda_B2_D2048_S512_W4", "batch": 2, "dim": 2048, "seqlen": 512, "width": 4, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.052561000018158666, "p50": 0.053772000001117704, "p90": 0.05471100001841478, "mean": 0.057879400003457704, "iqr": 0.0021000000174353772, "raw_times": [0.0526110000009794, 0.05471100001841478, 0.053772000001117704, 0.052561000018158666, 0.07574199997861797], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.06976200000963217, "peak_bytes": 37789696, "ok": true, "absmax": 0.0, "corr": {"ok": true, "rtol": 0.003, "atol": 0.005, "absmax": 0.0, "mae": 0.0, "mse": 0.0, "ref": "causal_conv1d_fp32"}, "err": null}
11
+ {"ts": "2025-10-29T14:27:36Z", "run": "b5ef0d3c828f4daf8ce959b0e1a6bf4c", "impl": "hf_kernels_causal_conv1d", "tags": {"family": "hf-kernels", "backend": "cuda"}, "wl": {"name": "cuda_B2_D2048_S2048_W2", "batch": 2, "dim": 2048, "seqlen": 2048, "width": 2, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.051341000016691396, "p50": 0.05308099997591853, "p90": 0.05349200000637211, "mean": 0.05272739998645193, "iqr": 0.0014310000437944836, "raw_times": [0.052060999962577625, 0.05366199997069998, 0.05308099997591853, 0.051341000016691396, 0.05349200000637211], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.05661099999088037, "peak_bytes": 151019520, "ok": true, "absmax": 0.0, "corr": {"ok": true, "rtol": 0.003, "atol": 0.005, "absmax": 0.0, "mae": 0.0, "mse": 0.0, "ref": "causal_conv1d_fp32"}, "err": null}
12
+ {"ts": "2025-10-29T14:27:36Z", "run": "b5ef0d3c828f4daf8ce959b0e1a6bf4c", "impl": "hf_kernels_causal_conv1d", "tags": {"family": "hf-kernels", "backend": "cuda"}, "wl": {"name": "cuda_B2_D2048_S2048_W4", "batch": 2, "dim": 2048, "seqlen": 2048, "width": 4, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.05081099999415528, "p50": 0.053202000003693684, "p90": 0.05362099994954406, "mean": 0.05282339998302632, "iqr": 0.0011499999459374521, "raw_times": [0.05081099999415528, 0.052471000003606605, 0.05362099994954406, 0.054011999964131974, 0.053202000003693684], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.05642200000011144, "peak_bytes": 151035904, "ok": true, "absmax": 0.0, "corr": {"ok": true, "rtol": 0.003, "atol": 0.005, "absmax": 0.0, "mae": 0.0, "mse": 0.0, "ref": "causal_conv1d_fp32"}, "err": null}
13
+ {"ts": "2025-10-29T14:27:36Z", "run": "b5ef0d3c828f4daf8ce959b0e1a6bf4c", "impl": "hf_kernels_causal_conv1d", "tags": {"family": "hf-kernels", "backend": "cuda"}, "wl": {"name": "cuda_B4_D64_S128_W2", "batch": 4, "dim": 64, "seqlen": 128, "width": 2, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.052221000032659504, "p50": 0.05397099999981947, "p90": 0.05448100000648992, "mean": 0.05380100001275423, "iqr": 0.001479999980347202, "raw_times": [0.052221000032659504, 0.05300100002614272, 0.055330999998659536, 0.05448100000648992, 0.05397099999981947], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.058421999995061924, "peak_bytes": 33727488, "ok": true, "absmax": 0.0, "corr": {"ok": true, "rtol": 0.003, "atol": 0.005, "absmax": 0.0, "mae": 0.0, "mse": 0.0, "ref": "causal_conv1d_fp32"}, "err": null}
14
+ {"ts": "2025-10-29T14:27:36Z", "run": "b5ef0d3c828f4daf8ce959b0e1a6bf4c", "impl": "hf_kernels_causal_conv1d", "tags": {"family": "hf-kernels", "backend": "cuda"}, "wl": {"name": "cuda_B4_D64_S128_W4", "batch": 4, "dim": 64, "seqlen": 128, "width": 4, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.05164199995988383, "p50": 0.053460999993149017, "p90": 0.05421100001967716, "mean": 0.0532791999944493, "iqr": 0.00139000002263856, "raw_times": [0.0528209999970386, 0.054261000002497894, 0.05421100001967716, 0.05164199995988383, 0.053460999993149017], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.05741199998965385, "peak_bytes": 591360, "ok": true, "absmax": 0.0, "corr": {"ok": true, "rtol": 0.003, "atol": 0.005, "absmax": 0.0, "mae": 0.0, "mse": 0.0, "ref": "causal_conv1d_fp32"}, "err": null}
15
+ {"ts": "2025-10-29T14:27:36Z", "run": "b5ef0d3c828f4daf8ce959b0e1a6bf4c", "impl": "hf_kernels_causal_conv1d", "tags": {"family": "hf-kernels", "backend": "cuda"}, "wl": {"name": "cuda_B4_D64_S512_W2", "batch": 4, "dim": 64, "seqlen": 512, "width": 2, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.052460999995673774, "p50": 0.0557109999590466, "p90": 0.05585100001326282, "mean": 0.05600519999688913, "iqr": 0.002330000029360235, "raw_times": [0.052460999995673774, 0.0557109999590466, 0.05585100001326282, 0.053520999983902584, 0.06248200003255988], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.058421000005637325, "peak_bytes": 2360320, "ok": true, "absmax": 0.0, "corr": {"ok": true, "rtol": 0.003, "atol": 0.005, "absmax": 0.0, "mae": 0.0, "mse": 0.0, "ref": "causal_conv1d_fp32"}, "err": null}
16
+ {"ts": "2025-10-29T14:27:36Z", "run": "b5ef0d3c828f4daf8ce959b0e1a6bf4c", "impl": "hf_kernels_causal_conv1d", "tags": {"family": "hf-kernels", "backend": "cuda"}, "wl": {"name": "cuda_B4_D64_S512_W4", "batch": 4, "dim": 64, "seqlen": 512, "width": 4, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.0540510000064387, "p50": 0.0541219999945497, "p90": 0.05425200004083308, "mean": 0.054337400013082515, "iqr": 0.0001910000264615519, "raw_times": [0.0540510000064387, 0.05425200004083308, 0.05520100000921957, 0.0541219999945497, 0.05406100001437153], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.05772200000819794, "peak_bytes": 2360832, "ok": true, "absmax": 0.0, "corr": {"ok": true, "rtol": 0.003, "atol": 0.005, "absmax": 0.0, "mae": 0.0, "mse": 0.0, "ref": "causal_conv1d_fp32"}, "err": null}
17
+ {"ts": "2025-10-29T14:27:36Z", "run": "b5ef0d3c828f4daf8ce959b0e1a6bf4c", "impl": "hf_kernels_causal_conv1d", "tags": {"family": "hf-kernels", "backend": "cuda"}, "wl": {"name": "cuda_B4_D64_S2048_W2", "batch": 4, "dim": 64, "seqlen": 2048, "width": 2, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.05302099998516496, "p50": 0.05408099997339377, "p90": 0.0544409999747586, "mean": 0.05571119997966889, "iqr": 0.0008299999763039523, "raw_times": [0.05302099998516496, 0.0544409999747586, 0.06340199996657248, 0.053610999998454645, 0.05408099997339377], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.05787100002407897, "peak_bytes": 9438208, "ok": true, "absmax": 0.0, "corr": {"ok": true, "rtol": 0.003, "atol": 0.005, "absmax": 0.0, "mae": 0.0, "mse": 0.0, "ref": "causal_conv1d_fp32"}, "err": null}
18
+ {"ts": "2025-10-29T14:27:36Z", "run": "b5ef0d3c828f4daf8ce959b0e1a6bf4c", "impl": "hf_kernels_causal_conv1d", "tags": {"family": "hf-kernels", "backend": "cuda"}, "wl": {"name": "cuda_B4_D64_S2048_W4", "batch": 4, "dim": 64, "seqlen": 2048, "width": 4, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.052391999986411975, "p50": 0.05292200000894809, "p90": 0.053131000015582686, "mean": 0.05318180001268047, "iqr": 0.00026899999738816405, "raw_times": [0.053131000015582686, 0.054602000034265075, 0.052391999986411975, 0.05286200001819452, 0.05292200000894809], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.05755099999760205, "peak_bytes": 9438720, "ok": true, "absmax": 0.0, "corr": {"ok": true, "rtol": 0.003, "atol": 0.005, "absmax": 0.0, "mae": 0.0, "mse": 0.0, "ref": "causal_conv1d_fp32"}, "err": null}
19
+ {"ts": "2025-10-29T14:27:36Z", "run": "b5ef0d3c828f4daf8ce959b0e1a6bf4c", "impl": "hf_kernels_causal_conv1d", "tags": {"family": "hf-kernels", "backend": "cuda"}, "wl": {"name": "cuda_B4_D2048_S128_W2", "batch": 4, "dim": 2048, "seqlen": 128, "width": 2, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.05195099998900332, "p50": 0.05479100002503401, "p90": 0.05620100000669481, "mean": 0.05852919999824735, "iqr": 0.0034000000255218765, "raw_times": [0.07690199998933167, 0.05620100000669481, 0.05479100002503401, 0.05195099998900332, 0.052800999981172936], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.057030999982998765, "peak_bytes": 18931712, "ok": true, "absmax": 0.0, "corr": {"ok": true, "rtol": 0.003, "atol": 0.005, "absmax": 0.0, "mae": 0.0, "mse": 0.0, "ref": "causal_conv1d_fp32"}, "err": null}
20
+ {"ts": "2025-10-29T14:27:36Z", "run": "b5ef0d3c828f4daf8ce959b0e1a6bf4c", "impl": "hf_kernels_causal_conv1d", "tags": {"family": "hf-kernels", "backend": "cuda"}, "wl": {"name": "cuda_B4_D2048_S128_W4", "batch": 4, "dim": 2048, "seqlen": 128, "width": 4, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.05290100000365783, "p50": 0.05457100002104198, "p90": 0.055251000048883725, "mean": 0.055353400011881604, "iqr": 0.001779000058377278, "raw_times": [0.05290100000365783, 0.06057199999531804, 0.055251000048883725, 0.05347199999050645, 0.05457100002104198], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.056300999972336285, "peak_bytes": 18948096, "ok": true, "absmax": 0.0, "corr": {"ok": true, "rtol": 0.003, "atol": 0.005, "absmax": 0.0, "mae": 0.0, "mse": 0.0, "ref": "causal_conv1d_fp32"}, "err": null}
21
+ {"ts": "2025-10-29T14:27:36Z", "run": "b5ef0d3c828f4daf8ce959b0e1a6bf4c", "impl": "hf_kernels_causal_conv1d", "tags": {"family": "hf-kernels", "backend": "cuda"}, "wl": {"name": "cuda_B4_D2048_S512_W2", "batch": 4, "dim": 2048, "seqlen": 512, "width": 2, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.052152000023397704, "p50": 0.05461199998535449, "p90": 0.05471200000783938, "mean": 0.05381760001910152, "iqr": 0.001900999961890193, "raw_times": [0.052152000023397704, 0.05461199998535449, 0.05480100003296684, 0.05471200000783938, 0.052811000045949186], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.05758200001082514, "peak_bytes": 75522048, "ok": true, "absmax": 0.0, "corr": {"ok": true, "rtol": 0.003, "atol": 0.005, "absmax": 0.0, "mae": 0.0, "mse": 0.0, "ref": "causal_conv1d_fp32"}, "err": null}
22
+ {"ts": "2025-10-29T14:27:36Z", "run": "b5ef0d3c828f4daf8ce959b0e1a6bf4c", "impl": "hf_kernels_causal_conv1d", "tags": {"family": "hf-kernels", "backend": "cuda"}, "wl": {"name": "cuda_B4_D2048_S512_W4", "batch": 4, "dim": 2048, "seqlen": 512, "width": 4, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.05339099999446262, "p50": 0.05463100001179555, "p90": 0.055171999974845676, "mean": 0.05451339999353877, "iqr": 0.0011309999763398082, "raw_times": [0.05404099999850587, 0.055331999988084135, 0.05339099999446262, 0.05463100001179555, 0.055171999974845676], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.058501000012256554, "peak_bytes": 75538432, "ok": true, "absmax": 0.0, "corr": {"ok": true, "rtol": 0.003, "atol": 0.005, "absmax": 0.0, "mae": 0.0, "mse": 0.0, "ref": "causal_conv1d_fp32"}, "err": null}
23
+ {"ts": "2025-10-29T14:27:36Z", "run": "b5ef0d3c828f4daf8ce959b0e1a6bf4c", "impl": "hf_kernels_causal_conv1d", "tags": {"family": "hf-kernels", "backend": "cuda"}, "wl": {"name": "cuda_B4_D2048_S2048_W2", "batch": 4, "dim": 2048, "seqlen": 2048, "width": 2, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.05277100001421786, "p50": 0.053860999969401746, "p90": 0.054361000024982786, "mean": 0.053951200004576094, "iqr": 0.0007190000133050489, "raw_times": [0.05277100001421786, 0.05512100000260034, 0.05364200001167774, 0.053860999969401746, 0.054361000024982786], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.057851999997637904, "peak_bytes": 302014464, "ok": true, "absmax": 0.0, "corr": {"ok": true, "rtol": 0.003, "atol": 0.005, "absmax": 0.0, "mae": 0.0, "mse": 0.0, "ref": "causal_conv1d_fp32"}, "err": null}
24
+ {"ts": "2025-10-29T14:27:36Z", "run": "b5ef0d3c828f4daf8ce959b0e1a6bf4c", "impl": "hf_kernels_causal_conv1d", "tags": {"family": "hf-kernels", "backend": "cuda"}, "wl": {"name": "cuda_B4_D2048_S2048_W4", "batch": 4, "dim": 2048, "seqlen": 2048, "width": 4, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.05124100005104992, "p50": 0.053691000005073874, "p90": 0.054261999991922494, "mean": 0.05327740001348502, "iqr": 0.0014609999539061391, "raw_times": [0.05124100005104992, 0.05439199998136246, 0.054261999991922494, 0.053691000005073874, 0.052801000038016355], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.05714199994599767, "peak_bytes": 302030848, "ok": true, "absmax": 0.0, "corr": {"ok": true, "rtol": 0.003, "atol": 0.005, "absmax": 0.0, "mae": 0.0, "mse": 0.0, "ref": "causal_conv1d_fp32"}, "err": null}
causal_conv1d/impls/cells/benchmark.py CHANGED
@@ -4,37 +4,28 @@
4
  # "numpy",
5
  # "torch==2.8.0",
6
  # "kernels-benchmark-tools",
 
7
  # ]
8
  #
9
  # [tool.uv.sources]
10
  # kernels-benchmark-tools = { path = "../../../../../tools", editable = true }
11
  # ///
12
  import torch
13
- import torch.nn.functional as F
14
  import sys
15
  from kernels_benchmark_tools import KernelTypeEnum, run_benchmark
 
16
 
 
 
17
 
18
- def torch_causal_conv1d(input_tensor, weight, bias):
19
- # Convert to weight dtype for computation
20
- x = input_tensor.to(weight.dtype)
21
- dim = weight.shape[0]
22
- width = weight.shape[1]
23
- seqlen = input_tensor.shape[-1]
24
 
25
- # Depthwise causal conv1d using PyTorch
26
- out = F.conv1d(x, weight.unsqueeze(1), bias, padding=width - 1, groups=dim)
27
-
28
- # Truncate to original sequence length
29
- out = out[..., :seqlen]
30
-
31
- # Convert back to original dtype
32
- return out.to(input_tensor.dtype)
33
 
34
 
35
  run_benchmark(
36
  kernel_type=KernelTypeEnum.CAUSAL_CONV1D,
37
- impl_name="torch_eager",
38
- impl_tags={"family": "pytorch", "backend": "eager"},
39
- impl_func=torch_causal_conv1d,
40
  )
 
4
  # "numpy",
5
  # "torch==2.8.0",
6
  # "kernels-benchmark-tools",
7
+ # "kernels",
8
  # ]
9
  #
10
  # [tool.uv.sources]
11
  # kernels-benchmark-tools = { path = "../../../../../tools", editable = true }
12
  # ///
13
  import torch
 
14
  import sys
15
  from kernels_benchmark_tools import KernelTypeEnum, run_benchmark
16
+ from kernels import get_kernel
17
 
18
+ # Load the causal conv1d kernel
19
+ causal_conv1d = get_kernel("kernels-community/causal-conv1d")
20
 
 
 
 
 
 
 
21
 
22
+ def hf_kernels_causal_conv1d(input_tensor, weight, bias):
23
+ return causal_conv1d.causal_conv1d_fn(input_tensor, weight, bias)
 
 
 
 
 
 
24
 
25
 
26
  run_benchmark(
27
  kernel_type=KernelTypeEnum.CAUSAL_CONV1D,
28
+ impl_name="hf_kernels_causal_conv1d",
29
+ impl_tags={"family": "hf-kernels", "backend": "cuda"},
30
+ impl_func=hf_kernels_causal_conv1d,
31
  )
causal_conv1d/impls/hf_kernels_causal_conv1d.html CHANGED
The diff for this file is too large to render. See raw diff
 
causal_conv1d/impls/torch_causal_conv1d.html CHANGED
The diff for this file is too large to render. See raw diff
 
causal_conv1d/results/artifacts/combine/latency.svg CHANGED

Git LFS Details

  • SHA256: a640783c4d5cb4dc1763b97fa9a3e0cf2d278599a3fc38ba2056846c760ec8fe
  • Pointer size: 130 Bytes
  • Size of remote file: 35.4 kB

Git LFS Details

  • SHA256: 3d92f3a3aa92e11f21958cf1c591a4e709fd40f7b0cccbd544c1e1a77b11bcd2
  • Pointer size: 130 Bytes
  • Size of remote file: 35.4 kB
causal_conv1d/results/combined_results.html CHANGED
@@ -3872,7 +3872,7 @@ span.linenos.special { color: #000000; background-color: #ffffc0; padding-left:
3872
  <rdf:RDF>
3873
  <ns2:Work>
3874
  <dc:type rdf:resource="http://purl.org/dc/dcmitype/StillImage" />
3875
- <dc:date>2025-10-28T14:09:26.231666</dc:date>
3876
  <dc:format>image/svg+xml</dc:format>
3877
  <dc:creator>
3878
  <ns2:Agent>
@@ -4216,70 +4216,70 @@ span.linenos.special { color: #000000; background-color: #ffffc0; padding-left:
4216
  <g id="matplotlib.axis_2">
4217
  <g id="ytick_1">
4218
  <g id="grid-y--2" class="grid grid-y">
4219
- <path d="M 47.72 375.159294 L 831.034248 375.159294 " clip-path="url(#pb49fc4c8d2)" style="fill: none; stroke: #b0b0b0; stroke-opacity: 0.3; stroke-width: 0.8; stroke-linecap: square" />
4220
  </g>
4221
  <g id="line2d_25">
4222
  <defs>
4223
  <path id="m0fca2865ba" d="M 0 0 L -3.5 0 " style="stroke: #000000; stroke-width: 0.8" />
4224
  </defs>
4225
  <g>
4226
- <use ns4:href="#m0fca2865ba" x="47.72" y="375.159294" style="stroke: #000000; stroke-width: 0.8" />
4227
  </g>
4228
  </g>
4229
  <g id="text_25">
4230
- <text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: end" x="40.72" y="378.958513" transform="rotate(-0 40.72 378.958513)">0.1</text>
4231
  </g>
4232
  </g>
4233
  <g id="ytick_2">
4234
  <g id="grid-y--3" class="grid grid-y">
4235
- <path d="M 47.72 292.369752 L 831.034248 292.369752 " clip-path="url(#pb49fc4c8d2)" style="fill: none; stroke: #b0b0b0; stroke-opacity: 0.3; stroke-width: 0.8; stroke-linecap: square" />
4236
  </g>
4237
  <g id="line2d_26">
4238
  <g>
4239
- <use ns4:href="#m0fca2865ba" x="47.72" y="292.369752" style="stroke: #000000; stroke-width: 0.8" />
4240
  </g>
4241
  </g>
4242
  <g id="text_26">
4243
- <text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: end" x="40.72" y="296.168971" transform="rotate(-0 40.72 296.168971)">0.2</text>
4244
  </g>
4245
  </g>
4246
  <g id="ytick_3">
4247
  <g id="grid-y--4" class="grid grid-y">
4248
- <path d="M 47.72 209.58021 L 831.034248 209.58021 " clip-path="url(#pb49fc4c8d2)" style="fill: none; stroke: #b0b0b0; stroke-opacity: 0.3; stroke-width: 0.8; stroke-linecap: square" />
4249
  </g>
4250
  <g id="line2d_27">
4251
  <g>
4252
- <use ns4:href="#m0fca2865ba" x="47.72" y="209.58021" style="stroke: #000000; stroke-width: 0.8" />
4253
  </g>
4254
  </g>
4255
  <g id="text_27">
4256
- <text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: end" x="40.72" y="213.379429" transform="rotate(-0 40.72 213.379429)">0.3</text>
4257
  </g>
4258
  </g>
4259
  <g id="ytick_4">
4260
  <g id="grid-y--5" class="grid grid-y">
4261
- <path d="M 47.72 126.790668 L 831.034248 126.790668 " clip-path="url(#pb49fc4c8d2)" style="fill: none; stroke: #b0b0b0; stroke-opacity: 0.3; stroke-width: 0.8; stroke-linecap: square" />
4262
  </g>
4263
  <g id="line2d_28">
4264
  <g>
4265
- <use ns4:href="#m0fca2865ba" x="47.72" y="126.790668" style="stroke: #000000; stroke-width: 0.8" />
4266
  </g>
4267
  </g>
4268
  <g id="text_28">
4269
- <text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: end" x="40.72" y="130.589887" transform="rotate(-0 40.72 130.589887)">0.4</text>
4270
  </g>
4271
  </g>
4272
  <g id="ytick_5">
4273
  <g id="grid-y--6" class="grid grid-y">
4274
- <path d="M 47.72 44.001126 L 831.034248 44.001126 " clip-path="url(#pb49fc4c8d2)" style="fill: none; stroke: #b0b0b0; stroke-opacity: 0.3; stroke-width: 0.8; stroke-linecap: square" />
4275
  </g>
4276
  <g id="line2d_29">
4277
  <g>
4278
- <use ns4:href="#m0fca2865ba" x="47.72" y="44.001126" style="stroke: #000000; stroke-width: 0.8" />
4279
  </g>
4280
  </g>
4281
  <g id="text_29">
4282
- <text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: end" x="40.72" y="47.800344" transform="rotate(-0 40.72 47.800344)">0.5</text>
4283
  </g>
4284
  </g>
4285
  <g id="label--y" class="ylabel">
@@ -4287,66 +4287,66 @@ span.linenos.special { color: #000000; background-color: #ffffc0; padding-left:
4287
  </g>
4288
  </g>
4289
  <g id="series--hf-kernels-causal-conv1d" class="series">
4290
- <path d="M 83.325193 420.186871 L 114.286231 412.917949 L 145.247268 412.868275 L 176.208306 413.042133 L 207.169343 414.110118 L 238.130381 414.110946 L 269.091418 413.580265 L 300.052455 414.938014 L 331.013493 414.656529 L 361.97453 415.161545 L 392.935568 415.575493 L 423.896605 414.035608 L 454.857643 415.195489 L 485.81868 415.20294 L 516.779718 414.706203 L 547.740755 414.043887 L 578.701793 412.479164 L 609.66283 413.795518 L 640.623868 413.141481 L 671.584905 413.489197 L 702.545943 414.151513 L 733.50698 413.886586 L 764.468018 414.582019 L 795.429055 415.368519 " clip-path="url(#pb49fc4c8d2)" style="fill: none; stroke: #1f77b4; stroke-width: 1.5; stroke-linecap: square" />
4291
  <defs>
4292
  <path id="md7efaf3aec" d="M 0 3 C 0.795609 3 1.55874 2.683901 2.12132 2.12132 C 2.683901 1.55874 3 0.795609 3 0 C 3 -0.795609 2.683901 -1.55874 2.12132 -2.12132 C 1.55874 -2.683901 0.795609 -3 0 -3 C -0.795609 -3 -1.55874 -2.683901 -2.12132 -2.12132 C -2.683901 -1.55874 -3 -0.795609 -3 0 C -3 0.795609 -2.683901 1.55874 -2.12132 2.12132 C -1.55874 2.683901 -0.795609 3 0 3 z " style="stroke: #1f77b4" />
4293
  </defs>
4294
  <g clip-path="url(#pb49fc4c8d2)">
4295
  <use ns4:href="#md7efaf3aec" x="83.325193" y="420.186871" style="fill: #1f77b4; stroke: #1f77b4" />
4296
- <use ns4:href="#md7efaf3aec" x="114.286231" y="412.917949" style="fill: #1f77b4; stroke: #1f77b4" />
4297
- <use ns4:href="#md7efaf3aec" x="145.247268" y="412.868275" style="fill: #1f77b4; stroke: #1f77b4" />
4298
- <use ns4:href="#md7efaf3aec" x="176.208306" y="413.042133" style="fill: #1f77b4; stroke: #1f77b4" />
4299
- <use ns4:href="#md7efaf3aec" x="207.169343" y="414.110118" style="fill: #1f77b4; stroke: #1f77b4" />
4300
- <use ns4:href="#md7efaf3aec" x="238.130381" y="414.110946" style="fill: #1f77b4; stroke: #1f77b4" />
4301
- <use ns4:href="#md7efaf3aec" x="269.091418" y="413.580265" style="fill: #1f77b4; stroke: #1f77b4" />
4302
- <use ns4:href="#md7efaf3aec" x="300.052455" y="414.938014" style="fill: #1f77b4; stroke: #1f77b4" />
4303
- <use ns4:href="#md7efaf3aec" x="331.013493" y="414.656529" style="fill: #1f77b4; stroke: #1f77b4" />
4304
- <use ns4:href="#md7efaf3aec" x="361.97453" y="415.161545" style="fill: #1f77b4; stroke: #1f77b4" />
4305
- <use ns4:href="#md7efaf3aec" x="392.935568" y="415.575493" style="fill: #1f77b4; stroke: #1f77b4" />
4306
- <use ns4:href="#md7efaf3aec" x="423.896605" y="414.035608" style="fill: #1f77b4; stroke: #1f77b4" />
4307
- <use ns4:href="#md7efaf3aec" x="454.857643" y="415.195489" style="fill: #1f77b4; stroke: #1f77b4" />
4308
- <use ns4:href="#md7efaf3aec" x="485.81868" y="415.20294" style="fill: #1f77b4; stroke: #1f77b4" />
4309
- <use ns4:href="#md7efaf3aec" x="516.779718" y="414.706203" style="fill: #1f77b4; stroke: #1f77b4" />
4310
- <use ns4:href="#md7efaf3aec" x="547.740755" y="414.043887" style="fill: #1f77b4; stroke: #1f77b4" />
4311
- <use ns4:href="#md7efaf3aec" x="578.701793" y="412.479164" style="fill: #1f77b4; stroke: #1f77b4" />
4312
- <use ns4:href="#md7efaf3aec" x="609.66283" y="413.795518" style="fill: #1f77b4; stroke: #1f77b4" />
4313
- <use ns4:href="#md7efaf3aec" x="640.623868" y="413.141481" style="fill: #1f77b4; stroke: #1f77b4" />
4314
- <use ns4:href="#md7efaf3aec" x="671.584905" y="413.489197" style="fill: #1f77b4; stroke: #1f77b4" />
4315
- <use ns4:href="#md7efaf3aec" x="702.545943" y="414.151513" style="fill: #1f77b4; stroke: #1f77b4" />
4316
- <use ns4:href="#md7efaf3aec" x="733.50698" y="413.886586" style="fill: #1f77b4; stroke: #1f77b4" />
4317
- <use ns4:href="#md7efaf3aec" x="764.468018" y="414.582019" style="fill: #1f77b4; stroke: #1f77b4" />
4318
- <use ns4:href="#md7efaf3aec" x="795.429055" y="415.368519" style="fill: #1f77b4; stroke: #1f77b4" />
4319
  </g>
4320
  </g>
4321
  <g id="series--torch-eager" class="series">
4322
- <path d="M 83.325193 400.963139 L 114.286231 386.754798 L 145.247268 385.803546 L 176.208306 388.30379 L 207.169343 387.50901 L 238.130381 389.106849 L 269.091418 387.49328 L 300.052455 387.715984 L 331.013493 388.179606 L 361.97453 388.212722 L 392.935568 337.799686 L 423.896605 324.362943 L 454.857643 391.39184 L 485.81868 390.497713 L 516.779718 390.58133 L 547.740755 390.373529 L 578.701793 390.174834 L 609.66283 390.323855 L 640.623868 390.472876 L 671.584905 390.688129 L 702.545943 380.7865 L 733.50698 375.843964 L 764.468018 55.544472 L 795.429055 45.608899 " clip-path="url(#pb49fc4c8d2)" style="fill: none; stroke: #ff7f0e; stroke-width: 1.5; stroke-linecap: square" />
4323
  <defs>
4324
  <path id="m9b8c54d372" d="M 0 3 C 0.795609 3 1.55874 2.683901 2.12132 2.12132 C 2.683901 1.55874 3 0.795609 3 0 C 3 -0.795609 2.683901 -1.55874 2.12132 -2.12132 C 1.55874 -2.683901 0.795609 -3 0 -3 C -0.795609 -3 -1.55874 -2.683901 -2.12132 -2.12132 C -2.683901 -1.55874 -3 -0.795609 -3 0 C -3 0.795609 -2.683901 1.55874 -2.12132 2.12132 C -1.55874 2.683901 -0.795609 3 0 3 z " style="stroke: #ff7f0e" />
4325
  </defs>
4326
  <g clip-path="url(#pb49fc4c8d2)">
4327
- <use ns4:href="#m9b8c54d372" x="83.325193" y="400.963139" style="fill: #ff7f0e; stroke: #ff7f0e" />
4328
- <use ns4:href="#m9b8c54d372" x="114.286231" y="386.754798" style="fill: #ff7f0e; stroke: #ff7f0e" />
4329
- <use ns4:href="#m9b8c54d372" x="145.247268" y="385.803546" style="fill: #ff7f0e; stroke: #ff7f0e" />
4330
- <use ns4:href="#m9b8c54d372" x="176.208306" y="388.30379" style="fill: #ff7f0e; stroke: #ff7f0e" />
4331
- <use ns4:href="#m9b8c54d372" x="207.169343" y="387.50901" style="fill: #ff7f0e; stroke: #ff7f0e" />
4332
- <use ns4:href="#m9b8c54d372" x="238.130381" y="389.106849" style="fill: #ff7f0e; stroke: #ff7f0e" />
4333
- <use ns4:href="#m9b8c54d372" x="269.091418" y="387.49328" style="fill: #ff7f0e; stroke: #ff7f0e" />
4334
- <use ns4:href="#m9b8c54d372" x="300.052455" y="387.715984" style="fill: #ff7f0e; stroke: #ff7f0e" />
4335
- <use ns4:href="#m9b8c54d372" x="331.013493" y="388.179606" style="fill: #ff7f0e; stroke: #ff7f0e" />
4336
- <use ns4:href="#m9b8c54d372" x="361.97453" y="388.212722" style="fill: #ff7f0e; stroke: #ff7f0e" />
4337
- <use ns4:href="#m9b8c54d372" x="392.935568" y="337.799686" style="fill: #ff7f0e; stroke: #ff7f0e" />
4338
- <use ns4:href="#m9b8c54d372" x="423.896605" y="324.362943" style="fill: #ff7f0e; stroke: #ff7f0e" />
4339
- <use ns4:href="#m9b8c54d372" x="454.857643" y="391.39184" style="fill: #ff7f0e; stroke: #ff7f0e" />
4340
- <use ns4:href="#m9b8c54d372" x="485.81868" y="390.497713" style="fill: #ff7f0e; stroke: #ff7f0e" />
4341
- <use ns4:href="#m9b8c54d372" x="516.779718" y="390.58133" style="fill: #ff7f0e; stroke: #ff7f0e" />
4342
- <use ns4:href="#m9b8c54d372" x="547.740755" y="390.373529" style="fill: #ff7f0e; stroke: #ff7f0e" />
4343
- <use ns4:href="#m9b8c54d372" x="578.701793" y="390.174834" style="fill: #ff7f0e; stroke: #ff7f0e" />
4344
- <use ns4:href="#m9b8c54d372" x="609.66283" y="390.323855" style="fill: #ff7f0e; stroke: #ff7f0e" />
4345
- <use ns4:href="#m9b8c54d372" x="640.623868" y="390.472876" style="fill: #ff7f0e; stroke: #ff7f0e" />
4346
- <use ns4:href="#m9b8c54d372" x="671.584905" y="390.688129" style="fill: #ff7f0e; stroke: #ff7f0e" />
4347
- <use ns4:href="#m9b8c54d372" x="702.545943" y="380.7865" style="fill: #ff7f0e; stroke: #ff7f0e" />
4348
- <use ns4:href="#m9b8c54d372" x="733.50698" y="375.843964" style="fill: #ff7f0e; stroke: #ff7f0e" />
4349
- <use ns4:href="#m9b8c54d372" x="764.468018" y="55.544472" style="fill: #ff7f0e; stroke: #ff7f0e" />
4350
  <use ns4:href="#m9b8c54d372" x="795.429055" y="45.608899" style="fill: #ff7f0e; stroke: #ff7f0e" />
4351
  </g>
4352
  </g>
@@ -4405,7 +4405,7 @@ span.linenos.special { color: #000000; background-color: #ffffc0; padding-left:
4405
  <span onclick="toggleOutput('combine')" style="cursor: pointer;">▼ output</span>
4406
  <span id="uv-indicator-combine" onclick="toggleUvLogsFromHeader('combine')" style="cursor: pointer;">▶ uv-logs</span>
4407
  </span> |
4408
- Cell: combine | 4.38s
4409
  | <button class="run-btn" onclick="runCell('combine')">▶ run</button>
4410
  <button class="copy-btn" onclick="copyCell('combine')">Copy</button>
4411
  <a href="cells/combine.py" target="_blank" class="raw-btn">Raw</a>
@@ -4499,11 +4499,11 @@ hf_kernels_causal_conv1d cuda_B2_D2048_S2048_W4 0.05 True
4499
  hf_kernels_causal_conv1d cuda_B2_D2048_S512_W2 0.05 True
4500
  hf_kernels_causal_conv1d cuda_B2_D2048_S512_W4 0.05 True
4501
  hf_kernels_causal_conv1d cuda_B2_D64_S128_W2 0.05 True
4502
- hf_kernels_causal_conv1d cuda_B2_D64_S128_W4 0.05 True
4503
- hf_kernels_causal_conv1d cuda_B2_D64_S2048_W2 0.05 True
4504
  hf_kernels_causal_conv1d cuda_B2_D64_S2048_W4 0.05 True
4505
- hf_kernels_causal_conv1d cuda_B2_D64_S512_W2 0.05 True
4506
- hf_kernels_causal_conv1d cuda_B2_D64_S512_W4 0.05 True
4507
  hf_kernels_causal_conv1d cuda_B4_D2048_S128_W2 0.05 True
4508
  hf_kernels_causal_conv1d cuda_B4_D2048_S128_W4 0.05 True
4509
  hf_kernels_causal_conv1d cuda_B4_D2048_S2048_W2 0.05 True
@@ -4514,9 +4514,9 @@ hf_kernels_causal_conv1d cuda_B4_D64_S128_W2 0.05 True
4514
  hf_kernels_causal_conv1d cuda_B4_D64_S128_W4 0.05 True
4515
  hf_kernels_causal_conv1d cuda_B4_D64_S2048_W2 0.05 True
4516
  hf_kernels_causal_conv1d cuda_B4_D64_S2048_W4 0.05 True
4517
- hf_kernels_causal_conv1d cuda_B4_D64_S512_W2 0.05 True
4518
  hf_kernels_causal_conv1d cuda_B4_D64_S512_W4 0.05 True
4519
- torch_eager cuda_B2_D2048_S128_W2 0.09 True
4520
  torch_eager cuda_B2_D2048_S128_W4 0.08 True
4521
  torch_eager cuda_B2_D2048_S2048_W2 0.15 True
4522
  torch_eager cuda_B2_D2048_S2048_W4 0.16 True
@@ -4524,7 +4524,7 @@ torch_eager cuda_B2_D2048_S512_W2 0.08 True
4524
  torch_eager cuda_B2_D2048_S512_W4 0.08 True
4525
  torch_eager cuda_B2_D64_S128_W2 0.07 True
4526
  torch_eager cuda_B2_D64_S128_W4 0.09 True
4527
- torch_eager cuda_B2_D64_S2048_W2 0.09 True
4528
  torch_eager cuda_B2_D64_S2048_W4 0.08 True
4529
  torch_eager cuda_B2_D64_S512_W2 0.09 True
4530
  torch_eager cuda_B2_D64_S512_W4 0.08 True
@@ -4537,7 +4537,7 @@ torch_eager cuda_B4_D2048_S512_W4 0.10 True
4537
  torch_eager cuda_B4_D64_S128_W2 0.08 True
4538
  torch_eager cuda_B4_D64_S128_W4 0.08 True
4539
  torch_eager cuda_B4_D64_S2048_W2 0.08 True
4540
- torch_eager cuda_B4_D64_S2048_W4 0.08 True
4541
  torch_eager cuda_B4_D64_S512_W2 0.08 True
4542
  torch_eager cuda_B4_D64_S512_W4 0.08 True
4543
 
@@ -4559,7 +4559,7 @@ Implementations included:
4559
  <div class="uv-install-logs" id="uv-logs-combine">
4560
  <div class="uv-logs-header" onclick="toggleUvLogs(this)">▶ UV Install Logs</div>
4561
  <div class="uv-logs-content" style="display: none;">
4562
- Installed 37 packages in 221ms
4563
  </div>
4564
  </div>
4565
  <div class="cell-artifacts">
@@ -4572,7 +4572,7 @@ Installed 37 packages in 221ms
4572
  <rdf:RDF>
4573
  <ns2:Work>
4574
  <dc:type rdf:resource="http://purl.org/dc/dcmitype/StillImage" />
4575
- <dc:date>2025-10-28T14:09:26.231666</dc:date>
4576
  <dc:format>image/svg+xml</dc:format>
4577
  <dc:creator>
4578
  <ns2:Agent>
@@ -4916,70 +4916,70 @@ Installed 37 packages in 221ms
4916
  <g id="matplotlib.axis_2">
4917
  <g id="ytick_1">
4918
  <g id="grid-y--2" class="grid grid-y">
4919
- <path d="M 47.72 375.159294 L 831.034248 375.159294 " clip-path="url(#pb49fc4c8d2)" style="fill: none; stroke: #b0b0b0; stroke-opacity: 0.3; stroke-width: 0.8; stroke-linecap: square" />
4920
  </g>
4921
  <g id="line2d_25">
4922
  <defs>
4923
  <path id="m0fca2865ba" d="M 0 0 L -3.5 0 " style="stroke: #000000; stroke-width: 0.8" />
4924
  </defs>
4925
  <g>
4926
- <use ns4:href="#m0fca2865ba" x="47.72" y="375.159294" style="stroke: #000000; stroke-width: 0.8" />
4927
  </g>
4928
  </g>
4929
  <g id="text_25">
4930
- <text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: end" x="40.72" y="378.958513" transform="rotate(-0 40.72 378.958513)">0.1</text>
4931
  </g>
4932
  </g>
4933
  <g id="ytick_2">
4934
  <g id="grid-y--3" class="grid grid-y">
4935
- <path d="M 47.72 292.369752 L 831.034248 292.369752 " clip-path="url(#pb49fc4c8d2)" style="fill: none; stroke: #b0b0b0; stroke-opacity: 0.3; stroke-width: 0.8; stroke-linecap: square" />
4936
  </g>
4937
  <g id="line2d_26">
4938
  <g>
4939
- <use ns4:href="#m0fca2865ba" x="47.72" y="292.369752" style="stroke: #000000; stroke-width: 0.8" />
4940
  </g>
4941
  </g>
4942
  <g id="text_26">
4943
- <text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: end" x="40.72" y="296.168971" transform="rotate(-0 40.72 296.168971)">0.2</text>
4944
  </g>
4945
  </g>
4946
  <g id="ytick_3">
4947
  <g id="grid-y--4" class="grid grid-y">
4948
- <path d="M 47.72 209.58021 L 831.034248 209.58021 " clip-path="url(#pb49fc4c8d2)" style="fill: none; stroke: #b0b0b0; stroke-opacity: 0.3; stroke-width: 0.8; stroke-linecap: square" />
4949
  </g>
4950
  <g id="line2d_27">
4951
  <g>
4952
- <use ns4:href="#m0fca2865ba" x="47.72" y="209.58021" style="stroke: #000000; stroke-width: 0.8" />
4953
  </g>
4954
  </g>
4955
  <g id="text_27">
4956
- <text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: end" x="40.72" y="213.379429" transform="rotate(-0 40.72 213.379429)">0.3</text>
4957
  </g>
4958
  </g>
4959
  <g id="ytick_4">
4960
  <g id="grid-y--5" class="grid grid-y">
4961
- <path d="M 47.72 126.790668 L 831.034248 126.790668 " clip-path="url(#pb49fc4c8d2)" style="fill: none; stroke: #b0b0b0; stroke-opacity: 0.3; stroke-width: 0.8; stroke-linecap: square" />
4962
  </g>
4963
  <g id="line2d_28">
4964
  <g>
4965
- <use ns4:href="#m0fca2865ba" x="47.72" y="126.790668" style="stroke: #000000; stroke-width: 0.8" />
4966
  </g>
4967
  </g>
4968
  <g id="text_28">
4969
- <text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: end" x="40.72" y="130.589887" transform="rotate(-0 40.72 130.589887)">0.4</text>
4970
  </g>
4971
  </g>
4972
  <g id="ytick_5">
4973
  <g id="grid-y--6" class="grid grid-y">
4974
- <path d="M 47.72 44.001126 L 831.034248 44.001126 " clip-path="url(#pb49fc4c8d2)" style="fill: none; stroke: #b0b0b0; stroke-opacity: 0.3; stroke-width: 0.8; stroke-linecap: square" />
4975
  </g>
4976
  <g id="line2d_29">
4977
  <g>
4978
- <use ns4:href="#m0fca2865ba" x="47.72" y="44.001126" style="stroke: #000000; stroke-width: 0.8" />
4979
  </g>
4980
  </g>
4981
  <g id="text_29">
4982
- <text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: end" x="40.72" y="47.800344" transform="rotate(-0 40.72 47.800344)">0.5</text>
4983
  </g>
4984
  </g>
4985
  <g id="label--y" class="ylabel">
@@ -4987,66 +4987,66 @@ Installed 37 packages in 221ms
4987
  </g>
4988
  </g>
4989
  <g id="series--hf-kernels-causal-conv1d" class="series">
4990
- <path d="M 83.325193 420.186871 L 114.286231 412.917949 L 145.247268 412.868275 L 176.208306 413.042133 L 207.169343 414.110118 L 238.130381 414.110946 L 269.091418 413.580265 L 300.052455 414.938014 L 331.013493 414.656529 L 361.97453 415.161545 L 392.935568 415.575493 L 423.896605 414.035608 L 454.857643 415.195489 L 485.81868 415.20294 L 516.779718 414.706203 L 547.740755 414.043887 L 578.701793 412.479164 L 609.66283 413.795518 L 640.623868 413.141481 L 671.584905 413.489197 L 702.545943 414.151513 L 733.50698 413.886586 L 764.468018 414.582019 L 795.429055 415.368519 " clip-path="url(#pb49fc4c8d2)" style="fill: none; stroke: #1f77b4; stroke-width: 1.5; stroke-linecap: square" />
4991
  <defs>
4992
  <path id="md7efaf3aec" d="M 0 3 C 0.795609 3 1.55874 2.683901 2.12132 2.12132 C 2.683901 1.55874 3 0.795609 3 0 C 3 -0.795609 2.683901 -1.55874 2.12132 -2.12132 C 1.55874 -2.683901 0.795609 -3 0 -3 C -0.795609 -3 -1.55874 -2.683901 -2.12132 -2.12132 C -2.683901 -1.55874 -3 -0.795609 -3 0 C -3 0.795609 -2.683901 1.55874 -2.12132 2.12132 C -1.55874 2.683901 -0.795609 3 0 3 z " style="stroke: #1f77b4" />
4993
  </defs>
4994
  <g clip-path="url(#pb49fc4c8d2)">
4995
  <use ns4:href="#md7efaf3aec" x="83.325193" y="420.186871" style="fill: #1f77b4; stroke: #1f77b4" />
4996
- <use ns4:href="#md7efaf3aec" x="114.286231" y="412.917949" style="fill: #1f77b4; stroke: #1f77b4" />
4997
- <use ns4:href="#md7efaf3aec" x="145.247268" y="412.868275" style="fill: #1f77b4; stroke: #1f77b4" />
4998
- <use ns4:href="#md7efaf3aec" x="176.208306" y="413.042133" style="fill: #1f77b4; stroke: #1f77b4" />
4999
- <use ns4:href="#md7efaf3aec" x="207.169343" y="414.110118" style="fill: #1f77b4; stroke: #1f77b4" />
5000
- <use ns4:href="#md7efaf3aec" x="238.130381" y="414.110946" style="fill: #1f77b4; stroke: #1f77b4" />
5001
- <use ns4:href="#md7efaf3aec" x="269.091418" y="413.580265" style="fill: #1f77b4; stroke: #1f77b4" />
5002
- <use ns4:href="#md7efaf3aec" x="300.052455" y="414.938014" style="fill: #1f77b4; stroke: #1f77b4" />
5003
- <use ns4:href="#md7efaf3aec" x="331.013493" y="414.656529" style="fill: #1f77b4; stroke: #1f77b4" />
5004
- <use ns4:href="#md7efaf3aec" x="361.97453" y="415.161545" style="fill: #1f77b4; stroke: #1f77b4" />
5005
- <use ns4:href="#md7efaf3aec" x="392.935568" y="415.575493" style="fill: #1f77b4; stroke: #1f77b4" />
5006
- <use ns4:href="#md7efaf3aec" x="423.896605" y="414.035608" style="fill: #1f77b4; stroke: #1f77b4" />
5007
- <use ns4:href="#md7efaf3aec" x="454.857643" y="415.195489" style="fill: #1f77b4; stroke: #1f77b4" />
5008
- <use ns4:href="#md7efaf3aec" x="485.81868" y="415.20294" style="fill: #1f77b4; stroke: #1f77b4" />
5009
- <use ns4:href="#md7efaf3aec" x="516.779718" y="414.706203" style="fill: #1f77b4; stroke: #1f77b4" />
5010
- <use ns4:href="#md7efaf3aec" x="547.740755" y="414.043887" style="fill: #1f77b4; stroke: #1f77b4" />
5011
- <use ns4:href="#md7efaf3aec" x="578.701793" y="412.479164" style="fill: #1f77b4; stroke: #1f77b4" />
5012
- <use ns4:href="#md7efaf3aec" x="609.66283" y="413.795518" style="fill: #1f77b4; stroke: #1f77b4" />
5013
- <use ns4:href="#md7efaf3aec" x="640.623868" y="413.141481" style="fill: #1f77b4; stroke: #1f77b4" />
5014
- <use ns4:href="#md7efaf3aec" x="671.584905" y="413.489197" style="fill: #1f77b4; stroke: #1f77b4" />
5015
- <use ns4:href="#md7efaf3aec" x="702.545943" y="414.151513" style="fill: #1f77b4; stroke: #1f77b4" />
5016
- <use ns4:href="#md7efaf3aec" x="733.50698" y="413.886586" style="fill: #1f77b4; stroke: #1f77b4" />
5017
- <use ns4:href="#md7efaf3aec" x="764.468018" y="414.582019" style="fill: #1f77b4; stroke: #1f77b4" />
5018
- <use ns4:href="#md7efaf3aec" x="795.429055" y="415.368519" style="fill: #1f77b4; stroke: #1f77b4" />
5019
  </g>
5020
  </g>
5021
  <g id="series--torch-eager" class="series">
5022
- <path d="M 83.325193 400.963139 L 114.286231 386.754798 L 145.247268 385.803546 L 176.208306 388.30379 L 207.169343 387.50901 L 238.130381 389.106849 L 269.091418 387.49328 L 300.052455 387.715984 L 331.013493 388.179606 L 361.97453 388.212722 L 392.935568 337.799686 L 423.896605 324.362943 L 454.857643 391.39184 L 485.81868 390.497713 L 516.779718 390.58133 L 547.740755 390.373529 L 578.701793 390.174834 L 609.66283 390.323855 L 640.623868 390.472876 L 671.584905 390.688129 L 702.545943 380.7865 L 733.50698 375.843964 L 764.468018 55.544472 L 795.429055 45.608899 " clip-path="url(#pb49fc4c8d2)" style="fill: none; stroke: #ff7f0e; stroke-width: 1.5; stroke-linecap: square" />
5023
  <defs>
5024
  <path id="m9b8c54d372" d="M 0 3 C 0.795609 3 1.55874 2.683901 2.12132 2.12132 C 2.683901 1.55874 3 0.795609 3 0 C 3 -0.795609 2.683901 -1.55874 2.12132 -2.12132 C 1.55874 -2.683901 0.795609 -3 0 -3 C -0.795609 -3 -1.55874 -2.683901 -2.12132 -2.12132 C -2.683901 -1.55874 -3 -0.795609 -3 0 C -3 0.795609 -2.683901 1.55874 -2.12132 2.12132 C -1.55874 2.683901 -0.795609 3 0 3 z " style="stroke: #ff7f0e" />
5025
  </defs>
5026
  <g clip-path="url(#pb49fc4c8d2)">
5027
- <use ns4:href="#m9b8c54d372" x="83.325193" y="400.963139" style="fill: #ff7f0e; stroke: #ff7f0e" />
5028
- <use ns4:href="#m9b8c54d372" x="114.286231" y="386.754798" style="fill: #ff7f0e; stroke: #ff7f0e" />
5029
- <use ns4:href="#m9b8c54d372" x="145.247268" y="385.803546" style="fill: #ff7f0e; stroke: #ff7f0e" />
5030
- <use ns4:href="#m9b8c54d372" x="176.208306" y="388.30379" style="fill: #ff7f0e; stroke: #ff7f0e" />
5031
- <use ns4:href="#m9b8c54d372" x="207.169343" y="387.50901" style="fill: #ff7f0e; stroke: #ff7f0e" />
5032
- <use ns4:href="#m9b8c54d372" x="238.130381" y="389.106849" style="fill: #ff7f0e; stroke: #ff7f0e" />
5033
- <use ns4:href="#m9b8c54d372" x="269.091418" y="387.49328" style="fill: #ff7f0e; stroke: #ff7f0e" />
5034
- <use ns4:href="#m9b8c54d372" x="300.052455" y="387.715984" style="fill: #ff7f0e; stroke: #ff7f0e" />
5035
- <use ns4:href="#m9b8c54d372" x="331.013493" y="388.179606" style="fill: #ff7f0e; stroke: #ff7f0e" />
5036
- <use ns4:href="#m9b8c54d372" x="361.97453" y="388.212722" style="fill: #ff7f0e; stroke: #ff7f0e" />
5037
- <use ns4:href="#m9b8c54d372" x="392.935568" y="337.799686" style="fill: #ff7f0e; stroke: #ff7f0e" />
5038
- <use ns4:href="#m9b8c54d372" x="423.896605" y="324.362943" style="fill: #ff7f0e; stroke: #ff7f0e" />
5039
- <use ns4:href="#m9b8c54d372" x="454.857643" y="391.39184" style="fill: #ff7f0e; stroke: #ff7f0e" />
5040
- <use ns4:href="#m9b8c54d372" x="485.81868" y="390.497713" style="fill: #ff7f0e; stroke: #ff7f0e" />
5041
- <use ns4:href="#m9b8c54d372" x="516.779718" y="390.58133" style="fill: #ff7f0e; stroke: #ff7f0e" />
5042
- <use ns4:href="#m9b8c54d372" x="547.740755" y="390.373529" style="fill: #ff7f0e; stroke: #ff7f0e" />
5043
- <use ns4:href="#m9b8c54d372" x="578.701793" y="390.174834" style="fill: #ff7f0e; stroke: #ff7f0e" />
5044
- <use ns4:href="#m9b8c54d372" x="609.66283" y="390.323855" style="fill: #ff7f0e; stroke: #ff7f0e" />
5045
- <use ns4:href="#m9b8c54d372" x="640.623868" y="390.472876" style="fill: #ff7f0e; stroke: #ff7f0e" />
5046
- <use ns4:href="#m9b8c54d372" x="671.584905" y="390.688129" style="fill: #ff7f0e; stroke: #ff7f0e" />
5047
- <use ns4:href="#m9b8c54d372" x="702.545943" y="380.7865" style="fill: #ff7f0e; stroke: #ff7f0e" />
5048
- <use ns4:href="#m9b8c54d372" x="733.50698" y="375.843964" style="fill: #ff7f0e; stroke: #ff7f0e" />
5049
- <use ns4:href="#m9b8c54d372" x="764.468018" y="55.544472" style="fill: #ff7f0e; stroke: #ff7f0e" />
5050
  <use ns4:href="#m9b8c54d372" x="795.429055" y="45.608899" style="fill: #ff7f0e; stroke: #ff7f0e" />
5051
  </g>
5052
  </g>
 
3872
  <rdf:RDF>
3873
  <ns2:Work>
3874
  <dc:type rdf:resource="http://purl.org/dc/dcmitype/StillImage" />
3875
+ <dc:date>2025-10-29T14:27:58.771179</dc:date>
3876
  <dc:format>image/svg+xml</dc:format>
3877
  <dc:creator>
3878
  <ns2:Agent>
 
4216
  <g id="matplotlib.axis_2">
4217
  <g id="ytick_1">
4218
  <g id="grid-y--2" class="grid grid-y">
4219
+ <path d="M 47.72 377.079386 L 831.034248 377.079386 " clip-path="url(#pb49fc4c8d2)" style="fill: none; stroke: #b0b0b0; stroke-opacity: 0.3; stroke-width: 0.8; stroke-linecap: square" />
4220
  </g>
4221
  <g id="line2d_25">
4222
  <defs>
4223
  <path id="m0fca2865ba" d="M 0 0 L -3.5 0 " style="stroke: #000000; stroke-width: 0.8" />
4224
  </defs>
4225
  <g>
4226
+ <use ns4:href="#m0fca2865ba" x="47.72" y="377.079386" style="stroke: #000000; stroke-width: 0.8" />
4227
  </g>
4228
  </g>
4229
  <g id="text_25">
4230
+ <text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: end" x="40.72" y="380.878605" transform="rotate(-0 40.72 380.878605)">0.1</text>
4231
  </g>
4232
  </g>
4233
  <g id="ytick_2">
4234
  <g id="grid-y--3" class="grid grid-y">
4235
+ <path d="M 47.72 293.552318 L 831.034248 293.552318 " clip-path="url(#pb49fc4c8d2)" style="fill: none; stroke: #b0b0b0; stroke-opacity: 0.3; stroke-width: 0.8; stroke-linecap: square" />
4236
  </g>
4237
  <g id="line2d_26">
4238
  <g>
4239
+ <use ns4:href="#m0fca2865ba" x="47.72" y="293.552318" style="stroke: #000000; stroke-width: 0.8" />
4240
  </g>
4241
  </g>
4242
  <g id="text_26">
4243
+ <text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: end" x="40.72" y="297.351537" transform="rotate(-0 40.72 297.351537)">0.2</text>
4244
  </g>
4245
  </g>
4246
  <g id="ytick_3">
4247
  <g id="grid-y--4" class="grid grid-y">
4248
+ <path d="M 47.72 210.02525 L 831.034248 210.02525 " clip-path="url(#pb49fc4c8d2)" style="fill: none; stroke: #b0b0b0; stroke-opacity: 0.3; stroke-width: 0.8; stroke-linecap: square" />
4249
  </g>
4250
  <g id="line2d_27">
4251
  <g>
4252
+ <use ns4:href="#m0fca2865ba" x="47.72" y="210.02525" style="stroke: #000000; stroke-width: 0.8" />
4253
  </g>
4254
  </g>
4255
  <g id="text_27">
4256
+ <text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: end" x="40.72" y="213.824469" transform="rotate(-0 40.72 213.824469)">0.3</text>
4257
  </g>
4258
  </g>
4259
  <g id="ytick_4">
4260
  <g id="grid-y--5" class="grid grid-y">
4261
+ <path d="M 47.72 126.498182 L 831.034248 126.498182 " clip-path="url(#pb49fc4c8d2)" style="fill: none; stroke: #b0b0b0; stroke-opacity: 0.3; stroke-width: 0.8; stroke-linecap: square" />
4262
  </g>
4263
  <g id="line2d_28">
4264
  <g>
4265
+ <use ns4:href="#m0fca2865ba" x="47.72" y="126.498182" style="stroke: #000000; stroke-width: 0.8" />
4266
  </g>
4267
  </g>
4268
  <g id="text_28">
4269
+ <text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: end" x="40.72" y="130.297401" transform="rotate(-0 40.72 130.297401)">0.4</text>
4270
  </g>
4271
  </g>
4272
  <g id="ytick_5">
4273
  <g id="grid-y--6" class="grid grid-y">
4274
+ <path d="M 47.72 42.971114 L 831.034248 42.971114 " clip-path="url(#pb49fc4c8d2)" style="fill: none; stroke: #b0b0b0; stroke-opacity: 0.3; stroke-width: 0.8; stroke-linecap: square" />
4275
  </g>
4276
  <g id="line2d_29">
4277
  <g>
4278
+ <use ns4:href="#m0fca2865ba" x="47.72" y="42.971114" style="stroke: #000000; stroke-width: 0.8" />
4279
  </g>
4280
  </g>
4281
  <g id="text_29">
4282
+ <text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: end" x="40.72" y="46.770333" transform="rotate(-0 40.72 46.770333)">0.5</text>
4283
  </g>
4284
  </g>
4285
  <g id="label--y" class="ylabel">
 
4287
  </g>
4288
  </g>
4289
  <g id="series--hf-kernels-causal-conv1d" class="series">
4290
+ <path d="M 83.325193 420.186871 L 114.286231 413.746934 L 145.247268 413.019413 L 176.208306 414.649026 L 207.169343 414.665731 L 238.130381 415.985459 L 269.091418 416.252746 L 300.052455 415.525225 L 331.013493 416.703792 L 361.97453 415.692279 L 392.935568 416.269451 L 423.896605 416.168383 L 454.857643 415.52606 L 485.81868 415.952048 L 516.779718 414.072689 L 547.740755 415.399934 L 578.701793 415.43418 L 609.66283 416.402259 L 640.623868 414.841138 L 671.584905 415.024898 L 702.545943 414.990652 L 733.50698 414.974782 L 764.468018 415.61794 L 795.429055 415.759936 " clip-path="url(#pb49fc4c8d2)" style="fill: none; stroke: #1f77b4; stroke-width: 1.5; stroke-linecap: square" />
4291
  <defs>
4292
  <path id="md7efaf3aec" d="M 0 3 C 0.795609 3 1.55874 2.683901 2.12132 2.12132 C 2.683901 1.55874 3 0.795609 3 0 C 3 -0.795609 2.683901 -1.55874 2.12132 -2.12132 C 1.55874 -2.683901 0.795609 -3 0 -3 C -0.795609 -3 -1.55874 -2.683901 -2.12132 -2.12132 C -2.683901 -1.55874 -3 -0.795609 -3 0 C -3 0.795609 -2.683901 1.55874 -2.12132 2.12132 C -1.55874 2.683901 -0.795609 3 0 3 z " style="stroke: #1f77b4" />
4293
  </defs>
4294
  <g clip-path="url(#pb49fc4c8d2)">
4295
  <use ns4:href="#md7efaf3aec" x="83.325193" y="420.186871" style="fill: #1f77b4; stroke: #1f77b4" />
4296
+ <use ns4:href="#md7efaf3aec" x="114.286231" y="413.746934" style="fill: #1f77b4; stroke: #1f77b4" />
4297
+ <use ns4:href="#md7efaf3aec" x="145.247268" y="413.019413" style="fill: #1f77b4; stroke: #1f77b4" />
4298
+ <use ns4:href="#md7efaf3aec" x="176.208306" y="414.649026" style="fill: #1f77b4; stroke: #1f77b4" />
4299
+ <use ns4:href="#md7efaf3aec" x="207.169343" y="414.665731" style="fill: #1f77b4; stroke: #1f77b4" />
4300
+ <use ns4:href="#md7efaf3aec" x="238.130381" y="415.985459" style="fill: #1f77b4; stroke: #1f77b4" />
4301
+ <use ns4:href="#md7efaf3aec" x="269.091418" y="416.252746" style="fill: #1f77b4; stroke: #1f77b4" />
4302
+ <use ns4:href="#md7efaf3aec" x="300.052455" y="415.525225" style="fill: #1f77b4; stroke: #1f77b4" />
4303
+ <use ns4:href="#md7efaf3aec" x="331.013493" y="416.703792" style="fill: #1f77b4; stroke: #1f77b4" />
4304
+ <use ns4:href="#md7efaf3aec" x="361.97453" y="415.692279" style="fill: #1f77b4; stroke: #1f77b4" />
4305
+ <use ns4:href="#md7efaf3aec" x="392.935568" y="416.269451" style="fill: #1f77b4; stroke: #1f77b4" />
4306
+ <use ns4:href="#md7efaf3aec" x="423.896605" y="416.168383" style="fill: #1f77b4; stroke: #1f77b4" />
4307
+ <use ns4:href="#md7efaf3aec" x="454.857643" y="415.52606" style="fill: #1f77b4; stroke: #1f77b4" />
4308
+ <use ns4:href="#md7efaf3aec" x="485.81868" y="415.952048" style="fill: #1f77b4; stroke: #1f77b4" />
4309
+ <use ns4:href="#md7efaf3aec" x="516.779718" y="414.072689" style="fill: #1f77b4; stroke: #1f77b4" />
4310
+ <use ns4:href="#md7efaf3aec" x="547.740755" y="415.399934" style="fill: #1f77b4; stroke: #1f77b4" />
4311
+ <use ns4:href="#md7efaf3aec" x="578.701793" y="415.43418" style="fill: #1f77b4; stroke: #1f77b4" />
4312
+ <use ns4:href="#md7efaf3aec" x="609.66283" y="416.402259" style="fill: #1f77b4; stroke: #1f77b4" />
4313
+ <use ns4:href="#md7efaf3aec" x="640.623868" y="414.841138" style="fill: #1f77b4; stroke: #1f77b4" />
4314
+ <use ns4:href="#md7efaf3aec" x="671.584905" y="415.024898" style="fill: #1f77b4; stroke: #1f77b4" />
4315
+ <use ns4:href="#md7efaf3aec" x="702.545943" y="414.990652" style="fill: #1f77b4; stroke: #1f77b4" />
4316
+ <use ns4:href="#md7efaf3aec" x="733.50698" y="414.974782" style="fill: #1f77b4; stroke: #1f77b4" />
4317
+ <use ns4:href="#md7efaf3aec" x="764.468018" y="415.61794" style="fill: #1f77b4; stroke: #1f77b4" />
4318
+ <use ns4:href="#md7efaf3aec" x="795.429055" y="415.759936" style="fill: #1f77b4; stroke: #1f77b4" />
4319
  </g>
4320
  </g>
4321
  <g id="series--torch-eager" class="series">
4322
+ <path d="M 83.325193 401.710683 L 114.286231 389.180788 L 145.247268 389.523249 L 176.208306 390.141349 L 207.169343 391.126968 L 238.130381 390.809566 L 269.091418 390.934856 L 300.052455 390.667569 L 331.013493 390.500515 L 361.97453 389.707008 L 392.935568 339.037818 L 423.896605 325.239147 L 454.857643 391.043441 L 485.81868 391.009195 L 516.779718 391.143674 L 547.740755 390.442046 L 578.701793 390.951562 L 609.66283 389.129836 L 640.623868 391.795185 L 671.584905 391.319081 L 702.545943 381.654999 L 733.50698 375.966806 L 764.468018 53.96077 L 795.429055 45.608899 " clip-path="url(#pb49fc4c8d2)" style="fill: none; stroke: #ff7f0e; stroke-width: 1.5; stroke-linecap: square" />
4323
  <defs>
4324
  <path id="m9b8c54d372" d="M 0 3 C 0.795609 3 1.55874 2.683901 2.12132 2.12132 C 2.683901 1.55874 3 0.795609 3 0 C 3 -0.795609 2.683901 -1.55874 2.12132 -2.12132 C 1.55874 -2.683901 0.795609 -3 0 -3 C -0.795609 -3 -1.55874 -2.683901 -2.12132 -2.12132 C -2.683901 -1.55874 -3 -0.795609 -3 0 C -3 0.795609 -2.683901 1.55874 -2.12132 2.12132 C -1.55874 2.683901 -0.795609 3 0 3 z " style="stroke: #ff7f0e" />
4325
  </defs>
4326
  <g clip-path="url(#pb49fc4c8d2)">
4327
+ <use ns4:href="#m9b8c54d372" x="83.325193" y="401.710683" style="fill: #ff7f0e; stroke: #ff7f0e" />
4328
+ <use ns4:href="#m9b8c54d372" x="114.286231" y="389.180788" style="fill: #ff7f0e; stroke: #ff7f0e" />
4329
+ <use ns4:href="#m9b8c54d372" x="145.247268" y="389.523249" style="fill: #ff7f0e; stroke: #ff7f0e" />
4330
+ <use ns4:href="#m9b8c54d372" x="176.208306" y="390.141349" style="fill: #ff7f0e; stroke: #ff7f0e" />
4331
+ <use ns4:href="#m9b8c54d372" x="207.169343" y="391.126968" style="fill: #ff7f0e; stroke: #ff7f0e" />
4332
+ <use ns4:href="#m9b8c54d372" x="238.130381" y="390.809566" style="fill: #ff7f0e; stroke: #ff7f0e" />
4333
+ <use ns4:href="#m9b8c54d372" x="269.091418" y="390.934856" style="fill: #ff7f0e; stroke: #ff7f0e" />
4334
+ <use ns4:href="#m9b8c54d372" x="300.052455" y="390.667569" style="fill: #ff7f0e; stroke: #ff7f0e" />
4335
+ <use ns4:href="#m9b8c54d372" x="331.013493" y="390.500515" style="fill: #ff7f0e; stroke: #ff7f0e" />
4336
+ <use ns4:href="#m9b8c54d372" x="361.97453" y="389.707008" style="fill: #ff7f0e; stroke: #ff7f0e" />
4337
+ <use ns4:href="#m9b8c54d372" x="392.935568" y="339.037818" style="fill: #ff7f0e; stroke: #ff7f0e" />
4338
+ <use ns4:href="#m9b8c54d372" x="423.896605" y="325.239147" style="fill: #ff7f0e; stroke: #ff7f0e" />
4339
+ <use ns4:href="#m9b8c54d372" x="454.857643" y="391.043441" style="fill: #ff7f0e; stroke: #ff7f0e" />
4340
+ <use ns4:href="#m9b8c54d372" x="485.81868" y="391.009195" style="fill: #ff7f0e; stroke: #ff7f0e" />
4341
+ <use ns4:href="#m9b8c54d372" x="516.779718" y="391.143674" style="fill: #ff7f0e; stroke: #ff7f0e" />
4342
+ <use ns4:href="#m9b8c54d372" x="547.740755" y="390.442046" style="fill: #ff7f0e; stroke: #ff7f0e" />
4343
+ <use ns4:href="#m9b8c54d372" x="578.701793" y="390.951562" style="fill: #ff7f0e; stroke: #ff7f0e" />
4344
+ <use ns4:href="#m9b8c54d372" x="609.66283" y="389.129836" style="fill: #ff7f0e; stroke: #ff7f0e" />
4345
+ <use ns4:href="#m9b8c54d372" x="640.623868" y="391.795185" style="fill: #ff7f0e; stroke: #ff7f0e" />
4346
+ <use ns4:href="#m9b8c54d372" x="671.584905" y="391.319081" style="fill: #ff7f0e; stroke: #ff7f0e" />
4347
+ <use ns4:href="#m9b8c54d372" x="702.545943" y="381.654999" style="fill: #ff7f0e; stroke: #ff7f0e" />
4348
+ <use ns4:href="#m9b8c54d372" x="733.50698" y="375.966806" style="fill: #ff7f0e; stroke: #ff7f0e" />
4349
+ <use ns4:href="#m9b8c54d372" x="764.468018" y="53.96077" style="fill: #ff7f0e; stroke: #ff7f0e" />
4350
  <use ns4:href="#m9b8c54d372" x="795.429055" y="45.608899" style="fill: #ff7f0e; stroke: #ff7f0e" />
4351
  </g>
4352
  </g>
 
4405
  <span onclick="toggleOutput('combine')" style="cursor: pointer;">▼ output</span>
4406
  <span id="uv-indicator-combine" onclick="toggleUvLogsFromHeader('combine')" style="cursor: pointer;">▶ uv-logs</span>
4407
  </span> |
4408
+ Cell: combine | 4.32s
4409
  | <button class="run-btn" onclick="runCell('combine')">▶ run</button>
4410
  <button class="copy-btn" onclick="copyCell('combine')">Copy</button>
4411
  <a href="cells/combine.py" target="_blank" class="raw-btn">Raw</a>
 
4499
  hf_kernels_causal_conv1d cuda_B2_D2048_S512_W2 0.05 True
4500
  hf_kernels_causal_conv1d cuda_B2_D2048_S512_W4 0.05 True
4501
  hf_kernels_causal_conv1d cuda_B2_D64_S128_W2 0.05 True
4502
+ hf_kernels_causal_conv1d cuda_B2_D64_S128_W4 0.06 True
4503
+ hf_kernels_causal_conv1d cuda_B2_D64_S2048_W2 0.06 True
4504
  hf_kernels_causal_conv1d cuda_B2_D64_S2048_W4 0.05 True
4505
+ hf_kernels_causal_conv1d cuda_B2_D64_S512_W2 0.06 True
4506
+ hf_kernels_causal_conv1d cuda_B2_D64_S512_W4 0.06 True
4507
  hf_kernels_causal_conv1d cuda_B4_D2048_S128_W2 0.05 True
4508
  hf_kernels_causal_conv1d cuda_B4_D2048_S128_W4 0.05 True
4509
  hf_kernels_causal_conv1d cuda_B4_D2048_S2048_W2 0.05 True
 
4514
  hf_kernels_causal_conv1d cuda_B4_D64_S128_W4 0.05 True
4515
  hf_kernels_causal_conv1d cuda_B4_D64_S2048_W2 0.05 True
4516
  hf_kernels_causal_conv1d cuda_B4_D64_S2048_W4 0.05 True
4517
+ hf_kernels_causal_conv1d cuda_B4_D64_S512_W2 0.06 True
4518
  hf_kernels_causal_conv1d cuda_B4_D64_S512_W4 0.05 True
4519
+ torch_eager cuda_B2_D2048_S128_W2 0.08 True
4520
  torch_eager cuda_B2_D2048_S128_W4 0.08 True
4521
  torch_eager cuda_B2_D2048_S2048_W2 0.15 True
4522
  torch_eager cuda_B2_D2048_S2048_W4 0.16 True
 
4524
  torch_eager cuda_B2_D2048_S512_W4 0.08 True
4525
  torch_eager cuda_B2_D64_S128_W2 0.07 True
4526
  torch_eager cuda_B2_D64_S128_W4 0.09 True
4527
+ torch_eager cuda_B2_D64_S2048_W2 0.08 True
4528
  torch_eager cuda_B2_D64_S2048_W4 0.08 True
4529
  torch_eager cuda_B2_D64_S512_W2 0.09 True
4530
  torch_eager cuda_B2_D64_S512_W4 0.08 True
 
4537
  torch_eager cuda_B4_D64_S128_W2 0.08 True
4538
  torch_eager cuda_B4_D64_S128_W4 0.08 True
4539
  torch_eager cuda_B4_D64_S2048_W2 0.08 True
4540
+ torch_eager cuda_B4_D64_S2048_W4 0.09 True
4541
  torch_eager cuda_B4_D64_S512_W2 0.08 True
4542
  torch_eager cuda_B4_D64_S512_W4 0.08 True
4543
 
 
4559
  <div class="uv-install-logs" id="uv-logs-combine">
4560
  <div class="uv-logs-header" onclick="toggleUvLogs(this)">▶ UV Install Logs</div>
4561
  <div class="uv-logs-content" style="display: none;">
4562
+ Installed 37 packages in 214ms
4563
  </div>
4564
  </div>
4565
  <div class="cell-artifacts">
 
4572
  <rdf:RDF>
4573
  <ns2:Work>
4574
  <dc:type rdf:resource="http://purl.org/dc/dcmitype/StillImage" />
4575
+ <dc:date>2025-10-29T14:27:58.771179</dc:date>
4576
  <dc:format>image/svg+xml</dc:format>
4577
  <dc:creator>
4578
  <ns2:Agent>
 
4916
  <g id="matplotlib.axis_2">
4917
  <g id="ytick_1">
4918
  <g id="grid-y--2" class="grid grid-y">
4919
+ <path d="M 47.72 377.079386 L 831.034248 377.079386 " clip-path="url(#pb49fc4c8d2)" style="fill: none; stroke: #b0b0b0; stroke-opacity: 0.3; stroke-width: 0.8; stroke-linecap: square" />
4920
  </g>
4921
  <g id="line2d_25">
4922
  <defs>
4923
  <path id="m0fca2865ba" d="M 0 0 L -3.5 0 " style="stroke: #000000; stroke-width: 0.8" />
4924
  </defs>
4925
  <g>
4926
+ <use ns4:href="#m0fca2865ba" x="47.72" y="377.079386" style="stroke: #000000; stroke-width: 0.8" />
4927
  </g>
4928
  </g>
4929
  <g id="text_25">
4930
+ <text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: end" x="40.72" y="380.878605" transform="rotate(-0 40.72 380.878605)">0.1</text>
4931
  </g>
4932
  </g>
4933
  <g id="ytick_2">
4934
  <g id="grid-y--3" class="grid grid-y">
4935
+ <path d="M 47.72 293.552318 L 831.034248 293.552318 " clip-path="url(#pb49fc4c8d2)" style="fill: none; stroke: #b0b0b0; stroke-opacity: 0.3; stroke-width: 0.8; stroke-linecap: square" />
4936
  </g>
4937
  <g id="line2d_26">
4938
  <g>
4939
+ <use ns4:href="#m0fca2865ba" x="47.72" y="293.552318" style="stroke: #000000; stroke-width: 0.8" />
4940
  </g>
4941
  </g>
4942
  <g id="text_26">
4943
+ <text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: end" x="40.72" y="297.351537" transform="rotate(-0 40.72 297.351537)">0.2</text>
4944
  </g>
4945
  </g>
4946
  <g id="ytick_3">
4947
  <g id="grid-y--4" class="grid grid-y">
4948
+ <path d="M 47.72 210.02525 L 831.034248 210.02525 " clip-path="url(#pb49fc4c8d2)" style="fill: none; stroke: #b0b0b0; stroke-opacity: 0.3; stroke-width: 0.8; stroke-linecap: square" />
4949
  </g>
4950
  <g id="line2d_27">
4951
  <g>
4952
+ <use ns4:href="#m0fca2865ba" x="47.72" y="210.02525" style="stroke: #000000; stroke-width: 0.8" />
4953
  </g>
4954
  </g>
4955
  <g id="text_27">
4956
+ <text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: end" x="40.72" y="213.824469" transform="rotate(-0 40.72 213.824469)">0.3</text>
4957
  </g>
4958
  </g>
4959
  <g id="ytick_4">
4960
  <g id="grid-y--5" class="grid grid-y">
4961
+ <path d="M 47.72 126.498182 L 831.034248 126.498182 " clip-path="url(#pb49fc4c8d2)" style="fill: none; stroke: #b0b0b0; stroke-opacity: 0.3; stroke-width: 0.8; stroke-linecap: square" />
4962
  </g>
4963
  <g id="line2d_28">
4964
  <g>
4965
+ <use ns4:href="#m0fca2865ba" x="47.72" y="126.498182" style="stroke: #000000; stroke-width: 0.8" />
4966
  </g>
4967
  </g>
4968
  <g id="text_28">
4969
+ <text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: end" x="40.72" y="130.297401" transform="rotate(-0 40.72 130.297401)">0.4</text>
4970
  </g>
4971
  </g>
4972
  <g id="ytick_5">
4973
  <g id="grid-y--6" class="grid grid-y">
4974
+ <path d="M 47.72 42.971114 L 831.034248 42.971114 " clip-path="url(#pb49fc4c8d2)" style="fill: none; stroke: #b0b0b0; stroke-opacity: 0.3; stroke-width: 0.8; stroke-linecap: square" />
4975
  </g>
4976
  <g id="line2d_29">
4977
  <g>
4978
+ <use ns4:href="#m0fca2865ba" x="47.72" y="42.971114" style="stroke: #000000; stroke-width: 0.8" />
4979
  </g>
4980
  </g>
4981
  <g id="text_29">
4982
+ <text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: end" x="40.72" y="46.770333" transform="rotate(-0 40.72 46.770333)">0.5</text>
4983
  </g>
4984
  </g>
4985
  <g id="label--y" class="ylabel">
 
4987
  </g>
4988
  </g>
4989
  <g id="series--hf-kernels-causal-conv1d" class="series">
4990
+ <path d="M 83.325193 420.186871 L 114.286231 413.746934 L 145.247268 413.019413 L 176.208306 414.649026 L 207.169343 414.665731 L 238.130381 415.985459 L 269.091418 416.252746 L 300.052455 415.525225 L 331.013493 416.703792 L 361.97453 415.692279 L 392.935568 416.269451 L 423.896605 416.168383 L 454.857643 415.52606 L 485.81868 415.952048 L 516.779718 414.072689 L 547.740755 415.399934 L 578.701793 415.43418 L 609.66283 416.402259 L 640.623868 414.841138 L 671.584905 415.024898 L 702.545943 414.990652 L 733.50698 414.974782 L 764.468018 415.61794 L 795.429055 415.759936 " clip-path="url(#pb49fc4c8d2)" style="fill: none; stroke: #1f77b4; stroke-width: 1.5; stroke-linecap: square" />
4991
  <defs>
4992
  <path id="md7efaf3aec" d="M 0 3 C 0.795609 3 1.55874 2.683901 2.12132 2.12132 C 2.683901 1.55874 3 0.795609 3 0 C 3 -0.795609 2.683901 -1.55874 2.12132 -2.12132 C 1.55874 -2.683901 0.795609 -3 0 -3 C -0.795609 -3 -1.55874 -2.683901 -2.12132 -2.12132 C -2.683901 -1.55874 -3 -0.795609 -3 0 C -3 0.795609 -2.683901 1.55874 -2.12132 2.12132 C -1.55874 2.683901 -0.795609 3 0 3 z " style="stroke: #1f77b4" />
4993
  </defs>
4994
  <g clip-path="url(#pb49fc4c8d2)">
4995
  <use ns4:href="#md7efaf3aec" x="83.325193" y="420.186871" style="fill: #1f77b4; stroke: #1f77b4" />
4996
+ <use ns4:href="#md7efaf3aec" x="114.286231" y="413.746934" style="fill: #1f77b4; stroke: #1f77b4" />
4997
+ <use ns4:href="#md7efaf3aec" x="145.247268" y="413.019413" style="fill: #1f77b4; stroke: #1f77b4" />
4998
+ <use ns4:href="#md7efaf3aec" x="176.208306" y="414.649026" style="fill: #1f77b4; stroke: #1f77b4" />
4999
+ <use ns4:href="#md7efaf3aec" x="207.169343" y="414.665731" style="fill: #1f77b4; stroke: #1f77b4" />
5000
+ <use ns4:href="#md7efaf3aec" x="238.130381" y="415.985459" style="fill: #1f77b4; stroke: #1f77b4" />
5001
+ <use ns4:href="#md7efaf3aec" x="269.091418" y="416.252746" style="fill: #1f77b4; stroke: #1f77b4" />
5002
+ <use ns4:href="#md7efaf3aec" x="300.052455" y="415.525225" style="fill: #1f77b4; stroke: #1f77b4" />
5003
+ <use ns4:href="#md7efaf3aec" x="331.013493" y="416.703792" style="fill: #1f77b4; stroke: #1f77b4" />
5004
+ <use ns4:href="#md7efaf3aec" x="361.97453" y="415.692279" style="fill: #1f77b4; stroke: #1f77b4" />
5005
+ <use ns4:href="#md7efaf3aec" x="392.935568" y="416.269451" style="fill: #1f77b4; stroke: #1f77b4" />
5006
+ <use ns4:href="#md7efaf3aec" x="423.896605" y="416.168383" style="fill: #1f77b4; stroke: #1f77b4" />
5007
+ <use ns4:href="#md7efaf3aec" x="454.857643" y="415.52606" style="fill: #1f77b4; stroke: #1f77b4" />
5008
+ <use ns4:href="#md7efaf3aec" x="485.81868" y="415.952048" style="fill: #1f77b4; stroke: #1f77b4" />
5009
+ <use ns4:href="#md7efaf3aec" x="516.779718" y="414.072689" style="fill: #1f77b4; stroke: #1f77b4" />
5010
+ <use ns4:href="#md7efaf3aec" x="547.740755" y="415.399934" style="fill: #1f77b4; stroke: #1f77b4" />
5011
+ <use ns4:href="#md7efaf3aec" x="578.701793" y="415.43418" style="fill: #1f77b4; stroke: #1f77b4" />
5012
+ <use ns4:href="#md7efaf3aec" x="609.66283" y="416.402259" style="fill: #1f77b4; stroke: #1f77b4" />
5013
+ <use ns4:href="#md7efaf3aec" x="640.623868" y="414.841138" style="fill: #1f77b4; stroke: #1f77b4" />
5014
+ <use ns4:href="#md7efaf3aec" x="671.584905" y="415.024898" style="fill: #1f77b4; stroke: #1f77b4" />
5015
+ <use ns4:href="#md7efaf3aec" x="702.545943" y="414.990652" style="fill: #1f77b4; stroke: #1f77b4" />
5016
+ <use ns4:href="#md7efaf3aec" x="733.50698" y="414.974782" style="fill: #1f77b4; stroke: #1f77b4" />
5017
+ <use ns4:href="#md7efaf3aec" x="764.468018" y="415.61794" style="fill: #1f77b4; stroke: #1f77b4" />
5018
+ <use ns4:href="#md7efaf3aec" x="795.429055" y="415.759936" style="fill: #1f77b4; stroke: #1f77b4" />
5019
  </g>
5020
  </g>
5021
  <g id="series--torch-eager" class="series">
5022
+ <path d="M 83.325193 401.710683 L 114.286231 389.180788 L 145.247268 389.523249 L 176.208306 390.141349 L 207.169343 391.126968 L 238.130381 390.809566 L 269.091418 390.934856 L 300.052455 390.667569 L 331.013493 390.500515 L 361.97453 389.707008 L 392.935568 339.037818 L 423.896605 325.239147 L 454.857643 391.043441 L 485.81868 391.009195 L 516.779718 391.143674 L 547.740755 390.442046 L 578.701793 390.951562 L 609.66283 389.129836 L 640.623868 391.795185 L 671.584905 391.319081 L 702.545943 381.654999 L 733.50698 375.966806 L 764.468018 53.96077 L 795.429055 45.608899 " clip-path="url(#pb49fc4c8d2)" style="fill: none; stroke: #ff7f0e; stroke-width: 1.5; stroke-linecap: square" />
5023
  <defs>
5024
  <path id="m9b8c54d372" d="M 0 3 C 0.795609 3 1.55874 2.683901 2.12132 2.12132 C 2.683901 1.55874 3 0.795609 3 0 C 3 -0.795609 2.683901 -1.55874 2.12132 -2.12132 C 1.55874 -2.683901 0.795609 -3 0 -3 C -0.795609 -3 -1.55874 -2.683901 -2.12132 -2.12132 C -2.683901 -1.55874 -3 -0.795609 -3 0 C -3 0.795609 -2.683901 1.55874 -2.12132 2.12132 C -1.55874 2.683901 -0.795609 3 0 3 z " style="stroke: #ff7f0e" />
5025
  </defs>
5026
  <g clip-path="url(#pb49fc4c8d2)">
5027
+ <use ns4:href="#m9b8c54d372" x="83.325193" y="401.710683" style="fill: #ff7f0e; stroke: #ff7f0e" />
5028
+ <use ns4:href="#m9b8c54d372" x="114.286231" y="389.180788" style="fill: #ff7f0e; stroke: #ff7f0e" />
5029
+ <use ns4:href="#m9b8c54d372" x="145.247268" y="389.523249" style="fill: #ff7f0e; stroke: #ff7f0e" />
5030
+ <use ns4:href="#m9b8c54d372" x="176.208306" y="390.141349" style="fill: #ff7f0e; stroke: #ff7f0e" />
5031
+ <use ns4:href="#m9b8c54d372" x="207.169343" y="391.126968" style="fill: #ff7f0e; stroke: #ff7f0e" />
5032
+ <use ns4:href="#m9b8c54d372" x="238.130381" y="390.809566" style="fill: #ff7f0e; stroke: #ff7f0e" />
5033
+ <use ns4:href="#m9b8c54d372" x="269.091418" y="390.934856" style="fill: #ff7f0e; stroke: #ff7f0e" />
5034
+ <use ns4:href="#m9b8c54d372" x="300.052455" y="390.667569" style="fill: #ff7f0e; stroke: #ff7f0e" />
5035
+ <use ns4:href="#m9b8c54d372" x="331.013493" y="390.500515" style="fill: #ff7f0e; stroke: #ff7f0e" />
5036
+ <use ns4:href="#m9b8c54d372" x="361.97453" y="389.707008" style="fill: #ff7f0e; stroke: #ff7f0e" />
5037
+ <use ns4:href="#m9b8c54d372" x="392.935568" y="339.037818" style="fill: #ff7f0e; stroke: #ff7f0e" />
5038
+ <use ns4:href="#m9b8c54d372" x="423.896605" y="325.239147" style="fill: #ff7f0e; stroke: #ff7f0e" />
5039
+ <use ns4:href="#m9b8c54d372" x="454.857643" y="391.043441" style="fill: #ff7f0e; stroke: #ff7f0e" />
5040
+ <use ns4:href="#m9b8c54d372" x="485.81868" y="391.009195" style="fill: #ff7f0e; stroke: #ff7f0e" />
5041
+ <use ns4:href="#m9b8c54d372" x="516.779718" y="391.143674" style="fill: #ff7f0e; stroke: #ff7f0e" />
5042
+ <use ns4:href="#m9b8c54d372" x="547.740755" y="390.442046" style="fill: #ff7f0e; stroke: #ff7f0e" />
5043
+ <use ns4:href="#m9b8c54d372" x="578.701793" y="390.951562" style="fill: #ff7f0e; stroke: #ff7f0e" />
5044
+ <use ns4:href="#m9b8c54d372" x="609.66283" y="389.129836" style="fill: #ff7f0e; stroke: #ff7f0e" />
5045
+ <use ns4:href="#m9b8c54d372" x="640.623868" y="391.795185" style="fill: #ff7f0e; stroke: #ff7f0e" />
5046
+ <use ns4:href="#m9b8c54d372" x="671.584905" y="391.319081" style="fill: #ff7f0e; stroke: #ff7f0e" />
5047
+ <use ns4:href="#m9b8c54d372" x="702.545943" y="381.654999" style="fill: #ff7f0e; stroke: #ff7f0e" />
5048
+ <use ns4:href="#m9b8c54d372" x="733.50698" y="375.966806" style="fill: #ff7f0e; stroke: #ff7f0e" />
5049
+ <use ns4:href="#m9b8c54d372" x="764.468018" y="53.96077" style="fill: #ff7f0e; stroke: #ff7f0e" />
5050
  <use ns4:href="#m9b8c54d372" x="795.429055" y="45.608899" style="fill: #ff7f0e; stroke: #ff7f0e" />
5051
  </g>
5052
  </g>
flash_attn/impls/artifacts/benchmark/attention.jsonl CHANGED
@@ -1,6 +1,6 @@
1
- {"ts": "2025-10-28T14:08:53Z", "run": "5a355db38c804430a5f59b9c06c46f52", "impl": "hf_kernels_flash_attn3", "tags": {"family": "hf-kernels", "backend": "flash-attn3", "compile": "none"}, "wl": {"name": "cuda_attn_L128_bfloat16", "batch": 1, "seq_len": 4224, "heads": 24, "head_dim": 128, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.9106109999947876, "p50": 0.9171110000352201, "p90": 0.9204320000435473, "mean": 0.9179216000347878, "iqr": 0.005419999979494605, "raw_times": [0.9171110000352201, 0.9150120000640527, 0.9106109999947876, 0.9204320000435473, 0.9264420000363316], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.9176309999929799, "peak_bytes": 295567360, "ok": true, "absmax": 0.0625, "corr": {"ok": true, "rtol": 0.02, "atol": 0.02, "absmax": 0.0625, "mae": 0.0003604888916015625, "mse": 2.8014183044433594e-06, "ref": "sdpa_math_fp32"}, "err": null}
2
- {"ts": "2025-10-28T14:08:53Z", "run": "5a355db38c804430a5f59b9c06c46f52", "impl": "hf_kernels_flash_attn3", "tags": {"family": "hf-kernels", "backend": "flash-attn3", "compile": "none"}, "wl": {"name": "cuda_attn_L256_bfloat16", "batch": 1, "seq_len": 4352, "heads": 24, "head_dim": 128, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.9576329999845257, "p50": 0.960063999968952, "p90": 0.9623629999850891, "mean": 0.9611931999643275, "iqr": 0.0033900000744324643, "raw_times": [0.9589729999106567, 0.9576329999845257, 0.960063999968952, 0.9669329999724141, 0.9623629999850891], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.9673530000782193, "peak_bytes": 304742400, "ok": true, "absmax": 0.0625, "corr": {"ok": true, "rtol": 0.02, "atol": 0.02, "absmax": 0.0625, "mae": 0.00035858154296875, "mse": 2.7865171432495117e-06, "ref": "sdpa_math_fp32"}, "err": null}
3
- {"ts": "2025-10-28T14:08:53Z", "run": "5a355db38c804430a5f59b9c06c46f52", "impl": "hf_kernels_flash_attn3", "tags": {"family": "hf-kernels", "backend": "flash-attn3", "compile": "none"}, "wl": {"name": "cuda_attn_L320_bfloat16", "batch": 1, "seq_len": 4416, "heads": 24, "head_dim": 128, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 1.0019650000003821, "p50": 1.0193159999971613, "p90": 1.0211459999709405, "mean": 1.015251600006195, "iqr": 0.01198099994326185, "raw_times": [1.0019650000003821, 1.0091650000276786, 1.024666000034813, 1.0193159999971613, 1.0211459999709405], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 1.009233999980097, "peak_bytes": 307494912, "ok": true, "absmax": 0.0625, "corr": {"ok": true, "rtol": 0.02, "atol": 0.02, "absmax": 0.0625, "mae": 0.00035858154296875, "mse": 2.7865171432495117e-06, "ref": "sdpa_math_fp32"}, "err": null}
4
- {"ts": "2025-10-28T14:08:53Z", "run": "5a355db38c804430a5f59b9c06c46f52", "impl": "hf_kernels_flash_attn3", "tags": {"family": "hf-kernels", "backend": "flash-attn3", "compile": "none"}, "wl": {"name": "cuda_attn_L384_bfloat16", "batch": 1, "seq_len": 4480, "heads": 24, "head_dim": 128, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 1.0176959999625979, "p50": 1.0199849999708022, "p90": 1.025695000066662, "mean": 1.0218714000075124, "iqr": 0.006820000066909415, "raw_times": [1.0271060000377474, 1.0176959999625979, 1.0188749999997526, 1.0199849999708022, 1.025695000066662], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 1.027405000058934, "peak_bytes": 311296000, "ok": true, "absmax": 0.0625, "corr": {"ok": true, "rtol": 0.02, "atol": 0.02, "absmax": 0.0625, "mae": 0.00035858154296875, "mse": 2.7865171432495117e-06, "ref": "sdpa_math_fp32"}, "err": null}
5
- {"ts": "2025-10-28T14:08:53Z", "run": "5a355db38c804430a5f59b9c06c46f52", "impl": "hf_kernels_flash_attn3", "tags": {"family": "hf-kernels", "backend": "flash-attn3", "compile": "none"}, "wl": {"name": "cuda_attn_L448_bfloat16", "batch": 1, "seq_len": 4544, "heads": 24, "head_dim": 128, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 1.1665810000067722, "p50": 1.1845809999613266, "p90": 1.185440999961429, "mean": 1.1787729999923613, "iqr": 0.01419000000169035, "raw_times": [1.1712509999597387, 1.1665810000067722, 1.18601100007254, 1.1845809999613266, 1.185440999961429], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 1.1787800000320203, "peak_bytes": 315621376, "ok": true, "absmax": 0.0625, "corr": {"ok": true, "rtol": 0.02, "atol": 0.02, "absmax": 0.0625, "mae": 0.00035858154296875, "mse": 2.7865171432495117e-06, "ref": "sdpa_math_fp32"}, "err": null}
6
- {"ts": "2025-10-28T14:08:53Z", "run": "5a355db38c804430a5f59b9c06c46f52", "impl": "hf_kernels_flash_attn3", "tags": {"family": "hf-kernels", "backend": "flash-attn3", "compile": "none"}, "wl": {"name": "cuda_attn_L512_bfloat16", "batch": 1, "seq_len": 4608, "heads": 24, "head_dim": 128, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 1.1722899999995207, "p50": 1.1832310000272628, "p90": 1.1854509999693619, "mean": 1.181276799979969, "iqr": 0.008630000024822948, "raw_times": [1.1885909999591604, 1.1854509999693619, 1.176820999944539, 1.1832310000272628, 1.1722899999995207], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 1.1782799999764393, "peak_bytes": 319946752, "ok": true, "absmax": 0.125, "corr": {"ok": true, "rtol": 0.02, "atol": 0.02, "absmax": 0.125, "mae": 0.000362396240234375, "mse": 2.8014183044433594e-06, "ref": "sdpa_math_fp32"}, "err": null}
 
1
+ {"ts": "2025-10-29T14:27:40Z", "run": "f6f1662cc0064e52be2f08072a7254af", "impl": "xformers_meff", "tags": {"family": "xformers", "backend": "memory_efficient", "compile": "none"}, "wl": {"name": "cuda_attn_L128_bfloat16", "batch": 1, "seq_len": 4224, "heads": 24, "head_dim": 128, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.98791400000664, "p50": 0.995113999977093, "p90": 1.0003840000081254, "mean": 0.9967803999984426, "iqr": 0.00634899998885885, "raw_times": [0.98791400000664, 0.9940350000192666, 1.006454999981088, 1.0003840000081254, 0.995113999977093], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 1.0150049999992916, "peak_bytes": 295567360, "ok": true, "absmax": 0.0625, "corr": {"ok": true, "rtol": 0.02, "atol": 0.02, "absmax": 0.0625, "mae": 0.0003604888916015625, "mse": 2.8014183044433594e-06, "ref": "sdpa_math_fp32"}, "err": null}
2
+ {"ts": "2025-10-29T14:27:41Z", "run": "f6f1662cc0064e52be2f08072a7254af", "impl": "xformers_meff", "tags": {"family": "xformers", "backend": "memory_efficient", "compile": "none"}, "wl": {"name": "cuda_attn_L256_bfloat16", "batch": 1, "seq_len": 4352, "heads": 24, "head_dim": 128, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 1.0233649999804584, "p50": 1.0321449999537435, "p90": 1.0348449999924014, "mean": 1.032277399974646, "iqr": 0.009739000006447895, "raw_times": [1.045925999960673, 1.0251059999859535, 1.0321449999537435, 1.0233649999804584, 1.0348449999924014], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 1.035865999995167, "peak_bytes": 304742400, "ok": true, "absmax": 0.0625, "corr": {"ok": true, "rtol": 0.02, "atol": 0.02, "absmax": 0.0625, "mae": 0.00035858154296875, "mse": 2.7865171432495117e-06, "ref": "sdpa_math_fp32"}, "err": null}
3
+ {"ts": "2025-10-29T14:27:41Z", "run": "f6f1662cc0064e52be2f08072a7254af", "impl": "xformers_meff", "tags": {"family": "xformers", "backend": "memory_efficient", "compile": "none"}, "wl": {"name": "cuda_attn_L320_bfloat16", "batch": 1, "seq_len": 4416, "heads": 24, "head_dim": 128, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 1.0737370000128976, "p50": 1.084086999981082, "p90": 1.088675999994848, "mean": 1.0826705999988917, "iqr": 0.0148400000057336, "raw_times": [1.0738359999891145, 1.0930170000165162, 1.084086999981082, 1.0737370000128976, 1.088675999994848], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 1.0889670000437945, "peak_bytes": 307494912, "ok": true, "absmax": 0.0625, "corr": {"ok": true, "rtol": 0.02, "atol": 0.02, "absmax": 0.0625, "mae": 0.00035858154296875, "mse": 2.7865171432495117e-06, "ref": "sdpa_math_fp32"}, "err": null}
4
+ {"ts": "2025-10-29T14:27:41Z", "run": "f6f1662cc0064e52be2f08072a7254af", "impl": "xformers_meff", "tags": {"family": "xformers", "backend": "memory_efficient", "compile": "none"}, "wl": {"name": "cuda_attn_L384_bfloat16", "batch": 1, "seq_len": 4480, "heads": 24, "head_dim": 128, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 1.0887770000067576, "p50": 1.0916359999555425, "p90": 1.096396999969329, "mean": 1.0932085999797891, "iqr": 0.005600999998023326, "raw_times": [1.0887770000067576, 1.0907959999713057, 1.0916359999555425, 1.0984369999960109, 1.096396999969329], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 1.1031370000296192, "peak_bytes": 311296000, "ok": true, "absmax": 0.0625, "corr": {"ok": true, "rtol": 0.02, "atol": 0.02, "absmax": 0.0625, "mae": 0.00035858154296875, "mse": 2.7865171432495117e-06, "ref": "sdpa_math_fp32"}, "err": null}
5
+ {"ts": "2025-10-29T14:27:41Z", "run": "f6f1662cc0064e52be2f08072a7254af", "impl": "xformers_meff", "tags": {"family": "xformers", "backend": "memory_efficient", "compile": "none"}, "wl": {"name": "cuda_attn_L448_bfloat16", "batch": 1, "seq_len": 4544, "heads": 24, "head_dim": 128, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 1.2465009999687027, "p50": 1.2523310000460697, "p90": 1.2523909999799798, "mean": 1.2538410000047406, "iqr": 0.005290999979479238, "raw_times": [1.2523310000460697, 1.2523909999799798, 1.2471000000005006, 1.2465009999687027, 1.2708820000284504], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 1.2551809999763464, "peak_bytes": 315621376, "ok": true, "absmax": 0.0625, "corr": {"ok": true, "rtol": 0.02, "atol": 0.02, "absmax": 0.0625, "mae": 0.00035858154296875, "mse": 2.7865171432495117e-06, "ref": "sdpa_math_fp32"}, "err": null}
6
+ {"ts": "2025-10-29T14:27:41Z", "run": "f6f1662cc0064e52be2f08072a7254af", "impl": "xformers_meff", "tags": {"family": "xformers", "backend": "memory_efficient", "compile": "none"}, "wl": {"name": "cuda_attn_L512_bfloat16", "batch": 1, "seq_len": 4608, "heads": 24, "head_dim": 128, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 1.2355700000057368, "p50": 1.241141000036805, "p90": 1.2576200000466997, "mean": 1.2477664000130062, "iqr": 0.02047000003813082, "raw_times": [1.2355700000057368, 1.241141000036805, 1.237150000008569, 1.2576200000466997, 1.2673509999672206], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 1.2579809999806457, "peak_bytes": 319946752, "ok": true, "absmax": 0.125, "corr": {"ok": true, "rtol": 0.02, "atol": 0.02, "absmax": 0.125, "mae": 0.000362396240234375, "mse": 2.8014183044433594e-06, "ref": "sdpa_math_fp32"}, "err": null}
flash_attn/impls/cells/benchmark.py CHANGED
@@ -4,7 +4,7 @@
4
  # "numpy",
5
  # "torch==2.8.0",
6
  # "kernels-benchmark-tools",
7
- # "kernels",
8
  # ]
9
  #
10
  # [tool.uv.sources]
@@ -13,19 +13,18 @@
13
  import torch
14
  import sys
15
  from kernels_benchmark_tools import KernelTypeEnum, run_benchmark
16
- from kernels import get_kernel
17
 
18
- # Load the flash attention 3 kernel
19
- hf_kernels_flash_attn3 = get_kernel("kernels-community/flash-attn3")
20
 
21
-
22
- def hf_flash_attention3(query, key, value):
23
- return hf_kernels_flash_attn3.flash_attn_func(query, key, value, causal=False)[0]
 
24
 
25
 
26
  run_benchmark(
27
  kernel_type=KernelTypeEnum.ATTENTION,
28
- impl_name="hf_kernels_flash_attn3",
29
- impl_tags={"family": "hf-kernels", "backend": "flash-attn3", "compile": "none"},
30
- impl_func=hf_flash_attention3,
31
  )
 
4
  # "numpy",
5
  # "torch==2.8.0",
6
  # "kernels-benchmark-tools",
7
+ # "xformers",
8
  # ]
9
  #
10
  # [tool.uv.sources]
 
13
  import torch
14
  import sys
15
  from kernels_benchmark_tools import KernelTypeEnum, run_benchmark
16
+ import xformers.ops as xops
17
 
 
 
18
 
19
+ def xformers_attention(q, k, v):
20
+ """xFormers memory efficient attention"""
21
+ # xFormers expects [batch, seq_len, heads, head_dim]
22
+ return xops.memory_efficient_attention(q, k, v)
23
 
24
 
25
  run_benchmark(
26
  kernel_type=KernelTypeEnum.ATTENTION,
27
+ impl_name="xformers_meff",
28
+ impl_tags={"family": "xformers", "backend": "memory_efficient", "compile": "none"},
29
+ impl_func=xformers_attention,
30
  )
flash_attn/impls/flash_attention.html CHANGED
@@ -3871,7 +3871,7 @@ span.linenos.special { color: #000000; background-color: #ffffc0; padding-left:
3871
  <span onclick="toggleOutput('nv')" style="cursor: pointer;">▼ output</span>
3872
  <span id="uv-indicator-nv" style="cursor: default; opacity: 0.3;">▶ uv-logs</span>
3873
  </span> |
3874
- Cell: nv | 0.26s
3875
  | <button class="run-btn" onclick="runCell('nv')">▶ run</button>
3876
  <button class="copy-btn" onclick="copyCell('nv')">Copy</button>
3877
  <a href="cells/nv.py" target="_blank" class="raw-btn">Raw</a>
@@ -3888,7 +3888,7 @@ Cell: nv | 0.26s
3888
  </div>
3889
  </div>
3890
  <div id="output-nv" class="cell-output">
3891
- <div class="cell-stdout"><pre class="stdout-text">Tue Oct 28 14:08:39 2025
3892
  +-----------------------------------------------------------------------------------------+
3893
  | NVIDIA-SMI 570.195.03 Driver Version: 570.195.03 CUDA Version: 12.8 |
3894
  |-----------------------------------------+------------------------+----------------------+
@@ -3897,7 +3897,7 @@ Cell: nv | 0.26s
3897
  | | | MIG M. |
3898
  |=========================================+========================+======================|
3899
  | 0 NVIDIA L40S On | 00000000:4D:00.0 Off | 0 |
3900
- | N/A 32C P0 153W / 350W | 0MiB / 46068MiB | 26% Default |
3901
  | | | N/A |
3902
  +-----------------------------------------+------------------------+----------------------+
3903
 
@@ -3919,9 +3919,9 @@ Cell: nv | 0.26s
3919
  <span class="collapse-indicators">
3920
  <span onclick="toggleCode('benchmark')" style="cursor: pointer;">▼ code</span>
3921
  <span onclick="toggleOutput('benchmark')" style="cursor: pointer;">▼ output</span>
3922
- <span id="uv-indicator-benchmark" style="cursor: default; opacity: 0.3;">▶ uv-logs</span>
3923
  </span> |
3924
- Cell: benchmark | 3.83s
3925
  | <button class="run-btn" onclick="runCell('benchmark')">▶ run</button>
3926
  <button class="copy-btn" onclick="copyCell('benchmark')">Copy</button>
3927
  <a href="cells/benchmark.py" target="_blank" class="raw-btn">Raw</a>
@@ -3972,29 +3972,29 @@ PROFILE TRACE: torch_flash_ma | cuda_attn_L128_bfloat16
3972
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
3973
  Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
3974
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
3975
- torch_flash_ma 0.00% 0.000us 0.00% 0.000us 0.000us 3.585ms 101.47% 3.585ms 3.585ms 1
3976
- torch_flash_ma 6.34% 327.656us 45.53% 2.352ms 2.352ms 0.000us 0.00% 3.573ms 3.573ms 1
3977
- aten::scaled_dot_product_attention 0.82% 42.312us 4.12% 213.057us 71.019us 0.000us 0.00% 2.820ms 940.062us 3
3978
- aten::_scaled_dot_product_flash_attention 0.51% 26.321us 3.31% 170.745us 56.915us 0.000us 0.00% 2.820ms 940.062us 3
3979
- aten::_flash_attention_forward 0.73% 37.527us 2.40% 124.015us 41.338us 2.820ms 79.83% 2.820ms 940.062us 3
3980
- void pytorch_flash::flash_fwd_kernel&lt;Flash_fwd_kerne... 0.00% 0.000us 0.00% 0.000us 0.000us 2.820ms 79.83% 2.820ms 940.062us 3
3981
- aten::contiguous 0.27% 14.121us 33.79% 1.745ms 145.446us 0.000us 0.00% 752.928us 62.744us 12
3982
- aten::clone 0.72% 37.329us 33.52% 1.731ms 144.269us 0.000us 0.00% 752.928us 62.744us 12
3983
- aten::copy_ 1.68% 87.013us 31.25% 1.614ms 134.513us 712.672us 20.17% 752.928us 62.744us 12
3984
- void at::native::elementwise_kernel&lt;128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 712.672us 20.17% 712.672us 59.389us 12
3985
- Activity Buffer Request 27.64% 1.428ms 27.64% 1.428ms 1.428ms 40.256us 1.14% 40.256us 40.256us 1
3986
- aten::transpose 1.24% 64.087us 1.67% 86.009us 3.584us 0.000us 0.00% 0.000us 0.000us 24
3987
- aten::as_strided 0.42% 21.922us 0.42% 21.922us 0.913us 0.000us 0.00% 0.000us 0.000us 24
3988
- aten::empty_like 0.48% 24.711us 1.99% 102.775us 6.852us 0.000us 0.00% 0.000us 0.000us 15
3989
- aten::empty 1.74% 89.843us 1.74% 89.843us 3.743us 0.000us 0.00% 0.000us 0.000us 24
3990
- cudaLaunchKernel 2.38% 122.771us 2.38% 122.771us 8.185us 0.000us 0.00% 0.000us 0.000us 15
3991
- aten::empty_strided 0.34% 17.310us 0.34% 17.310us 5.770us 0.000us 0.00% 0.000us 0.000us 3
3992
- cudaDeviceGetAttribute 0.04% 2.229us 0.04% 2.229us 0.372us 0.000us 0.00% 0.000us 0.000us 6
3993
- cudaFuncSetAttribute 0.17% 8.900us 0.17% 8.900us 2.967us 0.000us 0.00% 0.000us 0.000us 3
3994
- cudaDeviceSynchronize 54.47% 2.814ms 54.47% 2.814ms 2.814ms 0.000us 0.00% 0.000us 0.000us 1
3995
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
3996
- Self CPU time total: 5.165ms
3997
- Self CUDA time total: 3.533ms
3998
 
3999
 
4000
 
@@ -4004,29 +4004,29 @@ PROFILE TRACE: torch_flash_ma | cuda_attn_L256_bfloat16
4004
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4005
  Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
4006
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4007
- torch_flash_ma 4.84% 255.079us 41.49% 2.188ms 2.188ms 0.000us 0.00% 3.787ms 3.787ms 1
4008
- torch_flash_ma 0.00% 0.000us 0.00% 0.000us 0.000us 3.743ms 100.29% 3.743ms 3.743ms 1
4009
- aten::scaled_dot_product_attention 0.47% 24.640us 3.42% 180.356us 60.119us 0.000us 0.00% 2.967ms 989.106us 3
4010
- aten::_scaled_dot_product_flash_attention 0.36% 19.241us 2.95% 155.716us 51.905us 0.000us 0.00% 2.967ms 989.106us 3
4011
- aten::_flash_attention_forward 0.73% 38.683us 2.19% 115.525us 38.508us 2.967ms 79.51% 2.967ms 989.106us 3
4012
- void pytorch_flash::flash_fwd_kernel&lt;Flash_fwd_kerne... 0.00% 0.000us 0.00% 0.000us 0.000us 2.967ms 79.51% 2.967ms 989.106us 3
4013
- aten::contiguous 0.17% 8.802us 32.41% 1.709ms 142.425us 0.000us 0.00% 819.868us 68.322us 12
4014
- aten::clone 0.52% 27.349us 32.24% 1.700ms 141.692us 0.000us 0.00% 819.868us 68.322us 12
4015
- aten::copy_ 1.56% 82.061us 30.60% 1.614ms 134.473us 764.892us 20.49% 819.868us 68.322us 12
4016
- void at::native::elementwise_kernel&lt;128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 764.892us 20.49% 764.892us 63.741us 12
4017
- Activity Buffer Request 27.50% 1.450ms 27.50% 1.450ms 1.450ms 54.976us 1.47% 54.976us 54.976us 1
4018
- aten::transpose 0.91% 47.959us 1.22% 64.512us 2.688us 0.000us 0.00% 0.000us 0.000us 24
4019
- aten::as_strided 0.31% 16.553us 0.31% 16.553us 0.690us 0.000us 0.00% 0.000us 0.000us 24
4020
- aten::empty_like 0.39% 20.732us 1.52% 80.304us 5.354us 0.000us 0.00% 0.000us 0.000us 15
4021
- aten::empty 1.38% 72.972us 1.38% 72.972us 3.040us 0.000us 0.00% 0.000us 0.000us 24
4022
- cudaLaunchKernel 1.96% 103.146us 1.96% 103.146us 6.876us 0.000us 0.00% 0.000us 0.000us 15
4023
- aten::empty_strided 0.28% 14.880us 0.28% 14.880us 4.960us 0.000us 0.00% 0.000us 0.000us 3
4024
- cudaDeviceGetAttribute 0.03% 1.800us 0.03% 1.800us 0.300us 0.000us 0.00% 0.000us 0.000us 6
4025
- cudaFuncSetAttribute 0.07% 3.830us 0.07% 3.830us 1.277us 0.000us 0.00% 0.000us 0.000us 3
4026
- cudaDeviceSynchronize 58.51% 3.085ms 58.51% 3.085ms 3.085ms 0.000us 0.00% 0.000us 0.000us 1
4027
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4028
- Self CPU time total: 5.273ms
4029
- Self CUDA time total: 3.732ms
4030
 
4031
 
4032
 
@@ -4036,29 +4036,29 @@ PROFILE TRACE: torch_flash_ma | cuda_attn_L320_bfloat16
4036
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4037
  Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
4038
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4039
- torch_flash_ma 4.77% 251.162us 41.45% 2.184ms 2.184ms 0.000us 0.00% 3.786ms 3.786ms 1
4040
- torch_flash_ma 0.00% 0.000us 0.00% 0.000us 0.000us 3.738ms 100.28% 3.738ms 3.738ms 1
4041
- aten::scaled_dot_product_attention 0.46% 24.280us 3.42% 180.086us 60.029us 0.000us 0.00% 2.949ms 982.872us 3
4042
- aten::_scaled_dot_product_flash_attention 0.34% 18.160us 2.96% 155.806us 51.935us 0.000us 0.00% 2.949ms 982.872us 3
4043
- aten::_flash_attention_forward 0.73% 38.599us 2.20% 115.865us 38.622us 2.949ms 79.09% 2.949ms 982.872us 3
4044
- void pytorch_flash::flash_fwd_kernel&lt;Flash_fwd_kerne... 0.00% 0.000us 0.00% 0.000us 0.000us 2.949ms 79.09% 2.949ms 982.872us 3
4045
- aten::contiguous 0.17% 8.991us 32.44% 1.710ms 142.465us 0.000us 0.00% 837.719us 69.810us 12
4046
- aten::clone 0.53% 27.728us 32.27% 1.701ms 141.715us 0.000us 0.00% 837.719us 69.810us 12
4047
- aten::copy_ 1.52% 79.873us 30.57% 1.611ms 134.242us 779.480us 20.91% 837.719us 69.810us 12
4048
- void at::native::elementwise_kernel&lt;128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 779.480us 20.91% 779.480us 64.957us 12
4049
- Activity Buffer Request 27.50% 1.449ms 27.50% 1.449ms 1.449ms 58.239us 1.56% 58.239us 58.239us 1
4050
- aten::transpose 0.92% 48.219us 1.24% 65.252us 2.719us 0.000us 0.00% 0.000us 0.000us 24
4051
- aten::as_strided 0.32% 17.033us 0.32% 17.033us 0.710us 0.000us 0.00% 0.000us 0.000us 24
4052
- aten::empty_like 0.37% 19.303us 1.55% 81.795us 5.453us 0.000us 0.00% 0.000us 0.000us 15
4053
- aten::empty 1.44% 76.031us 1.44% 76.031us 3.168us 0.000us 0.00% 0.000us 0.000us 24
4054
- cudaLaunchKernel 1.98% 104.564us 1.98% 104.564us 6.971us 0.000us 0.00% 0.000us 0.000us 15
4055
- aten::empty_strided 0.28% 14.492us 0.28% 14.492us 4.831us 0.000us 0.00% 0.000us 0.000us 3
4056
- cudaDeviceGetAttribute 0.04% 1.860us 0.04% 1.860us 0.310us 0.000us 0.00% 0.000us 0.000us 6
4057
- cudaFuncSetAttribute 0.10% 5.030us 0.10% 5.030us 1.677us 0.000us 0.00% 0.000us 0.000us 3
4058
- cudaDeviceSynchronize 58.55% 3.085ms 58.55% 3.085ms 3.085ms 0.000us 0.00% 0.000us 0.000us 1
4059
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4060
- Self CPU time total: 5.269ms
4061
- Self CUDA time total: 3.728ms
4062
 
4063
 
4064
 
@@ -4068,29 +4068,29 @@ PROFILE TRACE: torch_flash_ma | cuda_attn_L384_bfloat16
4068
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4069
  Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
4070
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4071
- torch_flash_ma 5.01% 280.573us 44.17% 2.475ms 2.475ms 0.000us 0.00% 3.878ms 3.878ms 1
4072
- torch_flash_ma 0.00% 0.000us 0.00% 0.000us 0.000us 3.831ms 100.27% 3.831ms 3.831ms 1
4073
- aten::scaled_dot_product_attention 0.48% 26.630us 3.39% 189.956us 63.319us 0.000us 0.00% 3.032ms 1.011ms 3
4074
- aten::_scaled_dot_product_flash_attention 0.34% 19.101us 2.91% 163.326us 54.442us 0.000us 0.00% 3.032ms 1.011ms 3
4075
- aten::_flash_attention_forward 0.70% 39.063us 2.15% 120.325us 40.108us 3.032ms 79.37% 3.032ms 1.011ms 3
4076
- void pytorch_flash::flash_fwd_kernel&lt;Flash_fwd_kerne... 0.00% 0.000us 0.00% 0.000us 0.000us 3.032ms 79.37% 3.032ms 1.011ms 3
4077
- aten::contiguous 0.17% 9.271us 34.98% 1.960ms 163.354us 0.000us 0.00% 845.820us 70.485us 12
4078
- aten::clone 0.52% 28.974us 34.82% 1.951ms 162.581us 0.000us 0.00% 845.820us 70.485us 12
4079
- aten::copy_ 1.48% 83.180us 33.17% 1.859ms 154.908us 788.284us 20.63% 845.820us 70.485us 12
4080
- void at::native::elementwise_kernel&lt;128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 788.284us 20.63% 788.284us 65.690us 12
4081
- Activity Buffer Request 26.18% 1.467ms 26.18% 1.467ms 1.467ms 57.536us 1.51% 57.536us 57.536us 1
4082
- aten::transpose 0.89% 50.110us 1.21% 67.952us 2.831us 0.000us 0.00% 0.000us 0.000us 24
4083
- aten::as_strided 0.32% 17.842us 0.32% 17.842us 0.743us 0.000us 0.00% 0.000us 0.000us 24
4084
- aten::empty_like 0.36% 19.969us 1.53% 85.492us 5.699us 0.000us 0.00% 0.000us 0.000us 15
4085
- aten::empty 1.37% 76.982us 1.37% 76.982us 3.208us 0.000us 0.00% 0.000us 0.000us 24
4086
- cudaLaunchKernel 5.95% 333.480us 5.95% 333.480us 22.232us 0.000us 0.00% 0.000us 0.000us 15
4087
- aten::empty_strided 0.30% 17.041us 0.30% 17.041us 5.680us 0.000us 0.00% 0.000us 0.000us 3
4088
- cudaDeviceGetAttribute 0.03% 1.700us 0.03% 1.700us 0.283us 0.000us 0.00% 0.000us 0.000us 6
4089
- cudaFuncSetAttribute 0.07% 4.040us 0.07% 4.040us 1.347us 0.000us 0.00% 0.000us 0.000us 3
4090
- cudaDeviceSynchronize 55.83% 3.129ms 55.83% 3.129ms 3.129ms 0.000us 0.00% 0.000us 0.000us 1
4091
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4092
- Self CPU time total: 5.603ms
4093
- Self CUDA time total: 3.820ms
4094
 
4095
 
4096
 
@@ -4100,29 +4100,29 @@ PROFILE TRACE: torch_flash_ma | cuda_attn_L448_bfloat16
4100
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4101
  Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
4102
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4103
- torch_flash_ma 5.07% 303.893us 39.93% 2.395ms 2.395ms 0.000us 0.00% 4.370ms 4.370ms 1
4104
- torch_flash_ma 0.00% 0.000us 0.00% 0.000us 0.000us 4.320ms 100.25% 4.320ms 4.320ms 1
4105
- aten::scaled_dot_product_attention 0.41% 24.650us 3.07% 184.006us 61.335us 0.000us 0.00% 3.503ms 1.168ms 3
4106
- aten::_scaled_dot_product_flash_attention 0.32% 19.311us 2.66% 159.356us 53.119us 0.000us 0.00% 3.503ms 1.168ms 3
4107
- aten::_flash_attention_forward 0.68% 40.911us 1.97% 118.205us 39.402us 3.503ms 81.28% 3.503ms 1.168ms 3
4108
- void pytorch_flash::flash_fwd_kernel&lt;Flash_fwd_kerne... 0.00% 0.000us 0.00% 0.000us 0.000us 3.503ms 81.28% 3.503ms 1.168ms 3
4109
- aten::contiguous 0.15% 8.977us 31.04% 1.862ms 155.201us 0.000us 0.00% 867.581us 72.298us 12
4110
- aten::clone 0.47% 28.114us 30.89% 1.853ms 154.453us 0.000us 0.00% 867.581us 72.298us 12
4111
- aten::copy_ 1.36% 81.500us 29.40% 1.764ms 146.991us 806.749us 18.72% 867.581us 72.298us 12
4112
- void at::native::elementwise_kernel&lt;128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 806.749us 18.72% 806.749us 67.229us 12
4113
- Activity Buffer Request 23.82% 1.429ms 23.82% 1.429ms 1.429ms 60.832us 1.41% 60.832us 60.832us 1
4114
- aten::transpose 0.82% 49.363us 1.11% 66.863us 2.786us 0.000us 0.00% 0.000us 0.000us 24
4115
- aten::as_strided 0.29% 17.500us 0.29% 17.500us 0.729us 0.000us 0.00% 0.000us 0.000us 24
4116
- aten::empty_like 0.33% 20.081us 1.37% 82.424us 5.495us 0.000us 0.00% 0.000us 0.000us 15
4117
- aten::empty 1.26% 75.593us 1.26% 75.593us 3.150us 0.000us 0.00% 0.000us 0.000us 24
4118
- cudaLaunchKernel 4.60% 275.759us 4.60% 275.759us 18.384us 0.000us 0.00% 0.000us 0.000us 15
4119
- aten::empty_strided 0.25% 15.251us 0.25% 15.251us 5.084us 0.000us 0.00% 0.000us 0.000us 3
4120
- cudaDeviceGetAttribute 0.03% 1.740us 0.03% 1.740us 0.290us 0.000us 0.00% 0.000us 0.000us 6
4121
- cudaFuncSetAttribute 0.06% 3.680us 0.06% 3.680us 1.227us 0.000us 0.00% 0.000us 0.000us 3
4122
- cudaDeviceSynchronize 60.07% 3.604ms 60.07% 3.604ms 3.604ms 0.000us 0.00% 0.000us 0.000us 1
4123
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4124
- Self CPU time total: 5.999ms
4125
- Self CUDA time total: 4.309ms
4126
 
4127
 
4128
 
@@ -4132,39 +4132,91 @@ PROFILE TRACE: torch_flash_ma | cuda_attn_L512_bfloat16
4132
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4133
  Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
4134
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4135
- torch_flash_ma 3.83% 232.270us 37.82% 2.296ms 2.296ms 0.000us 0.00% 4.474ms 4.474ms 1
4136
- torch_flash_ma 0.00% 0.000us 0.00% 0.000us 0.000us 4.423ms 100.25% 4.423ms 4.423ms 1
4137
- aten::scaled_dot_product_attention 0.41% 24.850us 2.85% 172.746us 57.582us 0.000us 0.00% 3.595ms 1.198ms 3
4138
- aten::_scaled_dot_product_flash_attention 0.30% 18.250us 2.44% 147.896us 49.299us 0.000us 0.00% 3.595ms 1.198ms 3
4139
- aten::_flash_attention_forward 0.54% 32.692us 1.77% 107.224us 35.741us 3.595ms 81.48% 3.595ms 1.198ms 3
4140
- void pytorch_flash::flash_fwd_kernel&lt;Flash_fwd_kerne... 0.00% 0.000us 0.00% 0.000us 0.000us 3.595ms 81.48% 3.595ms 1.198ms 3
4141
- aten::contiguous 0.14% 8.610us 30.41% 1.846ms 153.859us 0.000us 0.00% 878.139us 73.178us 12
4142
- aten::clone 0.45% 27.368us 30.27% 1.838ms 153.142us 0.000us 0.00% 878.139us 73.178us 12
4143
- aten::copy_ 1.35% 81.917us 28.83% 1.750ms 145.831us 817.083us 18.52% 878.139us 73.178us 12
4144
- void at::native::elementwise_kernel&lt;128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 817.083us 18.52% 817.083us 68.090us 12
4145
- Activity Buffer Request 23.72% 1.440ms 23.72% 1.440ms 1.440ms 61.056us 1.38% 61.056us 61.056us 1
4146
- aten::transpose 0.82% 50.064us 1.10% 66.792us 2.783us 0.000us 0.00% 0.000us 0.000us 24
4147
- aten::as_strided 0.28% 16.728us 0.28% 16.728us 0.697us 0.000us 0.00% 0.000us 0.000us 24
4148
- aten::empty_like 0.32% 19.431us 1.31% 79.591us 5.306us 0.000us 0.00% 0.000us 0.000us 15
4149
- aten::empty 1.21% 73.220us 1.21% 73.220us 3.051us 0.000us 0.00% 0.000us 0.000us 24
4150
- cudaLaunchKernel 4.12% 249.950us 4.12% 249.950us 16.663us 0.000us 0.00% 0.000us 0.000us 15
4151
- aten::empty_strided 0.24% 14.270us 0.24% 14.270us 4.757us 0.000us 0.00% 0.000us 0.000us 3
4152
- cudaDeviceGetAttribute 0.03% 1.680us 0.03% 1.680us 0.280us 0.000us 0.00% 0.000us 0.000us 6
4153
- cudaFuncSetAttribute 0.07% 4.380us 0.07% 4.380us 1.460us 0.000us 0.00% 0.000us 0.000us 3
4154
- cudaDeviceSynchronize 62.18% 3.775ms 62.18% 3.775ms 3.775ms 0.000us 0.00% 0.000us 0.000us 1
4155
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4156
- Self CPU time total: 6.071ms
4157
- Self CUDA time total: 4.413ms
4158
 
4159
 
4160
  impl wl p50(ms) ok
4161
  torch_flash_ma cuda_attn_L128_bfloat16 1.22 True
4162
- torch_flash_ma cuda_attn_L256_bfloat16 1.27 True
4163
- torch_flash_ma cuda_attn_L320_bfloat16 1.28 True
4164
- torch_flash_ma cuda_attn_L384_bfloat16 1.31 True
4165
  torch_flash_ma cuda_attn_L448_bfloat16 1.47 True
4166
  torch_flash_ma cuda_attn_L512_bfloat16 1.50 True
4167
  </pre></div>
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
4168
  <div class="cell-artifacts">
4169
  <h4>Artifacts:</h4>
4170
  <a href="artifacts/benchmark/attention.jsonl" class="artifact" target="_blank">attention.jsonl</a>
 
3871
  <span onclick="toggleOutput('nv')" style="cursor: pointer;">▼ output</span>
3872
  <span id="uv-indicator-nv" style="cursor: default; opacity: 0.3;">▶ uv-logs</span>
3873
  </span> |
3874
+ Cell: nv | 0.28s
3875
  | <button class="run-btn" onclick="runCell('nv')">▶ run</button>
3876
  <button class="copy-btn" onclick="copyCell('nv')">Copy</button>
3877
  <a href="cells/nv.py" target="_blank" class="raw-btn">Raw</a>
 
3888
  </div>
3889
  </div>
3890
  <div id="output-nv" class="cell-output">
3891
+ <div class="cell-stdout"><pre class="stdout-text">Wed Oct 29 14:25:53 2025
3892
  +-----------------------------------------------------------------------------------------+
3893
  | NVIDIA-SMI 570.195.03 Driver Version: 570.195.03 CUDA Version: 12.8 |
3894
  |-----------------------------------------+------------------------+----------------------+
 
3897
  | | | MIG M. |
3898
  |=========================================+========================+======================|
3899
  | 0 NVIDIA L40S On | 00000000:4D:00.0 Off | 0 |
3900
+ | N/A 27C P8 21W / 350W | 0MiB / 46068MiB | 0% Default |
3901
  | | | N/A |
3902
  +-----------------------------------------+------------------------+----------------------+
3903
 
 
3919
  <span class="collapse-indicators">
3920
  <span onclick="toggleCode('benchmark')" style="cursor: pointer;">▼ code</span>
3921
  <span onclick="toggleOutput('benchmark')" style="cursor: pointer;">▼ output</span>
3922
+ <span id="uv-indicator-benchmark" onclick="toggleUvLogsFromHeader('benchmark')" style="cursor: pointer;">▶ uv-logs</span>
3923
  </span> |
3924
+ Cell: benchmark | 32.77s
3925
  | <button class="run-btn" onclick="runCell('benchmark')">▶ run</button>
3926
  <button class="copy-btn" onclick="copyCell('benchmark')">Copy</button>
3927
  <a href="cells/benchmark.py" target="_blank" class="raw-btn">Raw</a>
 
3972
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
3973
  Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
3974
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
3975
+ torch_flash_ma 0.00% 0.000us 0.00% 0.000us 0.000us 3.644ms 102.02% 3.644ms 3.644ms 1
3976
+ torch_flash_ma 6.80% 356.846us 47.04% 2.468ms 2.468ms 0.000us 0.00% 3.612ms 3.612ms 1
3977
+ aten::scaled_dot_product_attention 0.82% 43.042us 4.47% 234.776us 78.259us 0.000us 0.00% 2.857ms 952.201us 3
3978
+ aten::_scaled_dot_product_flash_attention 0.56% 29.330us 3.65% 191.734us 63.911us 0.000us 0.00% 2.857ms 952.201us 3
3979
+ aten::_flash_attention_forward 0.75% 39.581us 2.59% 135.674us 45.225us 2.857ms 79.97% 2.857ms 952.201us 3
3980
+ void pytorch_flash::flash_fwd_kernel&lt;Flash_fwd_kerne... 0.00% 0.000us 0.00% 0.000us 0.000us 2.857ms 79.97% 2.857ms 952.201us 3
3981
+ aten::contiguous 0.27% 14.180us 34.32% 1.801ms 150.051us 0.000us 0.00% 755.680us 62.973us 12
3982
+ aten::clone 0.74% 38.791us 34.04% 1.786ms 148.870us 0.000us 0.00% 755.680us 62.973us 12
3983
+ aten::copy_ 1.85% 97.030us 31.43% 1.649ms 137.429us 715.456us 20.03% 755.680us 62.973us 12
3984
+ void at::native::elementwise_kernel&lt;128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 715.456us 20.03% 715.456us 59.621us 12
3985
+ Activity Buffer Request 27.38% 1.437ms 27.38% 1.437ms 1.437ms 40.224us 1.13% 40.224us 40.224us 1
3986
+ aten::transpose 1.47% 77.273us 1.96% 102.714us 4.280us 0.000us 0.00% 0.000us 0.000us 24
3987
+ aten::as_strided 0.48% 25.441us 0.48% 25.441us 1.060us 0.000us 0.00% 0.000us 0.000us 24
3988
+ aten::empty_like 0.70% 36.821us 2.35% 123.326us 8.222us 0.000us 0.00% 0.000us 0.000us 15
3989
+ aten::empty 1.93% 101.493us 1.93% 101.493us 4.229us 0.000us 0.00% 0.000us 0.000us 24
3990
+ cudaLaunchKernel 2.70% 141.775us 2.70% 141.775us 9.452us 0.000us 0.00% 0.000us 0.000us 15
3991
+ aten::empty_strided 0.35% 18.402us 0.35% 18.402us 6.134us 0.000us 0.00% 0.000us 0.000us 3
3992
+ cudaDeviceGetAttribute 0.05% 2.540us 0.05% 2.540us 0.423us 0.000us 0.00% 0.000us 0.000us 6
3993
+ cudaFuncSetAttribute 0.17% 8.890us 0.17% 8.890us 2.963us 0.000us 0.00% 0.000us 0.000us 3
3994
+ cudaDeviceSynchronize 52.96% 2.779ms 52.96% 2.779ms 2.779ms 0.000us 0.00% 0.000us 0.000us 1
3995
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
3996
+ Self CPU time total: 5.247ms
3997
+ Self CUDA time total: 3.572ms
3998
 
3999
 
4000
 
 
4004
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4005
  Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
4006
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4007
+ torch_flash_ma 4.70% 246.528us 41.73% 2.189ms 2.189ms 0.000us 0.00% 3.817ms 3.817ms 1
4008
+ torch_flash_ma 0.00% 0.000us 0.00% 0.000us 0.000us 3.772ms 100.28% 3.772ms 3.772ms 1
4009
+ aten::scaled_dot_product_attention 0.51% 26.610us 3.43% 180.143us 60.048us 0.000us 0.00% 2.999ms 999.573us 3
4010
+ aten::_scaled_dot_product_flash_attention 0.37% 19.600us 2.93% 153.533us 51.178us 0.000us 0.00% 2.999ms 999.573us 3
4011
+ aten::_flash_attention_forward 0.63% 32.980us 2.12% 111.443us 37.148us 2.999ms 79.71% 2.999ms 999.573us 3
4012
+ void pytorch_flash::flash_fwd_kernel&lt;Flash_fwd_kerne... 0.00% 0.000us 0.00% 0.000us 0.000us 2.999ms 79.71% 2.999ms 999.573us 3
4013
+ aten::contiguous 0.19% 10.030us 32.68% 1.715ms 142.893us 0.000us 0.00% 818.210us 68.184us 12
4014
+ aten::clone 0.55% 29.002us 32.49% 1.705ms 142.057us 0.000us 0.00% 818.210us 68.184us 12
4015
+ aten::copy_ 2.09% 109.441us 30.74% 1.613ms 134.399us 763.297us 20.29% 818.210us 68.184us 12
4016
+ void at::native::elementwise_kernel&lt;128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 763.297us 20.29% 763.297us 63.608us 12
4017
+ Activity Buffer Request 26.94% 1.413ms 26.94% 1.413ms 1.413ms 54.913us 1.46% 54.913us 54.913us 1
4018
+ aten::transpose 1.00% 52.652us 1.34% 70.433us 2.935us 0.000us 0.00% 0.000us 0.000us 24
4019
+ aten::as_strided 0.34% 17.781us 0.34% 17.781us 0.741us 0.000us 0.00% 0.000us 0.000us 24
4020
+ aten::empty_like 0.38% 19.980us 1.61% 84.581us 5.639us 0.000us 0.00% 0.000us 0.000us 15
4021
+ aten::empty 1.45% 76.201us 1.45% 76.201us 3.175us 0.000us 0.00% 0.000us 0.000us 24
4022
+ cudaLaunchKernel 2.16% 113.102us 2.16% 113.102us 7.540us 0.000us 0.00% 0.000us 0.000us 15
4023
+ aten::empty_strided 0.31% 16.430us 0.31% 16.430us 5.477us 0.000us 0.00% 0.000us 0.000us 3
4024
+ cudaDeviceGetAttribute 0.03% 1.751us 0.03% 1.751us 0.292us 0.000us 0.00% 0.000us 0.000us 6
4025
+ cudaFuncSetAttribute 0.07% 3.771us 0.07% 3.771us 1.257us 0.000us 0.00% 0.000us 0.000us 3
4026
+ cudaDeviceSynchronize 58.27% 3.058ms 58.27% 3.058ms 3.058ms 0.000us 0.00% 0.000us 0.000us 1
4027
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4028
+ Self CPU time total: 5.247ms
4029
+ Self CUDA time total: 3.762ms
4030
 
4031
 
4032
 
 
4036
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4037
  Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
4038
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4039
+ torch_flash_ma 4.50% 237.986us 41.18% 2.178ms 2.178ms 0.000us 0.00% 3.833ms 3.833ms 1
4040
+ torch_flash_ma 0.00% 0.000us 0.00% 0.000us 0.000us 3.785ms 100.29% 3.785ms 3.785ms 1
4041
+ aten::scaled_dot_product_attention 0.46% 24.381us 3.40% 179.915us 59.972us 0.000us 0.00% 2.998ms 999.221us 3
4042
+ aten::_scaled_dot_product_flash_attention 0.36% 19.171us 2.94% 155.534us 51.845us 0.000us 0.00% 2.998ms 999.221us 3
4043
+ aten::_flash_attention_forward 0.65% 34.259us 2.15% 113.691us 37.897us 2.998ms 79.44% 2.998ms 999.221us 3
4044
+ void pytorch_flash::flash_fwd_kernel&lt;Flash_fwd_kerne... 0.00% 0.000us 0.00% 0.000us 0.000us 2.998ms 79.44% 2.998ms 999.221us 3
4045
+ aten::contiguous 0.19% 9.800us 32.38% 1.712ms 142.708us 0.000us 0.00% 835.263us 69.605us 12
4046
+ aten::clone 0.53% 28.211us 32.20% 1.703ms 141.891us 0.000us 0.00% 835.263us 69.605us 12
4047
+ aten::copy_ 1.60% 84.650us 30.46% 1.611ms 134.247us 776.063us 20.56% 835.263us 69.605us 12
4048
+ void at::native::elementwise_kernel&lt;128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 776.063us 20.56% 776.063us 64.672us 12
4049
+ Activity Buffer Request 27.18% 1.437ms 27.18% 1.437ms 1.437ms 59.200us 1.57% 59.200us 59.200us 1
4050
+ aten::transpose 0.99% 52.225us 1.33% 70.125us 2.922us 0.000us 0.00% 0.000us 0.000us 24
4051
+ aten::as_strided 0.34% 17.900us 0.34% 17.900us 0.746us 0.000us 0.00% 0.000us 0.000us 24
4052
+ aten::empty_like 0.37% 19.782us 1.60% 84.803us 5.654us 0.000us 0.00% 0.000us 0.000us 15
4053
+ aten::empty 1.45% 76.431us 1.45% 76.431us 3.185us 0.000us 0.00% 0.000us 0.000us 24
4054
+ cudaLaunchKernel 2.16% 114.204us 2.16% 114.204us 7.614us 0.000us 0.00% 0.000us 0.000us 15
4055
+ aten::empty_strided 0.30% 16.100us 0.30% 16.100us 5.367us 0.000us 0.00% 0.000us 0.000us 3
4056
+ cudaDeviceGetAttribute 0.03% 1.730us 0.03% 1.730us 0.288us 0.000us 0.00% 0.000us 0.000us 6
4057
+ cudaFuncSetAttribute 0.07% 3.730us 0.07% 3.730us 1.243us 0.000us 0.00% 0.000us 0.000us 3
4058
+ cudaDeviceSynchronize 58.82% 3.110ms 58.82% 3.110ms 3.110ms 0.000us 0.00% 0.000us 0.000us 1
4059
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4060
+ Self CPU time total: 5.288ms
4061
+ Self CUDA time total: 3.774ms
4062
 
4063
 
4064
 
 
4068
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4069
  Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
4070
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4071
+ torch_flash_ma 4.36% 241.837us 43.33% 2.405ms 2.405ms 0.000us 0.00% 3.884ms 3.884ms 1
4072
+ torch_flash_ma 0.00% 0.000us 0.00% 0.000us 0.000us 3.837ms 100.27% 3.837ms 3.837ms 1
4073
+ aten::scaled_dot_product_attention 0.48% 26.802us 3.27% 181.715us 60.572us 0.000us 0.00% 3.042ms 1.014ms 3
4074
+ aten::_scaled_dot_product_flash_attention 0.35% 19.308us 2.79% 154.913us 51.638us 0.000us 0.00% 3.042ms 1.014ms 3
4075
+ aten::_flash_attention_forward 0.60% 33.361us 2.03% 112.712us 37.571us 3.042ms 79.50% 3.042ms 1.014ms 3
4076
+ void pytorch_flash::flash_fwd_kernel&lt;Flash_fwd_kerne... 0.00% 0.000us 0.00% 0.000us 0.000us 3.042ms 79.50% 3.042ms 1.014ms 3
4077
+ aten::contiguous 0.17% 9.659us 34.84% 1.934ms 161.162us 0.000us 0.00% 841.829us 70.152us 12
4078
+ aten::clone 0.50% 27.830us 34.67% 1.924ms 160.357us 0.000us 0.00% 841.829us 70.152us 12
4079
+ aten::copy_ 1.56% 86.702us 32.55% 1.807ms 150.547us 784.548us 20.50% 841.829us 70.152us 12
4080
+ void at::native::elementwise_kernel&lt;128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 784.548us 20.50% 784.548us 65.379us 12
4081
+ Activity Buffer Request 25.45% 1.413ms 25.45% 1.413ms 1.413ms 57.281us 1.50% 57.281us 57.281us 1
4082
+ aten::transpose 0.95% 52.620us 1.27% 70.404us 2.933us 0.000us 0.00% 0.000us 0.000us 24
4083
+ aten::as_strided 0.32% 17.784us 0.32% 17.784us 0.741us 0.000us 0.00% 0.000us 0.000us 24
4084
+ aten::empty_like 0.78% 43.221us 2.00% 111.194us 7.413us 0.000us 0.00% 0.000us 0.000us 15
4085
+ aten::empty 1.45% 80.673us 1.45% 80.673us 3.361us 0.000us 0.00% 0.000us 0.000us 24
4086
+ cudaLaunchKernel 5.96% 331.078us 5.96% 331.078us 22.072us 0.000us 0.00% 0.000us 0.000us 15
4087
+ aten::empty_strided 0.28% 15.800us 0.28% 15.800us 5.267us 0.000us 0.00% 0.000us 0.000us 3
4088
+ cudaDeviceGetAttribute 0.03% 1.730us 0.03% 1.730us 0.288us 0.000us 0.00% 0.000us 0.000us 6
4089
+ cudaFuncSetAttribute 0.07% 3.850us 0.07% 3.850us 1.283us 0.000us 0.00% 0.000us 0.000us 3
4090
+ cudaDeviceSynchronize 56.67% 3.146ms 56.67% 3.146ms 3.146ms 0.000us 0.00% 0.000us 0.000us 1
4091
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4092
+ Self CPU time total: 5.551ms
4093
+ Self CUDA time total: 3.827ms
4094
 
4095
 
4096
 
 
4100
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4101
  Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
4102
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4103
+ torch_flash_ma 4.46% 268.165us 40.09% 2.413ms 2.413ms 0.000us 0.00% 4.405ms 4.405ms 1
4104
+ torch_flash_ma 0.00% 0.000us 0.00% 0.000us 0.000us 4.355ms 100.25% 4.355ms 4.355ms 1
4105
+ aten::scaled_dot_product_attention 0.46% 27.642us 3.64% 218.806us 72.935us 0.000us 0.00% 3.540ms 1.180ms 3
4106
+ aten::_scaled_dot_product_flash_attention 0.75% 45.250us 3.18% 191.164us 63.721us 0.000us 0.00% 3.540ms 1.180ms 3
4107
+ aten::_flash_attention_forward 0.61% 36.651us 2.01% 120.923us 40.308us 3.540ms 81.48% 3.540ms 1.180ms 3
4108
+ void pytorch_flash::flash_fwd_kernel&lt;Flash_fwd_kerne... 0.00% 0.000us 0.00% 0.000us 0.000us 3.540ms 81.48% 3.540ms 1.180ms 3
4109
+ aten::contiguous 0.18% 10.862us 31.11% 1.873ms 156.050us 0.000us 0.00% 865.606us 72.134us 12
4110
+ aten::clone 0.51% 30.490us 30.93% 1.862ms 155.145us 0.000us 0.00% 865.606us 72.134us 12
4111
+ aten::copy_ 1.51% 90.931us 29.34% 1.766ms 147.155us 804.645us 18.52% 865.606us 72.134us 12
4112
+ void at::native::elementwise_kernel&lt;128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 804.645us 18.52% 804.645us 67.054us 12
4113
+ Activity Buffer Request 21.61% 1.300ms 21.61% 1.300ms 1.300ms 60.961us 1.40% 60.961us 60.961us 1
4114
+ aten::transpose 0.99% 59.753us 1.30% 78.501us 3.271us 0.000us 0.00% 0.000us 0.000us 24
4115
+ aten::as_strided 0.31% 18.748us 0.31% 18.748us 0.781us 0.000us 0.00% 0.000us 0.000us 24
4116
+ aten::empty_like 0.35% 20.935us 1.45% 87.165us 5.811us 0.000us 0.00% 0.000us 0.000us 15
4117
+ aten::empty 1.32% 79.690us 1.32% 79.690us 3.320us 0.000us 0.00% 0.000us 0.000us 24
4118
+ cudaLaunchKernel 6.67% 401.680us 6.67% 401.680us 26.779us 0.000us 0.00% 0.000us 0.000us 15
4119
+ aten::empty_strided 0.27% 16.081us 0.27% 16.081us 5.360us 0.000us 0.00% 0.000us 0.000us 3
4120
+ cudaDeviceGetAttribute 0.03% 2.030us 0.03% 2.030us 0.338us 0.000us 0.00% 0.000us 0.000us 6
4121
+ cudaFuncSetAttribute 0.06% 3.810us 0.06% 3.810us 1.270us 0.000us 0.00% 0.000us 0.000us 3
4122
+ cudaDeviceSynchronize 59.91% 3.605ms 59.91% 3.605ms 3.605ms 0.000us 0.00% 0.000us 0.000us 1
4123
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4124
+ Self CPU time total: 6.018ms
4125
+ Self CUDA time total: 4.344ms
4126
 
4127
 
4128
 
 
4132
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4133
  Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
4134
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4135
+ torch_flash_ma 4.01% 246.839us 39.75% 2.447ms 2.447ms 0.000us 0.00% 4.458ms 4.458ms 1
4136
+ torch_flash_ma 0.00% 0.000us 0.00% 0.000us 0.000us 4.407ms 100.23% 4.407ms 4.407ms 1
4137
+ aten::scaled_dot_product_attention 0.40% 24.621us 2.95% 181.474us 60.491us 0.000us 0.00% 3.579ms 1.193ms 3
4138
+ aten::_scaled_dot_product_flash_attention 0.34% 20.980us 2.55% 156.853us 52.284us 0.000us 0.00% 3.579ms 1.193ms 3
4139
+ aten::_flash_attention_forward 0.58% 35.588us 1.84% 113.003us 37.668us 3.579ms 81.40% 3.579ms 1.193ms 3
4140
+ void pytorch_flash::flash_fwd_kernel&lt;Flash_fwd_kerne... 0.00% 0.000us 0.00% 0.000us 0.000us 3.579ms 81.40% 3.579ms 1.193ms 3
4141
+ aten::contiguous 0.16% 10.061us 32.01% 1.971ms 164.244us 0.000us 0.00% 878.818us 73.235us 12
4142
+ aten::clone 0.50% 30.903us 31.85% 1.961ms 163.406us 0.000us 0.00% 878.818us 73.235us 12
4143
+ aten::copy_ 1.35% 82.841us 30.27% 1.864ms 155.305us 817.634us 18.60% 878.818us 73.235us 12
4144
+ void at::native::elementwise_kernel&lt;128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 817.634us 18.60% 817.634us 68.136us 12
4145
+ Activity Buffer Request 23.50% 1.447ms 23.50% 1.447ms 1.447ms 61.184us 1.39% 61.184us 61.184us 1
4146
+ aten::transpose 0.85% 52.630us 1.15% 70.790us 2.950us 0.000us 0.00% 0.000us 0.000us 24
4147
+ aten::as_strided 0.29% 18.160us 0.29% 18.160us 0.757us 0.000us 0.00% 0.000us 0.000us 24
4148
+ aten::empty_like 0.33% 20.456us 1.41% 86.700us 5.780us 0.000us 0.00% 0.000us 0.000us 15
4149
+ aten::empty 1.28% 78.794us 1.28% 78.794us 3.283us 0.000us 0.00% 0.000us 0.000us 24
4150
+ cudaLaunchKernel 5.81% 357.919us 5.81% 357.919us 23.861us 0.000us 0.00% 0.000us 0.000us 15
4151
+ aten::empty_strided 0.25% 15.401us 0.25% 15.401us 5.134us 0.000us 0.00% 0.000us 0.000us 3
4152
+ cudaDeviceGetAttribute 0.03% 1.632us 0.03% 1.632us 0.272us 0.000us 0.00% 0.000us 0.000us 6
4153
+ cudaFuncSetAttribute 0.06% 3.720us 0.06% 3.720us 1.240us 0.000us 0.00% 0.000us 0.000us 3
4154
+ cudaDeviceSynchronize 60.25% 3.709ms 60.25% 3.709ms 3.709ms 0.000us 0.00% 0.000us 0.000us 1
4155
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4156
+ Self CPU time total: 6.156ms
4157
+ Self CUDA time total: 4.397ms
4158
 
4159
 
4160
  impl wl p50(ms) ok
4161
  torch_flash_ma cuda_attn_L128_bfloat16 1.22 True
4162
+ torch_flash_ma cuda_attn_L256_bfloat16 1.28 True
4163
+ torch_flash_ma cuda_attn_L320_bfloat16 1.29 True
4164
+ torch_flash_ma cuda_attn_L384_bfloat16 1.33 True
4165
  torch_flash_ma cuda_attn_L448_bfloat16 1.47 True
4166
  torch_flash_ma cuda_attn_L512_bfloat16 1.50 True
4167
  </pre></div>
4168
+ <div class="uv-install-logs" id="uv-logs-benchmark">
4169
+ <div class="uv-logs-header" onclick="toggleUvLogs(this)">▶ UV Install Logs</div>
4170
+ <div class="uv-logs-content" style="display: none;">
4171
+ Building kernels-benchmark-tools @ file:///__w/kernels-benchmarks/kernels-benchmarks/tools
4172
+ Downloading nvidia-cuda-cupti-cu12 (9.8MiB)
4173
+ Downloading matplotlib (8.3MiB)
4174
+ Downloading nvidia-cufile-cu12 (1.1MiB)
4175
+ Downloading numpy (16.2MiB)
4176
+ Downloading nvidia-cudnn-cu12 (674.0MiB)
4177
+ Downloading nvidia-cusparse-cu12 (274.9MiB)
4178
+ Downloading nvidia-cublas-cu12 (566.8MiB)
4179
+ Downloading nvidia-cusparselt-cu12 (273.9MiB)
4180
+ Downloading nvidia-nvjitlink-cu12 (37.4MiB)
4181
+ Downloading kiwisolver (1.4MiB)
4182
+ Downloading networkx (1.9MiB)
4183
+ Downloading nvidia-curand-cu12 (60.7MiB)
4184
+ Downloading sympy (6.0MiB)
4185
+ Downloading nvidia-cuda-nvrtc-cu12 (84.0MiB)
4186
+ Downloading setuptools (1.1MiB)
4187
+ Downloading nvidia-cusolver-cu12 (255.1MiB)
4188
+ Downloading triton (148.3MiB)
4189
+ Downloading pillow (6.7MiB)
4190
+ Downloading nvidia-cufft-cu12 (184.2MiB)
4191
+ Downloading fonttools (4.7MiB)
4192
+ Downloading nvidia-nccl-cu12 (307.4MiB)
4193
+ Downloading torch (846.9MiB)
4194
+ Downloading nvidia-cufile-cu12
4195
+ Downloading kiwisolver
4196
+ Downloading setuptools
4197
+ Downloading networkx
4198
+ Downloading fonttools
4199
+ Downloading pillow
4200
+ Built kernels-benchmark-tools @ file:///__w/kernels-benchmarks/kernels-benchmarks/tools
4201
+ Downloading nvidia-cuda-cupti-cu12
4202
+ Downloading matplotlib
4203
+ Downloading numpy
4204
+ Downloading sympy
4205
+ Downloading nvidia-nvjitlink-cu12
4206
+ Downloading nvidia-curand-cu12
4207
+ Downloading nvidia-cuda-nvrtc-cu12
4208
+ Downloading triton
4209
+ Downloading nvidia-cufft-cu12
4210
+ Downloading nvidia-cusolver-cu12
4211
+ Downloading nvidia-cusparse-cu12
4212
+ Downloading nvidia-cusparselt-cu12
4213
+ Downloading nvidia-nccl-cu12
4214
+ Downloading nvidia-cublas-cu12
4215
+ Downloading nvidia-cudnn-cu12
4216
+ Downloading torch
4217
+ Installed 37 packages in 212ms
4218
+ </div>
4219
+ </div>
4220
  <div class="cell-artifacts">
4221
  <h4>Artifacts:</h4>
4222
  <a href="artifacts/benchmark/attention.jsonl" class="artifact" target="_blank">attention.jsonl</a>
flash_attn/impls/hf_kernels_flash_attn.html CHANGED
@@ -3869,9 +3869,9 @@ span.linenos.special { color: #000000; background-color: #ffffc0; padding-left:
3869
  <span class="collapse-indicators">
3870
  <span onclick="toggleCode('benchmark')" style="cursor: pointer;">▼ code</span>
3871
  <span onclick="toggleOutput('benchmark')" style="cursor: pointer;">▼ output</span>
3872
- <span id="uv-indicator-benchmark" onclick="toggleUvLogsFromHeader('benchmark')" style="cursor: pointer;">▶ uv-logs</span>
3873
  </span> |
3874
- Cell: benchmark | 6.08s
3875
  | <button class="run-btn" onclick="runCell('benchmark')">▶ run</button>
3876
  <button class="copy-btn" onclick="copyCell('benchmark')">Copy</button>
3877
  <a href="cells/benchmark.py" target="_blank" class="raw-btn">Raw</a>
@@ -3926,21 +3926,21 @@ PROFILE TRACE: hf_kernels_flash_attn | cuda_attn_L128_bfloat16
3926
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
3927
  Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
3928
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
3929
- hf_kernels_flash_attn 3.64% 160.058us 41.50% 1.823ms 1.823ms 0.000us 0.00% 3.744ms 3.744ms 1
3930
- _flash_attn_9e27194::fwd 1.78% 78.347us 37.86% 1.663ms 554.208us 2.792ms 100.00% 3.744ms 1.248ms 3
3931
- hf_kernels_flash_attn 0.00% 0.000us 0.00% 0.000us 0.000us 2.794ms 100.05% 2.794ms 2.794ms 1
3932
- void flash::flash_fwd_kernel&lt;Flash_fwd_kernel_traits... 0.00% 0.000us 0.00% 0.000us 0.000us 2.792ms 100.00% 2.792ms 930.800us 3
3933
- Activity Buffer Request 33.00% 1.449ms 33.00% 1.449ms 1.449ms 951.685us 34.08% 951.685us 951.685us 1
3934
- cudaDeviceGetAttribute 0.13% 5.638us 0.13% 5.638us 0.376us 0.000us 0.00% 0.000us 0.000us 15
3935
- aten::empty_like 0.40% 17.551us 1.19% 52.122us 17.374us 0.000us 0.00% 0.000us 0.000us 3
3936
- aten::empty_strided 0.79% 34.571us 0.79% 34.571us 11.524us 0.000us 0.00% 0.000us 0.000us 3
3937
- aten::empty 0.57% 24.890us 0.57% 24.890us 2.766us 0.000us 0.00% 0.000us 0.000us 9
3938
- cudaFuncSetAttribute 0.28% 12.210us 0.28% 12.210us 4.070us 0.000us 0.00% 0.000us 0.000us 3
3939
- cudaLaunchKernel 0.92% 40.292us 0.92% 40.292us 13.431us 0.000us 0.00% 0.000us 0.000us 3
3940
- cudaDeviceSynchronize 58.50% 2.569ms 58.50% 2.569ms 2.569ms 0.000us 0.00% 0.000us 0.000us 1
3941
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
3942
- Self CPU time total: 4.392ms
3943
- Self CUDA time total: 2.792ms
3944
 
3945
 
3946
 
@@ -3950,21 +3950,21 @@ PROFILE TRACE: hf_kernels_flash_attn | cuda_attn_L256_bfloat16
3950
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
3951
  Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
3952
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
3953
- hf_kernels_flash_attn 2.22% 99.144us 37.48% 1.673ms 1.673ms 0.000us 0.00% 3.949ms 3.949ms 1
3954
- _flash_attn_9e27194::fwd 1.20% 53.462us 35.26% 1.574ms 524.654us 2.953ms 100.00% 3.949ms 1.316ms 3
3955
- hf_kernels_flash_attn 0.00% 0.000us 0.00% 0.000us 0.000us 2.955ms 100.05% 2.955ms 2.955ms 1
3956
- void flash::flash_fwd_kernel&lt;Flash_fwd_kernel_traits... 0.00% 0.000us 0.00% 0.000us 0.000us 2.953ms 100.00% 2.953ms 984.436us 3
3957
- Activity Buffer Request 32.23% 1.439ms 32.23% 1.439ms 1.439ms 995.807us 33.72% 995.807us 995.807us 1
3958
- cudaDeviceGetAttribute 0.10% 4.621us 0.10% 4.621us 0.308us 0.000us 0.00% 0.000us 0.000us 15
3959
- aten::empty_like 0.17% 7.710us 0.56% 24.861us 8.287us 0.000us 0.00% 0.000us 0.000us 3
3960
- aten::empty_strided 0.38% 17.151us 0.38% 17.151us 5.717us 0.000us 0.00% 0.000us 0.000us 3
3961
- aten::empty 0.47% 21.122us 0.47% 21.122us 2.347us 0.000us 0.00% 0.000us 0.000us 9
3962
- cudaFuncSetAttribute 0.08% 3.791us 0.08% 3.791us 1.264us 0.000us 0.00% 0.000us 0.000us 3
3963
- cudaLaunchKernel 0.61% 27.380us 0.61% 27.380us 9.127us 0.000us 0.00% 0.000us 0.000us 3
3964
- cudaDeviceSynchronize 62.52% 2.791ms 62.52% 2.791ms 2.791ms 0.000us 0.00% 0.000us 0.000us 1
3965
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
3966
- Self CPU time total: 4.464ms
3967
- Self CUDA time total: 2.953ms
3968
 
3969
 
3970
 
@@ -3974,21 +3974,21 @@ PROFILE TRACE: hf_kernels_flash_attn | cuda_attn_L320_bfloat16
3974
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
3975
  Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
3976
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
3977
- hf_kernels_flash_attn 2.58% 116.955us 37.54% 1.702ms 1.702ms 0.000us 0.00% 4.041ms 4.041ms 1
3978
- _flash_attn_9e27194::fwd 1.53% 69.255us 34.96% 1.585ms 528.314us 3.010ms 100.00% 4.041ms 1.347ms 3
3979
- hf_kernels_flash_attn 0.00% 0.000us 0.00% 0.000us 0.000us 3.012ms 100.05% 3.012ms 3.012ms 1
3980
- void flash::flash_fwd_kernel&lt;Flash_fwd_kernel_traits... 0.00% 0.000us 0.00% 0.000us 0.000us 3.010ms 100.00% 3.010ms 1.003ms 3
3981
- Activity Buffer Request 31.53% 1.430ms 31.53% 1.430ms 1.430ms 1.031ms 34.26% 1.031ms 1.031ms 1
3982
- cudaDeviceGetAttribute 0.10% 4.450us 0.10% 4.450us 0.297us 0.000us 0.00% 0.000us 0.000us 15
3983
- aten::empty_like 0.18% 8.151us 0.57% 25.801us 8.600us 0.000us 0.00% 0.000us 0.000us 3
3984
- aten::empty_strided 0.39% 17.650us 0.39% 17.650us 5.883us 0.000us 0.00% 0.000us 0.000us 3
3985
- aten::empty 0.48% 21.771us 0.48% 21.771us 2.419us 0.000us 0.00% 0.000us 0.000us 9
3986
- cudaFuncSetAttribute 0.10% 4.360us 0.10% 4.360us 1.453us 0.000us 0.00% 0.000us 0.000us 3
3987
- cudaLaunchKernel 0.66% 29.790us 0.66% 29.790us 9.930us 0.000us 0.00% 0.000us 0.000us 3
3988
- cudaDeviceSynchronize 62.46% 2.832ms 62.46% 2.832ms 2.832ms 0.000us 0.00% 0.000us 0.000us 1
3989
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
3990
- Self CPU time total: 4.534ms
3991
- Self CUDA time total: 3.010ms
3992
 
3993
 
3994
 
@@ -3998,21 +3998,21 @@ PROFILE TRACE: hf_kernels_flash_attn | cuda_attn_L384_bfloat16
3998
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
3999
  Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
4000
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4001
- hf_kernels_flash_attn 2.39% 114.805us 40.03% 1.925ms 1.925ms 0.000us 0.00% 4.094ms 4.094ms 1
4002
- _flash_attn_9e27194::fwd 1.09% 52.653us 37.65% 1.810ms 603.407us 3.063ms 100.00% 4.094ms 1.365ms 3
4003
- hf_kernels_flash_attn 0.00% 0.000us 0.00% 0.000us 0.000us 3.065ms 100.05% 3.065ms 3.065ms 1
4004
- void flash::flash_fwd_kernel&lt;Flash_fwd_kernel_traits... 0.00% 0.000us 0.00% 0.000us 0.000us 3.063ms 100.00% 3.063ms 1.021ms 3
4005
- Activity Buffer Request 29.78% 1.432ms 29.78% 1.432ms 1.432ms 1.031ms 33.65% 1.031ms 1.031ms 1
4006
- cudaDeviceGetAttribute 0.10% 4.861us 0.10% 4.861us 0.324us 0.000us 0.00% 0.000us 0.000us 15
4007
- aten::empty_like 0.16% 7.720us 0.55% 26.331us 8.777us 0.000us 0.00% 0.000us 0.000us 3
4008
- aten::empty_strided 0.39% 18.611us 0.39% 18.611us 6.204us 0.000us 0.00% 0.000us 0.000us 3
4009
- aten::empty 0.45% 21.731us 0.45% 21.731us 2.415us 0.000us 0.00% 0.000us 0.000us 9
4010
- cudaFuncSetAttribute 0.08% 3.728us 0.08% 3.728us 1.243us 0.000us 0.00% 0.000us 0.000us 3
4011
- cudaLaunchKernel 5.59% 268.862us 5.59% 268.862us 89.621us 0.000us 0.00% 0.000us 0.000us 3
4012
- cudaDeviceSynchronize 59.97% 2.884ms 59.97% 2.884ms 2.884ms 0.000us 0.00% 0.000us 0.000us 1
4013
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4014
- Self CPU time total: 4.809ms
4015
- Self CUDA time total: 3.063ms
4016
 
4017
 
4018
 
@@ -4022,21 +4022,21 @@ PROFILE TRACE: hf_kernels_flash_attn | cuda_attn_L448_bfloat16
4022
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4023
  Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
4024
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4025
- hf_kernels_flash_attn 2.13% 113.755us 35.84% 1.918ms 1.918ms 0.000us 0.00% 4.786ms 4.786ms 1
4026
- _flash_attn_9e27194::fwd 1.02% 54.483us 33.71% 1.804ms 601.364us 3.588ms 100.00% 4.786ms 1.595ms 3
4027
- hf_kernels_flash_attn 0.00% 0.000us 0.00% 0.000us 0.000us 3.590ms 100.04% 3.590ms 3.590ms 1
4028
- void flash::flash_fwd_kernel&lt;Flash_fwd_kernel_traits... 0.00% 0.000us 0.00% 0.000us 0.000us 3.588ms 100.00% 3.588ms 1.196ms 3
4029
- Activity Buffer Request 26.99% 1.445ms 26.99% 1.445ms 1.445ms 1.198ms 33.38% 1.198ms 1.198ms 1
4030
- cudaDeviceGetAttribute 0.08% 4.270us 0.08% 4.270us 0.285us 0.000us 0.00% 0.000us 0.000us 15
4031
- aten::empty_like 0.15% 8.039us 0.48% 25.640us 8.547us 0.000us 0.00% 0.000us 0.000us 3
4032
- aten::empty_strided 0.33% 17.601us 0.33% 17.601us 5.867us 0.000us 0.00% 0.000us 0.000us 3
4033
- aten::empty 0.40% 21.582us 0.40% 21.582us 2.398us 0.000us 0.00% 0.000us 0.000us 9
4034
- cudaFuncSetAttribute 0.07% 3.700us 0.07% 3.700us 1.233us 0.000us 0.00% 0.000us 0.000us 3
4035
- cudaLaunchKernel 4.67% 249.891us 4.67% 249.891us 83.297us 0.000us 0.00% 0.000us 0.000us 3
4036
- cudaDeviceSynchronize 64.16% 3.434ms 64.16% 3.434ms 3.434ms 0.000us 0.00% 0.000us 0.000us 1
4037
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4038
- Self CPU time total: 5.351ms
4039
- Self CUDA time total: 3.588ms
4040
 
4041
 
4042
 
@@ -4046,41 +4046,36 @@ PROFILE TRACE: hf_kernels_flash_attn | cuda_attn_L512_bfloat16
4046
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4047
  Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
4048
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4049
- hf_kernels_flash_attn 2.08% 111.044us 35.25% 1.879ms 1.879ms 0.000us 0.00% 4.816ms 4.816ms 1
4050
- _flash_attn_9e27194::fwd 0.99% 52.834us 33.17% 1.768ms 589.427us 3.606ms 100.00% 4.816ms 1.605ms 3
4051
- hf_kernels_flash_attn 0.00% 0.000us 0.00% 0.000us 0.000us 3.607ms 100.05% 3.607ms 3.607ms 1
4052
- void flash::flash_fwd_kernel&lt;Flash_fwd_kernel_traits... 0.00% 0.000us 0.00% 0.000us 0.000us 3.606ms 100.00% 3.606ms 1.202ms 3
4053
- Activity Buffer Request 26.56% 1.416ms 26.56% 1.416ms 1.416ms 1.210ms 33.55% 1.210ms 1.210ms 1
4054
- cudaDeviceGetAttribute 0.08% 4.460us 0.08% 4.460us 0.297us 0.000us 0.00% 0.000us 0.000us 15
4055
- aten::empty_like 0.14% 7.500us 0.49% 26.051us 8.684us 0.000us 0.00% 0.000us 0.000us 3
4056
- aten::empty_strided 0.35% 18.551us 0.35% 18.551us 6.184us 0.000us 0.00% 0.000us 0.000us 3
4057
- aten::empty 0.41% 21.960us 0.41% 21.960us 2.440us 0.000us 0.00% 0.000us 0.000us 9
4058
- cudaFuncSetAttribute 0.08% 4.009us 0.08% 4.009us 1.336us 0.000us 0.00% 0.000us 0.000us 3
4059
- cudaLaunchKernel 4.55% 242.792us 4.55% 242.792us 80.931us 0.000us 0.00% 0.000us 0.000us 3
4060
- cudaDeviceSynchronize 64.75% 3.452ms 64.75% 3.452ms 3.452ms 0.000us 0.00% 0.000us 0.000us 1
4061
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4062
- Self CPU time total: 5.332ms
4063
- Self CUDA time total: 3.606ms
4064
 
4065
 
4066
  impl wl p50(ms) ok
4067
- hf_kernels_flash_attn cuda_attn_L128_bfloat16 0.96 True
4068
  hf_kernels_flash_attn cuda_attn_L256_bfloat16 1.01 True
4069
  hf_kernels_flash_attn cuda_attn_L320_bfloat16 1.06 True
4070
- hf_kernels_flash_attn cuda_attn_L384_bfloat16 1.05 True
4071
- hf_kernels_flash_attn cuda_attn_L448_bfloat16 1.22 True
4072
- hf_kernels_flash_attn cuda_attn_L512_bfloat16 1.21 True
4073
  </pre></div>
4074
- <div class="uv-install-logs" id="uv-logs-benchmark">
4075
- <div class="uv-logs-header" onclick="toggleUvLogs(this)">▶ UV Install Logs</div>
4076
- <div class="uv-logs-content" style="display: none;">
4077
- Installed 15 packages in 13ms
4078
  </div>
4079
- </div>
4080
- <div class="cell-stderr">Fetching 20 files: 0%| | 0/20 [00:00&lt;?, ?it/s]
4081
- Fetching 20 files: 5%|▌ | 1/20 [00:00&lt;00:04, 4.26it/s]
4082
- Fetching 20 files: 10%|█ | 2/20 [00:01&lt;00:17, 1.03it/s]
4083
- Fetching 20 files: 100%|██████████| 20/20 [00:01&lt;00:00, 11.64it/s]</div>
4084
  <div class="cell-artifacts">
4085
  <h4>Artifacts:</h4>
4086
  <a href="artifacts/benchmark/attention.jsonl" class="artifact" target="_blank">attention.jsonl</a>
 
3869
  <span class="collapse-indicators">
3870
  <span onclick="toggleCode('benchmark')" style="cursor: pointer;">▼ code</span>
3871
  <span onclick="toggleOutput('benchmark')" style="cursor: pointer;">▼ output</span>
3872
+ <span id="uv-indicator-benchmark" style="cursor: default; opacity: 0.3;">▶ uv-logs</span>
3873
  </span> |
3874
+ Cell: benchmark | 5.58s
3875
  | <button class="run-btn" onclick="runCell('benchmark')">▶ run</button>
3876
  <button class="copy-btn" onclick="copyCell('benchmark')">Copy</button>
3877
  <a href="cells/benchmark.py" target="_blank" class="raw-btn">Raw</a>
 
3926
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
3927
  Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
3928
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
3929
+ hf_kernels_flash_attn 3.55% 156.153us 41.08% 1.807ms 1.807ms 0.000us 0.00% 3.775ms 3.775ms 1
3930
+ _flash_attn_9e27194::fwd 1.65% 72.542us 37.53% 1.651ms 550.240us 2.812ms 100.00% 3.775ms 1.258ms 3
3931
+ hf_kernels_flash_attn 0.00% 0.000us 0.00% 0.000us 0.000us 2.814ms 100.05% 2.814ms 2.814ms 1
3932
+ void flash::flash_fwd_kernel&lt;Flash_fwd_kernel_traits... 0.00% 0.000us 0.00% 0.000us 0.000us 2.812ms 100.00% 2.812ms 937.398us 3
3933
+ Activity Buffer Request 32.22% 1.417ms 32.22% 1.417ms 1.417ms 962.880us 34.24% 962.880us 962.880us 1
3934
+ cudaDeviceGetAttribute 0.13% 5.500us 0.13% 5.500us 0.367us 0.000us 0.00% 0.000us 0.000us 15
3935
+ aten::empty_like 0.43% 19.110us 1.25% 54.882us 18.294us 0.000us 0.00% 0.000us 0.000us 3
3936
+ aten::empty_strided 0.81% 35.772us 0.81% 35.772us 11.924us 0.000us 0.00% 0.000us 0.000us 3
3937
+ aten::empty 0.57% 25.101us 0.57% 25.101us 2.789us 0.000us 0.00% 0.000us 0.000us 9
3938
+ cudaFuncSetAttribute 0.30% 13.270us 0.30% 13.270us 4.423us 0.000us 0.00% 0.000us 0.000us 3
3939
+ cudaLaunchKernel 1.42% 62.402us 1.42% 62.402us 20.801us 0.000us 0.00% 0.000us 0.000us 3
3940
+ cudaDeviceSynchronize 58.92% 2.591ms 58.92% 2.591ms 2.591ms 0.000us 0.00% 0.000us 0.000us 1
3941
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
3942
+ Self CPU time total: 4.398ms
3943
+ Self CUDA time total: 2.812ms
3944
 
3945
 
3946
 
 
3950
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
3951
  Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
3952
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
3953
+ hf_kernels_flash_attn 2.04% 91.192us 36.62% 1.634ms 1.634ms 0.000us 0.00% 3.983ms 3.983ms 1
3954
+ _flash_attn_9e27194::fwd 1.11% 49.718us 34.57% 1.543ms 514.203us 2.978ms 100.00% 3.983ms 1.328ms 3
3955
+ hf_kernels_flash_attn 0.00% 0.000us 0.00% 0.000us 0.000us 2.980ms 100.05% 2.980ms 2.980ms 1
3956
+ void flash::flash_fwd_kernel&lt;Flash_fwd_kernel_traits... 0.00% 0.000us 0.00% 0.000us 0.000us 2.978ms 100.00% 2.978ms 992.707us 3
3957
+ Activity Buffer Request 31.74% 1.416ms 31.74% 1.416ms 1.416ms 1.004ms 33.73% 1.004ms 1.004ms 1
3958
+ cudaDeviceGetAttribute 0.08% 3.711us 0.08% 3.711us 0.247us 0.000us 0.00% 0.000us 0.000us 15
3959
+ aten::empty_like 0.17% 7.481us 0.51% 22.841us 7.614us 0.000us 0.00% 0.000us 0.000us 3
3960
+ aten::empty_strided 0.34% 15.360us 0.34% 15.360us 5.120us 0.000us 0.00% 0.000us 0.000us 3
3961
+ aten::empty 0.46% 20.620us 0.46% 20.620us 2.291us 0.000us 0.00% 0.000us 0.000us 9
3962
+ cudaFuncSetAttribute 0.08% 3.741us 0.08% 3.741us 1.247us 0.000us 0.00% 0.000us 0.000us 3
3963
+ cudaLaunchKernel 0.58% 25.842us 0.58% 25.842us 8.614us 0.000us 0.00% 0.000us 0.000us 3
3964
+ cudaDeviceSynchronize 63.38% 2.828ms 63.38% 2.828ms 2.828ms 0.000us 0.00% 0.000us 0.000us 1
3965
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
3966
+ Self CPU time total: 4.462ms
3967
+ Self CUDA time total: 2.978ms
3968
 
3969
 
3970
 
 
3974
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
3975
  Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
3976
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
3977
+ hf_kernels_flash_attn 2.28% 105.284us 36.17% 1.673ms 1.673ms 0.000us 0.00% 4.145ms 4.145ms 1
3978
+ _flash_attn_9e27194::fwd 1.09% 50.271us 33.89% 1.567ms 522.459us 3.096ms 100.00% 4.145ms 1.382ms 3
3979
+ hf_kernels_flash_attn 0.00% 0.000us 0.00% 0.000us 0.000us 3.098ms 100.05% 3.098ms 3.098ms 1
3980
+ void flash::flash_fwd_kernel&lt;Flash_fwd_kernel_traits... 0.00% 0.000us 0.00% 0.000us 0.000us 3.096ms 100.00% 3.096ms 1.032ms 3
3981
+ Activity Buffer Request 31.08% 1.437ms 31.08% 1.437ms 1.437ms 1.049ms 33.87% 1.049ms 1.049ms 1
3982
+ cudaDeviceGetAttribute 0.08% 3.850us 0.08% 3.850us 0.257us 0.000us 0.00% 0.000us 0.000us 15
3983
+ aten::empty_like 0.15% 7.061us 0.49% 22.631us 7.544us 0.000us 0.00% 0.000us 0.000us 3
3984
+ aten::empty_strided 0.34% 15.570us 0.34% 15.570us 5.190us 0.000us 0.00% 0.000us 0.000us 3
3985
+ aten::empty 0.47% 21.760us 0.47% 21.760us 2.418us 0.000us 0.00% 0.000us 0.000us 9
3986
+ cudaFuncSetAttribute 0.08% 3.689us 0.08% 3.689us 1.230us 0.000us 0.00% 0.000us 0.000us 3
3987
+ cudaLaunchKernel 0.61% 27.992us 0.61% 27.992us 9.331us 0.000us 0.00% 0.000us 0.000us 3
3988
+ cudaDeviceSynchronize 63.83% 2.952ms 63.83% 2.952ms 2.952ms 0.000us 0.00% 0.000us 0.000us 1
3989
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
3990
+ Self CPU time total: 4.625ms
3991
+ Self CUDA time total: 3.096ms
3992
 
3993
 
3994
 
 
3998
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
3999
  Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
4000
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4001
+ hf_kernels_flash_attn 2.30% 110.882us 38.29% 1.842ms 1.842ms 0.000us 0.00% 4.161ms 4.161ms 1
4002
+ _flash_attn_9e27194::fwd 1.05% 50.321us 35.98% 1.731ms 577.014us 3.117ms 100.00% 4.161ms 1.387ms 3
4003
+ hf_kernels_flash_attn 0.00% 0.000us 0.00% 0.000us 0.000us 3.118ms 100.05% 3.118ms 3.118ms 1
4004
+ void flash::flash_fwd_kernel&lt;Flash_fwd_kernel_traits... 0.00% 0.000us 0.00% 0.000us 0.000us 3.117ms 100.00% 3.117ms 1.039ms 3
4005
+ Activity Buffer Request 29.64% 1.426ms 29.64% 1.426ms 1.426ms 1.044ms 33.50% 1.044ms 1.044ms 1
4006
+ cudaDeviceGetAttribute 0.08% 3.780us 0.08% 3.780us 0.252us 0.000us 0.00% 0.000us 0.000us 15
4007
+ aten::empty_like 0.15% 7.259us 0.50% 24.240us 8.080us 0.000us 0.00% 0.000us 0.000us 3
4008
+ aten::empty_strided 0.35% 16.981us 0.35% 16.981us 5.660us 0.000us 0.00% 0.000us 0.000us 3
4009
+ aten::empty 0.45% 21.602us 0.45% 21.602us 2.400us 0.000us 0.00% 0.000us 0.000us 9
4010
+ cudaFuncSetAttribute 0.08% 3.770us 0.08% 3.770us 1.257us 0.000us 0.00% 0.000us 0.000us 3
4011
+ cudaLaunchKernel 4.18% 201.205us 4.18% 201.205us 67.068us 0.000us 0.00% 0.000us 0.000us 3
4012
+ cudaDeviceSynchronize 61.71% 2.969ms 61.71% 2.969ms 2.969ms 0.000us 0.00% 0.000us 0.000us 1
4013
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4014
+ Self CPU time total: 4.811ms
4015
+ Self CUDA time total: 3.117ms
4016
 
4017
 
4018
 
 
4022
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4023
  Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
4024
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4025
+ hf_kernels_flash_attn 2.05% 108.443us 34.64% 1.832ms 1.832ms 0.000us 0.00% 4.810ms 4.810ms 1
4026
+ _flash_attn_9e27194::fwd 0.96% 50.812us 32.59% 1.723ms 574.364us 3.602ms 100.00% 4.810ms 1.603ms 3
4027
+ hf_kernels_flash_attn 0.00% 0.000us 0.00% 0.000us 0.000us 3.603ms 100.04% 3.603ms 3.603ms 1
4028
+ void flash::flash_fwd_kernel&lt;Flash_fwd_kernel_traits... 0.00% 0.000us 0.00% 0.000us 0.000us 3.602ms 100.00% 3.602ms 1.201ms 3
4029
+ Activity Buffer Request 27.53% 1.455ms 27.53% 1.455ms 1.455ms 1.209ms 33.55% 1.209ms 1.209ms 1
4030
+ cudaDeviceGetAttribute 0.08% 4.070us 0.08% 4.070us 0.271us 0.000us 0.00% 0.000us 0.000us 15
4031
+ aten::empty_like 0.14% 7.390us 0.45% 23.900us 7.967us 0.000us 0.00% 0.000us 0.000us 3
4032
+ aten::empty_strided 0.31% 16.510us 0.31% 16.510us 5.503us 0.000us 0.00% 0.000us 0.000us 3
4033
+ aten::empty 0.40% 21.151us 0.40% 21.151us 2.350us 0.000us 0.00% 0.000us 0.000us 9
4034
+ cudaFuncSetAttribute 0.07% 3.770us 0.07% 3.770us 1.257us 0.000us 0.00% 0.000us 0.000us 3
4035
+ cudaLaunchKernel 3.10% 164.023us 3.10% 164.023us 54.674us 0.000us 0.00% 0.000us 0.000us 3
4036
+ cudaDeviceSynchronize 65.36% 3.455ms 65.36% 3.455ms 3.455ms 0.000us 0.00% 0.000us 0.000us 1
4037
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4038
+ Self CPU time total: 5.287ms
4039
+ Self CUDA time total: 3.602ms
4040
 
4041
 
4042
 
 
4046
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4047
  Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
4048
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4049
+ hf_kernels_flash_attn 1.95% 105.103us 34.11% 1.836ms 1.836ms 0.000us 0.00% 4.931ms 4.931ms 1
4050
+ _flash_attn_9e27194::fwd 1.08% 58.141us 32.16% 1.731ms 577.087us 3.693ms 100.00% 4.931ms 1.644ms 3
4051
+ hf_kernels_flash_attn 0.00% 0.000us 0.00% 0.000us 0.000us 3.695ms 100.04% 3.695ms 3.695ms 1
4052
+ void flash::flash_fwd_kernel&lt;Flash_fwd_kernel_traits... 0.00% 0.000us 0.00% 0.000us 0.000us 3.693ms 100.00% 3.693ms 1.231ms 3
4053
+ Activity Buffer Request 26.71% 1.438ms 26.71% 1.438ms 1.438ms 1.238ms 33.53% 1.238ms 1.238ms 1
4054
+ cudaDeviceGetAttribute 0.08% 4.380us 0.08% 4.380us 0.292us 0.000us 0.00% 0.000us 0.000us 15
4055
+ aten::empty_like 0.15% 8.230us 0.50% 26.750us 8.917us 0.000us 0.00% 0.000us 0.000us 3
4056
+ aten::empty_strided 0.34% 18.520us 0.34% 18.520us 6.173us 0.000us 0.00% 0.000us 0.000us 3
4057
+ aten::empty 0.48% 25.961us 0.48% 25.961us 2.885us 0.000us 0.00% 0.000us 0.000us 9
4058
+ cudaFuncSetAttribute 0.08% 4.220us 0.08% 4.220us 1.407us 0.000us 0.00% 0.000us 0.000us 3
4059
+ cudaLaunchKernel 3.23% 173.714us 3.23% 173.714us 57.905us 0.000us 0.00% 0.000us 0.000us 3
4060
+ cudaDeviceSynchronize 65.89% 3.548ms 65.89% 3.548ms 3.548ms 0.000us 0.00% 0.000us 0.000us 1
4061
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4062
+ Self CPU time total: 5.384ms
4063
+ Self CUDA time total: 3.693ms
4064
 
4065
 
4066
  impl wl p50(ms) ok
4067
+ hf_kernels_flash_attn cuda_attn_L128_bfloat16 0.97 True
4068
  hf_kernels_flash_attn cuda_attn_L256_bfloat16 1.01 True
4069
  hf_kernels_flash_attn cuda_attn_L320_bfloat16 1.06 True
4070
+ hf_kernels_flash_attn cuda_attn_L384_bfloat16 1.09 True
4071
+ hf_kernels_flash_attn cuda_attn_L448_bfloat16 1.24 True
4072
+ hf_kernels_flash_attn cuda_attn_L512_bfloat16 1.23 True
4073
  </pre></div>
4074
+ <div class="cell-stderr">
4075
+ Fetching 20 files: 0%| | 0/20 [00:00&lt;?, ?it/s]
4076
+ Fetching 20 files: 10%|█ | 2/20 [00:01&lt;00:13, 1.34it/s]
4077
+ Fetching 20 files: 100%|██████████| 20/20 [00:01&lt;00:00, 13.40it/s]
4078
  </div>
 
 
 
 
 
4079
  <div class="cell-artifacts">
4080
  <h4>Artifacts:</h4>
4081
  <a href="artifacts/benchmark/attention.jsonl" class="artifact" target="_blank">attention.jsonl</a>
flash_attn/impls/hf_kernels_flash_attn3.html CHANGED
@@ -3871,7 +3871,7 @@ span.linenos.special { color: #000000; background-color: #ffffc0; padding-left:
3871
  <span onclick="toggleOutput('benchmark')" style="cursor: pointer;">▼ output</span>
3872
  <span id="uv-indicator-benchmark" style="cursor: default; opacity: 0.3;">▶ uv-logs</span>
3873
  </span> |
3874
- Cell: benchmark | 5.68s
3875
  | <button class="run-btn" onclick="runCell('benchmark')">▶ run</button>
3876
  <button class="copy-btn" onclick="copyCell('benchmark')">Copy</button>
3877
  <a href="cells/benchmark.py" target="_blank" class="raw-btn">Raw</a>
@@ -3925,19 +3925,19 @@ PROFILE TRACE: hf_kernels_flash_attn3 | cuda_attn_L128_bfloat16
3925
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
3926
  Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
3927
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
3928
- hf_kernels_flash_attn3 3.89% 167.076us 44.49% 1.911ms 1.911ms 0.000us 0.00% 3.576ms 3.576ms 1
3929
- FlashAttnFunc 3.00% 128.934us 40.60% 1.744ms 581.290us 0.000us 0.00% 3.576ms 1.192ms 3
3930
- _flash_attn3_48fe103_dirty::fwd 1.82% 78.184us 37.60% 1.615ms 538.312us 2.688ms 100.00% 3.576ms 1.192ms 3
3931
- hf_kernels_flash_attn3 0.00% 0.000us 0.00% 0.000us 0.000us 2.690ms 100.05% 2.690ms 2.690ms 1
3932
- void cutlass::device_kernel&lt;flash::enable_sm80_to_sm... 0.00% 0.000us 0.00% 0.000us 0.000us 2.688ms 100.00% 2.688ms 896.117us 3
3933
- Activity Buffer Request 33.29% 1.430ms 33.29% 1.430ms 1.430ms 887.327us 33.01% 887.327us 887.327us 1
3934
- aten::empty 1.08% 46.281us 1.08% 46.281us 7.714us 0.000us 0.00% 0.000us 0.000us 6
3935
- cudaFuncSetAttribute 0.37% 15.900us 0.37% 15.900us 5.300us 0.000us 0.00% 0.000us 0.000us 3
3936
- cudaLaunchKernel 1.04% 44.671us 1.04% 44.671us 14.890us 0.000us 0.00% 0.000us 0.000us 3
3937
- cudaDeviceSynchronize 55.51% 2.384ms 55.51% 2.384ms 2.384ms 0.000us 0.00% 0.000us 0.000us 1
3938
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
3939
- Self CPU time total: 4.295ms
3940
- Self CUDA time total: 2.688ms
3941
 
3942
 
3943
 
@@ -3947,19 +3947,19 @@ PROFILE TRACE: hf_kernels_flash_attn3 | cuda_attn_L256_bfloat16
3947
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
3948
  Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
3949
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
3950
- hf_kernels_flash_attn3 3.06% 130.754us 41.10% 1.758ms 1.758ms 0.000us 0.00% 3.668ms 3.668ms 1
3951
- FlashAttnFunc 2.23% 95.572us 38.05% 1.627ms 542.455us 0.000us 0.00% 3.668ms 1.223ms 3
3952
- _flash_attn3_48fe103_dirty::fwd 1.23% 52.754us 35.81% 1.532ms 510.598us 2.747ms 100.00% 3.668ms 1.223ms 3
3953
- hf_kernels_flash_attn3 0.00% 0.000us 0.00% 0.000us 0.000us 2.748ms 100.05% 2.748ms 2.748ms 1
3954
- void cutlass::device_kernel&lt;flash::enable_sm80_to_sm... 0.00% 0.000us 0.00% 0.000us 0.000us 2.747ms 100.00% 2.747ms 915.501us 3
3955
- Activity Buffer Request 33.10% 1.416ms 33.10% 1.416ms 1.416ms 921.272us 33.54% 921.272us 921.272us 1
3956
- aten::empty 0.63% 26.890us 0.63% 26.890us 4.482us 0.000us 0.00% 0.000us 0.000us 6
3957
- cudaFuncSetAttribute 0.12% 4.970us 0.12% 4.970us 1.657us 0.000us 0.00% 0.000us 0.000us 3
3958
- cudaLaunchKernel 0.73% 31.351us 0.73% 31.351us 10.450us 0.000us 0.00% 0.000us 0.000us 3
3959
- cudaDeviceSynchronize 58.90% 2.519ms 58.90% 2.519ms 2.519ms 0.000us 0.00% 0.000us 0.000us 1
3960
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
3961
- Self CPU time total: 4.277ms
3962
- Self CUDA time total: 2.747ms
3963
 
3964
 
3965
 
@@ -3969,19 +3969,19 @@ PROFILE TRACE: hf_kernels_flash_attn3 | cuda_attn_L320_bfloat16
3969
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
3970
  Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
3971
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
3972
- hf_kernels_flash_attn3 2.33% 101.653us 39.53% 1.727ms 1.727ms 0.000us 0.00% 3.829ms 3.829ms 1
3973
- FlashAttnFunc 2.05% 89.593us 37.20% 1.625ms 541.619us 0.000us 0.00% 3.829ms 1.276ms 3
3974
- _flash_attn3_48fe103_dirty::fwd 1.17% 51.051us 35.15% 1.535ms 511.754us 2.856ms 100.00% 3.829ms 1.276ms 3
3975
- hf_kernels_flash_attn3 0.00% 0.000us 0.00% 0.000us 0.000us 2.858ms 100.06% 2.858ms 2.858ms 1
3976
- void cutlass::device_kernel&lt;flash::enable_sm80_to_sm... 0.00% 0.000us 0.00% 0.000us 0.000us 2.856ms 100.00% 2.856ms 952.136us 3
3977
- Activity Buffer Request 32.54% 1.421ms 32.54% 1.421ms 1.421ms 972.574us 34.05% 972.574us 972.574us 1
3978
- aten::empty 0.62% 27.231us 0.62% 27.231us 4.538us 0.000us 0.00% 0.000us 0.000us 6
3979
- cudaFuncSetAttribute 0.12% 5.411us 0.12% 5.411us 1.804us 0.000us 0.00% 0.000us 0.000us 3
3980
- cudaLaunchKernel 0.69% 30.341us 0.69% 30.341us 10.114us 0.000us 0.00% 0.000us 0.000us 3
3981
- cudaDeviceSynchronize 60.47% 2.642ms 60.47% 2.642ms 2.642ms 0.000us 0.00% 0.000us 0.000us 1
3982
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
3983
- Self CPU time total: 4.368ms
3984
- Self CUDA time total: 2.856ms
3985
 
3986
 
3987
 
@@ -3991,19 +3991,19 @@ PROFILE TRACE: hf_kernels_flash_attn3 | cuda_attn_L384_bfloat16
3991
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
3992
  Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
3993
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
3994
- hf_kernels_flash_attn3 2.61% 122.474us 42.62% 2.001ms 2.001ms 0.000us 0.00% 3.906ms 3.906ms 1
3995
- FlashAttnFunc 1.99% 93.683us 40.01% 1.879ms 626.332us 0.000us 0.00% 3.906ms 1.302ms 3
3996
- _flash_attn3_48fe103_dirty::fwd 1.17% 54.872us 38.02% 1.785ms 595.104us 2.915ms 100.00% 3.906ms 1.302ms 3
3997
- hf_kernels_flash_attn3 0.00% 0.000us 0.00% 0.000us 0.000us 2.917ms 100.05% 2.917ms 2.917ms 1
3998
- void cutlass::device_kernel&lt;flash::enable_sm80_to_sm... 0.00% 0.000us 0.00% 0.000us 0.000us 2.915ms 100.00% 2.915ms 971.727us 3
3999
- Activity Buffer Request 31.11% 1.461ms 31.11% 1.461ms 1.461ms 991.129us 34.00% 991.129us 991.129us 1
4000
- aten::empty 0.59% 27.622us 0.59% 27.622us 4.604us 0.000us 0.00% 0.000us 0.000us 6
4001
- cudaFuncSetAttribute 0.12% 5.820us 0.12% 5.820us 1.940us 0.000us 0.00% 0.000us 0.000us 3
4002
- cudaLaunchKernel 5.03% 236.178us 5.03% 236.178us 78.726us 0.000us 0.00% 0.000us 0.000us 3
4003
- cudaDeviceSynchronize 57.38% 2.695ms 57.38% 2.695ms 2.695ms 0.000us 0.00% 0.000us 0.000us 1
4004
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4005
- Self CPU time total: 4.696ms
4006
- Self CUDA time total: 2.915ms
4007
 
4008
 
4009
 
@@ -4013,19 +4013,19 @@ PROFILE TRACE: hf_kernels_flash_attn3 | cuda_attn_L448_bfloat16
4013
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4014
  Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
4015
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4016
- hf_kernels_flash_attn3 2.45% 124.235us 37.18% 1.882ms 1.882ms 0.000us 0.00% 4.537ms 4.537ms 1
4017
- FlashAttnFunc 1.83% 92.522us 34.73% 1.758ms 585.897us 0.000us 0.00% 4.537ms 1.512ms 3
4018
- _flash_attn3_48fe103_dirty::fwd 1.03% 52.313us 32.90% 1.665ms 555.056us 3.398ms 100.00% 4.537ms 1.512ms 3
4019
- hf_kernels_flash_attn3 0.00% 0.000us 0.00% 0.000us 0.000us 3.399ms 100.05% 3.399ms 3.399ms 1
4020
- void cutlass::device_kernel&lt;flash::enable_sm80_to_sm... 0.00% 0.000us 0.00% 0.000us 0.000us 3.398ms 100.00% 3.398ms 1.133ms 3
4021
- Activity Buffer Request 27.82% 1.408ms 27.82% 1.408ms 1.408ms 1.139ms 33.52% 1.139ms 1.139ms 1
4022
- aten::empty 0.54% 27.441us 0.54% 27.441us 4.573us 0.000us 0.00% 0.000us 0.000us 6
4023
- cudaFuncSetAttribute 0.12% 5.839us 0.12% 5.839us 1.946us 0.000us 0.00% 0.000us 0.000us 3
4024
- cudaLaunchKernel 3.39% 171.646us 3.39% 171.646us 57.215us 0.000us 0.00% 0.000us 0.000us 3
4025
- cudaDeviceSynchronize 62.82% 3.179ms 62.82% 3.179ms 3.179ms 0.000us 0.00% 0.000us 0.000us 1
4026
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4027
- Self CPU time total: 5.061ms
4028
- Self CUDA time total: 3.398ms
4029
 
4030
 
4031
 
@@ -4035,33 +4035,33 @@ PROFILE TRACE: hf_kernels_flash_attn3 | cuda_attn_L512_bfloat16
4035
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4036
  Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
4037
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4038
- hf_kernels_flash_attn3 2.74% 138.223us 36.95% 1.864ms 1.864ms 0.000us 0.00% 4.557ms 4.557ms 1
4039
- FlashAttnFunc 1.84% 92.725us 34.21% 1.726ms 575.197us 0.000us 0.00% 4.557ms 1.519ms 3
4040
- _flash_attn3_48fe103_dirty::fwd 1.03% 52.171us 32.37% 1.633ms 544.289us 3.424ms 100.00% 4.557ms 1.519ms 3
4041
- hf_kernels_flash_attn3 0.00% 0.000us 0.00% 0.000us 0.000us 3.425ms 100.04% 3.425ms 3.425ms 1
4042
- void cutlass::device_kernel&lt;flash::enable_sm80_to_sm... 0.00% 0.000us 0.00% 0.000us 0.000us 3.424ms 100.00% 3.424ms 1.141ms 3
4043
- Activity Buffer Request 27.34% 1.379ms 27.34% 1.379ms 1.379ms 1.133ms 33.10% 1.133ms 1.133ms 1
4044
- aten::empty 0.57% 28.661us 0.57% 28.661us 4.777us 0.000us 0.00% 0.000us 0.000us 6
4045
- cudaFuncSetAttribute 0.10% 5.240us 0.10% 5.240us 1.747us 0.000us 0.00% 0.000us 0.000us 3
4046
- cudaLaunchKernel 3.33% 167.776us 3.33% 167.776us 55.925us 0.000us 0.00% 0.000us 0.000us 3
4047
- cudaDeviceSynchronize 63.05% 3.181ms 63.05% 3.181ms 3.181ms 0.000us 0.00% 0.000us 0.000us 1
4048
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4049
- Self CPU time total: 5.045ms
4050
- Self CUDA time total: 3.424ms
4051
 
4052
 
4053
  impl wl p50(ms) ok
4054
- hf_kernels_flash_attn3 cuda_attn_L128_bfloat16 0.92 True
4055
- hf_kernels_flash_attn3 cuda_attn_L256_bfloat16 0.96 True
4056
- hf_kernels_flash_attn3 cuda_attn_L320_bfloat16 1.02 True
4057
- hf_kernels_flash_attn3 cuda_attn_L384_bfloat16 1.02 True
4058
- hf_kernels_flash_attn3 cuda_attn_L448_bfloat16 1.18 True
4059
  hf_kernels_flash_attn3 cuda_attn_L512_bfloat16 1.18 True
4060
  </pre></div>
4061
  <div class="cell-stderr">
4062
  Fetching 4 files: 0%| | 0/4 [00:00&lt;?, ?it/s]
4063
- Fetching 4 files: 50%|█████ | 2/4 [00:01&lt;00:01, 1.27it/s]
4064
- Fetching 4 files: 100%|██████████| 4/4 [00:01&lt;00:00, 2.55it/s]
4065
  </div>
4066
  <div class="cell-artifacts">
4067
  <h4>Artifacts:</h4>
 
3871
  <span onclick="toggleOutput('benchmark')" style="cursor: pointer;">▼ output</span>
3872
  <span id="uv-indicator-benchmark" style="cursor: default; opacity: 0.3;">▶ uv-logs</span>
3873
  </span> |
3874
+ Cell: benchmark | 5.52s
3875
  | <button class="run-btn" onclick="runCell('benchmark')">▶ run</button>
3876
  <button class="copy-btn" onclick="copyCell('benchmark')">Copy</button>
3877
  <a href="cells/benchmark.py" target="_blank" class="raw-btn">Raw</a>
 
3925
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
3926
  Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
3927
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
3928
+ hf_kernels_flash_attn3 3.72% 161.222us 44.67% 1.935ms 1.935ms 0.000us 0.00% 3.599ms 3.599ms 1
3929
+ FlashAttnFunc 2.81% 121.834us 40.95% 1.774ms 591.218us 0.000us 0.00% 3.599ms 1.200ms 3
3930
+ _flash_attn3_48fe103_dirty::fwd 1.85% 79.992us 38.14% 1.652ms 550.607us 2.693ms 100.00% 3.599ms 1.200ms 3
3931
+ hf_kernels_flash_attn3 0.00% 0.000us 0.00% 0.000us 0.000us 2.695ms 100.05% 2.695ms 2.695ms 1
3932
+ void cutlass::device_kernel&lt;flash::enable_sm80_to_sm... 0.00% 0.000us 0.00% 0.000us 0.000us 2.693ms 100.00% 2.693ms 897.759us 3
3933
+ Activity Buffer Request 33.93% 1.470ms 33.93% 1.470ms 1.470ms 905.439us 33.62% 905.439us 905.439us 1
3934
+ aten::empty 1.00% 43.311us 1.00% 43.311us 7.219us 0.000us 0.00% 0.000us 0.000us 6
3935
+ cudaFuncSetAttribute 0.32% 13.891us 0.32% 13.891us 4.630us 0.000us 0.00% 0.000us 0.000us 3
3936
+ cudaLaunchKernel 1.04% 45.121us 1.04% 45.121us 15.040us 0.000us 0.00% 0.000us 0.000us 3
3937
+ cudaDeviceSynchronize 55.33% 2.396ms 55.33% 2.396ms 2.396ms 0.000us 0.00% 0.000us 0.000us 1
3938
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
3939
+ Self CPU time total: 4.331ms
3940
+ Self CUDA time total: 2.693ms
3941
 
3942
 
3943
 
 
3947
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
3948
  Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
3949
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
3950
+ hf_kernels_flash_attn3 2.17% 96.772us 39.76% 1.770ms 1.770ms 0.000us 0.00% 3.876ms 3.876ms 1
3951
+ FlashAttnFunc 2.04% 90.694us 37.59% 1.674ms 557.834us 0.000us 0.00% 3.876ms 1.292ms 3
3952
+ _flash_attn3_48fe103_dirty::fwd 1.15% 51.142us 35.55% 1.583ms 527.603us 2.896ms 100.00% 3.876ms 1.292ms 3
3953
+ hf_kernels_flash_attn3 0.00% 0.000us 0.00% 0.000us 0.000us 2.898ms 100.05% 2.898ms 2.898ms 1
3954
+ void cutlass::device_kernel&lt;flash::enable_sm80_to_sm... 0.00% 0.000us 0.00% 0.000us 0.000us 2.896ms 100.00% 2.896ms 965.387us 3
3955
+ Activity Buffer Request 33.04% 1.471ms 33.04% 1.471ms 1.471ms 979.809us 33.83% 979.809us 979.809us 1
3956
+ aten::empty 0.58% 25.610us 0.58% 25.610us 4.268us 0.000us 0.00% 0.000us 0.000us 6
3957
+ cudaFuncSetAttribute 0.12% 5.240us 0.12% 5.240us 1.747us 0.000us 0.00% 0.000us 0.000us 3
3958
+ cudaLaunchKernel 0.67% 29.750us 0.67% 29.750us 9.917us 0.000us 0.00% 0.000us 0.000us 3
3959
+ cudaDeviceSynchronize 60.24% 2.682ms 60.24% 2.682ms 2.682ms 0.000us 0.00% 0.000us 0.000us 1
3960
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
3961
+ Self CPU time total: 4.452ms
3962
+ Self CUDA time total: 2.896ms
3963
 
3964
 
3965
 
 
3969
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
3970
  Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
3971
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
3972
+ hf_kernels_flash_attn3 2.19% 98.331us 39.82% 1.786ms 1.786ms 0.000us 0.00% 3.885ms 3.885ms 1
3973
+ FlashAttnFunc 1.99% 89.333us 37.63% 1.688ms 562.551us 0.000us 0.00% 3.885ms 1.295ms 3
3974
+ _flash_attn3_48fe103_dirty::fwd 1.08% 48.311us 35.64% 1.598ms 532.773us 2.912ms 100.00% 3.885ms 1.295ms 3
3975
+ hf_kernels_flash_attn3 0.00% 0.000us 0.00% 0.000us 0.000us 2.914ms 100.05% 2.914ms 2.914ms 1
3976
+ void cutlass::device_kernel&lt;flash::enable_sm80_to_sm... 0.00% 0.000us 0.00% 0.000us 0.000us 2.912ms 100.00% 2.912ms 970.802us 3
3977
+ Activity Buffer Request 33.18% 1.488ms 33.18% 1.488ms 1.488ms 972.637us 33.40% 972.637us 972.637us 1
3978
+ aten::empty 0.57% 25.370us 0.57% 25.370us 4.228us 0.000us 0.00% 0.000us 0.000us 6
3979
+ cudaFuncSetAttribute 0.13% 5.730us 0.13% 5.730us 1.910us 0.000us 0.00% 0.000us 0.000us 3
3980
+ cudaLaunchKernel 0.69% 30.861us 0.69% 30.861us 10.287us 0.000us 0.00% 0.000us 0.000us 3
3981
+ cudaDeviceSynchronize 60.18% 2.699ms 60.18% 2.699ms 2.699ms 0.000us 0.00% 0.000us 0.000us 1
3982
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
3983
+ Self CPU time total: 4.485ms
3984
+ Self CUDA time total: 2.912ms
3985
 
3986
 
3987
 
 
3991
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
3992
  Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
3993
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
3994
+ hf_kernels_flash_attn3 2.51% 118.553us 41.81% 1.973ms 1.973ms 0.000us 0.00% 3.964ms 3.964ms 1
3995
+ FlashAttnFunc 1.94% 91.662us 39.30% 1.855ms 618.205us 0.000us 0.00% 3.964ms 1.321ms 3
3996
+ _flash_attn3_48fe103_dirty::fwd 1.07% 50.373us 37.36% 1.763ms 587.651us 2.962ms 100.00% 3.964ms 1.321ms 3
3997
+ hf_kernels_flash_attn3 0.00% 0.000us 0.00% 0.000us 0.000us 2.964ms 100.05% 2.964ms 2.964ms 1
3998
+ void cutlass::device_kernel&lt;flash::enable_sm80_to_sm... 0.00% 0.000us 0.00% 0.000us 0.000us 2.962ms 100.00% 2.962ms 987.401us 3
3999
+ Activity Buffer Request 30.92% 1.459ms 30.92% 1.459ms 1.459ms 1.002ms 33.82% 1.002ms 1.002ms 1
4000
+ aten::empty 0.56% 26.451us 0.56% 26.451us 4.408us 0.000us 0.00% 0.000us 0.000us 6
4001
+ cudaFuncSetAttribute 0.11% 5.270us 0.11% 5.270us 1.757us 0.000us 0.00% 0.000us 0.000us 3
4002
+ cudaLaunchKernel 4.70% 221.845us 4.70% 221.845us 73.948us 0.000us 0.00% 0.000us 0.000us 3
4003
+ cudaDeviceSynchronize 58.19% 2.746ms 58.19% 2.746ms 2.746ms 0.000us 0.00% 0.000us 0.000us 1
4004
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4005
+ Self CPU time total: 4.719ms
4006
+ Self CUDA time total: 2.962ms
4007
 
4008
 
4009
 
 
4013
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4014
  Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
4015
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4016
+ hf_kernels_flash_attn3 2.19% 114.453us 37.34% 1.953ms 1.953ms 0.000us 0.00% 4.662ms 4.662ms 1
4017
+ FlashAttnFunc 1.73% 90.401us 35.15% 1.838ms 612.822us 0.000us 0.00% 4.662ms 1.554ms 3
4018
+ _flash_attn3_48fe103_dirty::fwd 0.97% 50.643us 33.42% 1.748ms 582.688us 3.490ms 100.00% 4.662ms 1.554ms 3
4019
+ hf_kernels_flash_attn3 0.00% 0.000us 0.00% 0.000us 0.000us 3.492ms 100.04% 3.492ms 3.492ms 1
4020
+ void cutlass::device_kernel&lt;flash::enable_sm80_to_sm... 0.00% 0.000us 0.00% 0.000us 0.000us 3.490ms 100.00% 3.490ms 1.163ms 3
4021
+ Activity Buffer Request 28.44% 1.487ms 28.44% 1.487ms 1.487ms 1.171ms 33.56% 1.171ms 1.171ms 1
4022
+ aten::empty 0.52% 27.271us 0.52% 27.271us 4.545us 0.000us 0.00% 0.000us 0.000us 6
4023
+ cudaFuncSetAttribute 0.09% 4.950us 0.09% 4.950us 1.650us 0.000us 0.00% 0.000us 0.000us 3
4024
+ cudaLaunchKernel 3.40% 178.024us 3.40% 178.024us 59.341us 0.000us 0.00% 0.000us 0.000us 3
4025
+ cudaDeviceSynchronize 62.66% 3.277ms 62.66% 3.277ms 3.277ms 0.000us 0.00% 0.000us 0.000us 1
4026
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4027
+ Self CPU time total: 5.230ms
4028
+ Self CUDA time total: 3.490ms
4029
 
4030
 
4031
 
 
4035
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4036
  Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
4037
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4038
+ hf_kernels_flash_attn3 2.26% 115.663us 36.27% 1.854ms 1.854ms 0.000us 0.00% 4.679ms 4.679ms 1
4039
+ FlashAttnFunc 2.25% 114.773us 34.01% 1.738ms 579.364us 0.000us 0.00% 4.679ms 1.560ms 3
4040
+ _flash_attn3_48fe103_dirty::fwd 1.02% 51.933us 31.76% 1.623ms 541.107us 3.499ms 100.00% 4.679ms 1.560ms 3
4041
+ hf_kernels_flash_attn3 0.00% 0.000us 0.00% 0.000us 0.000us 3.500ms 100.04% 3.500ms 3.500ms 1
4042
+ void cutlass::device_kernel&lt;flash::enable_sm80_to_sm... 0.00% 0.000us 0.00% 0.000us 0.000us 3.499ms 100.00% 3.499ms 1.166ms 3
4043
+ Activity Buffer Request 26.80% 1.370ms 26.80% 1.370ms 1.370ms 1.181ms 33.75% 1.181ms 1.181ms 1
4044
+ aten::empty 0.54% 27.681us 0.54% 27.681us 4.613us 0.000us 0.00% 0.000us 0.000us 6
4045
+ cudaFuncSetAttribute 0.10% 5.079us 0.10% 5.079us 1.693us 0.000us 0.00% 0.000us 0.000us 3
4046
+ cudaLaunchKernel 3.30% 168.813us 3.30% 168.813us 56.271us 0.000us 0.00% 0.000us 0.000us 3
4047
+ cudaDeviceSynchronize 63.73% 3.257ms 63.73% 3.257ms 3.257ms 0.000us 0.00% 0.000us 0.000us 1
4048
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4049
+ Self CPU time total: 5.111ms
4050
+ Self CUDA time total: 3.499ms
4051
 
4052
 
4053
  impl wl p50(ms) ok
4054
+ hf_kernels_flash_attn3 cuda_attn_L128_bfloat16 0.94 True
4055
+ hf_kernels_flash_attn3 cuda_attn_L256_bfloat16 0.97 True
4056
+ hf_kernels_flash_attn3 cuda_attn_L320_bfloat16 1.04 True
4057
+ hf_kernels_flash_attn3 cuda_attn_L384_bfloat16 1.05 True
4058
+ hf_kernels_flash_attn3 cuda_attn_L448_bfloat16 1.20 True
4059
  hf_kernels_flash_attn3 cuda_attn_L512_bfloat16 1.18 True
4060
  </pre></div>
4061
  <div class="cell-stderr">
4062
  Fetching 4 files: 0%| | 0/4 [00:00&lt;?, ?it/s]
4063
+ Fetching 4 files: 50%|█████ | 2/4 [00:01&lt;00:01, 1.38it/s]
4064
+ Fetching 4 files: 100%|██████████| 4/4 [00:01&lt;00:00, 2.75it/s]
4065
  </div>
4066
  <div class="cell-artifacts">
4067
  <h4>Artifacts:</h4>
flash_attn/impls/mem_efficient_attention.html CHANGED
@@ -3869,9 +3869,9 @@ span.linenos.special { color: #000000; background-color: #ffffc0; padding-left:
3869
  <span class="collapse-indicators">
3870
  <span onclick="toggleCode('benchmark')" style="cursor: pointer;">▼ code</span>
3871
  <span onclick="toggleOutput('benchmark')" style="cursor: pointer;">▼ output</span>
3872
- <span id="uv-indicator-benchmark" onclick="toggleUvLogsFromHeader('benchmark')" style="cursor: pointer;">▶ uv-logs</span>
3873
  </span> |
3874
- Cell: benchmark | 32.68s
3875
  | <button class="run-btn" onclick="runCell('benchmark')">▶ run</button>
3876
  <button class="copy-btn" onclick="copyCell('benchmark')">Copy</button>
3877
  <a href="cells/benchmark.py" target="_blank" class="raw-btn">Raw</a>
@@ -3924,28 +3924,28 @@ PROFILE TRACE: torch_mem_eff | cuda_attn_L128_bfloat16
3924
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
3925
  Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
3926
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
3927
- torch_mem_eff 4.77% 340.490us 32.91% 2.350ms 2.350ms 0.000us 0.00% 5.530ms 5.530ms 1
3928
- torch_mem_eff 0.00% 0.000us 0.00% 0.000us 0.000us 5.523ms 100.81% 5.523ms 5.523ms 1
3929
- aten::scaled_dot_product_attention 0.44% 31.421us 2.67% 190.938us 63.646us 0.000us 0.00% 4.861ms 1.620ms 3
3930
- aten::_scaled_dot_product_efficient_attention 0.35% 24.771us 2.23% 159.517us 53.172us 0.000us 0.00% 4.861ms 1.620ms 3
3931
- aten::_efficient_attention_forward 0.51% 36.163us 1.50% 107.413us 35.804us 4.861ms 88.73% 4.861ms 1.620ms 3
3932
- fmha_cutlassF_bf16_aligned_64x128_rf_sm80(PyTorchMem... 0.00% 0.000us 0.00% 0.000us 0.000us 4.861ms 88.73% 4.861ms 1.620ms 3
3933
- aten::contiguous 0.17% 12.232us 24.52% 1.751ms 194.525us 0.000us 0.00% 668.128us 74.236us 9
3934
- aten::clone 0.48% 34.579us 24.35% 1.738ms 193.165us 0.000us 0.00% 668.128us 74.236us 9
3935
- aten::copy_ 1.16% 82.494us 22.79% 1.628ms 180.845us 617.312us 11.27% 668.128us 74.236us 9
3936
- void at::native::elementwise_kernel&lt;128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 617.312us 11.27% 617.312us 68.590us 9
3937
- Activity Buffer Request 20.35% 1.453ms 20.35% 1.453ms 1.453ms 50.816us 0.93% 50.816us 50.816us 1
3938
- aten::transpose 1.00% 71.754us 1.33% 95.065us 3.961us 0.000us 0.00% 0.000us 0.000us 24
3939
- aten::as_strided 0.33% 23.311us 0.33% 23.311us 0.971us 0.000us 0.00% 0.000us 0.000us 24
3940
- aten::empty_like 0.27% 19.481us 1.07% 76.301us 8.478us 0.000us 0.00% 0.000us 0.000us 9
3941
- aten::empty 1.26% 89.759us 1.26% 89.759us 4.274us 0.000us 0.00% 0.000us 0.000us 21
3942
- cudaLaunchKernel 1.62% 115.656us 1.62% 115.656us 9.638us 0.000us 0.00% 0.000us 0.000us 12
3943
- cudaStreamIsCapturing 0.04% 2.980us 0.04% 2.980us 0.993us 0.000us 0.00% 0.000us 0.000us 3
3944
- cudaFuncSetAttribute 0.16% 11.490us 0.16% 11.490us 3.830us 0.000us 0.00% 0.000us 0.000us 3
3945
- cudaDeviceSynchronize 67.09% 4.790ms 67.09% 4.790ms 4.790ms 0.000us 0.00% 0.000us 0.000us 1
3946
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
3947
- Self CPU time total: 7.140ms
3948
- Self CUDA time total: 5.479ms
3949
 
3950
 
3951
 
@@ -3955,28 +3955,28 @@ PROFILE TRACE: torch_mem_eff | cuda_attn_L256_bfloat16
3955
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
3956
  Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
3957
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
3958
- torch_mem_eff 3.38% 251.986us 27.98% 2.086ms 2.086ms 0.000us 0.00% 6.014ms 6.014ms 1
3959
- torch_mem_eff 0.00% 0.000us 0.00% 0.000us 0.000us 5.969ms 100.15% 5.969ms 5.969ms 1
3960
- aten::scaled_dot_product_attention 0.27% 19.962us 1.97% 146.646us 48.882us 0.000us 0.00% 5.323ms 1.774ms 3
3961
- aten::_scaled_dot_product_efficient_attention 0.26% 19.141us 1.70% 126.684us 42.228us 0.000us 0.00% 5.323ms 1.774ms 3
3962
- aten::_efficient_attention_forward 0.39% 29.281us 1.12% 83.514us 27.838us 5.323ms 89.32% 5.323ms 1.774ms 3
3963
- fmha_cutlassF_bf16_aligned_64x128_rf_sm80(PyTorchMem... 0.00% 0.000us 0.00% 0.000us 0.000us 5.323ms 89.32% 5.323ms 1.774ms 3
3964
- aten::contiguous 0.10% 7.510us 22.05% 1.644ms 182.655us 0.000us 0.00% 690.909us 76.768us 9
3965
- aten::clone 0.31% 23.251us 21.95% 1.636ms 181.821us 0.000us 0.00% 690.909us 76.768us 9
3966
- aten::copy_ 0.91% 68.131us 20.95% 1.562ms 173.540us 636.478us 10.68% 690.909us 76.768us 9
3967
- void at::native::elementwise_kernel&lt;128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 636.478us 10.68% 636.478us 70.720us 9
3968
- Activity Buffer Request 19.09% 1.423ms 19.09% 1.423ms 1.423ms 54.431us 0.91% 54.431us 54.431us 1
3969
- aten::transpose 0.68% 50.542us 0.90% 67.292us 2.804us 0.000us 0.00% 0.000us 0.000us 24
3970
- aten::as_strided 0.22% 16.750us 0.22% 16.750us 0.698us 0.000us 0.00% 0.000us 0.000us 24
3971
- aten::empty_like 0.17% 12.371us 0.69% 51.272us 5.697us 0.000us 0.00% 0.000us 0.000us 9
3972
- aten::empty 0.87% 64.771us 0.87% 64.771us 3.084us 0.000us 0.00% 0.000us 0.000us 21
3973
- cudaLaunchKernel 1.25% 93.466us 1.25% 93.466us 7.789us 0.000us 0.00% 0.000us 0.000us 12
3974
- cudaStreamIsCapturing 0.03% 2.400us 0.03% 2.400us 0.800us 0.000us 0.00% 0.000us 0.000us 3
3975
- cudaFuncSetAttribute 0.05% 3.371us 0.05% 3.371us 1.124us 0.000us 0.00% 0.000us 0.000us 3
3976
- cudaDeviceSynchronize 72.02% 5.368ms 72.02% 5.368ms 5.368ms 0.000us 0.00% 0.000us 0.000us 1
3977
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
3978
- Self CPU time total: 7.454ms
3979
- Self CUDA time total: 5.959ms
3980
 
3981
 
3982
 
@@ -3986,28 +3986,28 @@ PROFILE TRACE: torch_mem_eff | cuda_attn_L320_bfloat16
3986
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
3987
  Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
3988
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
3989
- torch_mem_eff 3.08% 235.490us 27.25% 2.083ms 2.083ms 0.000us 0.00% 6.182ms 6.182ms 1
3990
- torch_mem_eff 0.00% 0.000us 0.00% 0.000us 0.000us 6.132ms 100.15% 6.132ms 6.132ms 1
3991
- aten::scaled_dot_product_attention 0.24% 18.220us 1.86% 142.046us 47.349us 0.000us 0.00% 5.466ms 1.822ms 3
3992
- aten::_scaled_dot_product_efficient_attention 0.24% 18.131us 1.62% 123.826us 41.275us 0.000us 0.00% 5.466ms 1.822ms 3
3993
- aten::_efficient_attention_forward 0.37% 27.940us 1.08% 82.291us 27.430us 5.466ms 89.28% 5.466ms 1.822ms 3
3994
- fmha_cutlassF_bf16_aligned_64x128_rf_sm80(PyTorchMem... 0.00% 0.000us 0.00% 0.000us 0.000us 5.466ms 89.28% 5.466ms 1.822ms 3
3995
- aten::contiguous 0.10% 7.272us 21.47% 1.642ms 182.409us 0.000us 0.00% 715.197us 79.466us 9
3996
- aten::clone 0.29% 22.290us 21.38% 1.634ms 181.601us 0.000us 0.00% 715.197us 79.466us 9
3997
- aten::copy_ 0.83% 63.251us 20.39% 1.559ms 173.182us 656.318us 10.72% 715.197us 79.466us 9
3998
- void at::native::elementwise_kernel&lt;128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 656.318us 10.72% 656.318us 72.924us 9
3999
- Activity Buffer Request 18.70% 1.430ms 18.70% 1.430ms 1.430ms 58.879us 0.96% 58.879us 58.879us 1
4000
- aten::transpose 0.93% 71.209us 1.15% 87.625us 3.651us 0.000us 0.00% 0.000us 0.000us 24
4001
- aten::as_strided 0.21% 16.416us 0.21% 16.416us 0.684us 0.000us 0.00% 0.000us 0.000us 24
4002
- aten::empty_like 0.15% 11.741us 0.70% 53.481us 5.942us 0.000us 0.00% 0.000us 0.000us 9
4003
- aten::empty 0.89% 67.840us 0.89% 67.840us 3.230us 0.000us 0.00% 0.000us 0.000us 21
4004
- cudaLaunchKernel 1.15% 88.022us 1.15% 88.022us 7.335us 0.000us 0.00% 0.000us 0.000us 12
4005
- cudaStreamIsCapturing 0.03% 2.651us 0.03% 2.651us 0.884us 0.000us 0.00% 0.000us 0.000us 3
4006
- cudaFuncSetAttribute 0.04% 3.370us 0.04% 3.370us 1.123us 0.000us 0.00% 0.000us 0.000us 3
4007
- cudaDeviceSynchronize 72.75% 5.562ms 72.75% 5.562ms 5.562ms 0.000us 0.00% 0.000us 0.000us 1
4008
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4009
- Self CPU time total: 7.646ms
4010
- Self CUDA time total: 6.123ms
4011
 
4012
 
4013
 
@@ -4017,28 +4017,28 @@ PROFILE TRACE: torch_mem_eff | cuda_attn_L384_bfloat16
4017
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4018
  Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
4019
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4020
- torch_mem_eff 2.84% 224.838us 29.78% 2.354ms 2.354ms 0.000us 0.00% 6.170ms 6.170ms 1
4021
- torch_mem_eff 0.00% 0.000us 0.00% 0.000us 0.000us 6.121ms 100.15% 6.121ms 6.121ms 1
4022
- aten::scaled_dot_product_attention 0.24% 18.891us 1.82% 143.646us 47.882us 0.000us 0.00% 5.458ms 1.819ms 3
4023
- aten::_scaled_dot_product_efficient_attention 0.24% 19.093us 1.58% 124.755us 41.585us 0.000us 0.00% 5.458ms 1.819ms 3
4024
- aten::_efficient_attention_forward 0.36% 28.140us 1.04% 82.213us 27.404us 5.458ms 89.30% 5.458ms 1.819ms 3
4025
- fmha_cutlassF_bf16_aligned_64x128_rf_sm80(PyTorchMem... 0.00% 0.000us 0.00% 0.000us 0.000us 5.458ms 89.30% 5.458ms 1.819ms 3
4026
- aten::contiguous 0.10% 7.739us 24.57% 1.942ms 215.806us 0.000us 0.00% 711.998us 79.111us 9
4027
- aten::clone 0.31% 24.450us 24.47% 1.935ms 214.946us 0.000us 0.00% 711.998us 79.111us 9
4028
- aten::copy_ 0.86% 68.064us 23.51% 1.859ms 206.523us 653.982us 10.70% 711.998us 79.111us 9
4029
- void at::native::elementwise_kernel&lt;128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 653.982us 10.70% 653.982us 72.665us 9
4030
- Activity Buffer Request 18.84% 1.489ms 18.84% 1.489ms 1.489ms 58.016us 0.95% 58.016us 58.016us 1
4031
- aten::transpose 0.62% 49.288us 0.84% 66.489us 2.770us 0.000us 0.00% 0.000us 0.000us 24
4032
- aten::as_strided 0.22% 17.201us 0.22% 17.201us 0.717us 0.000us 0.00% 0.000us 0.000us 24
4033
- aten::empty_like 0.15% 12.041us 0.65% 51.362us 5.707us 0.000us 0.00% 0.000us 0.000us 9
4034
- aten::empty 0.83% 65.351us 0.83% 65.351us 3.112us 0.000us 0.00% 0.000us 0.000us 21
4035
- cudaLaunchKernel 4.09% 323.234us 4.09% 323.234us 26.936us 0.000us 0.00% 0.000us 0.000us 12
4036
- cudaStreamIsCapturing 0.03% 2.670us 0.03% 2.670us 0.890us 0.000us 0.00% 0.000us 0.000us 3
4037
- cudaFuncSetAttribute 0.04% 3.430us 0.04% 3.430us 1.143us 0.000us 0.00% 0.000us 0.000us 3
4038
- cudaDeviceSynchronize 70.22% 5.551ms 70.22% 5.551ms 5.551ms 0.000us 0.00% 0.000us 0.000us 1
4039
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4040
- Self CPU time total: 7.905ms
4041
- Self CUDA time total: 6.112ms
4042
 
4043
 
4044
 
@@ -4048,28 +4048,28 @@ PROFILE TRACE: torch_mem_eff | cuda_attn_L448_bfloat16
4048
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4049
  Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
4050
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4051
- torch_mem_eff 2.78% 220.799us 28.42% 2.258ms 2.258ms 0.000us 0.00% 6.296ms 6.296ms 1
4052
- torch_mem_eff 0.00% 0.000us 0.00% 0.000us 0.000us 6.245ms 100.15% 6.245ms 6.245ms 1
4053
- aten::scaled_dot_product_attention 0.24% 19.311us 1.79% 142.116us 47.372us 0.000us 0.00% 5.574ms 1.858ms 3
4054
- aten::_scaled_dot_product_efficient_attention 0.23% 17.909us 1.55% 122.805us 40.935us 0.000us 0.00% 5.574ms 1.858ms 3
4055
- aten::_efficient_attention_forward 0.36% 28.682us 1.03% 82.073us 27.358us 5.574ms 89.39% 5.574ms 1.858ms 3
4056
- fmha_cutlassF_bf16_aligned_64x128_rf_sm80(PyTorchMem... 0.00% 0.000us 0.00% 0.000us 0.000us 5.574ms 89.39% 5.574ms 1.858ms 3
4057
- aten::contiguous 0.09% 7.009us 23.32% 1.852ms 205.811us 0.000us 0.00% 721.599us 80.178us 9
4058
- aten::clone 0.28% 22.450us 23.23% 1.845ms 205.033us 0.000us 0.00% 721.599us 80.178us 9
4059
- aten::copy_ 0.87% 68.713us 22.33% 1.774ms 197.096us 661.695us 10.61% 721.599us 80.178us 9
4060
- void at::native::elementwise_kernel&lt;128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 661.695us 10.61% 661.695us 73.522us 9
4061
- Activity Buffer Request 17.91% 1.422ms 17.91% 1.422ms 1.422ms 59.904us 0.96% 59.904us 59.904us 1
4062
- aten::transpose 0.61% 48.435us 0.82% 65.304us 2.721us 0.000us 0.00% 0.000us 0.000us 24
4063
- aten::as_strided 0.21% 16.869us 0.21% 16.869us 0.703us 0.000us 0.00% 0.000us 0.000us 24
4064
- aten::empty_like 0.14% 11.511us 0.62% 48.982us 5.442us 0.000us 0.00% 0.000us 0.000us 9
4065
- aten::empty 0.78% 61.691us 0.78% 61.691us 2.938us 0.000us 0.00% 0.000us 0.000us 21
4066
- cudaLaunchKernel 3.85% 305.580us 3.85% 305.580us 25.465us 0.000us 0.00% 0.000us 0.000us 12
4067
- cudaStreamIsCapturing 0.03% 2.440us 0.03% 2.440us 0.813us 0.000us 0.00% 0.000us 0.000us 3
4068
- cudaFuncSetAttribute 0.05% 3.920us 0.05% 3.920us 1.307us 0.000us 0.00% 0.000us 0.000us 3
4069
- cudaDeviceSynchronize 71.58% 5.685ms 71.58% 5.685ms 5.685ms 0.000us 0.00% 0.000us 0.000us 1
4070
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4071
- Self CPU time total: 7.943ms
4072
- Self CUDA time total: 6.236ms
4073
 
4074
 
4075
 
@@ -4079,90 +4079,38 @@ PROFILE TRACE: torch_mem_eff | cuda_attn_L512_bfloat16
4079
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4080
  Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
4081
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4082
- torch_mem_eff 3.27% 267.711us 29.30% 2.401ms 2.401ms 0.000us 0.00% 6.459ms 6.459ms 1
4083
- torch_mem_eff 0.00% 0.000us 0.00% 0.000us 0.000us 6.406ms 100.13% 6.406ms 6.406ms 1
4084
- aten::scaled_dot_product_attention 0.24% 19.643us 1.85% 151.176us 50.392us 0.000us 0.00% 5.726ms 1.909ms 3
4085
- aten::_scaled_dot_product_efficient_attention 0.26% 20.920us 1.61% 131.533us 43.844us 0.000us 0.00% 5.726ms 1.909ms 3
4086
- aten::_efficient_attention_forward 0.37% 30.563us 1.03% 84.603us 28.201us 5.726ms 89.50% 5.726ms 1.909ms 3
4087
- fmha_cutlassF_bf16_aligned_64x128_rf_sm80(PyTorchMem... 0.00% 0.000us 0.00% 0.000us 0.000us 5.726ms 89.50% 5.726ms 1.909ms 3
4088
- aten::contiguous 0.09% 7.670us 23.58% 1.932ms 214.647us 0.000us 0.00% 733.247us 81.472us 9
4089
- aten::clone 0.31% 25.042us 23.48% 1.924ms 213.795us 0.000us 0.00% 733.247us 81.472us 9
4090
- aten::copy_ 0.88% 72.162us 22.52% 1.845ms 205.052us 671.711us 10.50% 733.247us 81.472us 9
4091
- void at::native::elementwise_kernel&lt;128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 671.711us 10.50% 671.711us 74.635us 9
4092
- Activity Buffer Request 17.78% 1.456ms 17.78% 1.456ms 1.456ms 61.536us 0.96% 61.536us 61.536us 1
4093
- aten::transpose 0.71% 58.110us 0.93% 75.842us 3.160us 0.000us 0.00% 0.000us 0.000us 24
4094
- aten::as_strided 0.22% 17.732us 0.22% 17.732us 0.739us 0.000us 0.00% 0.000us 0.000us 24
4095
- aten::empty_like 0.15% 12.319us 0.65% 53.641us 5.960us 0.000us 0.00% 0.000us 0.000us 9
4096
- aten::empty 0.81% 66.513us 0.81% 66.513us 3.167us 0.000us 0.00% 0.000us 0.000us 21
4097
- cudaLaunchKernel 4.14% 339.159us 4.14% 339.159us 28.263us 0.000us 0.00% 0.000us 0.000us 12
4098
- cudaStreamIsCapturing 0.03% 2.379us 0.03% 2.379us 0.793us 0.000us 0.00% 0.000us 0.000us 3
4099
- cudaFuncSetAttribute 0.05% 4.230us 0.05% 4.230us 1.410us 0.000us 0.00% 0.000us 0.000us 3
4100
- cudaDeviceSynchronize 70.70% 5.793ms 70.70% 5.793ms 5.793ms 0.000us 0.00% 0.000us 0.000us 1
4101
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4102
- Self CPU time total: 8.193ms
4103
- Self CUDA time total: 6.398ms
4104
 
4105
 
4106
  impl wl p50(ms) ok
4107
- torch_mem_eff cuda_attn_L128_bfloat16 1.86 True
4108
- torch_mem_eff cuda_attn_L256_bfloat16 1.97 True
4109
- torch_mem_eff cuda_attn_L320_bfloat16 2.04 True
4110
- torch_mem_eff cuda_attn_L384_bfloat16 2.06 True
4111
- torch_mem_eff cuda_attn_L448_bfloat16 2.03 True
4112
  torch_mem_eff cuda_attn_L512_bfloat16 2.19 True
4113
  </pre></div>
4114
- <div class="uv-install-logs" id="uv-logs-benchmark">
4115
- <div class="uv-logs-header" onclick="toggleUvLogs(this)">▶ UV Install Logs</div>
4116
- <div class="uv-logs-content" style="display: none;">
4117
- Building kernels-benchmark-tools @ file:///__w/kernels-benchmarks/kernels-benchmarks/tools
4118
- Downloading networkx (1.9MiB)
4119
- Downloading matplotlib (8.3MiB)
4120
- Downloading nvidia-cufft-cu12 (184.2MiB)
4121
- Downloading sympy (6.0MiB)
4122
- Downloading nvidia-cusolver-cu12 (255.1MiB)
4123
- Downloading nvidia-cublas-cu12 (566.8MiB)
4124
- Downloading nvidia-cuda-cupti-cu12 (9.8MiB)
4125
- Downloading numpy (16.2MiB)
4126
- Downloading nvidia-nvjitlink-cu12 (37.4MiB)
4127
- Downloading setuptools (1.1MiB)
4128
- Downloading nvidia-cudnn-cu12 (674.0MiB)
4129
- Downloading nvidia-curand-cu12 (60.7MiB)
4130
- Downloading nvidia-nccl-cu12 (307.4MiB)
4131
- Downloading kiwisolver (1.4MiB)
4132
- Downloading nvidia-cuda-nvrtc-cu12 (84.0MiB)
4133
- Downloading fonttools (4.7MiB)
4134
- Downloading nvidia-cusparse-cu12 (274.9MiB)
4135
- Downloading pillow (6.7MiB)
4136
- Downloading nvidia-cusparselt-cu12 (273.9MiB)
4137
- Downloading nvidia-cufile-cu12 (1.1MiB)
4138
- Downloading triton (148.3MiB)
4139
- Downloading torch (846.9MiB)
4140
- Downloading nvidia-cufile-cu12
4141
- Downloading kiwisolver
4142
- Downloading setuptools
4143
- Downloading fonttools
4144
- Downloading networkx
4145
- Downloading pillow
4146
- Built kernels-benchmark-tools @ file:///__w/kernels-benchmarks/kernels-benchmarks/tools
4147
- Downloading nvidia-cuda-cupti-cu12
4148
- Downloading matplotlib
4149
- Downloading numpy
4150
- Downloading sympy
4151
- Downloading nvidia-nvjitlink-cu12
4152
- Downloading nvidia-curand-cu12
4153
- Downloading nvidia-cuda-nvrtc-cu12
4154
- Downloading triton
4155
- Downloading nvidia-cufft-cu12
4156
- Downloading nvidia-cusolver-cu12
4157
- Downloading nvidia-cusparse-cu12
4158
- Downloading nvidia-cusparselt-cu12
4159
- Downloading nvidia-nccl-cu12
4160
- Downloading nvidia-cublas-cu12
4161
- Downloading nvidia-cudnn-cu12
4162
- Downloading torch
4163
- Installed 37 packages in 216ms
4164
- </div>
4165
- </div>
4166
  <div class="cell-artifacts">
4167
  <h4>Artifacts:</h4>
4168
  <a href="artifacts/benchmark/attention.jsonl" class="artifact" target="_blank">attention.jsonl</a>
 
3869
  <span class="collapse-indicators">
3870
  <span onclick="toggleCode('benchmark')" style="cursor: pointer;">▼ code</span>
3871
  <span onclick="toggleOutput('benchmark')" style="cursor: pointer;">▼ output</span>
3872
+ <span id="uv-indicator-benchmark" style="cursor: default; opacity: 0.3;">▶ uv-logs</span>
3873
  </span> |
3874
+ Cell: benchmark | 3.92s
3875
  | <button class="run-btn" onclick="runCell('benchmark')">▶ run</button>
3876
  <button class="copy-btn" onclick="copyCell('benchmark')">Copy</button>
3877
  <a href="cells/benchmark.py" target="_blank" class="raw-btn">Raw</a>
 
3924
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
3925
  Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
3926
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
3927
+ torch_mem_eff 4.77% 333.269us 32.71% 2.284ms 2.284ms 0.000us 0.00% 5.420ms 5.420ms 1
3928
+ torch_mem_eff 0.00% 0.000us 0.00% 0.000us 0.000us 5.402ms 100.61% 5.402ms 5.402ms 1
3929
+ aten::scaled_dot_product_attention 0.44% 30.450us 2.54% 177.435us 59.145us 0.000us 0.00% 4.753ms 1.584ms 3
3930
+ aten::_scaled_dot_product_efficient_attention 0.33% 22.722us 2.10% 146.985us 48.995us 0.000us 0.00% 4.753ms 1.584ms 3
3931
+ aten::_efficient_attention_forward 0.51% 35.382us 1.42% 99.273us 33.091us 4.753ms 88.51% 4.753ms 1.584ms 3
3932
+ fmha_cutlassF_bf16_aligned_64x128_rf_sm80(PyTorchMem... 0.00% 0.000us 0.00% 0.000us 0.000us 4.753ms 88.51% 4.753ms 1.584ms 3
3933
+ aten::contiguous 0.17% 11.660us 24.51% 1.712ms 190.185us 0.000us 0.00% 667.266us 74.141us 9
3934
+ aten::clone 0.46% 31.810us 24.34% 1.700ms 188.889us 0.000us 0.00% 667.266us 74.141us 9
3935
+ aten::copy_ 1.01% 70.871us 22.86% 1.597ms 177.404us 616.738us 11.49% 667.266us 74.141us 9
3936
+ void at::native::elementwise_kernel&lt;128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 616.738us 11.49% 616.738us 68.526us 9
3937
+ Activity Buffer Request 20.64% 1.441ms 20.64% 1.441ms 1.441ms 50.528us 0.94% 50.528us 50.528us 1
3938
+ aten::transpose 0.91% 63.619us 1.25% 87.011us 3.625us 0.000us 0.00% 0.000us 0.000us 24
3939
+ aten::as_strided 0.33% 23.392us 0.33% 23.392us 0.975us 0.000us 0.00% 0.000us 0.000us 24
3940
+ aten::empty_like 0.24% 16.972us 1.02% 71.553us 7.950us 0.000us 0.00% 0.000us 0.000us 9
3941
+ aten::empty 1.18% 82.691us 1.18% 82.691us 3.938us 0.000us 0.00% 0.000us 0.000us 21
3942
+ cudaLaunchKernel 1.55% 108.383us 1.55% 108.383us 9.032us 0.000us 0.00% 0.000us 0.000us 12
3943
+ cudaStreamIsCapturing 0.05% 3.260us 0.05% 3.260us 1.087us 0.000us 0.00% 0.000us 0.000us 3
3944
+ cudaFuncSetAttribute 0.12% 8.450us 0.12% 8.450us 2.817us 0.000us 0.00% 0.000us 0.000us 3
3945
+ cudaDeviceSynchronize 67.29% 4.700ms 67.29% 4.700ms 4.700ms 0.000us 0.00% 0.000us 0.000us 1
3946
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
3947
+ Self CPU time total: 6.984ms
3948
+ Self CUDA time total: 5.369ms
3949
 
3950
 
3951
 
 
3955
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
3956
  Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
3957
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
3958
+ torch_mem_eff 3.53% 251.015us 29.52% 2.098ms 2.098ms 0.000us 0.00% 5.633ms 5.633ms 1
3959
+ torch_mem_eff 0.00% 0.000us 0.00% 0.000us 0.000us 5.587ms 100.15% 5.587ms 5.587ms 1
3960
+ aten::scaled_dot_product_attention 0.25% 17.630us 2.05% 145.594us 48.531us 0.000us 0.00% 4.943ms 1.648ms 3
3961
+ aten::_scaled_dot_product_efficient_attention 0.28% 19.810us 1.80% 127.964us 42.655us 0.000us 0.00% 4.943ms 1.648ms 3
3962
+ aten::_efficient_attention_forward 0.42% 29.862us 1.18% 83.512us 27.837us 4.943ms 88.61% 4.943ms 1.648ms 3
3963
+ fmha_cutlassF_bf16_aligned_64x128_rf_sm80(PyTorchMem... 0.00% 0.000us 0.00% 0.000us 0.000us 4.943ms 88.61% 4.943ms 1.648ms 3
3964
+ aten::contiguous 0.10% 7.191us 23.30% 1.656ms 184.002us 0.000us 0.00% 689.540us 76.616us 9
3965
+ aten::clone 0.33% 23.318us 23.20% 1.649ms 183.203us 0.000us 0.00% 689.540us 76.616us 9
3966
+ aten::copy_ 0.92% 65.725us 22.12% 1.572ms 174.717us 635.140us 11.39% 689.540us 76.616us 9
3967
+ void at::native::elementwise_kernel&lt;128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 635.140us 11.39% 635.140us 70.571us 9
3968
+ Activity Buffer Request 20.24% 1.439ms 20.24% 1.439ms 1.439ms 54.400us 0.98% 54.400us 54.400us 1
3969
+ aten::transpose 0.71% 50.494us 0.99% 70.123us 2.922us 0.000us 0.00% 0.000us 0.000us 24
3970
+ aten::as_strided 0.28% 19.629us 0.28% 19.629us 0.818us 0.000us 0.00% 0.000us 0.000us 24
3971
+ aten::empty_like 0.18% 12.608us 0.75% 53.061us 5.896us 0.000us 0.00% 0.000us 0.000us 9
3972
+ aten::empty 0.94% 66.903us 0.94% 66.903us 3.186us 0.000us 0.00% 0.000us 0.000us 21
3973
+ cudaLaunchKernel 1.25% 89.012us 1.25% 89.012us 7.418us 0.000us 0.00% 0.000us 0.000us 12
3974
+ cudaStreamIsCapturing 0.03% 2.220us 0.03% 2.220us 0.740us 0.000us 0.00% 0.000us 0.000us 3
3975
+ cudaFuncSetAttribute 0.05% 3.880us 0.05% 3.880us 1.293us 0.000us 0.00% 0.000us 0.000us 3
3976
+ cudaDeviceSynchronize 70.48% 5.009ms 70.48% 5.009ms 5.009ms 0.000us 0.00% 0.000us 0.000us 1
3977
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
3978
+ Self CPU time total: 7.107ms
3979
+ Self CUDA time total: 5.578ms
3980
 
3981
 
3982
 
 
3986
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
3987
  Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
3988
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
3989
+ torch_mem_eff 3.28% 246.598us 28.54% 2.146ms 2.146ms 0.000us 0.00% 6.014ms 6.014ms 1
3990
+ torch_mem_eff 0.00% 0.000us 0.00% 0.000us 0.000us 5.967ms 100.18% 5.967ms 5.967ms 1
3991
+ aten::scaled_dot_product_attention 0.24% 18.181us 1.92% 144.583us 48.194us 0.000us 0.00% 5.302ms 1.767ms 3
3992
+ aten::_scaled_dot_product_efficient_attention 0.27% 19.980us 1.68% 126.402us 42.134us 0.000us 0.00% 5.302ms 1.767ms 3
3993
+ aten::_efficient_attention_forward 0.38% 28.571us 1.10% 82.521us 27.507us 5.302ms 89.01% 5.302ms 1.767ms 3
3994
+ fmha_cutlassF_bf16_aligned_64x128_rf_sm80(PyTorchMem... 0.00% 0.000us 0.00% 0.000us 0.000us 5.302ms 89.01% 5.302ms 1.767ms 3
3995
+ aten::contiguous 0.09% 6.930us 22.70% 1.707ms 189.666us 0.000us 0.00% 712.547us 79.172us 9
3996
+ aten::clone 0.30% 22.691us 22.61% 1.700ms 188.896us 0.000us 0.00% 712.547us 79.172us 9
3997
+ aten::copy_ 1.08% 81.024us 21.57% 1.622ms 180.228us 654.403us 10.99% 712.547us 79.172us 9
3998
+ void at::native::elementwise_kernel&lt;128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 654.403us 10.99% 654.403us 72.711us 9
3999
+ Activity Buffer Request 19.57% 1.471ms 19.57% 1.471ms 1.471ms 58.144us 0.98% 58.144us 58.144us 1
4000
+ aten::transpose 0.68% 51.431us 0.95% 71.351us 2.973us 0.000us 0.00% 0.000us 0.000us 24
4001
+ aten::as_strided 0.26% 19.920us 0.26% 19.920us 0.830us 0.000us 0.00% 0.000us 0.000us 24
4002
+ aten::empty_like 0.16% 11.979us 0.74% 55.320us 6.147us 0.000us 0.00% 0.000us 0.000us 9
4003
+ aten::empty 0.93% 69.561us 0.93% 69.561us 3.312us 0.000us 0.00% 0.000us 0.000us 21
4004
+ cudaLaunchKernel 1.22% 91.652us 1.22% 91.652us 7.638us 0.000us 0.00% 0.000us 0.000us 12
4005
+ cudaStreamIsCapturing 0.03% 2.359us 0.03% 2.359us 0.786us 0.000us 0.00% 0.000us 0.000us 3
4006
+ cudaFuncSetAttribute 0.05% 3.430us 0.05% 3.430us 1.143us 0.000us 0.00% 0.000us 0.000us 3
4007
+ cudaDeviceSynchronize 71.46% 5.373ms 71.46% 5.373ms 5.373ms 0.000us 0.00% 0.000us 0.000us 1
4008
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4009
+ Self CPU time total: 7.519ms
4010
+ Self CUDA time total: 5.956ms
4011
 
4012
 
4013
 
 
4017
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4018
  Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
4019
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4020
+ torch_mem_eff 3.21% 251.576us 29.97% 2.347ms 2.347ms 0.000us 0.00% 6.116ms 6.116ms 1
4021
+ torch_mem_eff 0.00% 0.000us 0.00% 0.000us 0.000us 6.068ms 100.14% 6.068ms 6.068ms 1
4022
+ aten::scaled_dot_product_attention 0.24% 18.800us 1.87% 146.693us 48.898us 0.000us 0.00% 5.408ms 1.803ms 3
4023
+ aten::_scaled_dot_product_efficient_attention 0.25% 19.900us 1.63% 127.893us 42.631us 0.000us 0.00% 5.408ms 1.803ms 3
4024
+ aten::_efficient_attention_forward 0.38% 29.372us 1.07% 83.903us 27.968us 5.408ms 89.25% 5.408ms 1.803ms 3
4025
+ fmha_cutlassF_bf16_aligned_64x128_rf_sm80(PyTorchMem... 0.00% 0.000us 0.00% 0.000us 0.000us 5.408ms 89.25% 5.408ms 1.803ms 3
4026
+ aten::contiguous 0.10% 7.511us 24.29% 1.902ms 211.340us 0.000us 0.00% 708.735us 78.748us 9
4027
+ aten::clone 0.28% 21.872us 24.19% 1.895ms 210.505us 0.000us 0.00% 708.735us 78.748us 9
4028
+ aten::copy_ 0.85% 66.540us 23.20% 1.817ms 201.834us 651.551us 10.75% 708.735us 78.748us 9
4029
+ void at::native::elementwise_kernel&lt;128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 651.551us 10.75% 651.551us 72.395us 9
4030
+ Activity Buffer Request 18.68% 1.462ms 18.68% 1.462ms 1.462ms 57.184us 0.94% 57.184us 57.184us 1
4031
+ aten::transpose 0.65% 50.781us 0.90% 70.402us 2.933us 0.000us 0.00% 0.000us 0.000us 24
4032
+ aten::as_strided 0.25% 19.621us 0.25% 19.621us 0.818us 0.000us 0.00% 0.000us 0.000us 24
4033
+ aten::empty_like 0.15% 11.809us 0.72% 56.170us 6.241us 0.000us 0.00% 0.000us 0.000us 9
4034
+ aten::empty 0.90% 70.242us 0.90% 70.242us 3.345us 0.000us 0.00% 0.000us 0.000us 21
4035
+ cudaLaunchKernel 3.97% 310.797us 3.97% 310.797us 25.900us 0.000us 0.00% 0.000us 0.000us 12
4036
+ cudaStreamIsCapturing 0.03% 2.250us 0.03% 2.250us 0.750us 0.000us 0.00% 0.000us 0.000us 3
4037
+ cudaFuncSetAttribute 0.04% 3.220us 0.04% 3.220us 1.073us 0.000us 0.00% 0.000us 0.000us 3
4038
+ cudaDeviceSynchronize 70.03% 5.484ms 70.03% 5.484ms 5.484ms 0.000us 0.00% 0.000us 0.000us 1
4039
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4040
+ Self CPU time total: 7.830ms
4041
+ Self CUDA time total: 6.059ms
4042
 
4043
 
4044
 
 
4048
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4049
  Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
4050
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4051
+ torch_mem_eff 3.15% 250.575us 28.50% 2.270ms 2.270ms 0.000us 0.00% 6.322ms 6.322ms 1
4052
+ torch_mem_eff 0.00% 0.000us 0.00% 0.000us 0.000us 6.270ms 100.14% 6.270ms 6.270ms 1
4053
+ aten::scaled_dot_product_attention 0.22% 17.572us 1.82% 145.084us 48.361us 0.000us 0.00% 5.598ms 1.866ms 3
4054
+ aten::_scaled_dot_product_efficient_attention 0.24% 19.250us 1.60% 127.512us 42.504us 0.000us 0.00% 5.598ms 1.866ms 3
4055
+ aten::_efficient_attention_forward 0.36% 28.812us 1.05% 83.962us 27.987us 5.598ms 89.40% 5.598ms 1.866ms 3
4056
+ fmha_cutlassF_bf16_aligned_64x128_rf_sm80(PyTorchMem... 0.00% 0.000us 0.00% 0.000us 0.000us 5.598ms 89.40% 5.598ms 1.866ms 3
4057
+ aten::contiguous 0.09% 6.912us 22.94% 1.827ms 203.045us 0.000us 0.00% 724.000us 80.444us 9
4058
+ aten::clone 0.28% 21.949us 22.86% 1.820ms 202.277us 0.000us 0.00% 724.000us 80.444us 9
4059
+ aten::copy_ 0.82% 65.091us 21.89% 1.744ms 193.745us 664.032us 10.60% 724.000us 80.444us 9
4060
+ void at::native::elementwise_kernel&lt;128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 664.032us 10.60% 664.032us 73.781us 9
4061
+ Activity Buffer Request 18.02% 1.435ms 18.02% 1.435ms 1.435ms 59.968us 0.96% 59.968us 59.968us 1
4062
+ aten::transpose 0.64% 50.930us 0.89% 70.859us 2.952us 0.000us 0.00% 0.000us 0.000us 24
4063
+ aten::as_strided 0.25% 19.929us 0.25% 19.929us 0.830us 0.000us 0.00% 0.000us 0.000us 24
4064
+ aten::empty_like 0.15% 12.022us 0.69% 54.843us 6.094us 0.000us 0.00% 0.000us 0.000us 9
4065
+ aten::empty 0.87% 69.430us 0.87% 69.430us 3.306us 0.000us 0.00% 0.000us 0.000us 21
4066
+ cudaLaunchKernel 3.34% 266.388us 3.34% 266.388us 22.199us 0.000us 0.00% 0.000us 0.000us 12
4067
+ cudaStreamIsCapturing 0.03% 2.320us 0.03% 2.320us 0.773us 0.000us 0.00% 0.000us 0.000us 3
4068
+ cudaFuncSetAttribute 0.04% 3.120us 0.04% 3.120us 1.040us 0.000us 0.00% 0.000us 0.000us 3
4069
+ cudaDeviceSynchronize 71.50% 5.695ms 71.50% 5.695ms 5.695ms 0.000us 0.00% 0.000us 0.000us 1
4070
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4071
+ Self CPU time total: 7.965ms
4072
+ Self CUDA time total: 6.262ms
4073
 
4074
 
4075
 
 
4079
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4080
  Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
4081
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4082
+ torch_mem_eff 3.00% 248.403us 26.98% 2.232ms 2.232ms 0.000us 0.00% 6.668ms 6.668ms 1
4083
+ torch_mem_eff 0.00% 0.000us 0.00% 0.000us 0.000us 6.616ms 100.13% 6.616ms 6.616ms 1
4084
+ aten::scaled_dot_product_attention 0.21% 17.221us 1.72% 142.654us 47.551us 0.000us 0.00% 5.939ms 1.980ms 3
4085
+ aten::_scaled_dot_product_efficient_attention 0.23% 18.779us 1.52% 125.433us 41.811us 0.000us 0.00% 5.939ms 1.980ms 3
4086
+ aten::_efficient_attention_forward 0.34% 28.440us 0.99% 81.712us 27.237us 5.939ms 89.88% 5.939ms 1.980ms 3
4087
+ fmha_cutlassF_bf16_aligned_64x128_rf_sm80(PyTorchMem... 0.00% 0.000us 0.00% 0.000us 0.000us 5.939ms 89.88% 5.939ms 1.980ms 3
4088
+ aten::contiguous 0.08% 6.861us 21.66% 1.792ms 199.142us 0.000us 0.00% 729.440us 81.049us 9
4089
+ aten::clone 0.26% 21.352us 21.58% 1.785ms 198.379us 0.000us 0.00% 729.440us 81.049us 9
4090
+ aten::copy_ 0.83% 69.012us 20.65% 1.709ms 189.858us 668.928us 10.12% 729.440us 81.049us 9
4091
+ void at::native::elementwise_kernel&lt;128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 668.928us 10.12% 668.928us 74.325us 9
4092
+ Activity Buffer Request 17.29% 1.430ms 17.29% 1.430ms 1.430ms 60.512us 0.92% 60.512us 60.512us 1
4093
+ aten::transpose 0.63% 51.780us 0.89% 73.784us 3.074us 0.000us 0.00% 0.000us 0.000us 24
4094
+ aten::as_strided 0.27% 22.004us 0.27% 22.004us 0.917us 0.000us 0.00% 0.000us 0.000us 24
4095
+ aten::empty_like 0.14% 11.870us 0.67% 55.340us 6.149us 0.000us 0.00% 0.000us 0.000us 9
4096
+ aten::empty 0.84% 69.312us 0.84% 69.312us 3.301us 0.000us 0.00% 0.000us 0.000us 21
4097
+ cudaLaunchKernel 2.79% 231.145us 2.79% 231.145us 19.262us 0.000us 0.00% 0.000us 0.000us 12
4098
+ cudaStreamIsCapturing 0.03% 2.280us 0.03% 2.280us 0.760us 0.000us 0.00% 0.000us 0.000us 3
4099
+ cudaFuncSetAttribute 0.04% 3.570us 0.04% 3.570us 1.190us 0.000us 0.00% 0.000us 0.000us 3
4100
+ cudaDeviceSynchronize 73.02% 6.041ms 73.02% 6.041ms 6.041ms 0.000us 0.00% 0.000us 0.000us 1
4101
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4102
+ Self CPU time total: 8.273ms
4103
+ Self CUDA time total: 6.608ms
4104
 
4105
 
4106
  impl wl p50(ms) ok
4107
+ torch_mem_eff cuda_attn_L128_bfloat16 1.83 True
4108
+ torch_mem_eff cuda_attn_L256_bfloat16 1.89 True
4109
+ torch_mem_eff cuda_attn_L320_bfloat16 2.00 True
4110
+ torch_mem_eff cuda_attn_L384_bfloat16 1.97 True
4111
+ torch_mem_eff cuda_attn_L448_bfloat16 2.06 True
4112
  torch_mem_eff cuda_attn_L512_bfloat16 2.19 True
4113
  </pre></div>
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
4114
  <div class="cell-artifacts">
4115
  <h4>Artifacts:</h4>
4116
  <a href="artifacts/benchmark/attention.jsonl" class="artifact" target="_blank">attention.jsonl</a>
flash_attn/impls/sage_attention.html CHANGED
@@ -3869,9 +3869,9 @@ span.linenos.special { color: #000000; background-color: #ffffc0; padding-left:
3869
  <span class="collapse-indicators">
3870
  <span onclick="toggleCode('benchmark')" style="cursor: pointer;">▼ code</span>
3871
  <span onclick="toggleOutput('benchmark')" style="cursor: pointer;">▼ output</span>
3872
- <span id="uv-indicator-benchmark" style="cursor: default; opacity: 0.3;">▶ uv-logs</span>
3873
  </span> |
3874
- Cell: benchmark | 4.22s
3875
  | <button class="run-btn" onclick="runCell('benchmark')">▶ run</button>
3876
  <button class="copy-btn" onclick="copyCell('benchmark')">Copy</button>
3877
  <a href="cells/benchmark.py" target="_blank" class="raw-btn">Raw</a>
@@ -3920,23 +3920,28 @@ Cell: benchmark | 4.22s
3920
  <div class="cell-stdout"><pre class="stdout-text">Running attention benchmark on cuda with 6 workloads.
3921
  impl wl p50(ms) ok
3922
  sage_int8_fp16 cuda_attn_L128_bfloat16 FAIL False
3923
- Error: module &#x27;sage_attention_fd11035eb4318b27&#x27; has no attribute &#x27;fwd&#x27;
3924
  sage_int8_fp16 cuda_attn_L256_bfloat16 FAIL False
3925
- Error: module &#x27;sage_attention_fd11035eb4318b27&#x27; has no attribute &#x27;fwd&#x27;
3926
  sage_int8_fp16 cuda_attn_L320_bfloat16 FAIL False
3927
- Error: module &#x27;sage_attention_fd11035eb4318b27&#x27; has no attribute &#x27;fwd&#x27;
3928
  sage_int8_fp16 cuda_attn_L384_bfloat16 FAIL False
3929
- Error: module &#x27;sage_attention_fd11035eb4318b27&#x27; has no attribute &#x27;fwd&#x27;
3930
  sage_int8_fp16 cuda_attn_L448_bfloat16 FAIL False
3931
- Error: module &#x27;sage_attention_fd11035eb4318b27&#x27; has no attribute &#x27;fwd&#x27;
3932
  sage_int8_fp16 cuda_attn_L512_bfloat16 FAIL False
3933
- Error: module &#x27;sage_attention_fd11035eb4318b27&#x27; has no attribute &#x27;fwd&#x27;
3934
  </pre></div>
3935
- <div class="cell-stderr">
3936
- Fetching 11 files: 0%| | 0/11 [00:00&lt;?, ?it/s]
3937
- Fetching 11 files: 73%|███████▎ | 8/11 [00:00&lt;00:00, 13.92it/s]
3938
- Fetching 11 files: 100%|██████████| 11/11 [00:00&lt;00:00, 19.13it/s]
3939
  </div>
 
 
 
 
 
3940
  <div class="cell-artifacts">
3941
  <h4>Artifacts:</h4>
3942
  <a href="artifacts/benchmark/attention.jsonl" class="artifact" target="_blank">attention.jsonl</a>
 
3869
  <span class="collapse-indicators">
3870
  <span onclick="toggleCode('benchmark')" style="cursor: pointer;">▼ code</span>
3871
  <span onclick="toggleOutput('benchmark')" style="cursor: pointer;">▼ output</span>
3872
+ <span id="uv-indicator-benchmark" onclick="toggleUvLogsFromHeader('benchmark')" style="cursor: pointer;">▶ uv-logs</span>
3873
  </span> |
3874
+ Cell: benchmark | 4.53s
3875
  | <button class="run-btn" onclick="runCell('benchmark')">▶ run</button>
3876
  <button class="copy-btn" onclick="copyCell('benchmark')">Copy</button>
3877
  <a href="cells/benchmark.py" target="_blank" class="raw-btn">Raw</a>
 
3920
  <div class="cell-stdout"><pre class="stdout-text">Running attention benchmark on cuda with 6 workloads.
3921
  impl wl p50(ms) ok
3922
  sage_int8_fp16 cuda_attn_L128_bfloat16 FAIL False
3923
+ Error: module &#x27;sage_attention_5c963cbdaf16559b&#x27; has no attribute &#x27;fwd&#x27;
3924
  sage_int8_fp16 cuda_attn_L256_bfloat16 FAIL False
3925
+ Error: module &#x27;sage_attention_5c963cbdaf16559b&#x27; has no attribute &#x27;fwd&#x27;
3926
  sage_int8_fp16 cuda_attn_L320_bfloat16 FAIL False
3927
+ Error: module &#x27;sage_attention_5c963cbdaf16559b&#x27; has no attribute &#x27;fwd&#x27;
3928
  sage_int8_fp16 cuda_attn_L384_bfloat16 FAIL False
3929
+ Error: module &#x27;sage_attention_5c963cbdaf16559b&#x27; has no attribute &#x27;fwd&#x27;
3930
  sage_int8_fp16 cuda_attn_L448_bfloat16 FAIL False
3931
+ Error: module &#x27;sage_attention_5c963cbdaf16559b&#x27; has no attribute &#x27;fwd&#x27;
3932
  sage_int8_fp16 cuda_attn_L512_bfloat16 FAIL False
3933
+ Error: module &#x27;sage_attention_5c963cbdaf16559b&#x27; has no attribute &#x27;fwd&#x27;
3934
  </pre></div>
3935
+ <div class="uv-install-logs" id="uv-logs-benchmark">
3936
+ <div class="uv-logs-header" onclick="toggleUvLogs(this)">▶ UV Install Logs</div>
3937
+ <div class="uv-logs-content" style="display: none;">
3938
+ Installed 15 packages in 14ms
3939
  </div>
3940
+ </div>
3941
+ <div class="cell-stderr">Fetching 11 files: 0%| | 0/11 [00:00&lt;?, ?it/s]
3942
+ Fetching 11 files: 18%|█▊ | 2/11 [00:00&lt;00:00, 15.79it/s]
3943
+ Fetching 11 files: 73%|███████▎ | 8/11 [00:00&lt;00:00, 13.55it/s]
3944
+ Fetching 11 files: 100%|██████████| 11/11 [00:00&lt;00:00, 18.83it/s]</div>
3945
  <div class="cell-artifacts">
3946
  <h4>Artifacts:</h4>
3947
  <a href="artifacts/benchmark/attention.jsonl" class="artifact" target="_blank">attention.jsonl</a>
flash_attn/impls/xformers.html CHANGED
@@ -3923,21 +3923,21 @@ PROFILE TRACE: xformers_meff | cuda_attn_L128_bfloat16
3923
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
3924
  Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
3925
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
3926
- xformers_meff 9.93% 451.937us 49.71% 2.262ms 2.262ms 0.000us 0.00% 3.695ms 3.695ms 1
3927
- xformers_flash3::flash_fwd 4.26% 193.656us 38.96% 1.773ms 590.904us 0.000us 0.00% 3.695ms 1.232ms 3
3928
- flash_attn_3::fwd 1.62% 73.841us 34.71% 1.579ms 526.352us 2.795ms 100.00% 3.695ms 1.232ms 3
3929
- xformers_meff 0.00% 0.000us 0.00% 0.000us 0.000us 2.797ms 100.05% 2.797ms 2.797ms 1
3930
- void cutlass::device_kernel&lt;flash::enable_sm80_to_sm... 0.00% 0.000us 0.00% 0.000us 0.000us 2.795ms 100.00% 2.795ms 931.773us 3
3931
- Activity Buffer Request 31.17% 1.418ms 31.17% 1.418ms 1.418ms 899.421us 32.18% 899.421us 899.421us 1
3932
- aten::empty 0.76% 34.741us 0.76% 34.741us 5.790us 0.000us 0.00% 0.000us 0.000us 6
3933
- cudaFuncSetAttribute 0.30% 13.732us 0.30% 13.732us 4.577us 0.000us 0.00% 0.000us 0.000us 3
3934
- cudaLaunchKernel 0.85% 38.662us 0.85% 38.662us 12.887us 0.000us 0.00% 0.000us 0.000us 3
3935
- aten::reshape 0.35% 15.860us 0.82% 37.181us 6.197us 0.000us 0.00% 0.000us 0.000us 6
3936
- aten::view 0.47% 21.321us 0.47% 21.321us 3.553us 0.000us 0.00% 0.000us 0.000us 6
3937
- cudaDeviceSynchronize 50.29% 2.288ms 50.29% 2.288ms 2.288ms 0.000us 0.00% 0.000us 0.000us 1
3938
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
3939
- Self CPU time total: 4.550ms
3940
- Self CUDA time total: 2.795ms
3941
 
3942
 
3943
 
@@ -3947,21 +3947,21 @@ PROFILE TRACE: xformers_meff | cuda_attn_L256_bfloat16
3947
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
3948
  Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
3949
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
3950
- xformers_meff 6.95% 312.321us 44.96% 2.021ms 2.021ms 0.000us 0.00% 3.832ms 3.832ms 1
3951
- xformers_flash3::flash_fwd 3.14% 141.315us 37.51% 1.686ms 561.970us 0.000us 0.00% 3.832ms 1.277ms 3
3952
- flash_attn_3::fwd 1.18% 53.030us 34.37% 1.545ms 514.865us 2.890ms 100.00% 3.832ms 1.277ms 3
3953
- xformers_meff 0.00% 0.000us 0.00% 0.000us 0.000us 2.892ms 100.05% 2.892ms 2.892ms 1
3954
- void cutlass::device_kernel&lt;flash::enable_sm80_to_sm... 0.00% 0.000us 0.00% 0.000us 0.000us 2.890ms 100.00% 2.890ms 963.329us 3
3955
- Activity Buffer Request 31.64% 1.422ms 31.64% 1.422ms 1.422ms 942.465us 32.61% 942.465us 942.465us 1
3956
- aten::empty 0.68% 30.660us 0.68% 30.660us 5.110us 0.000us 0.00% 0.000us 0.000us 6
3957
- cudaFuncSetAttribute 0.12% 5.592us 0.12% 5.592us 1.864us 0.000us 0.00% 0.000us 0.000us 3
3958
- cudaLaunchKernel 0.74% 33.432us 0.74% 33.432us 11.144us 0.000us 0.00% 0.000us 0.000us 3
3959
- aten::reshape 0.20% 8.951us 0.50% 22.691us 3.782us 0.000us 0.00% 0.000us 0.000us 6
3960
- aten::view 0.31% 13.740us 0.31% 13.740us 2.290us 0.000us 0.00% 0.000us 0.000us 6
3961
- cudaDeviceSynchronize 55.04% 2.474ms 55.04% 2.474ms 2.474ms 0.000us 0.00% 0.000us 0.000us 1
3962
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
3963
- Self CPU time total: 4.495ms
3964
- Self CUDA time total: 2.890ms
3965
 
3966
 
3967
 
@@ -3971,21 +3971,21 @@ PROFILE TRACE: xformers_meff | cuda_attn_L320_bfloat16
3971
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
3972
  Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
3973
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
3974
- xformers_meff 6.65% 298.008us 44.73% 2.006ms 2.006ms 0.000us 0.00% 3.867ms 3.867ms 1
3975
- xformers_flash3::flash_fwd 3.15% 141.235us 37.58% 1.685ms 561.690us 0.000us 0.00% 3.867ms 1.289ms 3
3976
- flash_attn_3::fwd 1.18% 53.120us 34.43% 1.544ms 514.611us 2.888ms 100.00% 3.867ms 1.289ms 3
3977
- xformers_meff 0.00% 0.000us 0.00% 0.000us 0.000us 2.890ms 100.06% 2.890ms 2.890ms 1
3978
- void cutlass::device_kernel&lt;flash::enable_sm80_to_sm... 0.00% 0.000us 0.00% 0.000us 0.000us 2.888ms 100.00% 2.888ms 962.683us 3
3979
- Activity Buffer Request 31.72% 1.422ms 31.72% 1.422ms 1.422ms 978.939us 33.90% 978.939us 978.939us 1
3980
- aten::empty 0.67% 30.192us 0.67% 30.192us 5.032us 0.000us 0.00% 0.000us 0.000us 6
3981
- cudaFuncSetAttribute 0.12% 5.491us 0.12% 5.491us 1.830us 0.000us 0.00% 0.000us 0.000us 3
3982
- cudaLaunchKernel 0.73% 32.901us 0.73% 32.901us 10.967us 0.000us 0.00% 0.000us 0.000us 3
3983
- aten::reshape 0.20% 8.773us 0.50% 22.603us 3.767us 0.000us 0.00% 0.000us 0.000us 6
3984
- aten::view 0.31% 13.830us 0.31% 13.830us 2.305us 0.000us 0.00% 0.000us 0.000us 6
3985
- cudaDeviceSynchronize 55.27% 2.478ms 55.27% 2.478ms 2.478ms 0.000us 0.00% 0.000us 0.000us 1
3986
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
3987
- Self CPU time total: 4.484ms
3988
- Self CUDA time total: 2.888ms
3989
 
3990
 
3991
 
@@ -3995,21 +3995,21 @@ PROFILE TRACE: xformers_meff | cuda_attn_L384_bfloat16
3995
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
3996
  Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
3997
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
3998
- xformers_meff 6.31% 299.042us 46.56% 2.205ms 2.205ms 0.000us 0.00% 3.936ms 3.936ms 1
3999
- xformers_flash3::flash_fwd 2.97% 140.784us 39.75% 1.883ms 627.609us 0.000us 0.00% 3.936ms 1.312ms 3
4000
- flash_attn_3::fwd 1.10% 52.191us 36.78% 1.742ms 580.681us 2.941ms 100.00% 3.936ms 1.312ms 3
4001
- xformers_meff 0.00% 0.000us 0.00% 0.000us 0.000us 2.943ms 100.05% 2.943ms 2.943ms 1
4002
- void cutlass::device_kernel&lt;flash::enable_sm80_to_sm... 0.00% 0.000us 0.00% 0.000us 0.000us 2.941ms 100.00% 2.941ms 980.445us 3
4003
- Activity Buffer Request 30.11% 1.426ms 30.11% 1.426ms 1.426ms 994.973us 33.83% 994.973us 994.973us 1
4004
- aten::empty 0.64% 30.333us 0.64% 30.333us 5.055us 0.000us 0.00% 0.000us 0.000us 6
4005
- cudaFuncSetAttribute 0.11% 5.440us 0.11% 5.440us 1.813us 0.000us 0.00% 0.000us 0.000us 3
4006
- cudaLaunchKernel 4.81% 227.898us 4.81% 227.898us 75.966us 0.000us 0.00% 0.000us 0.000us 3
4007
- aten::reshape 0.19% 8.769us 0.49% 23.220us 3.870us 0.000us 0.00% 0.000us 0.000us 6
4008
- aten::view 0.31% 14.451us 0.31% 14.451us 2.409us 0.000us 0.00% 0.000us 0.000us 6
4009
- cudaDeviceSynchronize 53.44% 2.531ms 53.44% 2.531ms 2.531ms 0.000us 0.00% 0.000us 0.000us 1
4010
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4011
- Self CPU time total: 4.736ms
4012
- Self CUDA time total: 2.941ms
4013
 
4014
 
4015
 
@@ -4019,21 +4019,21 @@ PROFILE TRACE: xformers_meff | cuda_attn_L448_bfloat16
4019
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4020
  Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
4021
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4022
- xformers_meff 5.82% 299.962us 41.73% 2.152ms 2.152ms 0.000us 0.00% 4.566ms 4.566ms 1
4023
- xformers_flash3::flash_fwd 2.76% 142.114us 35.47% 1.829ms 609.751us 0.000us 0.00% 4.566ms 1.522ms 3
4024
- flash_attn_3::fwd 1.04% 53.631us 32.71% 1.687ms 562.380us 3.419ms 100.00% 4.566ms 1.522ms 3
4025
- xformers_meff 0.00% 0.000us 0.00% 0.000us 0.000us 3.420ms 100.05% 3.420ms 3.420ms 1
4026
- void cutlass::device_kernel&lt;flash::enable_sm80_to_sm... 0.00% 0.000us 0.00% 0.000us 0.000us 3.419ms 100.00% 3.419ms 1.140ms 3
4027
- Activity Buffer Request 27.56% 1.422ms 27.56% 1.422ms 1.422ms 1.148ms 33.58% 1.148ms 1.148ms 1
4028
- aten::empty 0.60% 31.172us 0.60% 31.172us 5.195us 0.000us 0.00% 0.000us 0.000us 6
4029
- cudaFuncSetAttribute 0.11% 5.431us 0.11% 5.431us 1.810us 0.000us 0.00% 0.000us 0.000us 3
4030
- cudaLaunchKernel 3.40% 175.366us 3.40% 175.366us 58.455us 0.000us 0.00% 0.000us 0.000us 3
4031
- aten::reshape 0.17% 8.849us 0.45% 23.030us 3.838us 0.000us 0.00% 0.000us 0.000us 6
4032
- aten::view 0.27% 14.181us 0.27% 14.181us 2.363us 0.000us 0.00% 0.000us 0.000us 6
4033
- cudaDeviceSynchronize 58.27% 3.005ms 58.27% 3.005ms 3.005ms 0.000us 0.00% 0.000us 0.000us 1
4034
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4035
- Self CPU time total: 5.157ms
4036
- Self CUDA time total: 3.419ms
4037
 
4038
 
4039
 
@@ -4043,37 +4043,37 @@ PROFILE TRACE: xformers_meff | cuda_attn_L512_bfloat16
4043
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4044
  Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
4045
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4046
- xformers_meff 5.76% 295.800us 41.67% 2.139ms 2.139ms 0.000us 0.00% 4.557ms 4.557ms 1
4047
- xformers_flash3::flash_fwd 2.75% 141.044us 35.47% 1.821ms 606.924us 0.000us 0.00% 4.557ms 1.519ms 3
4048
- flash_attn_3::fwd 1.04% 53.523us 32.72% 1.680ms 559.910us 3.405ms 100.00% 4.557ms 1.519ms 3
4049
- xformers_meff 0.00% 0.000us 0.00% 0.000us 0.000us 3.407ms 100.05% 3.407ms 3.407ms 1
4050
- void cutlass::device_kernel&lt;flash::enable_sm80_to_sm... 0.00% 0.000us 0.00% 0.000us 0.000us 3.405ms 100.00% 3.405ms 1.135ms 3
4051
- Activity Buffer Request 27.67% 1.420ms 27.67% 1.420ms 1.420ms 1.152ms 33.82% 1.152ms 1.152ms 1
4052
- aten::empty 0.60% 30.610us 0.60% 30.610us 5.102us 0.000us 0.00% 0.000us 0.000us 6
4053
- cudaFuncSetAttribute 0.12% 6.310us 0.12% 6.310us 2.103us 0.000us 0.00% 0.000us 0.000us 3
4054
- cudaLaunchKernel 3.29% 168.946us 3.29% 168.946us 56.315us 0.000us 0.00% 0.000us 0.000us 3
4055
- aten::reshape 0.17% 8.721us 0.44% 22.392us 3.732us 0.000us 0.00% 0.000us 0.000us 6
4056
- aten::view 0.27% 13.671us 0.27% 13.671us 2.279us 0.000us 0.00% 0.000us 0.000us 6
4057
- cudaDeviceSynchronize 58.33% 2.994ms 58.33% 2.994ms 2.994ms 0.000us 0.00% 0.000us 0.000us 1
4058
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4059
- Self CPU time total: 5.133ms
4060
- Self CUDA time total: 3.405ms
4061
 
4062
 
4063
  impl wl p50(ms) ok
4064
- xformers_meff cuda_attn_L128_bfloat16 0.98 True
4065
  xformers_meff cuda_attn_L256_bfloat16 1.03 True
4066
  xformers_meff cuda_attn_L320_bfloat16 1.08 True
4067
- xformers_meff cuda_attn_L384_bfloat16 1.10 True
4068
- xformers_meff cuda_attn_L448_bfloat16 1.23 True
4069
- xformers_meff cuda_attn_L512_bfloat16 1.22 True
4070
  </pre></div>
4071
  <div class="uv-install-logs" id="uv-logs-benchmark">
4072
  <div class="uv-logs-header" onclick="toggleUvLogs(this)">▶ UV Install Logs</div>
4073
  <div class="uv-logs-content" style="display: none;">
4074
  Downloading xformers (111.8MiB)
4075
  Downloading xformers
4076
- Installed 1 package in 14ms
4077
  </div>
4078
  </div>
4079
  <div class="cell-artifacts">
 
3923
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
3924
  Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
3925
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
3926
+ xformers_meff 10.85% 481.112us 51.55% 2.285ms 2.285ms 0.000us 0.00% 3.582ms 3.582ms 1
3927
+ xformers_flash3::flash_fwd 4.56% 202.185us 39.85% 1.766ms 588.715us 0.000us 0.00% 3.582ms 1.194ms 3
3928
+ flash_attn_3::fwd 1.68% 74.662us 35.29% 1.564ms 521.320us 2.681ms 100.00% 3.582ms 1.194ms 3
3929
+ xformers_meff 0.00% 0.000us 0.00% 0.000us 0.000us 2.682ms 100.06% 2.682ms 2.682ms 1
3930
+ void cutlass::device_kernel&lt;flash::enable_sm80_to_sm... 0.00% 0.000us 0.00% 0.000us 0.000us 2.681ms 100.00% 2.681ms 893.515us 3
3931
+ Activity Buffer Request 31.74% 1.407ms 31.74% 1.407ms 1.407ms 901.761us 33.64% 901.761us 901.761us 1
3932
+ aten::empty 0.77% 33.920us 0.77% 33.920us 5.653us 0.000us 0.00% 0.000us 0.000us 6
3933
+ cudaFuncSetAttribute 0.23% 10.152us 0.23% 10.152us 3.384us 0.000us 0.00% 0.000us 0.000us 3
3934
+ cudaLaunchKernel 0.87% 38.521us 0.87% 38.521us 12.840us 0.000us 0.00% 0.000us 0.000us 3
3935
+ aten::reshape 0.29% 13.028us 0.85% 37.710us 6.285us 0.000us 0.00% 0.000us 0.000us 6
3936
+ aten::view 0.56% 24.682us 0.56% 24.682us 4.114us 0.000us 0.00% 0.000us 0.000us 6
3937
+ cudaDeviceSynchronize 48.45% 2.147ms 48.45% 2.147ms 2.147ms 0.000us 0.00% 0.000us 0.000us 1
3938
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
3939
+ Self CPU time total: 4.432ms
3940
+ Self CUDA time total: 2.681ms
3941
 
3942
 
3943
 
 
3947
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
3948
  Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
3949
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
3950
+ xformers_meff 7.16% 317.438us 45.96% 2.036ms 2.036ms 0.000us 0.00% 3.779ms 3.779ms 1
3951
+ xformers_flash3::flash_fwd 3.35% 148.243us 38.25% 1.695ms 564.991us 0.000us 0.00% 3.779ms 1.260ms 3
3952
+ flash_attn_3::fwd 1.25% 55.403us 34.91% 1.547ms 515.576us 2.825ms 100.00% 3.779ms 1.260ms 3
3953
+ xformers_meff 0.00% 0.000us 0.00% 0.000us 0.000us 2.827ms 100.05% 2.827ms 2.827ms 1
3954
+ void cutlass::device_kernel&lt;flash::enable_sm80_to_sm... 0.00% 0.000us 0.00% 0.000us 0.000us 2.825ms 100.00% 2.825ms 941.739us 3
3955
+ Activity Buffer Request 32.14% 1.424ms 32.14% 1.424ms 1.424ms 954.080us 33.77% 954.080us 954.080us 1
3956
+ aten::empty 0.63% 27.720us 0.63% 27.720us 4.620us 0.000us 0.00% 0.000us 0.000us 6
3957
+ cudaFuncSetAttribute 0.12% 5.400us 0.12% 5.400us 1.800us 0.000us 0.00% 0.000us 0.000us 3
3958
+ cudaLaunchKernel 0.77% 34.161us 0.77% 34.161us 11.387us 0.000us 0.00% 0.000us 0.000us 3
3959
+ aten::reshape 0.21% 9.370us 0.54% 23.750us 3.958us 0.000us 0.00% 0.000us 0.000us 6
3960
+ aten::view 0.32% 14.380us 0.32% 14.380us 2.397us 0.000us 0.00% 0.000us 0.000us 6
3961
+ cudaDeviceSynchronize 54.04% 2.395ms 54.04% 2.395ms 2.395ms 0.000us 0.00% 0.000us 0.000us 1
3962
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
3963
+ Self CPU time total: 4.431ms
3964
+ Self CUDA time total: 2.825ms
3965
 
3966
 
3967
 
 
3971
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
3972
  Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
3973
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
3974
+ xformers_meff 6.87% 310.027us 44.72% 2.018ms 2.018ms 0.000us 0.00% 3.923ms 3.923ms 1
3975
+ xformers_flash3::flash_fwd 3.22% 145.444us 37.33% 1.684ms 561.324us 0.000us 0.00% 3.923ms 1.308ms 3
3976
+ flash_attn_3::fwd 1.15% 52.002us 34.10% 1.539ms 512.843us 2.919ms 100.00% 3.923ms 1.308ms 3
3977
+ xformers_meff 0.00% 0.000us 0.00% 0.000us 0.000us 2.921ms 100.06% 2.921ms 2.921ms 1
3978
+ void cutlass::device_kernel&lt;flash::enable_sm80_to_sm... 0.00% 0.000us 0.00% 0.000us 0.000us 2.919ms 100.00% 2.919ms 973.037us 3
3979
+ Activity Buffer Request 31.44% 1.418ms 31.44% 1.418ms 1.418ms 1.004ms 34.40% 1.004ms 1.004ms 1
3980
+ aten::empty 0.63% 28.392us 0.63% 28.392us 4.732us 0.000us 0.00% 0.000us 0.000us 6
3981
+ cudaFuncSetAttribute 0.12% 5.520us 0.12% 5.520us 1.840us 0.000us 0.00% 0.000us 0.000us 3
3982
+ cudaLaunchKernel 0.76% 34.420us 0.76% 34.420us 11.473us 0.000us 0.00% 0.000us 0.000us 3
3983
+ aten::reshape 0.21% 9.519us 0.52% 23.650us 3.942us 0.000us 0.00% 0.000us 0.000us 6
3984
+ aten::view 0.31% 14.131us 0.31% 14.131us 2.355us 0.000us 0.00% 0.000us 0.000us 6
3985
+ cudaDeviceSynchronize 55.28% 2.494ms 55.28% 2.494ms 2.494ms 0.000us 0.00% 0.000us 0.000us 1
3986
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
3987
+ Self CPU time total: 4.511ms
3988
+ Self CUDA time total: 2.919ms
3989
 
3990
 
3991
 
 
3995
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
3996
  Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
3997
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
3998
+ xformers_meff 6.73% 317.798us 47.46% 2.241ms 2.241ms 0.000us 0.00% 3.892ms 3.892ms 1
3999
+ xformers_flash3::flash_fwd 3.10% 146.544us 40.23% 1.900ms 633.169us 0.000us 0.00% 3.892ms 1.297ms 3
4000
+ flash_attn_3::fwd 1.15% 54.462us 37.13% 1.753ms 584.321us 2.910ms 100.00% 3.892ms 1.297ms 3
4001
+ xformers_meff 0.00% 0.000us 0.00% 0.000us 0.000us 2.911ms 100.05% 2.911ms 2.911ms 1
4002
+ void cutlass::device_kernel&lt;flash::enable_sm80_to_sm... 0.00% 0.000us 0.00% 0.000us 0.000us 2.910ms 100.00% 2.910ms 969.848us 3
4003
+ Activity Buffer Request 30.01% 1.417ms 30.01% 1.417ms 1.417ms 982.915us 33.78% 982.915us 982.915us 1
4004
+ aten::empty 0.62% 29.170us 0.62% 29.170us 4.862us 0.000us 0.00% 0.000us 0.000us 6
4005
+ cudaFuncSetAttribute 0.11% 5.370us 0.11% 5.370us 1.790us 0.000us 0.00% 0.000us 0.000us 3
4006
+ cudaLaunchKernel 5.23% 247.156us 5.23% 247.156us 82.385us 0.000us 0.00% 0.000us 0.000us 3
4007
+ aten::reshape 0.20% 9.560us 0.50% 23.460us 3.910us 0.000us 0.00% 0.000us 0.000us 6
4008
+ aten::view 0.29% 13.900us 0.29% 13.900us 2.317us 0.000us 0.00% 0.000us 0.000us 6
4009
+ cudaDeviceSynchronize 52.54% 2.481ms 52.54% 2.481ms 2.481ms 0.000us 0.00% 0.000us 0.000us 1
4010
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4011
+ Self CPU time total: 4.721ms
4012
+ Self CUDA time total: 2.910ms
4013
 
4014
 
4015
 
 
4019
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4020
  Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
4021
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4022
+ xformers_meff 5.86% 306.369us 41.94% 2.193ms 2.193ms 0.000us 0.00% 4.614ms 4.614ms 1
4023
+ xformers_flash3::flash_fwd 2.85% 149.202us 35.63% 1.863ms 620.885us 0.000us 0.00% 4.614ms 1.538ms 3
4024
+ flash_attn_3::fwd 1.03% 53.951us 32.77% 1.713ms 571.151us 3.461ms 100.00% 4.614ms 1.538ms 3
4025
+ xformers_meff 0.00% 0.000us 0.00% 0.000us 0.000us 3.462ms 100.04% 3.462ms 3.462ms 1
4026
+ void cutlass::device_kernel&lt;flash::enable_sm80_to_sm... 0.00% 0.000us 0.00% 0.000us 0.000us 3.461ms 100.00% 3.461ms 1.154ms 3
4027
+ Activity Buffer Request 27.28% 1.426ms 27.28% 1.426ms 1.426ms 1.153ms 33.31% 1.153ms 1.153ms 1
4028
+ aten::empty 0.55% 28.813us 0.55% 28.813us 4.802us 0.000us 0.00% 0.000us 0.000us 6
4029
+ cudaFuncSetAttribute 0.11% 5.560us 0.11% 5.560us 1.853us 0.000us 0.00% 0.000us 0.000us 3
4030
+ cudaLaunchKernel 3.80% 198.684us 3.80% 198.684us 66.228us 0.000us 0.00% 0.000us 0.000us 3
4031
+ aten::reshape 0.18% 9.430us 0.46% 23.930us 3.988us 0.000us 0.00% 0.000us 0.000us 6
4032
+ aten::view 0.28% 14.500us 0.28% 14.500us 2.417us 0.000us 0.00% 0.000us 0.000us 6
4033
+ cudaDeviceSynchronize 58.06% 3.036ms 58.06% 3.036ms 3.036ms 0.000us 0.00% 0.000us 0.000us 1
4034
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4035
+ Self CPU time total: 5.228ms
4036
+ Self CUDA time total: 3.461ms
4037
 
4038
 
4039
 
 
4043
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4044
  Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
4045
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4046
+ xformers_meff 5.96% 310.158us 41.66% 2.167ms 2.167ms 0.000us 0.00% 4.643ms 4.643ms 1
4047
+ xformers_flash3::flash_fwd 2.83% 146.954us 35.22% 1.832ms 610.728us 0.000us 0.00% 4.643ms 1.548ms 3
4048
+ flash_attn_3::fwd 1.00% 51.911us 32.40% 1.685ms 561.744us 3.464ms 100.00% 4.643ms 1.548ms 3
4049
+ xformers_meff 0.00% 0.000us 0.00% 0.000us 0.000us 3.465ms 100.04% 3.465ms 3.465ms 1
4050
+ void cutlass::device_kernel&lt;flash::enable_sm80_to_sm... 0.00% 0.000us 0.00% 0.000us 0.000us 3.464ms 100.00% 3.464ms 1.155ms 3
4051
+ Activity Buffer Request 27.49% 1.430ms 27.49% 1.430ms 1.430ms 1.179ms 34.05% 1.179ms 1.179ms 1
4052
+ aten::empty 0.54% 28.311us 0.54% 28.311us 4.719us 0.000us 0.00% 0.000us 0.000us 6
4053
+ cudaFuncSetAttribute 0.11% 5.750us 0.11% 5.750us 1.917us 0.000us 0.00% 0.000us 0.000us 3
4054
+ cudaLaunchKernel 3.25% 169.084us 3.25% 169.084us 56.361us 0.000us 0.00% 0.000us 0.000us 3
4055
+ aten::reshape 0.17% 8.670us 0.48% 24.720us 4.120us 0.000us 0.00% 0.000us 0.000us 6
4056
+ aten::view 0.31% 16.050us 0.31% 16.050us 2.675us 0.000us 0.00% 0.000us 0.000us 6
4057
+ cudaDeviceSynchronize 58.34% 3.035ms 58.34% 3.035ms 3.035ms 0.000us 0.00% 0.000us 0.000us 1
4058
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4059
+ Self CPU time total: 5.202ms
4060
+ Self CUDA time total: 3.464ms
4061
 
4062
 
4063
  impl wl p50(ms) ok
4064
+ xformers_meff cuda_attn_L128_bfloat16 1.00 True
4065
  xformers_meff cuda_attn_L256_bfloat16 1.03 True
4066
  xformers_meff cuda_attn_L320_bfloat16 1.08 True
4067
+ xformers_meff cuda_attn_L384_bfloat16 1.09 True
4068
+ xformers_meff cuda_attn_L448_bfloat16 1.25 True
4069
+ xformers_meff cuda_attn_L512_bfloat16 1.24 True
4070
  </pre></div>
4071
  <div class="uv-install-logs" id="uv-logs-benchmark">
4072
  <div class="uv-logs-header" onclick="toggleUvLogs(this)">▶ UV Install Logs</div>
4073
  <div class="uv-logs-content" style="display: none;">
4074
  Downloading xformers (111.8MiB)
4075
  Downloading xformers
4076
+ Installed 1 package in 13ms
4077
  </div>
4078
  </div>
4079
  <div class="cell-artifacts">
flash_attn/results/artifacts/combine/latency.svg CHANGED

Git LFS Details

  • SHA256: 168c229932ad06a68508a4a77b66485ff9bcf48ed736a5ffdd003f5cb9e8e639
  • Pointer size: 130 Bytes
  • Size of remote file: 24.8 kB

Git LFS Details

  • SHA256: 0a7d7b3dc8fc6b60a4b9f8bfcf3e229706548b71a8174822b89cc9a2746d3bbd
  • Pointer size: 130 Bytes
  • Size of remote file: 24.8 kB
flash_attn/results/combined_results.html CHANGED
@@ -3872,7 +3872,7 @@ span.linenos.special { color: #000000; background-color: #ffffc0; padding-left:
3872
  <rdf:RDF>
3873
  <ns2:Work>
3874
  <dc:type rdf:resource="http://purl.org/dc/dcmitype/StillImage" />
3875
- <dc:date>2025-10-28T14:09:17.505622</dc:date>
3876
  <dc:format>image/svg+xml</dc:format>
3877
  <dc:creator>
3878
  <ns2:Agent>
@@ -3982,96 +3982,96 @@ span.linenos.special { color: #000000; background-color: #ffffc0; padding-left:
3982
  <g id="matplotlib.axis_2">
3983
  <g id="ytick_1">
3984
  <g id="grid-y--2" class="grid grid-y">
3985
- <path d="M 47.81 403.521712 L 835.361742 403.521712 " clip-path="url(#p09feef2583)" style="fill: none; stroke: #b0b0b0; stroke-opacity: 0.3; stroke-width: 0.8; stroke-linecap: square" />
3986
  </g>
3987
  <g id="line2d_7">
3988
  <defs>
3989
  <path id="m0fca2865ba" d="M 0 0 L -3.5 0 " style="stroke: #000000; stroke-width: 0.8" />
3990
  </defs>
3991
  <g>
3992
- <use ns4:href="#m0fca2865ba" x="47.81" y="403.521712" style="stroke: #000000; stroke-width: 0.8" />
3993
  </g>
3994
  </g>
3995
  <g id="text_7">
3996
- <text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: end" x="40.81" y="407.320931" transform="rotate(-0 40.81 407.320931)">1.0</text>
3997
  </g>
3998
  </g>
3999
  <g id="ytick_2">
4000
  <g id="grid-y--3" class="grid grid-y">
4001
- <path d="M 47.81 343.523424 L 835.361742 343.523424 " clip-path="url(#p09feef2583)" style="fill: none; stroke: #b0b0b0; stroke-opacity: 0.3; stroke-width: 0.8; stroke-linecap: square" />
4002
  </g>
4003
  <g id="line2d_8">
4004
  <g>
4005
- <use ns4:href="#m0fca2865ba" x="47.81" y="343.523424" style="stroke: #000000; stroke-width: 0.8" />
4006
  </g>
4007
  </g>
4008
  <g id="text_8">
4009
- <text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: end" x="40.81" y="347.322643" transform="rotate(-0 40.81 347.322643)">1.2</text>
4010
  </g>
4011
  </g>
4012
  <g id="ytick_3">
4013
  <g id="grid-y--4" class="grid grid-y">
4014
- <path d="M 47.81 283.525136 L 835.361742 283.525136 " clip-path="url(#p09feef2583)" style="fill: none; stroke: #b0b0b0; stroke-opacity: 0.3; stroke-width: 0.8; stroke-linecap: square" />
4015
  </g>
4016
  <g id="line2d_9">
4017
  <g>
4018
- <use ns4:href="#m0fca2865ba" x="47.81" y="283.525136" style="stroke: #000000; stroke-width: 0.8" />
4019
  </g>
4020
  </g>
4021
  <g id="text_9">
4022
- <text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: end" x="40.81" y="287.324355" transform="rotate(-0 40.81 287.324355)">1.4</text>
4023
  </g>
4024
  </g>
4025
  <g id="ytick_4">
4026
  <g id="grid-y--5" class="grid grid-y">
4027
- <path d="M 47.81 223.526848 L 835.361742 223.526848 " clip-path="url(#p09feef2583)" style="fill: none; stroke: #b0b0b0; stroke-opacity: 0.3; stroke-width: 0.8; stroke-linecap: square" />
4028
  </g>
4029
  <g id="line2d_10">
4030
  <g>
4031
- <use ns4:href="#m0fca2865ba" x="47.81" y="223.526848" style="stroke: #000000; stroke-width: 0.8" />
4032
  </g>
4033
  </g>
4034
  <g id="text_10">
4035
- <text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: end" x="40.81" y="227.326067" transform="rotate(-0 40.81 227.326067)">1.6</text>
4036
  </g>
4037
  </g>
4038
  <g id="ytick_5">
4039
  <g id="grid-y--6" class="grid grid-y">
4040
- <path d="M 47.81 163.52856 L 835.361742 163.52856 " clip-path="url(#p09feef2583)" style="fill: none; stroke: #b0b0b0; stroke-opacity: 0.3; stroke-width: 0.8; stroke-linecap: square" />
4041
  </g>
4042
  <g id="line2d_11">
4043
  <g>
4044
- <use ns4:href="#m0fca2865ba" x="47.81" y="163.52856" style="stroke: #000000; stroke-width: 0.8" />
4045
  </g>
4046
  </g>
4047
  <g id="text_11">
4048
- <text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: end" x="40.81" y="167.327779" transform="rotate(-0 40.81 167.327779)">1.8</text>
4049
  </g>
4050
  </g>
4051
  <g id="ytick_6">
4052
  <g id="grid-y--7" class="grid grid-y">
4053
- <path d="M 47.81 103.530273 L 835.361742 103.530273 " clip-path="url(#p09feef2583)" style="fill: none; stroke: #b0b0b0; stroke-opacity: 0.3; stroke-width: 0.8; stroke-linecap: square" />
4054
  </g>
4055
  <g id="line2d_12">
4056
  <g>
4057
- <use ns4:href="#m0fca2865ba" x="47.81" y="103.530273" style="stroke: #000000; stroke-width: 0.8" />
4058
  </g>
4059
  </g>
4060
  <g id="text_12">
4061
- <text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: end" x="40.81" y="107.329491" transform="rotate(-0 40.81 107.329491)">2.0</text>
4062
  </g>
4063
  </g>
4064
  <g id="ytick_7">
4065
  <g id="grid-y--8" class="grid grid-y">
4066
- <path d="M 47.81 43.531985 L 835.361742 43.531985 " clip-path="url(#p09feef2583)" style="fill: none; stroke: #b0b0b0; stroke-opacity: 0.3; stroke-width: 0.8; stroke-linecap: square" />
4067
  </g>
4068
  <g id="line2d_13">
4069
  <g>
4070
- <use ns4:href="#m0fca2865ba" x="47.81" y="43.531985" style="stroke: #000000; stroke-width: 0.8" />
4071
  </g>
4072
  </g>
4073
  <g id="text_13">
4074
- <text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: end" x="40.81" y="47.331204" transform="rotate(-0 40.81 47.331204)">2.2</text>
4075
  </g>
4076
  </g>
4077
  <g id="label--y" class="ylabel">
@@ -4079,73 +4079,73 @@ span.linenos.special { color: #000000; background-color: #ffffc0; padding-left:
4079
  </g>
4080
  </g>
4081
  <g id="series--torch-flash-ma" class="series">
4082
- <path d="M 83.607806 337.456697 L 226.799032 322.330829 L 369.990258 318.592935 L 513.181484 311.965825 L 656.37271 262.663131 L 799.563935 254.692359 " clip-path="url(#p09feef2583)" style="fill: none; stroke: #1f77b4; stroke-width: 1.5; stroke-linecap: square" />
4083
  <defs>
4084
  <path id="md7efaf3aec" d="M 0 3 C 0.795609 3 1.55874 2.683901 2.12132 2.12132 C 2.683901 1.55874 3 0.795609 3 0 C 3 -0.795609 2.683901 -1.55874 2.12132 -2.12132 C 1.55874 -2.683901 0.795609 -3 0 -3 C -0.795609 -3 -1.55874 -2.683901 -2.12132 -2.12132 C -2.683901 -1.55874 -3 -0.795609 -3 0 C -3 0.795609 -2.683901 1.55874 -2.12132 2.12132 C -1.55874 2.683901 -0.795609 3 0 3 z " style="stroke: #1f77b4" />
4085
  </defs>
4086
  <g clip-path="url(#p09feef2583)">
4087
- <use ns4:href="#md7efaf3aec" x="83.607806" y="337.456697" style="fill: #1f77b4; stroke: #1f77b4" />
4088
- <use ns4:href="#md7efaf3aec" x="226.799032" y="322.330829" style="fill: #1f77b4; stroke: #1f77b4" />
4089
- <use ns4:href="#md7efaf3aec" x="369.990258" y="318.592935" style="fill: #1f77b4; stroke: #1f77b4" />
4090
- <use ns4:href="#md7efaf3aec" x="513.181484" y="311.965825" style="fill: #1f77b4; stroke: #1f77b4" />
4091
- <use ns4:href="#md7efaf3aec" x="656.37271" y="262.663131" style="fill: #1f77b4; stroke: #1f77b4" />
4092
- <use ns4:href="#md7efaf3aec" x="799.563935" y="254.692359" style="fill: #1f77b4; stroke: #1f77b4" />
4093
  </g>
4094
  </g>
4095
  <g id="series--torch-mem-eff" class="series">
4096
- <path d="M 83.607806 144.033917 L 226.799032 111.747638 L 369.990258 92.42159 L 513.181484 85.353791 L 656.37271 94.728524 L 799.563935 45.999414 " clip-path="url(#p09feef2583)" style="fill: none; stroke: #ff7f0e; stroke-width: 1.5; stroke-linecap: square" />
4097
  <defs>
4098
  <path id="m9b8c54d372" d="M 0 3 C 0.795609 3 1.55874 2.683901 2.12132 2.12132 C 2.683901 1.55874 3 0.795609 3 0 C 3 -0.795609 2.683901 -1.55874 2.12132 -2.12132 C 1.55874 -2.683901 0.795609 -3 0 -3 C -0.795609 -3 -1.55874 -2.683901 -2.12132 -2.12132 C -2.683901 -1.55874 -3 -0.795609 -3 0 C -3 0.795609 -2.683901 1.55874 -2.12132 2.12132 C -1.55874 2.683901 -0.795609 3 0 3 z " style="stroke: #ff7f0e" />
4099
  </defs>
4100
  <g clip-path="url(#p09feef2583)">
4101
- <use ns4:href="#m9b8c54d372" x="83.607806" y="144.033917" style="fill: #ff7f0e; stroke: #ff7f0e" />
4102
- <use ns4:href="#m9b8c54d372" x="226.799032" y="111.747638" style="fill: #ff7f0e; stroke: #ff7f0e" />
4103
- <use ns4:href="#m9b8c54d372" x="369.990258" y="92.42159" style="fill: #ff7f0e; stroke: #ff7f0e" />
4104
- <use ns4:href="#m9b8c54d372" x="513.181484" y="85.353791" style="fill: #ff7f0e; stroke: #ff7f0e" />
4105
- <use ns4:href="#m9b8c54d372" x="656.37271" y="94.728524" style="fill: #ff7f0e; stroke: #ff7f0e" />
4106
  <use ns4:href="#m9b8c54d372" x="799.563935" y="45.999414" style="fill: #ff7f0e; stroke: #ff7f0e" />
4107
  </g>
4108
  </g>
4109
  <g id="series--xformers-meff" class="series">
4110
- <path d="M 83.607806 408.245077 L 226.799032 395.990127 L 369.990258 378.455027 L 513.181484 373.43287 L 656.37271 333.571508 L 799.563935 337.423698 " clip-path="url(#p09feef2583)" style="fill: none; stroke: #2ca02c; stroke-width: 1.5; stroke-linecap: square" />
4111
  <defs>
4112
  <path id="mc655281e0b" d="M 0 3 C 0.795609 3 1.55874 2.683901 2.12132 2.12132 C 2.683901 1.55874 3 0.795609 3 0 C 3 -0.795609 2.683901 -1.55874 2.12132 -2.12132 C 1.55874 -2.683901 0.795609 -3 0 -3 C -0.795609 -3 -1.55874 -2.683901 -2.12132 -2.12132 C -2.683901 -1.55874 -3 -0.795609 -3 0 C -3 0.795609 -2.683901 1.55874 -2.12132 2.12132 C -1.55874 2.683901 -0.795609 3 0 3 z " style="stroke: #2ca02c" />
4113
  </defs>
4114
  <g clip-path="url(#p09feef2583)">
4115
- <use ns4:href="#mc655281e0b" x="83.607806" y="408.245077" style="fill: #2ca02c; stroke: #2ca02c" />
4116
- <use ns4:href="#mc655281e0b" x="226.799032" y="395.990127" style="fill: #2ca02c; stroke: #2ca02c" />
4117
- <use ns4:href="#mc655281e0b" x="369.990258" y="378.455027" style="fill: #2ca02c; stroke: #2ca02c" />
4118
- <use ns4:href="#mc655281e0b" x="513.181484" y="373.43287" style="fill: #2ca02c; stroke: #2ca02c" />
4119
- <use ns4:href="#mc655281e0b" x="656.37271" y="333.571508" style="fill: #2ca02c; stroke: #2ca02c" />
4120
- <use ns4:href="#mc655281e0b" x="799.563935" y="337.423698" style="fill: #2ca02c; stroke: #2ca02c" />
4121
  </g>
4122
  </g>
4123
  <g id="series--hf-kernels-flash-attn" class="series">
4124
- <path d="M 83.607806 415.568468 L 226.799032 400.735991 L 369.990258 386.008812 L 513.181484 387.284075 L 656.37271 338.461368 L 799.563935 341.493982 " clip-path="url(#p09feef2583)" style="fill: none; stroke: #d62728; stroke-width: 1.5; stroke-linecap: square" />
4125
  <defs>
4126
  <path id="m61c8040d7e" d="M 0 3 C 0.795609 3 1.55874 2.683901 2.12132 2.12132 C 2.683901 1.55874 3 0.795609 3 0 C 3 -0.795609 2.683901 -1.55874 2.12132 -2.12132 C 1.55874 -2.683901 0.795609 -3 0 -3 C -0.795609 -3 -1.55874 -2.683901 -2.12132 -2.12132 C -2.683901 -1.55874 -3 -0.795609 -3 0 C -3 0.795609 -2.683901 1.55874 -2.12132 2.12132 C -1.55874 2.683901 -0.795609 3 0 3 z " style="stroke: #d62728" />
4127
  </defs>
4128
  <g clip-path="url(#p09feef2583)">
4129
- <use ns4:href="#m61c8040d7e" x="83.607806" y="415.568468" style="fill: #d62728; stroke: #d62728" />
4130
- <use ns4:href="#m61c8040d7e" x="226.799032" y="400.735991" style="fill: #d62728; stroke: #d62728" />
4131
- <use ns4:href="#m61c8040d7e" x="369.990258" y="386.008812" style="fill: #d62728; stroke: #d62728" />
4132
- <use ns4:href="#m61c8040d7e" x="513.181484" y="387.284075" style="fill: #d62728; stroke: #d62728" />
4133
- <use ns4:href="#m61c8040d7e" x="656.37271" y="338.461368" style="fill: #d62728; stroke: #d62728" />
4134
- <use ns4:href="#m61c8040d7e" x="799.563935" y="341.493982" style="fill: #d62728; stroke: #d62728" />
4135
  </g>
4136
  </g>
4137
  <g id="series--hf-kernels-flash-attn3" class="series">
4138
- <path d="M 83.607806 428.387702 L 226.799032 415.50217 L 369.990258 397.727077 L 513.181484 397.526383 L 656.37271 348.148992 L 799.563935 348.55398 " clip-path="url(#p09feef2583)" style="fill: none; stroke: #9467bd; stroke-width: 1.5; stroke-linecap: square" />
4139
  <defs>
4140
  <path id="m7cd35be9cc" d="M 0 3 C 0.795609 3 1.55874 2.683901 2.12132 2.12132 C 2.683901 1.55874 3 0.795609 3 0 C 3 -0.795609 2.683901 -1.55874 2.12132 -2.12132 C 1.55874 -2.683901 0.795609 -3 0 -3 C -0.795609 -3 -1.55874 -2.683901 -2.12132 -2.12132 C -2.683901 -1.55874 -3 -0.795609 -3 0 C -3 0.795609 -2.683901 1.55874 -2.12132 2.12132 C -1.55874 2.683901 -0.795609 3 0 3 z " style="stroke: #9467bd" />
4141
  </defs>
4142
  <g clip-path="url(#p09feef2583)">
4143
  <use ns4:href="#m7cd35be9cc" x="83.607806" y="428.387702" style="fill: #9467bd; stroke: #9467bd" />
4144
- <use ns4:href="#m7cd35be9cc" x="226.799032" y="415.50217" style="fill: #9467bd; stroke: #9467bd" />
4145
- <use ns4:href="#m7cd35be9cc" x="369.990258" y="397.727077" style="fill: #9467bd; stroke: #9467bd" />
4146
- <use ns4:href="#m7cd35be9cc" x="513.181484" y="397.526383" style="fill: #9467bd; stroke: #9467bd" />
4147
- <use ns4:href="#m7cd35be9cc" x="656.37271" y="348.148992" style="fill: #9467bd; stroke: #9467bd" />
4148
- <use ns4:href="#m7cd35be9cc" x="799.563935" y="348.55398" style="fill: #9467bd; stroke: #9467bd" />
4149
  </g>
4150
  </g>
4151
  <g id="patch_3">
@@ -4337,48 +4337,48 @@ Summary: 6 found, 0 skipped, 0 missing
4337
  COMBINED BENCHMARK SUMMARY
4338
 
4339
  impl wl p50(ms) ok
4340
- hf_kernels_flash_attn cuda_attn_L128_bfloat16 0.96 True
4341
  hf_kernels_flash_attn cuda_attn_L256_bfloat16 1.01 True
4342
  hf_kernels_flash_attn cuda_attn_L320_bfloat16 1.06 True
4343
- hf_kernels_flash_attn cuda_attn_L384_bfloat16 1.05 True
4344
- hf_kernels_flash_attn cuda_attn_L448_bfloat16 1.22 True
4345
- hf_kernels_flash_attn cuda_attn_L512_bfloat16 1.21 True
4346
- hf_kernels_flash_attn3 cuda_attn_L128_bfloat16 0.92 True
4347
- hf_kernels_flash_attn3 cuda_attn_L256_bfloat16 0.96 True
4348
- hf_kernels_flash_attn3 cuda_attn_L320_bfloat16 1.02 True
4349
- hf_kernels_flash_attn3 cuda_attn_L384_bfloat16 1.02 True
4350
- hf_kernels_flash_attn3 cuda_attn_L448_bfloat16 1.18 True
4351
  hf_kernels_flash_attn3 cuda_attn_L512_bfloat16 1.18 True
4352
  sage_int8_fp16 cuda_attn_L128_bfloat16 FAIL False
4353
- Error: module &#x27;sage_attention_fd11035eb4318b27&#x27; has no attribute &#x27;fwd&#x27;
4354
  sage_int8_fp16 cuda_attn_L256_bfloat16 FAIL False
4355
- Error: module &#x27;sage_attention_fd11035eb4318b27&#x27; has no attribute &#x27;fwd&#x27;
4356
  sage_int8_fp16 cuda_attn_L320_bfloat16 FAIL False
4357
- Error: module &#x27;sage_attention_fd11035eb4318b27&#x27; has no attribute &#x27;fwd&#x27;
4358
  sage_int8_fp16 cuda_attn_L384_bfloat16 FAIL False
4359
- Error: module &#x27;sage_attention_fd11035eb4318b27&#x27; has no attribute &#x27;fwd&#x27;
4360
  sage_int8_fp16 cuda_attn_L448_bfloat16 FAIL False
4361
- Error: module &#x27;sage_attention_fd11035eb4318b27&#x27; has no attribute &#x27;fwd&#x27;
4362
  sage_int8_fp16 cuda_attn_L512_bfloat16 FAIL False
4363
- Error: module &#x27;sage_attention_fd11035eb4318b27&#x27; has no attribute &#x27;fwd&#x27;
4364
  torch_flash_ma cuda_attn_L128_bfloat16 1.22 True
4365
- torch_flash_ma cuda_attn_L256_bfloat16 1.27 True
4366
- torch_flash_ma cuda_attn_L320_bfloat16 1.28 True
4367
- torch_flash_ma cuda_attn_L384_bfloat16 1.31 True
4368
  torch_flash_ma cuda_attn_L448_bfloat16 1.47 True
4369
  torch_flash_ma cuda_attn_L512_bfloat16 1.50 True
4370
- torch_mem_eff cuda_attn_L128_bfloat16 1.86 True
4371
- torch_mem_eff cuda_attn_L256_bfloat16 1.97 True
4372
- torch_mem_eff cuda_attn_L320_bfloat16 2.04 True
4373
- torch_mem_eff cuda_attn_L384_bfloat16 2.06 True
4374
- torch_mem_eff cuda_attn_L448_bfloat16 2.03 True
4375
  torch_mem_eff cuda_attn_L512_bfloat16 2.19 True
4376
- xformers_meff cuda_attn_L128_bfloat16 0.98 True
4377
  xformers_meff cuda_attn_L256_bfloat16 1.03 True
4378
  xformers_meff cuda_attn_L320_bfloat16 1.08 True
4379
- xformers_meff cuda_attn_L384_bfloat16 1.10 True
4380
- xformers_meff cuda_attn_L448_bfloat16 1.23 True
4381
- xformers_meff cuda_attn_L512_bfloat16 1.22 True
4382
 
4383
  GENERATING COMBINED VISUALIZATION
4384
 
@@ -4402,7 +4402,7 @@ Implementations included:
4402
  <div class="uv-install-logs" id="uv-logs-combine">
4403
  <div class="uv-logs-header" onclick="toggleUvLogs(this)">▶ UV Install Logs</div>
4404
  <div class="uv-logs-content" style="display: none;">
4405
- Installed 37 packages in 187ms
4406
  </div>
4407
  </div>
4408
  <div class="cell-artifacts">
@@ -4415,7 +4415,7 @@ Installed 37 packages in 187ms
4415
  <rdf:RDF>
4416
  <ns2:Work>
4417
  <dc:type rdf:resource="http://purl.org/dc/dcmitype/StillImage" />
4418
- <dc:date>2025-10-28T14:09:17.505622</dc:date>
4419
  <dc:format>image/svg+xml</dc:format>
4420
  <dc:creator>
4421
  <ns2:Agent>
@@ -4525,96 +4525,96 @@ Installed 37 packages in 187ms
4525
  <g id="matplotlib.axis_2">
4526
  <g id="ytick_1">
4527
  <g id="grid-y--2" class="grid grid-y">
4528
- <path d="M 47.81 403.521712 L 835.361742 403.521712 " clip-path="url(#p09feef2583)" style="fill: none; stroke: #b0b0b0; stroke-opacity: 0.3; stroke-width: 0.8; stroke-linecap: square" />
4529
  </g>
4530
  <g id="line2d_7">
4531
  <defs>
4532
  <path id="m0fca2865ba" d="M 0 0 L -3.5 0 " style="stroke: #000000; stroke-width: 0.8" />
4533
  </defs>
4534
  <g>
4535
- <use ns4:href="#m0fca2865ba" x="47.81" y="403.521712" style="stroke: #000000; stroke-width: 0.8" />
4536
  </g>
4537
  </g>
4538
  <g id="text_7">
4539
- <text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: end" x="40.81" y="407.320931" transform="rotate(-0 40.81 407.320931)">1.0</text>
4540
  </g>
4541
  </g>
4542
  <g id="ytick_2">
4543
  <g id="grid-y--3" class="grid grid-y">
4544
- <path d="M 47.81 343.523424 L 835.361742 343.523424 " clip-path="url(#p09feef2583)" style="fill: none; stroke: #b0b0b0; stroke-opacity: 0.3; stroke-width: 0.8; stroke-linecap: square" />
4545
  </g>
4546
  <g id="line2d_8">
4547
  <g>
4548
- <use ns4:href="#m0fca2865ba" x="47.81" y="343.523424" style="stroke: #000000; stroke-width: 0.8" />
4549
  </g>
4550
  </g>
4551
  <g id="text_8">
4552
- <text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: end" x="40.81" y="347.322643" transform="rotate(-0 40.81 347.322643)">1.2</text>
4553
  </g>
4554
  </g>
4555
  <g id="ytick_3">
4556
  <g id="grid-y--4" class="grid grid-y">
4557
- <path d="M 47.81 283.525136 L 835.361742 283.525136 " clip-path="url(#p09feef2583)" style="fill: none; stroke: #b0b0b0; stroke-opacity: 0.3; stroke-width: 0.8; stroke-linecap: square" />
4558
  </g>
4559
  <g id="line2d_9">
4560
  <g>
4561
- <use ns4:href="#m0fca2865ba" x="47.81" y="283.525136" style="stroke: #000000; stroke-width: 0.8" />
4562
  </g>
4563
  </g>
4564
  <g id="text_9">
4565
- <text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: end" x="40.81" y="287.324355" transform="rotate(-0 40.81 287.324355)">1.4</text>
4566
  </g>
4567
  </g>
4568
  <g id="ytick_4">
4569
  <g id="grid-y--5" class="grid grid-y">
4570
- <path d="M 47.81 223.526848 L 835.361742 223.526848 " clip-path="url(#p09feef2583)" style="fill: none; stroke: #b0b0b0; stroke-opacity: 0.3; stroke-width: 0.8; stroke-linecap: square" />
4571
  </g>
4572
  <g id="line2d_10">
4573
  <g>
4574
- <use ns4:href="#m0fca2865ba" x="47.81" y="223.526848" style="stroke: #000000; stroke-width: 0.8" />
4575
  </g>
4576
  </g>
4577
  <g id="text_10">
4578
- <text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: end" x="40.81" y="227.326067" transform="rotate(-0 40.81 227.326067)">1.6</text>
4579
  </g>
4580
  </g>
4581
  <g id="ytick_5">
4582
  <g id="grid-y--6" class="grid grid-y">
4583
- <path d="M 47.81 163.52856 L 835.361742 163.52856 " clip-path="url(#p09feef2583)" style="fill: none; stroke: #b0b0b0; stroke-opacity: 0.3; stroke-width: 0.8; stroke-linecap: square" />
4584
  </g>
4585
  <g id="line2d_11">
4586
  <g>
4587
- <use ns4:href="#m0fca2865ba" x="47.81" y="163.52856" style="stroke: #000000; stroke-width: 0.8" />
4588
  </g>
4589
  </g>
4590
  <g id="text_11">
4591
- <text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: end" x="40.81" y="167.327779" transform="rotate(-0 40.81 167.327779)">1.8</text>
4592
  </g>
4593
  </g>
4594
  <g id="ytick_6">
4595
  <g id="grid-y--7" class="grid grid-y">
4596
- <path d="M 47.81 103.530273 L 835.361742 103.530273 " clip-path="url(#p09feef2583)" style="fill: none; stroke: #b0b0b0; stroke-opacity: 0.3; stroke-width: 0.8; stroke-linecap: square" />
4597
  </g>
4598
  <g id="line2d_12">
4599
  <g>
4600
- <use ns4:href="#m0fca2865ba" x="47.81" y="103.530273" style="stroke: #000000; stroke-width: 0.8" />
4601
  </g>
4602
  </g>
4603
  <g id="text_12">
4604
- <text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: end" x="40.81" y="107.329491" transform="rotate(-0 40.81 107.329491)">2.0</text>
4605
  </g>
4606
  </g>
4607
  <g id="ytick_7">
4608
  <g id="grid-y--8" class="grid grid-y">
4609
- <path d="M 47.81 43.531985 L 835.361742 43.531985 " clip-path="url(#p09feef2583)" style="fill: none; stroke: #b0b0b0; stroke-opacity: 0.3; stroke-width: 0.8; stroke-linecap: square" />
4610
  </g>
4611
  <g id="line2d_13">
4612
  <g>
4613
- <use ns4:href="#m0fca2865ba" x="47.81" y="43.531985" style="stroke: #000000; stroke-width: 0.8" />
4614
  </g>
4615
  </g>
4616
  <g id="text_13">
4617
- <text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: end" x="40.81" y="47.331204" transform="rotate(-0 40.81 47.331204)">2.2</text>
4618
  </g>
4619
  </g>
4620
  <g id="label--y" class="ylabel">
@@ -4622,73 +4622,73 @@ Installed 37 packages in 187ms
4622
  </g>
4623
  </g>
4624
  <g id="series--torch-flash-ma" class="series">
4625
- <path d="M 83.607806 337.456697 L 226.799032 322.330829 L 369.990258 318.592935 L 513.181484 311.965825 L 656.37271 262.663131 L 799.563935 254.692359 " clip-path="url(#p09feef2583)" style="fill: none; stroke: #1f77b4; stroke-width: 1.5; stroke-linecap: square" />
4626
  <defs>
4627
  <path id="md7efaf3aec" d="M 0 3 C 0.795609 3 1.55874 2.683901 2.12132 2.12132 C 2.683901 1.55874 3 0.795609 3 0 C 3 -0.795609 2.683901 -1.55874 2.12132 -2.12132 C 1.55874 -2.683901 0.795609 -3 0 -3 C -0.795609 -3 -1.55874 -2.683901 -2.12132 -2.12132 C -2.683901 -1.55874 -3 -0.795609 -3 0 C -3 0.795609 -2.683901 1.55874 -2.12132 2.12132 C -1.55874 2.683901 -0.795609 3 0 3 z " style="stroke: #1f77b4" />
4628
  </defs>
4629
  <g clip-path="url(#p09feef2583)">
4630
- <use ns4:href="#md7efaf3aec" x="83.607806" y="337.456697" style="fill: #1f77b4; stroke: #1f77b4" />
4631
- <use ns4:href="#md7efaf3aec" x="226.799032" y="322.330829" style="fill: #1f77b4; stroke: #1f77b4" />
4632
- <use ns4:href="#md7efaf3aec" x="369.990258" y="318.592935" style="fill: #1f77b4; stroke: #1f77b4" />
4633
- <use ns4:href="#md7efaf3aec" x="513.181484" y="311.965825" style="fill: #1f77b4; stroke: #1f77b4" />
4634
- <use ns4:href="#md7efaf3aec" x="656.37271" y="262.663131" style="fill: #1f77b4; stroke: #1f77b4" />
4635
- <use ns4:href="#md7efaf3aec" x="799.563935" y="254.692359" style="fill: #1f77b4; stroke: #1f77b4" />
4636
  </g>
4637
  </g>
4638
  <g id="series--torch-mem-eff" class="series">
4639
- <path d="M 83.607806 144.033917 L 226.799032 111.747638 L 369.990258 92.42159 L 513.181484 85.353791 L 656.37271 94.728524 L 799.563935 45.999414 " clip-path="url(#p09feef2583)" style="fill: none; stroke: #ff7f0e; stroke-width: 1.5; stroke-linecap: square" />
4640
  <defs>
4641
  <path id="m9b8c54d372" d="M 0 3 C 0.795609 3 1.55874 2.683901 2.12132 2.12132 C 2.683901 1.55874 3 0.795609 3 0 C 3 -0.795609 2.683901 -1.55874 2.12132 -2.12132 C 1.55874 -2.683901 0.795609 -3 0 -3 C -0.795609 -3 -1.55874 -2.683901 -2.12132 -2.12132 C -2.683901 -1.55874 -3 -0.795609 -3 0 C -3 0.795609 -2.683901 1.55874 -2.12132 2.12132 C -1.55874 2.683901 -0.795609 3 0 3 z " style="stroke: #ff7f0e" />
4642
  </defs>
4643
  <g clip-path="url(#p09feef2583)">
4644
- <use ns4:href="#m9b8c54d372" x="83.607806" y="144.033917" style="fill: #ff7f0e; stroke: #ff7f0e" />
4645
- <use ns4:href="#m9b8c54d372" x="226.799032" y="111.747638" style="fill: #ff7f0e; stroke: #ff7f0e" />
4646
- <use ns4:href="#m9b8c54d372" x="369.990258" y="92.42159" style="fill: #ff7f0e; stroke: #ff7f0e" />
4647
- <use ns4:href="#m9b8c54d372" x="513.181484" y="85.353791" style="fill: #ff7f0e; stroke: #ff7f0e" />
4648
- <use ns4:href="#m9b8c54d372" x="656.37271" y="94.728524" style="fill: #ff7f0e; stroke: #ff7f0e" />
4649
  <use ns4:href="#m9b8c54d372" x="799.563935" y="45.999414" style="fill: #ff7f0e; stroke: #ff7f0e" />
4650
  </g>
4651
  </g>
4652
  <g id="series--xformers-meff" class="series">
4653
- <path d="M 83.607806 408.245077 L 226.799032 395.990127 L 369.990258 378.455027 L 513.181484 373.43287 L 656.37271 333.571508 L 799.563935 337.423698 " clip-path="url(#p09feef2583)" style="fill: none; stroke: #2ca02c; stroke-width: 1.5; stroke-linecap: square" />
4654
  <defs>
4655
  <path id="mc655281e0b" d="M 0 3 C 0.795609 3 1.55874 2.683901 2.12132 2.12132 C 2.683901 1.55874 3 0.795609 3 0 C 3 -0.795609 2.683901 -1.55874 2.12132 -2.12132 C 1.55874 -2.683901 0.795609 -3 0 -3 C -0.795609 -3 -1.55874 -2.683901 -2.12132 -2.12132 C -2.683901 -1.55874 -3 -0.795609 -3 0 C -3 0.795609 -2.683901 1.55874 -2.12132 2.12132 C -1.55874 2.683901 -0.795609 3 0 3 z " style="stroke: #2ca02c" />
4656
  </defs>
4657
  <g clip-path="url(#p09feef2583)">
4658
- <use ns4:href="#mc655281e0b" x="83.607806" y="408.245077" style="fill: #2ca02c; stroke: #2ca02c" />
4659
- <use ns4:href="#mc655281e0b" x="226.799032" y="395.990127" style="fill: #2ca02c; stroke: #2ca02c" />
4660
- <use ns4:href="#mc655281e0b" x="369.990258" y="378.455027" style="fill: #2ca02c; stroke: #2ca02c" />
4661
- <use ns4:href="#mc655281e0b" x="513.181484" y="373.43287" style="fill: #2ca02c; stroke: #2ca02c" />
4662
- <use ns4:href="#mc655281e0b" x="656.37271" y="333.571508" style="fill: #2ca02c; stroke: #2ca02c" />
4663
- <use ns4:href="#mc655281e0b" x="799.563935" y="337.423698" style="fill: #2ca02c; stroke: #2ca02c" />
4664
  </g>
4665
  </g>
4666
  <g id="series--hf-kernels-flash-attn" class="series">
4667
- <path d="M 83.607806 415.568468 L 226.799032 400.735991 L 369.990258 386.008812 L 513.181484 387.284075 L 656.37271 338.461368 L 799.563935 341.493982 " clip-path="url(#p09feef2583)" style="fill: none; stroke: #d62728; stroke-width: 1.5; stroke-linecap: square" />
4668
  <defs>
4669
  <path id="m61c8040d7e" d="M 0 3 C 0.795609 3 1.55874 2.683901 2.12132 2.12132 C 2.683901 1.55874 3 0.795609 3 0 C 3 -0.795609 2.683901 -1.55874 2.12132 -2.12132 C 1.55874 -2.683901 0.795609 -3 0 -3 C -0.795609 -3 -1.55874 -2.683901 -2.12132 -2.12132 C -2.683901 -1.55874 -3 -0.795609 -3 0 C -3 0.795609 -2.683901 1.55874 -2.12132 2.12132 C -1.55874 2.683901 -0.795609 3 0 3 z " style="stroke: #d62728" />
4670
  </defs>
4671
  <g clip-path="url(#p09feef2583)">
4672
- <use ns4:href="#m61c8040d7e" x="83.607806" y="415.568468" style="fill: #d62728; stroke: #d62728" />
4673
- <use ns4:href="#m61c8040d7e" x="226.799032" y="400.735991" style="fill: #d62728; stroke: #d62728" />
4674
- <use ns4:href="#m61c8040d7e" x="369.990258" y="386.008812" style="fill: #d62728; stroke: #d62728" />
4675
- <use ns4:href="#m61c8040d7e" x="513.181484" y="387.284075" style="fill: #d62728; stroke: #d62728" />
4676
- <use ns4:href="#m61c8040d7e" x="656.37271" y="338.461368" style="fill: #d62728; stroke: #d62728" />
4677
- <use ns4:href="#m61c8040d7e" x="799.563935" y="341.493982" style="fill: #d62728; stroke: #d62728" />
4678
  </g>
4679
  </g>
4680
  <g id="series--hf-kernels-flash-attn3" class="series">
4681
- <path d="M 83.607806 428.387702 L 226.799032 415.50217 L 369.990258 397.727077 L 513.181484 397.526383 L 656.37271 348.148992 L 799.563935 348.55398 " clip-path="url(#p09feef2583)" style="fill: none; stroke: #9467bd; stroke-width: 1.5; stroke-linecap: square" />
4682
  <defs>
4683
  <path id="m7cd35be9cc" d="M 0 3 C 0.795609 3 1.55874 2.683901 2.12132 2.12132 C 2.683901 1.55874 3 0.795609 3 0 C 3 -0.795609 2.683901 -1.55874 2.12132 -2.12132 C 1.55874 -2.683901 0.795609 -3 0 -3 C -0.795609 -3 -1.55874 -2.683901 -2.12132 -2.12132 C -2.683901 -1.55874 -3 -0.795609 -3 0 C -3 0.795609 -2.683901 1.55874 -2.12132 2.12132 C -1.55874 2.683901 -0.795609 3 0 3 z " style="stroke: #9467bd" />
4684
  </defs>
4685
  <g clip-path="url(#p09feef2583)">
4686
  <use ns4:href="#m7cd35be9cc" x="83.607806" y="428.387702" style="fill: #9467bd; stroke: #9467bd" />
4687
- <use ns4:href="#m7cd35be9cc" x="226.799032" y="415.50217" style="fill: #9467bd; stroke: #9467bd" />
4688
- <use ns4:href="#m7cd35be9cc" x="369.990258" y="397.727077" style="fill: #9467bd; stroke: #9467bd" />
4689
- <use ns4:href="#m7cd35be9cc" x="513.181484" y="397.526383" style="fill: #9467bd; stroke: #9467bd" />
4690
- <use ns4:href="#m7cd35be9cc" x="656.37271" y="348.148992" style="fill: #9467bd; stroke: #9467bd" />
4691
- <use ns4:href="#m7cd35be9cc" x="799.563935" y="348.55398" style="fill: #9467bd; stroke: #9467bd" />
4692
  </g>
4693
  </g>
4694
  <g id="patch_3">
 
3872
  <rdf:RDF>
3873
  <ns2:Work>
3874
  <dc:type rdf:resource="http://purl.org/dc/dcmitype/StillImage" />
3875
+ <dc:date>2025-10-29T14:28:03.109695</dc:date>
3876
  <dc:format>image/svg+xml</dc:format>
3877
  <dc:creator>
3878
  <ns2:Agent>
 
3982
  <g id="matplotlib.axis_2">
3983
  <g id="ytick_1">
3984
  <g id="grid-y--2" class="grid grid-y">
3985
+ <path d="M 47.81 409.00723 L 835.361742 409.00723 " clip-path="url(#p09feef2583)" style="fill: none; stroke: #b0b0b0; stroke-opacity: 0.3; stroke-width: 0.8; stroke-linecap: square" />
3986
  </g>
3987
  <g id="line2d_7">
3988
  <defs>
3989
  <path id="m0fca2865ba" d="M 0 0 L -3.5 0 " style="stroke: #000000; stroke-width: 0.8" />
3990
  </defs>
3991
  <g>
3992
+ <use ns4:href="#m0fca2865ba" x="47.81" y="409.00723" style="stroke: #000000; stroke-width: 0.8" />
3993
  </g>
3994
  </g>
3995
  <g id="text_7">
3996
+ <text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: end" x="40.81" y="412.806448" transform="rotate(-0 40.81 412.806448)">1.0</text>
3997
  </g>
3998
  </g>
3999
  <g id="ytick_2">
4000
  <g id="grid-y--3" class="grid grid-y">
4001
+ <path d="M 47.81 347.973099 L 835.361742 347.973099 " clip-path="url(#p09feef2583)" style="fill: none; stroke: #b0b0b0; stroke-opacity: 0.3; stroke-width: 0.8; stroke-linecap: square" />
4002
  </g>
4003
  <g id="line2d_8">
4004
  <g>
4005
+ <use ns4:href="#m0fca2865ba" x="47.81" y="347.973099" style="stroke: #000000; stroke-width: 0.8" />
4006
  </g>
4007
  </g>
4008
  <g id="text_8">
4009
+ <text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: end" x="40.81" y="351.772318" transform="rotate(-0 40.81 351.772318)">1.2</text>
4010
  </g>
4011
  </g>
4012
  <g id="ytick_3">
4013
  <g id="grid-y--4" class="grid grid-y">
4014
+ <path d="M 47.81 286.938969 L 835.361742 286.938969 " clip-path="url(#p09feef2583)" style="fill: none; stroke: #b0b0b0; stroke-opacity: 0.3; stroke-width: 0.8; stroke-linecap: square" />
4015
  </g>
4016
  <g id="line2d_9">
4017
  <g>
4018
+ <use ns4:href="#m0fca2865ba" x="47.81" y="286.938969" style="stroke: #000000; stroke-width: 0.8" />
4019
  </g>
4020
  </g>
4021
  <g id="text_9">
4022
+ <text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: end" x="40.81" y="290.738187" transform="rotate(-0 40.81 290.738187)">1.4</text>
4023
  </g>
4024
  </g>
4025
  <g id="ytick_4">
4026
  <g id="grid-y--5" class="grid grid-y">
4027
+ <path d="M 47.81 225.904838 L 835.361742 225.904838 " clip-path="url(#p09feef2583)" style="fill: none; stroke: #b0b0b0; stroke-opacity: 0.3; stroke-width: 0.8; stroke-linecap: square" />
4028
  </g>
4029
  <g id="line2d_10">
4030
  <g>
4031
+ <use ns4:href="#m0fca2865ba" x="47.81" y="225.904838" style="stroke: #000000; stroke-width: 0.8" />
4032
  </g>
4033
  </g>
4034
  <g id="text_10">
4035
+ <text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: end" x="40.81" y="229.704057" transform="rotate(-0 40.81 229.704057)">1.6</text>
4036
  </g>
4037
  </g>
4038
  <g id="ytick_5">
4039
  <g id="grid-y--6" class="grid grid-y">
4040
+ <path d="M 47.81 164.870708 L 835.361742 164.870708 " clip-path="url(#p09feef2583)" style="fill: none; stroke: #b0b0b0; stroke-opacity: 0.3; stroke-width: 0.8; stroke-linecap: square" />
4041
  </g>
4042
  <g id="line2d_11">
4043
  <g>
4044
+ <use ns4:href="#m0fca2865ba" x="47.81" y="164.870708" style="stroke: #000000; stroke-width: 0.8" />
4045
  </g>
4046
  </g>
4047
  <g id="text_11">
4048
+ <text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: end" x="40.81" y="168.669926" transform="rotate(-0 40.81 168.669926)">1.8</text>
4049
  </g>
4050
  </g>
4051
  <g id="ytick_6">
4052
  <g id="grid-y--7" class="grid grid-y">
4053
+ <path d="M 47.81 103.836577 L 835.361742 103.836577 " clip-path="url(#p09feef2583)" style="fill: none; stroke: #b0b0b0; stroke-opacity: 0.3; stroke-width: 0.8; stroke-linecap: square" />
4054
  </g>
4055
  <g id="line2d_12">
4056
  <g>
4057
+ <use ns4:href="#m0fca2865ba" x="47.81" y="103.836577" style="stroke: #000000; stroke-width: 0.8" />
4058
  </g>
4059
  </g>
4060
  <g id="text_12">
4061
+ <text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: end" x="40.81" y="107.635796" transform="rotate(-0 40.81 107.635796)">2.0</text>
4062
  </g>
4063
  </g>
4064
  <g id="ytick_7">
4065
  <g id="grid-y--8" class="grid grid-y">
4066
+ <path d="M 47.81 42.802447 L 835.361742 42.802447 " clip-path="url(#p09feef2583)" style="fill: none; stroke: #b0b0b0; stroke-opacity: 0.3; stroke-width: 0.8; stroke-linecap: square" />
4067
  </g>
4068
  <g id="line2d_13">
4069
  <g>
4070
+ <use ns4:href="#m0fca2865ba" x="47.81" y="42.802447" style="stroke: #000000; stroke-width: 0.8" />
4071
  </g>
4072
  </g>
4073
  <g id="text_13">
4074
+ <text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: end" x="40.81" y="46.601665" transform="rotate(-0 40.81 46.601665)">2.2</text>
4075
  </g>
4076
  </g>
4077
  <g id="label--y" class="ylabel">
 
4079
  </g>
4080
  </g>
4081
  <g id="series--torch-flash-ma" class="series">
4082
+ <path d="M 83.607806 340.639848 L 226.799032 324.181385 L 369.990258 320.559009 L 513.181484 308.901185 L 656.37271 265.282228 L 799.563935 254.967155 " clip-path="url(#p09feef2583)" style="fill: none; stroke: #1f77b4; stroke-width: 1.5; stroke-linecap: square" />
4083
  <defs>
4084
  <path id="md7efaf3aec" d="M 0 3 C 0.795609 3 1.55874 2.683901 2.12132 2.12132 C 2.683901 1.55874 3 0.795609 3 0 C 3 -0.795609 2.683901 -1.55874 2.12132 -2.12132 C 1.55874 -2.683901 0.795609 -3 0 -3 C -0.795609 -3 -1.55874 -2.683901 -2.12132 -2.12132 C -2.683901 -1.55874 -3 -0.795609 -3 0 C -3 0.795609 -2.683901 1.55874 -2.12132 2.12132 C -1.55874 2.683901 -0.795609 3 0 3 z " style="stroke: #1f77b4" />
4085
  </defs>
4086
  <g clip-path="url(#p09feef2583)">
4087
+ <use ns4:href="#md7efaf3aec" x="83.607806" y="340.639848" style="fill: #1f77b4; stroke: #1f77b4" />
4088
+ <use ns4:href="#md7efaf3aec" x="226.799032" y="324.181385" style="fill: #1f77b4; stroke: #1f77b4" />
4089
+ <use ns4:href="#md7efaf3aec" x="369.990258" y="320.559009" style="fill: #1f77b4; stroke: #1f77b4" />
4090
+ <use ns4:href="#md7efaf3aec" x="513.181484" y="308.901185" style="fill: #1f77b4; stroke: #1f77b4" />
4091
+ <use ns4:href="#md7efaf3aec" x="656.37271" y="265.282228" style="fill: #1f77b4; stroke: #1f77b4" />
4092
+ <use ns4:href="#md7efaf3aec" x="799.563935" y="254.967155" style="fill: #1f77b4; stroke: #1f77b4" />
4093
  </g>
4094
  </g>
4095
  <g id="series--torch-mem-eff" class="series">
4096
+ <path d="M 83.607806 156.748591 L 226.799032 137.315018 L 369.990258 105.143013 L 513.181484 114.228248 L 656.37271 86.655469 L 799.563935 45.999414 " clip-path="url(#p09feef2583)" style="fill: none; stroke: #ff7f0e; stroke-width: 1.5; stroke-linecap: square" />
4097
  <defs>
4098
  <path id="m9b8c54d372" d="M 0 3 C 0.795609 3 1.55874 2.683901 2.12132 2.12132 C 2.683901 1.55874 3 0.795609 3 0 C 3 -0.795609 2.683901 -1.55874 2.12132 -2.12132 C 1.55874 -2.683901 0.795609 -3 0 -3 C -0.795609 -3 -1.55874 -2.683901 -2.12132 -2.12132 C -2.683901 -1.55874 -3 -0.795609 -3 0 C -3 0.795609 -2.683901 1.55874 -2.12132 2.12132 C -1.55874 2.683901 -0.795609 3 0 3 z " style="stroke: #ff7f0e" />
4099
  </defs>
4100
  <g clip-path="url(#p09feef2583)">
4101
+ <use ns4:href="#m9b8c54d372" x="83.607806" y="156.748591" style="fill: #ff7f0e; stroke: #ff7f0e" />
4102
+ <use ns4:href="#m9b8c54d372" x="226.799032" y="137.315018" style="fill: #ff7f0e; stroke: #ff7f0e" />
4103
+ <use ns4:href="#m9b8c54d372" x="369.990258" y="105.143013" style="fill: #ff7f0e; stroke: #ff7f0e" />
4104
+ <use ns4:href="#m9b8c54d372" x="513.181484" y="114.228248" style="fill: #ff7f0e; stroke: #ff7f0e" />
4105
+ <use ns4:href="#m9b8c54d372" x="656.37271" y="86.655469" style="fill: #ff7f0e; stroke: #ff7f0e" />
4106
  <use ns4:href="#m9b8c54d372" x="799.563935" y="45.999414" style="fill: #ff7f0e; stroke: #ff7f0e" />
4107
  </g>
4108
  </g>
4109
  <g id="series--xformers-meff" class="series">
4110
+ <path d="M 83.607806 410.498293 L 226.799032 399.197519 L 369.990258 383.346345 L 513.181484 381.042612 L 656.37271 332.003214 L 799.563935 335.418073 " clip-path="url(#p09feef2583)" style="fill: none; stroke: #2ca02c; stroke-width: 1.5; stroke-linecap: square" />
4111
  <defs>
4112
  <path id="mc655281e0b" d="M 0 3 C 0.795609 3 1.55874 2.683901 2.12132 2.12132 C 2.683901 1.55874 3 0.795609 3 0 C 3 -0.795609 2.683901 -1.55874 2.12132 -2.12132 C 1.55874 -2.683901 0.795609 -3 0 -3 C -0.795609 -3 -1.55874 -2.683901 -2.12132 -2.12132 C -2.683901 -1.55874 -3 -0.795609 -3 0 C -3 0.795609 -2.683901 1.55874 -2.12132 2.12132 C -1.55874 2.683901 -0.795609 3 0 3 z " style="stroke: #2ca02c" />
4113
  </defs>
4114
  <g clip-path="url(#p09feef2583)">
4115
+ <use ns4:href="#mc655281e0b" x="83.607806" y="410.498293" style="fill: #2ca02c; stroke: #2ca02c" />
4116
+ <use ns4:href="#mc655281e0b" x="226.799032" y="399.197519" style="fill: #2ca02c; stroke: #2ca02c" />
4117
+ <use ns4:href="#mc655281e0b" x="369.990258" y="383.346345" style="fill: #2ca02c; stroke: #2ca02c" />
4118
+ <use ns4:href="#mc655281e0b" x="513.181484" y="381.042612" style="fill: #2ca02c; stroke: #2ca02c" />
4119
+ <use ns4:href="#mc655281e0b" x="656.37271" y="332.003214" style="fill: #2ca02c; stroke: #2ca02c" />
4120
+ <use ns4:href="#mc655281e0b" x="799.563935" y="335.418073" style="fill: #2ca02c; stroke: #2ca02c" />
4121
  </g>
4122
  </g>
4123
  <g id="series--hf-kernels-flash-attn" class="series">
4124
+ <path d="M 83.607806 418.603626 L 226.799032 405.380276 L 369.990258 389.547718 L 513.181484 382.629499 L 656.37271 335.525188 L 799.563935 340.270592 " clip-path="url(#p09feef2583)" style="fill: none; stroke: #d62728; stroke-width: 1.5; stroke-linecap: square" />
4125
  <defs>
4126
  <path id="m61c8040d7e" d="M 0 3 C 0.795609 3 1.55874 2.683901 2.12132 2.12132 C 2.683901 1.55874 3 0.795609 3 0 C 3 -0.795609 2.683901 -1.55874 2.12132 -2.12132 C 1.55874 -2.683901 0.795609 -3 0 -3 C -0.795609 -3 -1.55874 -2.683901 -2.12132 -2.12132 C -2.683901 -1.55874 -3 -0.795609 -3 0 C -3 0.795609 -2.683901 1.55874 -2.12132 2.12132 C -1.55874 2.683901 -0.795609 3 0 3 z " style="stroke: #d62728" />
4127
  </defs>
4128
  <g clip-path="url(#p09feef2583)">
4129
+ <use ns4:href="#m61c8040d7e" x="83.607806" y="418.603626" style="fill: #d62728; stroke: #d62728" />
4130
+ <use ns4:href="#m61c8040d7e" x="226.799032" y="405.380276" style="fill: #d62728; stroke: #d62728" />
4131
+ <use ns4:href="#m61c8040d7e" x="369.990258" y="389.547718" style="fill: #d62728; stroke: #d62728" />
4132
+ <use ns4:href="#m61c8040d7e" x="513.181484" y="382.629499" style="fill: #d62728; stroke: #d62728" />
4133
+ <use ns4:href="#m61c8040d7e" x="656.37271" y="335.525188" style="fill: #d62728; stroke: #d62728" />
4134
+ <use ns4:href="#m61c8040d7e" x="799.563935" y="340.270592" style="fill: #d62728; stroke: #d62728" />
4135
  </g>
4136
  </g>
4137
  <g id="series--hf-kernels-flash-attn3" class="series">
4138
+ <path d="M 83.607806 428.387702 L 226.799032 418.05737 L 369.990258 396.545281 L 513.181484 392.764216 L 656.37271 347.753681 L 799.563935 353.503096 " clip-path="url(#p09feef2583)" style="fill: none; stroke: #9467bd; stroke-width: 1.5; stroke-linecap: square" />
4139
  <defs>
4140
  <path id="m7cd35be9cc" d="M 0 3 C 0.795609 3 1.55874 2.683901 2.12132 2.12132 C 2.683901 1.55874 3 0.795609 3 0 C 3 -0.795609 2.683901 -1.55874 2.12132 -2.12132 C 1.55874 -2.683901 0.795609 -3 0 -3 C -0.795609 -3 -1.55874 -2.683901 -2.12132 -2.12132 C -2.683901 -1.55874 -3 -0.795609 -3 0 C -3 0.795609 -2.683901 1.55874 -2.12132 2.12132 C -1.55874 2.683901 -0.795609 3 0 3 z " style="stroke: #9467bd" />
4141
  </defs>
4142
  <g clip-path="url(#p09feef2583)">
4143
  <use ns4:href="#m7cd35be9cc" x="83.607806" y="428.387702" style="fill: #9467bd; stroke: #9467bd" />
4144
+ <use ns4:href="#m7cd35be9cc" x="226.799032" y="418.05737" style="fill: #9467bd; stroke: #9467bd" />
4145
+ <use ns4:href="#m7cd35be9cc" x="369.990258" y="396.545281" style="fill: #9467bd; stroke: #9467bd" />
4146
+ <use ns4:href="#m7cd35be9cc" x="513.181484" y="392.764216" style="fill: #9467bd; stroke: #9467bd" />
4147
+ <use ns4:href="#m7cd35be9cc" x="656.37271" y="347.753681" style="fill: #9467bd; stroke: #9467bd" />
4148
+ <use ns4:href="#m7cd35be9cc" x="799.563935" y="353.503096" style="fill: #9467bd; stroke: #9467bd" />
4149
  </g>
4150
  </g>
4151
  <g id="patch_3">
 
4337
  COMBINED BENCHMARK SUMMARY
4338
 
4339
  impl wl p50(ms) ok
4340
+ hf_kernels_flash_attn cuda_attn_L128_bfloat16 0.97 True
4341
  hf_kernels_flash_attn cuda_attn_L256_bfloat16 1.01 True
4342
  hf_kernels_flash_attn cuda_attn_L320_bfloat16 1.06 True
4343
+ hf_kernels_flash_attn cuda_attn_L384_bfloat16 1.09 True
4344
+ hf_kernels_flash_attn cuda_attn_L448_bfloat16 1.24 True
4345
+ hf_kernels_flash_attn cuda_attn_L512_bfloat16 1.23 True
4346
+ hf_kernels_flash_attn3 cuda_attn_L128_bfloat16 0.94 True
4347
+ hf_kernels_flash_attn3 cuda_attn_L256_bfloat16 0.97 True
4348
+ hf_kernels_flash_attn3 cuda_attn_L320_bfloat16 1.04 True
4349
+ hf_kernels_flash_attn3 cuda_attn_L384_bfloat16 1.05 True
4350
+ hf_kernels_flash_attn3 cuda_attn_L448_bfloat16 1.20 True
4351
  hf_kernels_flash_attn3 cuda_attn_L512_bfloat16 1.18 True
4352
  sage_int8_fp16 cuda_attn_L128_bfloat16 FAIL False
4353
+ Error: module &#x27;sage_attention_5c963cbdaf16559b&#x27; has no attribute &#x27;fwd&#x27;
4354
  sage_int8_fp16 cuda_attn_L256_bfloat16 FAIL False
4355
+ Error: module &#x27;sage_attention_5c963cbdaf16559b&#x27; has no attribute &#x27;fwd&#x27;
4356
  sage_int8_fp16 cuda_attn_L320_bfloat16 FAIL False
4357
+ Error: module &#x27;sage_attention_5c963cbdaf16559b&#x27; has no attribute &#x27;fwd&#x27;
4358
  sage_int8_fp16 cuda_attn_L384_bfloat16 FAIL False
4359
+ Error: module &#x27;sage_attention_5c963cbdaf16559b&#x27; has no attribute &#x27;fwd&#x27;
4360
  sage_int8_fp16 cuda_attn_L448_bfloat16 FAIL False
4361
+ Error: module &#x27;sage_attention_5c963cbdaf16559b&#x27; has no attribute &#x27;fwd&#x27;
4362
  sage_int8_fp16 cuda_attn_L512_bfloat16 FAIL False
4363
+ Error: module &#x27;sage_attention_5c963cbdaf16559b&#x27; has no attribute &#x27;fwd&#x27;
4364
  torch_flash_ma cuda_attn_L128_bfloat16 1.22 True
4365
+ torch_flash_ma cuda_attn_L256_bfloat16 1.28 True
4366
+ torch_flash_ma cuda_attn_L320_bfloat16 1.29 True
4367
+ torch_flash_ma cuda_attn_L384_bfloat16 1.33 True
4368
  torch_flash_ma cuda_attn_L448_bfloat16 1.47 True
4369
  torch_flash_ma cuda_attn_L512_bfloat16 1.50 True
4370
+ torch_mem_eff cuda_attn_L128_bfloat16 1.83 True
4371
+ torch_mem_eff cuda_attn_L256_bfloat16 1.89 True
4372
+ torch_mem_eff cuda_attn_L320_bfloat16 2.00 True
4373
+ torch_mem_eff cuda_attn_L384_bfloat16 1.97 True
4374
+ torch_mem_eff cuda_attn_L448_bfloat16 2.06 True
4375
  torch_mem_eff cuda_attn_L512_bfloat16 2.19 True
4376
+ xformers_meff cuda_attn_L128_bfloat16 1.00 True
4377
  xformers_meff cuda_attn_L256_bfloat16 1.03 True
4378
  xformers_meff cuda_attn_L320_bfloat16 1.08 True
4379
+ xformers_meff cuda_attn_L384_bfloat16 1.09 True
4380
+ xformers_meff cuda_attn_L448_bfloat16 1.25 True
4381
+ xformers_meff cuda_attn_L512_bfloat16 1.24 True
4382
 
4383
  GENERATING COMBINED VISUALIZATION
4384
 
 
4402
  <div class="uv-install-logs" id="uv-logs-combine">
4403
  <div class="uv-logs-header" onclick="toggleUvLogs(this)">▶ UV Install Logs</div>
4404
  <div class="uv-logs-content" style="display: none;">
4405
+ Installed 37 packages in 208ms
4406
  </div>
4407
  </div>
4408
  <div class="cell-artifacts">
 
4415
  <rdf:RDF>
4416
  <ns2:Work>
4417
  <dc:type rdf:resource="http://purl.org/dc/dcmitype/StillImage" />
4418
+ <dc:date>2025-10-29T14:28:03.109695</dc:date>
4419
  <dc:format>image/svg+xml</dc:format>
4420
  <dc:creator>
4421
  <ns2:Agent>
 
4525
  <g id="matplotlib.axis_2">
4526
  <g id="ytick_1">
4527
  <g id="grid-y--2" class="grid grid-y">
4528
+ <path d="M 47.81 409.00723 L 835.361742 409.00723 " clip-path="url(#p09feef2583)" style="fill: none; stroke: #b0b0b0; stroke-opacity: 0.3; stroke-width: 0.8; stroke-linecap: square" />
4529
  </g>
4530
  <g id="line2d_7">
4531
  <defs>
4532
  <path id="m0fca2865ba" d="M 0 0 L -3.5 0 " style="stroke: #000000; stroke-width: 0.8" />
4533
  </defs>
4534
  <g>
4535
+ <use ns4:href="#m0fca2865ba" x="47.81" y="409.00723" style="stroke: #000000; stroke-width: 0.8" />
4536
  </g>
4537
  </g>
4538
  <g id="text_7">
4539
+ <text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: end" x="40.81" y="412.806448" transform="rotate(-0 40.81 412.806448)">1.0</text>
4540
  </g>
4541
  </g>
4542
  <g id="ytick_2">
4543
  <g id="grid-y--3" class="grid grid-y">
4544
+ <path d="M 47.81 347.973099 L 835.361742 347.973099 " clip-path="url(#p09feef2583)" style="fill: none; stroke: #b0b0b0; stroke-opacity: 0.3; stroke-width: 0.8; stroke-linecap: square" />
4545
  </g>
4546
  <g id="line2d_8">
4547
  <g>
4548
+ <use ns4:href="#m0fca2865ba" x="47.81" y="347.973099" style="stroke: #000000; stroke-width: 0.8" />
4549
  </g>
4550
  </g>
4551
  <g id="text_8">
4552
+ <text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: end" x="40.81" y="351.772318" transform="rotate(-0 40.81 351.772318)">1.2</text>
4553
  </g>
4554
  </g>
4555
  <g id="ytick_3">
4556
  <g id="grid-y--4" class="grid grid-y">
4557
+ <path d="M 47.81 286.938969 L 835.361742 286.938969 " clip-path="url(#p09feef2583)" style="fill: none; stroke: #b0b0b0; stroke-opacity: 0.3; stroke-width: 0.8; stroke-linecap: square" />
4558
  </g>
4559
  <g id="line2d_9">
4560
  <g>
4561
+ <use ns4:href="#m0fca2865ba" x="47.81" y="286.938969" style="stroke: #000000; stroke-width: 0.8" />
4562
  </g>
4563
  </g>
4564
  <g id="text_9">
4565
+ <text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: end" x="40.81" y="290.738187" transform="rotate(-0 40.81 290.738187)">1.4</text>
4566
  </g>
4567
  </g>
4568
  <g id="ytick_4">
4569
  <g id="grid-y--5" class="grid grid-y">
4570
+ <path d="M 47.81 225.904838 L 835.361742 225.904838 " clip-path="url(#p09feef2583)" style="fill: none; stroke: #b0b0b0; stroke-opacity: 0.3; stroke-width: 0.8; stroke-linecap: square" />
4571
  </g>
4572
  <g id="line2d_10">
4573
  <g>
4574
+ <use ns4:href="#m0fca2865ba" x="47.81" y="225.904838" style="stroke: #000000; stroke-width: 0.8" />
4575
  </g>
4576
  </g>
4577
  <g id="text_10">
4578
+ <text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: end" x="40.81" y="229.704057" transform="rotate(-0 40.81 229.704057)">1.6</text>
4579
  </g>
4580
  </g>
4581
  <g id="ytick_5">
4582
  <g id="grid-y--6" class="grid grid-y">
4583
+ <path d="M 47.81 164.870708 L 835.361742 164.870708 " clip-path="url(#p09feef2583)" style="fill: none; stroke: #b0b0b0; stroke-opacity: 0.3; stroke-width: 0.8; stroke-linecap: square" />
4584
  </g>
4585
  <g id="line2d_11">
4586
  <g>
4587
+ <use ns4:href="#m0fca2865ba" x="47.81" y="164.870708" style="stroke: #000000; stroke-width: 0.8" />
4588
  </g>
4589
  </g>
4590
  <g id="text_11">
4591
+ <text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: end" x="40.81" y="168.669926" transform="rotate(-0 40.81 168.669926)">1.8</text>
4592
  </g>
4593
  </g>
4594
  <g id="ytick_6">
4595
  <g id="grid-y--7" class="grid grid-y">
4596
+ <path d="M 47.81 103.836577 L 835.361742 103.836577 " clip-path="url(#p09feef2583)" style="fill: none; stroke: #b0b0b0; stroke-opacity: 0.3; stroke-width: 0.8; stroke-linecap: square" />
4597
  </g>
4598
  <g id="line2d_12">
4599
  <g>
4600
+ <use ns4:href="#m0fca2865ba" x="47.81" y="103.836577" style="stroke: #000000; stroke-width: 0.8" />
4601
  </g>
4602
  </g>
4603
  <g id="text_12">
4604
+ <text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: end" x="40.81" y="107.635796" transform="rotate(-0 40.81 107.635796)">2.0</text>
4605
  </g>
4606
  </g>
4607
  <g id="ytick_7">
4608
  <g id="grid-y--8" class="grid grid-y">
4609
+ <path d="M 47.81 42.802447 L 835.361742 42.802447 " clip-path="url(#p09feef2583)" style="fill: none; stroke: #b0b0b0; stroke-opacity: 0.3; stroke-width: 0.8; stroke-linecap: square" />
4610
  </g>
4611
  <g id="line2d_13">
4612
  <g>
4613
+ <use ns4:href="#m0fca2865ba" x="47.81" y="42.802447" style="stroke: #000000; stroke-width: 0.8" />
4614
  </g>
4615
  </g>
4616
  <g id="text_13">
4617
+ <text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: end" x="40.81" y="46.601665" transform="rotate(-0 40.81 46.601665)">2.2</text>
4618
  </g>
4619
  </g>
4620
  <g id="label--y" class="ylabel">
 
4622
  </g>
4623
  </g>
4624
  <g id="series--torch-flash-ma" class="series">
4625
+ <path d="M 83.607806 340.639848 L 226.799032 324.181385 L 369.990258 320.559009 L 513.181484 308.901185 L 656.37271 265.282228 L 799.563935 254.967155 " clip-path="url(#p09feef2583)" style="fill: none; stroke: #1f77b4; stroke-width: 1.5; stroke-linecap: square" />
4626
  <defs>
4627
  <path id="md7efaf3aec" d="M 0 3 C 0.795609 3 1.55874 2.683901 2.12132 2.12132 C 2.683901 1.55874 3 0.795609 3 0 C 3 -0.795609 2.683901 -1.55874 2.12132 -2.12132 C 1.55874 -2.683901 0.795609 -3 0 -3 C -0.795609 -3 -1.55874 -2.683901 -2.12132 -2.12132 C -2.683901 -1.55874 -3 -0.795609 -3 0 C -3 0.795609 -2.683901 1.55874 -2.12132 2.12132 C -1.55874 2.683901 -0.795609 3 0 3 z " style="stroke: #1f77b4" />
4628
  </defs>
4629
  <g clip-path="url(#p09feef2583)">
4630
+ <use ns4:href="#md7efaf3aec" x="83.607806" y="340.639848" style="fill: #1f77b4; stroke: #1f77b4" />
4631
+ <use ns4:href="#md7efaf3aec" x="226.799032" y="324.181385" style="fill: #1f77b4; stroke: #1f77b4" />
4632
+ <use ns4:href="#md7efaf3aec" x="369.990258" y="320.559009" style="fill: #1f77b4; stroke: #1f77b4" />
4633
+ <use ns4:href="#md7efaf3aec" x="513.181484" y="308.901185" style="fill: #1f77b4; stroke: #1f77b4" />
4634
+ <use ns4:href="#md7efaf3aec" x="656.37271" y="265.282228" style="fill: #1f77b4; stroke: #1f77b4" />
4635
+ <use ns4:href="#md7efaf3aec" x="799.563935" y="254.967155" style="fill: #1f77b4; stroke: #1f77b4" />
4636
  </g>
4637
  </g>
4638
  <g id="series--torch-mem-eff" class="series">
4639
+ <path d="M 83.607806 156.748591 L 226.799032 137.315018 L 369.990258 105.143013 L 513.181484 114.228248 L 656.37271 86.655469 L 799.563935 45.999414 " clip-path="url(#p09feef2583)" style="fill: none; stroke: #ff7f0e; stroke-width: 1.5; stroke-linecap: square" />
4640
  <defs>
4641
  <path id="m9b8c54d372" d="M 0 3 C 0.795609 3 1.55874 2.683901 2.12132 2.12132 C 2.683901 1.55874 3 0.795609 3 0 C 3 -0.795609 2.683901 -1.55874 2.12132 -2.12132 C 1.55874 -2.683901 0.795609 -3 0 -3 C -0.795609 -3 -1.55874 -2.683901 -2.12132 -2.12132 C -2.683901 -1.55874 -3 -0.795609 -3 0 C -3 0.795609 -2.683901 1.55874 -2.12132 2.12132 C -1.55874 2.683901 -0.795609 3 0 3 z " style="stroke: #ff7f0e" />
4642
  </defs>
4643
  <g clip-path="url(#p09feef2583)">
4644
+ <use ns4:href="#m9b8c54d372" x="83.607806" y="156.748591" style="fill: #ff7f0e; stroke: #ff7f0e" />
4645
+ <use ns4:href="#m9b8c54d372" x="226.799032" y="137.315018" style="fill: #ff7f0e; stroke: #ff7f0e" />
4646
+ <use ns4:href="#m9b8c54d372" x="369.990258" y="105.143013" style="fill: #ff7f0e; stroke: #ff7f0e" />
4647
+ <use ns4:href="#m9b8c54d372" x="513.181484" y="114.228248" style="fill: #ff7f0e; stroke: #ff7f0e" />
4648
+ <use ns4:href="#m9b8c54d372" x="656.37271" y="86.655469" style="fill: #ff7f0e; stroke: #ff7f0e" />
4649
  <use ns4:href="#m9b8c54d372" x="799.563935" y="45.999414" style="fill: #ff7f0e; stroke: #ff7f0e" />
4650
  </g>
4651
  </g>
4652
  <g id="series--xformers-meff" class="series">
4653
+ <path d="M 83.607806 410.498293 L 226.799032 399.197519 L 369.990258 383.346345 L 513.181484 381.042612 L 656.37271 332.003214 L 799.563935 335.418073 " clip-path="url(#p09feef2583)" style="fill: none; stroke: #2ca02c; stroke-width: 1.5; stroke-linecap: square" />
4654
  <defs>
4655
  <path id="mc655281e0b" d="M 0 3 C 0.795609 3 1.55874 2.683901 2.12132 2.12132 C 2.683901 1.55874 3 0.795609 3 0 C 3 -0.795609 2.683901 -1.55874 2.12132 -2.12132 C 1.55874 -2.683901 0.795609 -3 0 -3 C -0.795609 -3 -1.55874 -2.683901 -2.12132 -2.12132 C -2.683901 -1.55874 -3 -0.795609 -3 0 C -3 0.795609 -2.683901 1.55874 -2.12132 2.12132 C -1.55874 2.683901 -0.795609 3 0 3 z " style="stroke: #2ca02c" />
4656
  </defs>
4657
  <g clip-path="url(#p09feef2583)">
4658
+ <use ns4:href="#mc655281e0b" x="83.607806" y="410.498293" style="fill: #2ca02c; stroke: #2ca02c" />
4659
+ <use ns4:href="#mc655281e0b" x="226.799032" y="399.197519" style="fill: #2ca02c; stroke: #2ca02c" />
4660
+ <use ns4:href="#mc655281e0b" x="369.990258" y="383.346345" style="fill: #2ca02c; stroke: #2ca02c" />
4661
+ <use ns4:href="#mc655281e0b" x="513.181484" y="381.042612" style="fill: #2ca02c; stroke: #2ca02c" />
4662
+ <use ns4:href="#mc655281e0b" x="656.37271" y="332.003214" style="fill: #2ca02c; stroke: #2ca02c" />
4663
+ <use ns4:href="#mc655281e0b" x="799.563935" y="335.418073" style="fill: #2ca02c; stroke: #2ca02c" />
4664
  </g>
4665
  </g>
4666
  <g id="series--hf-kernels-flash-attn" class="series">
4667
+ <path d="M 83.607806 418.603626 L 226.799032 405.380276 L 369.990258 389.547718 L 513.181484 382.629499 L 656.37271 335.525188 L 799.563935 340.270592 " clip-path="url(#p09feef2583)" style="fill: none; stroke: #d62728; stroke-width: 1.5; stroke-linecap: square" />
4668
  <defs>
4669
  <path id="m61c8040d7e" d="M 0 3 C 0.795609 3 1.55874 2.683901 2.12132 2.12132 C 2.683901 1.55874 3 0.795609 3 0 C 3 -0.795609 2.683901 -1.55874 2.12132 -2.12132 C 1.55874 -2.683901 0.795609 -3 0 -3 C -0.795609 -3 -1.55874 -2.683901 -2.12132 -2.12132 C -2.683901 -1.55874 -3 -0.795609 -3 0 C -3 0.795609 -2.683901 1.55874 -2.12132 2.12132 C -1.55874 2.683901 -0.795609 3 0 3 z " style="stroke: #d62728" />
4670
  </defs>
4671
  <g clip-path="url(#p09feef2583)">
4672
+ <use ns4:href="#m61c8040d7e" x="83.607806" y="418.603626" style="fill: #d62728; stroke: #d62728" />
4673
+ <use ns4:href="#m61c8040d7e" x="226.799032" y="405.380276" style="fill: #d62728; stroke: #d62728" />
4674
+ <use ns4:href="#m61c8040d7e" x="369.990258" y="389.547718" style="fill: #d62728; stroke: #d62728" />
4675
+ <use ns4:href="#m61c8040d7e" x="513.181484" y="382.629499" style="fill: #d62728; stroke: #d62728" />
4676
+ <use ns4:href="#m61c8040d7e" x="656.37271" y="335.525188" style="fill: #d62728; stroke: #d62728" />
4677
+ <use ns4:href="#m61c8040d7e" x="799.563935" y="340.270592" style="fill: #d62728; stroke: #d62728" />
4678
  </g>
4679
  </g>
4680
  <g id="series--hf-kernels-flash-attn3" class="series">
4681
+ <path d="M 83.607806 428.387702 L 226.799032 418.05737 L 369.990258 396.545281 L 513.181484 392.764216 L 656.37271 347.753681 L 799.563935 353.503096 " clip-path="url(#p09feef2583)" style="fill: none; stroke: #9467bd; stroke-width: 1.5; stroke-linecap: square" />
4682
  <defs>
4683
  <path id="m7cd35be9cc" d="M 0 3 C 0.795609 3 1.55874 2.683901 2.12132 2.12132 C 2.683901 1.55874 3 0.795609 3 0 C 3 -0.795609 2.683901 -1.55874 2.12132 -2.12132 C 1.55874 -2.683901 0.795609 -3 0 -3 C -0.795609 -3 -1.55874 -2.683901 -2.12132 -2.12132 C -2.683901 -1.55874 -3 -0.795609 -3 0 C -3 0.795609 -2.683901 1.55874 -2.12132 2.12132 C -1.55874 2.683901 -0.795609 3 0 3 z " style="stroke: #9467bd" />
4684
  </defs>
4685
  <g clip-path="url(#p09feef2583)">
4686
  <use ns4:href="#m7cd35be9cc" x="83.607806" y="428.387702" style="fill: #9467bd; stroke: #9467bd" />
4687
+ <use ns4:href="#m7cd35be9cc" x="226.799032" y="418.05737" style="fill: #9467bd; stroke: #9467bd" />
4688
+ <use ns4:href="#m7cd35be9cc" x="369.990258" y="396.545281" style="fill: #9467bd; stroke: #9467bd" />
4689
+ <use ns4:href="#m7cd35be9cc" x="513.181484" y="392.764216" style="fill: #9467bd; stroke: #9467bd" />
4690
+ <use ns4:href="#m7cd35be9cc" x="656.37271" y="347.753681" style="fill: #9467bd; stroke: #9467bd" />
4691
+ <use ns4:href="#m7cd35be9cc" x="799.563935" y="353.503096" style="fill: #9467bd; stroke: #9467bd" />
4692
  </g>
4693
  </g>
4694
  <g id="patch_3">
index.html CHANGED
The diff for this file is too large to render. See raw diff
 
layer_norm/impls/artifacts/benchmark/layer_norm.jsonl CHANGED
@@ -1,4 +1,4 @@
1
- {"ts": "2025-10-28T14:08:59Z", "run": "c74bca3f7fd14f779c98e3d8b69c0098", "impl": "hf_kernels_layer_norm", "tags": {"family": "hf-kernels", "repo": "kernels-community/layer-norm", "op": "layer_norm"}, "wl": {"name": "LN_B16_S2048_D4096", "batch": 16, "seq_len": 2048, "hidden_dim": 4096, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.8268990000033227, "p50": 0.8360890000176369, "p90": 0.8378790000733716, "mean": 0.8358750000070359, "iqr": 0.002010000116570154, "raw_times": [0.8426389999840467, 0.8268990000033227, 0.8378790000733716, 0.8360890000176369, 0.8358689999568014], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.8452999999235544, "peak_bytes": 2415935488, "ok": true, "absmax": 0.03125, "corr": {"ok": true, "rtol": 0.001, "atol": 0.03125, "absmax": 0.03125, "mae": 0.0015411376953125, "mse": 1.1205673217773438e-05, "ref": "layer_norm_ref"}, "err": null}
2
- {"ts": "2025-10-28T14:09:00Z", "run": "c74bca3f7fd14f779c98e3d8b69c0098", "impl": "hf_kernels_layer_norm", "tags": {"family": "hf-kernels", "repo": "kernels-community/layer-norm", "op": "layer_norm"}, "wl": {"name": "LN_B16_S2048_D8192", "batch": 16, "seq_len": 2048, "hidden_dim": 8192, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 1.6477070000746608, "p50": 1.6516379999984565, "p90": 1.6565669999408783, "mean": 1.6533151999965412, "iqr": 0.006360999918797461, "raw_times": [1.6565669999408783, 1.6516379999984565, 1.6477070000746608, 1.6604579999466296, 1.6502060000220808], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 1.6544470000781075, "peak_bytes": 4831870976, "ok": true, "absmax": 0.03125, "corr": {"ok": true, "rtol": 0.001, "atol": 0.03125, "absmax": 0.03125, "mae": 0.0015106201171875, "mse": 1.1086463928222656e-05, "ref": "layer_norm_ref"}, "err": null}
3
- {"ts": "2025-10-28T14:09:00Z", "run": "c74bca3f7fd14f779c98e3d8b69c0098", "impl": "hf_kernels_layer_norm", "tags": {"family": "hf-kernels", "repo": "kernels-community/layer-norm", "op": "layer_norm"}, "wl": {"name": "LN_B16_S4096_D4096", "batch": 16, "seq_len": 4096, "hidden_dim": 4096, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 1.6441269999631913, "p50": 1.6532669999378413, "p90": 1.6534970000066096, "mean": 1.6500411999913922, "iqr": 0.009149999982582813, "raw_times": [1.6441269999631913, 1.6534970000066096, 1.6532669999378413, 1.6443470000240268, 1.654968000025292], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 1.6665570000213847, "peak_bytes": 4831854592, "ok": true, "absmax": 0.03125, "corr": {"ok": true, "rtol": 0.001, "atol": 0.03125, "absmax": 0.03125, "mae": 0.0015411376953125, "mse": 1.1205673217773438e-05, "ref": "layer_norm_ref"}, "err": null}
4
- {"ts": "2025-10-28T14:09:00Z", "run": "c74bca3f7fd14f779c98e3d8b69c0098", "impl": "hf_kernels_layer_norm", "tags": {"family": "hf-kernels", "repo": "kernels-community/layer-norm", "op": "layer_norm"}, "wl": {"name": "LN_B16_S4096_D8192", "batch": 16, "seq_len": 4096, "hidden_dim": 8192, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 3.251962999911484, "p50": 3.270412999995642, "p90": 3.2735430000911947, "mean": 3.2660931999998866, "iqr": 0.01632000009976764, "raw_times": [3.2735430000911947, 3.251962999911484, 3.257222999991427, 3.277324000009685, 3.270412999995642], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 3.2640430000583365, "peak_bytes": 9663709184, "ok": true, "absmax": 0.03125, "corr": {"ok": true, "rtol": 0.001, "atol": 0.03125, "absmax": 0.03125, "mae": 0.0015106201171875, "mse": 1.1026859283447266e-05, "ref": "layer_norm_ref"}, "err": null}
 
1
+ {"ts": "2025-10-29T14:26:39Z", "run": "a26c482e532b445fa4192ef14ec4850f", "impl": "hf_kernels_layer_norm", "tags": {"family": "hf-kernels", "repo": "kernels-community/layer-norm", "op": "layer_norm"}, "wl": {"name": "LN_B16_S2048_D4096", "batch": 16, "seq_len": 2048, "hidden_dim": 4096, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.8322699999894212, "p50": 0.8364899999833142, "p90": 0.8382409999967422, "mean": 0.8359703999872181, "iqr": 0.0036810000096920703, "raw_times": [0.8322699999894212, 0.8382909999795629, 0.8345599999870501, 0.8382409999967422, 0.8364899999833142], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.8381600000006983, "peak_bytes": 2415935488, "ok": true, "absmax": 0.03125, "corr": {"ok": true, "rtol": 0.001, "atol": 0.03125, "absmax": 0.03125, "mae": 0.0015411376953125, "mse": 1.1205673217773438e-05, "ref": "layer_norm_ref"}, "err": null}
2
+ {"ts": "2025-10-29T14:26:39Z", "run": "a26c482e532b445fa4192ef14ec4850f", "impl": "hf_kernels_layer_norm", "tags": {"family": "hf-kernels", "repo": "kernels-community/layer-norm", "op": "layer_norm"}, "wl": {"name": "LN_B16_S2048_D8192", "batch": 16, "seq_len": 2048, "hidden_dim": 8192, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 1.6435499999829517, "p50": 1.6499199999771008, "p90": 1.6516499999852385, "mean": 1.650240399987979, "iqr": 0.0024989999474200886, "raw_times": [1.6516499999852385, 1.6435499999829517, 1.6499199999771008, 1.6491510000378184, 1.656930999956785], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 1.644769999984419, "peak_bytes": 4831870976, "ok": true, "absmax": 0.03125, "corr": {"ok": true, "rtol": 0.001, "atol": 0.03125, "absmax": 0.03125, "mae": 0.0015106201171875, "mse": 1.1086463928222656e-05, "ref": "layer_norm_ref"}, "err": null}
3
+ {"ts": "2025-10-29T14:26:39Z", "run": "a26c482e532b445fa4192ef14ec4850f", "impl": "hf_kernels_layer_norm", "tags": {"family": "hf-kernels", "repo": "kernels-community/layer-norm", "op": "layer_norm"}, "wl": {"name": "LN_B16_S4096_D4096", "batch": 16, "seq_len": 4096, "hidden_dim": 4096, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 1.6425610000396773, "p50": 1.6517310000381258, "p90": 1.654420999955164, "mean": 1.6505027999983213, "iqr": 0.006990999963818467, "raw_times": [1.6474299999913455, 1.6517310000381258, 1.654420999955164, 1.6563709999672938, 1.6425610000396773], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 1.6471609999939574, "peak_bytes": 4831854592, "ok": true, "absmax": 0.03125, "corr": {"ok": true, "rtol": 0.001, "atol": 0.03125, "absmax": 0.03125, "mae": 0.0015411376953125, "mse": 1.1205673217773438e-05, "ref": "layer_norm_ref"}, "err": null}
4
+ {"ts": "2025-10-29T14:26:39Z", "run": "a26c482e532b445fa4192ef14ec4850f", "impl": "hf_kernels_layer_norm", "tags": {"family": "hf-kernels", "repo": "kernels-community/layer-norm", "op": "layer_norm"}, "wl": {"name": "LN_B16_S4096_D8192", "batch": 16, "seq_len": 4096, "hidden_dim": 8192, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 3.237169999977141, "p50": 3.2576509999557857, "p90": 3.264301000001524, "mean": 3.257706599993071, "iqr": 0.008230999981151399, "raw_times": [3.2576509999557857, 3.264301000001524, 3.2733410000105323, 3.2560700000203724, 3.237169999977141], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 3.2725309999932506, "peak_bytes": 9663709184, "ok": true, "absmax": 0.03125, "corr": {"ok": true, "rtol": 0.001, "atol": 0.03125, "absmax": 0.03125, "mae": 0.0015106201171875, "mse": 1.1026859283447266e-05, "ref": "layer_norm_ref"}, "err": null}
layer_norm/impls/hf_kernels_layer_norm.html CHANGED
@@ -3862,8 +3862,7 @@ span.linenos.special { color: #000000; background-color: #ffffc0; padding-left:
3862
  </div>
3863
 
3864
  <div class="main-content">
3865
- <h2>on_github: huggingface/kernels-uvnotes</h2>
3866
- <h1>HF Kernels LayerNorm Implementation</h1>
3867
  <p>Based on kernels-community <code>layer-norm</code> kernel.</p>
3868
  <h2>LayerNorm Benchmark (HF Kernels)</h2>
3869
  <div class="cell" id="cell-benchmark">
@@ -3873,10 +3872,11 @@ span.linenos.special { color: #000000; background-color: #ffffc0; padding-left:
3873
  <span onclick="toggleOutput('benchmark')" style="cursor: pointer;">▼ output</span>
3874
  <span id="uv-indicator-benchmark" onclick="toggleUvLogsFromHeader('benchmark')" style="cursor: pointer;">▶ uv-logs</span>
3875
  </span> |
3876
- Cell: benchmark | 7.03s
3877
  | <button class="run-btn" onclick="runCell('benchmark')">▶ run</button>
3878
  <button class="copy-btn" onclick="copyCell('benchmark')">Copy</button>
3879
  <a href="cells/benchmark.py" target="_blank" class="raw-btn">Raw</a>
 
3880
  </div>
3881
  <div id="code-benchmark" class="cell-code" data-lines="49">
3882
  <div class="code-wrap">
@@ -3943,19 +3943,19 @@ PROFILE TRACE: hf_kernels_layer_norm | LN_B16_S2048_D4096
3943
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
3944
  Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
3945
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
3946
- hf_kernels_layer_norm 4.56% 180.575us 46.01% 1.822ms 1.822ms 0.000us 0.00% 3.098ms 3.098ms 1
3947
- _layer_norm_f8ec252::dropout_add_ln_fwd 1.70% 67.272us 40.91% 1.619ms 539.829us 2.362ms 100.00% 3.098ms 1.033ms 3
3948
- hf_kernels_layer_norm 0.00% 0.000us 0.00% 0.000us 0.000us 2.363ms 100.06% 2.363ms 2.363ms 1
3949
- void layer_norm::ln_fwd_kernel&lt;layer_norm::Kernel_tr... 0.00% 0.000us 0.00% 0.000us 0.000us 2.362ms 100.00% 2.362ms 787.305us 3
3950
- Activity Buffer Request 36.75% 1.455ms 36.75% 1.455ms 1.455ms 736.127us 31.17% 736.127us 736.127us 1
3951
- aten::view 0.54% 21.512us 0.54% 21.512us 3.585us 0.000us 0.00% 0.000us 0.000us 6
3952
- aten::empty 1.17% 46.231us 1.17% 46.231us 5.137us 0.000us 0.00% 0.000us 0.000us 9
3953
- cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFla... 0.23% 9.070us 0.23% 9.070us 3.023us 0.000us 0.00% 0.000us 0.000us 3
3954
- cudaLaunchKernel 1.06% 41.913us 1.06% 41.913us 13.971us 0.000us 0.00% 0.000us 0.000us 3
3955
- cudaDeviceSynchronize 53.99% 2.137ms 53.99% 2.137ms 2.137ms 0.000us 0.00% 0.000us 0.000us 1
3956
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
3957
- Self CPU time total: 3.959ms
3958
- Self CUDA time total: 2.362ms
3959
 
3960
 
3961
 
@@ -3965,19 +3965,19 @@ PROFILE TRACE: hf_kernels_layer_norm | LN_B16_S2048_D8192
3965
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
3966
  Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
3967
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
3968
- hf_kernels_layer_norm 2.19% 144.024us 30.18% 1.989ms 1.989ms 0.000us 0.00% 6.322ms 6.322ms 1
3969
- _layer_norm_f8ec252::dropout_add_ln_fwd 0.69% 45.641us 27.80% 1.832ms 610.764us 4.774ms 100.00% 6.322ms 2.107ms 3
3970
- hf_kernels_layer_norm 0.00% 0.000us 0.00% 0.000us 0.000us 4.776ms 100.03% 4.776ms 4.776ms 1
3971
- void layer_norm::ln_fwd_kernel&lt;layer_norm::Kernel_tr... 0.00% 0.000us 0.00% 0.000us 0.000us 4.774ms 100.00% 4.774ms 1.591ms 3
3972
- Activity Buffer Request 26.09% 1.720ms 26.09% 1.720ms 1.720ms 1.548ms 32.42% 1.548ms 1.548ms 1
3973
- aten::view 0.20% 12.871us 0.20% 12.871us 2.145us 0.000us 0.00% 0.000us 0.000us 6
3974
- aten::empty 0.50% 32.981us 0.50% 32.981us 3.665us 0.000us 0.00% 0.000us 0.000us 9
3975
- cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFla... 0.07% 4.881us 0.07% 4.881us 1.627us 0.000us 0.00% 0.000us 0.000us 3
3976
- cudaLaunchKernel 0.44% 29.151us 0.44% 29.151us 9.717us 0.000us 0.00% 0.000us 0.000us 3
3977
- cudaDeviceSynchronize 69.82% 4.602ms 69.82% 4.602ms 4.602ms 0.000us 0.00% 0.000us 0.000us 1
3978
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
3979
- Self CPU time total: 6.591ms
3980
- Self CUDA time total: 4.774ms
3981
 
3982
 
3983
 
@@ -3987,19 +3987,19 @@ PROFILE TRACE: hf_kernels_layer_norm | LN_B16_S4096_D4096
3987
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
3988
  Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
3989
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
3990
- hf_kernels_layer_norm 1.89% 121.823us 28.69% 1.852ms 1.852ms 0.000us 0.00% 6.323ms 6.323ms 1
3991
- _layer_norm_f8ec252::dropout_add_ln_fwd 0.69% 44.435us 26.61% 1.718ms 572.663us 4.766ms 100.00% 6.323ms 2.108ms 3
3992
- hf_kernels_layer_norm 0.00% 0.000us 0.00% 0.000us 0.000us 4.767ms 100.03% 4.767ms 4.767ms 1
3993
- void layer_norm::ln_fwd_kernel&lt;layer_norm::Kernel_tr... 0.00% 0.000us 0.00% 0.000us 0.000us 4.766ms 100.00% 4.766ms 1.589ms 3
3994
- Activity Buffer Request 24.91% 1.608ms 24.91% 1.608ms 1.608ms 1.557ms 32.67% 1.557ms 1.557ms 1
3995
- aten::view 0.19% 12.441us 0.19% 12.441us 2.074us 0.000us 0.00% 0.000us 0.000us 6
3996
- aten::empty 0.50% 32.030us 0.50% 32.030us 3.559us 0.000us 0.00% 0.000us 0.000us 9
3997
- cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFla... 0.08% 4.850us 0.08% 4.850us 1.617us 0.000us 0.00% 0.000us 0.000us 3
3998
- cudaLaunchKernel 0.44% 28.190us 0.44% 28.190us 9.397us 0.000us 0.00% 0.000us 0.000us 3
3999
- cudaDeviceSynchronize 71.31% 4.604ms 71.31% 4.604ms 4.604ms 0.000us 0.00% 0.000us 0.000us 1
4000
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4001
- Self CPU time total: 6.457ms
4002
- Self CUDA time total: 4.766ms
4003
 
4004
 
4005
 
@@ -4009,37 +4009,38 @@ PROFILE TRACE: hf_kernels_layer_norm | LN_B16_S4096_D8192
4009
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4010
  Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
4011
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4012
- hf_kernels_layer_norm 1.32% 150.697us 17.31% 1.975ms 1.975ms 0.000us 0.00% 12.822ms 12.822ms 1
4013
- _layer_norm_f8ec252::dropout_add_ln_fwd 0.42% 47.993us 15.87% 1.810ms 603.497us 9.629ms 100.00% 12.822ms 4.274ms 3
4014
- hf_kernels_layer_norm 0.00% 0.000us 0.00% 0.000us 0.000us 9.631ms 100.01% 9.631ms 9.631ms 1
4015
- void layer_norm::ln_fwd_kernel&lt;layer_norm::Kernel_tr... 0.00% 0.000us 0.00% 0.000us 0.000us 9.629ms 100.00% 9.629ms 3.210ms 3
4016
- Activity Buffer Request 12.56% 1.433ms 12.56% 1.433ms 1.433ms 3.193ms 33.16% 3.193ms 3.193ms 1
4017
- aten::view 0.12% 13.330us 0.12% 13.330us 2.222us 0.000us 0.00% 0.000us 0.000us 6
4018
- aten::empty 0.28% 32.431us 0.28% 32.431us 3.603us 0.000us 0.00% 0.000us 0.000us 9
4019
- cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFla... 0.05% 5.260us 0.05% 5.260us 1.753us 0.000us 0.00% 0.000us 0.000us 3
4020
- cudaLaunchKernel 2.56% 291.579us 2.56% 291.579us 97.193us 0.000us 0.00% 0.000us 0.000us 3
4021
- cudaDeviceSynchronize 82.69% 9.436ms 82.69% 9.436ms 9.436ms 0.000us 0.00% 0.000us 0.000us 1
4022
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4023
- Self CPU time total: 11.410ms
4024
- Self CUDA time total: 9.629ms
4025
 
4026
 
4027
  impl wl p50(ms) ok
4028
  hf_kernels_layer_norm LN_B16_S2048_D4096 0.84 True
4029
  hf_kernels_layer_norm LN_B16_S2048_D8192 1.65 True
4030
  hf_kernels_layer_norm LN_B16_S4096_D4096 1.65 True
4031
- hf_kernels_layer_norm LN_B16_S4096_D8192 3.27 True
4032
  </pre></div>
4033
  <div class="uv-install-logs" id="uv-logs-benchmark">
4034
  <div class="uv-logs-header" onclick="toggleUvLogs(this)">▶ UV Install Logs</div>
4035
  <div class="uv-logs-content" style="display: none;">
 
 
4036
  Installed 15 packages in 13ms
4037
  </div>
4038
  </div>
4039
  <div class="cell-stderr">Fetching 4 files: 0%| | 0/4 [00:00&lt;?, ?it/s]
4040
- Fetching 4 files: 25%|██▌ | 1/4 [00:00&lt;00:00, 8.47it/s]
4041
- Fetching 4 files: 50%|█████ | 2/4 [00:02&lt;00:02, 1.44s/it]
4042
- Fetching 4 files: 100%|██████████| 4/4 [00:02&lt;00:00, 1.61it/s]</div>
4043
  <div class="cell-artifacts">
4044
  <h4>Artifacts:</h4>
4045
  <a href="artifacts/benchmark/layer_norm.jsonl" class="artifact" target="_blank">layer_norm.jsonl</a>
 
3862
  </div>
3863
 
3864
  <div class="main-content">
3865
+ <h1>HF Kernels LayerNorm Implementation</h1>
 
3866
  <p>Based on kernels-community <code>layer-norm</code> kernel.</p>
3867
  <h2>LayerNorm Benchmark (HF Kernels)</h2>
3868
  <div class="cell" id="cell-benchmark">
 
3872
  <span onclick="toggleOutput('benchmark')" style="cursor: pointer;">▼ output</span>
3873
  <span id="uv-indicator-benchmark" onclick="toggleUvLogsFromHeader('benchmark')" style="cursor: pointer;">▶ uv-logs</span>
3874
  </span> |
3875
+ Cell: benchmark | 6.34s
3876
  | <button class="run-btn" onclick="runCell('benchmark')">▶ run</button>
3877
  <button class="copy-btn" onclick="copyCell('benchmark')">Copy</button>
3878
  <a href="cells/benchmark.py" target="_blank" class="raw-btn">Raw</a>
3879
+ <a href="https://github.com/huggingface/kernels-uvnotes/blob/main/layer_norm/impls/hf_kernels_layer_norm.md" target="_blank" class="github-btn">GitHub</a>
3880
  </div>
3881
  <div id="code-benchmark" class="cell-code" data-lines="49">
3882
  <div class="code-wrap">
 
3943
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
3944
  Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
3945
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
3946
+ hf_kernels_layer_norm 5.26% 209.855us 46.73% 1.864ms 1.864ms 0.000us 0.00% 3.097ms 3.097ms 1
3947
+ _layer_norm_f8ec252::dropout_add_ln_fwd 1.78% 70.832us 40.86% 1.630ms 543.337us 2.360ms 100.00% 3.097ms 1.032ms 3
3948
+ hf_kernels_layer_norm 0.00% 0.000us 0.00% 0.000us 0.000us 2.362ms 100.06% 2.362ms 2.362ms 1
3949
+ void layer_norm::ln_fwd_kernel&lt;layer_norm::Kernel_tr... 0.00% 0.000us 0.00% 0.000us 0.000us 2.360ms 100.00% 2.360ms 786.699us 3
3950
+ Activity Buffer Request 36.61% 1.461ms 36.61% 1.461ms 1.461ms 736.736us 31.22% 736.736us 736.736us 1
3951
+ aten::view 0.61% 24.271us 0.61% 24.271us 4.045us 0.000us 0.00% 0.000us 0.000us 6
3952
+ aten::empty 1.19% 47.642us 1.19% 47.642us 5.294us 0.000us 0.00% 0.000us 0.000us 9
3953
+ cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFla... 0.27% 10.789us 0.27% 10.789us 3.596us 0.000us 0.00% 0.000us 0.000us 3
3954
+ cudaLaunchKernel 1.01% 40.102us 1.01% 40.102us 13.367us 0.000us 0.00% 0.000us 0.000us 3
3955
+ cudaDeviceSynchronize 53.27% 2.125ms 53.27% 2.125ms 2.125ms 0.000us 0.00% 0.000us 0.000us 1
3956
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
3957
+ Self CPU time total: 3.989ms
3958
+ Self CUDA time total: 2.360ms
3959
 
3960
 
3961
 
 
3965
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
3966
  Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
3967
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
3968
+ hf_kernels_layer_norm 2.24% 143.733us 27.27% 1.751ms 1.751ms 0.000us 0.00% 6.440ms 6.440ms 1
3969
+ _layer_norm_f8ec252::dropout_add_ln_fwd 0.75% 48.181us 24.84% 1.595ms 531.669us 4.846ms 100.00% 6.440ms 2.147ms 3
3970
+ hf_kernels_layer_norm 0.00% 0.000us 0.00% 0.000us 0.000us 4.848ms 100.03% 4.848ms 4.848ms 1
3971
+ void layer_norm::ln_fwd_kernel&lt;layer_norm::Kernel_tr... 0.00% 0.000us 0.00% 0.000us 0.000us 4.846ms 100.00% 4.846ms 1.615ms 3
3972
+ Activity Buffer Request 23.08% 1.482ms 23.08% 1.482ms 1.482ms 1.594ms 32.88% 1.594ms 1.594ms 1
3973
+ aten::view 0.20% 12.572us 0.20% 12.572us 2.095us 0.000us 0.00% 0.000us 0.000us 6
3974
+ aten::empty 0.46% 29.840us 0.46% 29.840us 3.316us 0.000us 0.00% 0.000us 0.000us 9
3975
+ cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFla... 0.08% 5.420us 0.08% 5.420us 1.807us 0.000us 0.00% 0.000us 0.000us 3
3976
+ cudaLaunchKernel 0.46% 29.490us 0.46% 29.490us 9.830us 0.000us 0.00% 0.000us 0.000us 3
3977
+ cudaDeviceSynchronize 72.73% 4.670ms 72.73% 4.670ms 4.670ms 0.000us 0.00% 0.000us 0.000us 1
3978
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
3979
+ Self CPU time total: 6.421ms
3980
+ Self CUDA time total: 4.846ms
3981
 
3982
 
3983
 
 
3987
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
3988
  Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
3989
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
3990
+ hf_kernels_layer_norm 1.96% 126.465us 27.43% 1.766ms 1.766ms 0.000us 0.00% 6.435ms 6.435ms 1
3991
+ _layer_norm_f8ec252::dropout_add_ln_fwd 0.73% 46.779us 25.26% 1.627ms 542.360us 4.838ms 100.00% 6.435ms 2.145ms 3
3992
+ hf_kernels_layer_norm 0.00% 0.000us 0.00% 0.000us 0.000us 4.839ms 100.03% 4.839ms 4.839ms 1
3993
+ void layer_norm::ln_fwd_kernel&lt;layer_norm::Kernel_tr... 0.00% 0.000us 0.00% 0.000us 0.000us 4.838ms 100.00% 4.838ms 1.613ms 3
3994
+ Activity Buffer Request 23.54% 1.516ms 23.54% 1.516ms 1.516ms 1.597ms 33.01% 1.597ms 1.597ms 1
3995
+ aten::view 0.20% 12.929us 0.20% 12.929us 2.155us 0.000us 0.00% 0.000us 0.000us 6
3996
+ aten::empty 0.46% 29.911us 0.46% 29.911us 3.323us 0.000us 0.00% 0.000us 0.000us 9
3997
+ cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFla... 0.08% 5.300us 0.08% 5.300us 1.767us 0.000us 0.00% 0.000us 0.000us 3
3998
+ cudaLaunchKernel 0.45% 29.003us 0.45% 29.003us 9.668us 0.000us 0.00% 0.000us 0.000us 3
3999
+ cudaDeviceSynchronize 72.57% 4.674ms 72.57% 4.674ms 4.674ms 0.000us 0.00% 0.000us 0.000us 1
4000
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4001
+ Self CPU time total: 6.440ms
4002
+ Self CUDA time total: 4.838ms
4003
 
4004
 
4005
 
 
4009
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4010
  Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
4011
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4012
+ hf_kernels_layer_norm 1.17% 134.085us 17.09% 1.957ms 1.957ms 0.000us 0.00% 12.886ms 12.886ms 1
4013
+ _layer_norm_f8ec252::dropout_add_ln_fwd 0.41% 46.869us 15.80% 1.809ms 603.015us 9.665ms 100.00% 12.886ms 4.295ms 3
4014
+ hf_kernels_layer_norm 0.00% 0.000us 0.00% 0.000us 0.000us 9.667ms 100.01% 9.667ms 9.667ms 1
4015
+ void layer_norm::ln_fwd_kernel&lt;layer_norm::Kernel_tr... 0.00% 0.000us 0.00% 0.000us 0.000us 9.665ms 100.00% 9.665ms 3.222ms 3
4016
+ Activity Buffer Request 12.76% 1.462ms 12.76% 1.462ms 1.462ms 3.220ms 33.32% 3.220ms 3.220ms 1
4017
+ aten::view 0.12% 13.968us 0.12% 13.968us 2.328us 0.000us 0.00% 0.000us 0.000us 6
4018
+ aten::empty 0.26% 30.043us 0.26% 30.043us 3.338us 0.000us 0.00% 0.000us 0.000us 9
4019
+ cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFla... 0.05% 5.590us 0.05% 5.590us 1.863us 0.000us 0.00% 0.000us 0.000us 3
4020
+ cudaLaunchKernel 2.31% 264.797us 2.31% 264.797us 88.266us 0.000us 0.00% 0.000us 0.000us 3
4021
+ cudaDeviceSynchronize 82.91% 9.495ms 82.91% 9.495ms 9.495ms 0.000us 0.00% 0.000us 0.000us 1
4022
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4023
+ Self CPU time total: 11.452ms
4024
+ Self CUDA time total: 9.665ms
4025
 
4026
 
4027
  impl wl p50(ms) ok
4028
  hf_kernels_layer_norm LN_B16_S2048_D4096 0.84 True
4029
  hf_kernels_layer_norm LN_B16_S2048_D8192 1.65 True
4030
  hf_kernels_layer_norm LN_B16_S4096_D4096 1.65 True
4031
+ hf_kernels_layer_norm LN_B16_S4096_D8192 3.26 True
4032
  </pre></div>
4033
  <div class="uv-install-logs" id="uv-logs-benchmark">
4034
  <div class="uv-logs-header" onclick="toggleUvLogs(this)">▶ UV Install Logs</div>
4035
  <div class="uv-logs-content" style="display: none;">
4036
+ Downloading hf-xet (3.2MiB)
4037
+ Downloading hf-xet
4038
  Installed 15 packages in 13ms
4039
  </div>
4040
  </div>
4041
  <div class="cell-stderr">Fetching 4 files: 0%| | 0/4 [00:00&lt;?, ?it/s]
4042
+ Fetching 4 files: 50%|█████ | 2/4 [00:01&lt;00:01, 1.22it/s]
4043
+ Fetching 4 files: 100%|██████████| 4/4 [00:01&lt;00:00, 2.44it/s]</div>
 
4044
  <div class="cell-artifacts">
4045
  <h4>Artifacts:</h4>
4046
  <a href="artifacts/benchmark/layer_norm.jsonl" class="artifact" target="_blank">layer_norm.jsonl</a>
layer_norm/impls/torch_layer_norm.html CHANGED
@@ -3862,8 +3862,7 @@ span.linenos.special { color: #000000; background-color: #ffffc0; padding-left:
3862
  </div>
3863
 
3864
  <div class="main-content">
3865
- <h2>on_github: huggingface/kernels-uvnotes</h2>
3866
- <h1>Torch LayerNorm Implementation</h1>
3867
  <h2>GPU Info</h2>
3868
  <div class="cell" id="cell-nv">
3869
  <div class="cell-header">
@@ -3872,10 +3871,11 @@ span.linenos.special { color: #000000; background-color: #ffffc0; padding-left:
3872
  <span onclick="toggleOutput('nv')" style="cursor: pointer;">▼ output</span>
3873
  <span id="uv-indicator-nv" style="cursor: default; opacity: 0.3;">▶ uv-logs</span>
3874
  </span> |
3875
- Cell: nv | 0.22s
3876
  | <button class="run-btn" onclick="runCell('nv')">▶ run</button>
3877
  <button class="copy-btn" onclick="copyCell('nv')">Copy</button>
3878
  <a href="cells/nv.py" target="_blank" class="raw-btn">Raw</a>
 
3879
  </div>
3880
  <div id="code-nv" class="cell-code" data-lines="2">
3881
  <div class="code-wrap">
@@ -3887,7 +3887,7 @@ Cell: nv | 0.22s
3887
  </div>
3888
  </div>
3889
  <div id="output-nv" class="cell-output">
3890
- <div class="cell-stdout"><pre class="stdout-text">Tue Oct 28 14:08:35 2025
3891
  +-----------------------------------------------------------------------------------------+
3892
  | NVIDIA-SMI 570.195.03 Driver Version: 570.195.03 CUDA Version: 12.8 |
3893
  |-----------------------------------------+------------------------+----------------------+
@@ -3896,7 +3896,7 @@ Cell: nv | 0.22s
3896
  | | | MIG M. |
3897
  |=========================================+========================+======================|
3898
  | 0 NVIDIA L40S On | 00000000:4D:00.0 Off | 0 |
3899
- | N/A 31C P0 141W / 350W | 0MiB / 46068MiB | 21% Default |
3900
  | | | N/A |
3901
  +-----------------------------------------+------------------------+----------------------+
3902
 
@@ -3920,10 +3920,11 @@ Cell: nv | 0.22s
3920
  <span onclick="toggleOutput('benchmark')" style="cursor: pointer;">▼ output</span>
3921
  <span id="uv-indicator-benchmark" onclick="toggleUvLogsFromHeader('benchmark')" style="cursor: pointer;">▶ uv-logs</span>
3922
  </span> |
3923
- Cell: benchmark | 7.39s
3924
  | <button class="run-btn" onclick="runCell('benchmark')">▶ run</button>
3925
  <button class="copy-btn" onclick="copyCell('benchmark')">Copy</button>
3926
  <a href="cells/benchmark.py" target="_blank" class="raw-btn">Raw</a>
 
3927
  </div>
3928
  <div id="code-benchmark" class="cell-code" data-lines="26">
3929
  <div class="code-wrap">
@@ -3967,19 +3968,19 @@ PROFILE TRACE: torch_layer_norm | LN_B16_S2048_D4096
3967
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
3968
  Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
3969
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
3970
- torch_layer_norm 3.94% 153.126us 46.06% 1.791ms 1.791ms 0.000us 0.00% 3.027ms 3.027ms 1
3971
- aten::layer_norm 0.44% 17.151us 42.12% 1.638ms 545.972us 0.000us 0.00% 3.027ms 1.009ms 3
3972
- aten::native_layer_norm 1.99% 77.265us 41.68% 1.621ms 540.255us 2.317ms 100.00% 3.027ms 1.009ms 3
3973
  torch_layer_norm 0.00% 0.000us 0.00% 0.000us 0.000us 2.318ms 100.06% 2.318ms 2.318ms 1
3974
- void at::native::(anonymous namespace)::vectorized_l... 0.00% 0.000us 0.00% 0.000us 0.000us 2.317ms 100.00% 2.317ms 772.230us 3
3975
- Activity Buffer Request 37.14% 1.444ms 37.14% 1.444ms 1.444ms 709.980us 30.65% 709.980us 709.980us 1
3976
- aten::empty 1.21% 46.960us 1.21% 46.960us 5.218us 0.000us 0.00% 0.000us 0.000us 9
3977
- cudaLaunchKernel 1.16% 45.271us 1.16% 45.271us 15.090us 0.000us 0.00% 0.000us 0.000us 3
3978
- aten::view 0.18% 7.130us 0.18% 7.130us 1.188us 0.000us 0.00% 0.000us 0.000us 6
3979
- cudaDeviceSynchronize 53.94% 2.098ms 53.94% 2.098ms 2.098ms 0.000us 0.00% 0.000us 0.000us 1
3980
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
3981
- Self CPU time total: 3.889ms
3982
- Self CUDA time total: 2.317ms
3983
 
3984
 
3985
 
@@ -3989,19 +3990,19 @@ PROFILE TRACE: torch_layer_norm | LN_B16_S2048_D8192
3989
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
3990
  Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
3991
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
3992
- torch_layer_norm 1.11% 71.092us 25.40% 1.622ms 1.622ms 0.000us 0.00% 6.494ms 6.494ms 1
3993
- aten::layer_norm 0.16% 10.119us 24.29% 1.551ms 517.038us 0.000us 0.00% 6.494ms 2.165ms 3
3994
- aten::native_layer_norm 0.82% 52.103us 24.13% 1.541ms 513.665us 4.898ms 100.00% 6.494ms 2.165ms 3
3995
- torch_layer_norm 0.00% 0.000us 0.00% 0.000us 0.000us 4.899ms 100.03% 4.899ms 4.899ms 1
3996
- void at::native::(anonymous namespace)::vectorized_l... 0.00% 0.000us 0.00% 0.000us 0.000us 4.898ms 100.00% 4.898ms 1.633ms 3
3997
- Activity Buffer Request 22.36% 1.428ms 22.36% 1.428ms 1.428ms 1.596ms 32.59% 1.596ms 1.596ms 1
3998
- aten::empty 0.49% 31.052us 0.49% 31.052us 3.450us 0.000us 0.00% 0.000us 0.000us 9
3999
- cudaLaunchKernel 0.41% 26.160us 0.41% 26.160us 8.720us 0.000us 0.00% 0.000us 0.000us 3
4000
- aten::view 0.06% 3.830us 0.06% 3.830us 0.638us 0.000us 0.00% 0.000us 0.000us 6
4001
- cudaDeviceSynchronize 74.60% 4.764ms 74.60% 4.764ms 4.764ms 0.000us 0.00% 0.000us 0.000us 1
4002
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4003
- Self CPU time total: 6.386ms
4004
- Self CUDA time total: 4.898ms
4005
 
4006
 
4007
 
@@ -4011,19 +4012,19 @@ PROFILE TRACE: torch_layer_norm | LN_B16_S4096_D4096
4011
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4012
  Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
4013
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4014
- torch_layer_norm 1.17% 72.893us 26.00% 1.616ms 1.616ms 0.000us 0.00% 6.248ms 6.248ms 1
4015
- aten::layer_norm 0.15% 9.290us 24.82% 1.543ms 514.468us 0.000us 0.00% 6.248ms 2.083ms 3
4016
- aten::native_layer_norm 0.84% 52.403us 24.67% 1.534ms 511.371us 4.735ms 100.00% 6.248ms 2.083ms 3
4017
- torch_layer_norm 0.00% 0.000us 0.00% 0.000us 0.000us 4.736ms 100.03% 4.736ms 4.736ms 1
4018
- void at::native::(anonymous namespace)::vectorized_l... 0.00% 0.000us 0.00% 0.000us 0.000us 4.735ms 100.00% 4.735ms 1.578ms 3
4019
- Activity Buffer Request 22.86% 1.421ms 22.86% 1.421ms 1.421ms 1.513ms 31.96% 1.513ms 1.513ms 1
4020
- aten::empty 0.47% 29.320us 0.47% 29.320us 3.258us 0.000us 0.00% 0.000us 0.000us 9
4021
- cudaLaunchKernel 0.43% 26.781us 0.43% 26.781us 8.927us 0.000us 0.00% 0.000us 0.000us 3
4022
- aten::view 0.07% 4.140us 0.07% 4.140us 0.690us 0.000us 0.00% 0.000us 0.000us 6
4023
- cudaDeviceSynchronize 74.00% 4.601ms 74.00% 4.601ms 4.601ms 0.000us 0.00% 0.000us 0.000us 1
4024
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4025
- Self CPU time total: 6.218ms
4026
- Self CUDA time total: 4.735ms
4027
 
4028
 
4029
 
@@ -4033,19 +4034,19 @@ PROFILE TRACE: torch_layer_norm | LN_B16_S4096_D8192
4033
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4034
  Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
4035
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4036
- torch_layer_norm 0.66% 74.633us 14.54% 1.650ms 1.650ms 0.000us 0.00% 13.090ms 13.090ms 1
4037
- aten::layer_norm 0.09% 9.800us 13.88% 1.575ms 525.028us 0.000us 0.00% 13.090ms 4.363ms 3
4038
- aten::native_layer_norm 0.45% 51.390us 13.79% 1.565ms 521.762us 9.838ms 100.00% 13.090ms 4.363ms 3
4039
- torch_layer_norm 0.00% 0.000us 0.00% 0.000us 0.000us 9.839ms 100.01% 9.839ms 9.839ms 1
4040
- void at::native::(anonymous namespace)::vectorized_l... 0.00% 0.000us 0.00% 0.000us 0.000us 9.838ms 100.00% 9.838ms 3.279ms 3
4041
- Activity Buffer Request 11.36% 1.289ms 11.36% 1.289ms 1.289ms 3.253ms 33.06% 3.253ms 3.253ms 1
4042
- aten::empty 0.28% 31.381us 0.28% 31.381us 3.487us 0.000us 0.00% 0.000us 0.000us 9
4043
- cudaLaunchKernel 1.67% 189.088us 1.67% 189.088us 63.029us 0.000us 0.00% 0.000us 0.000us 3
4044
- aten::view 0.04% 4.121us 0.04% 4.121us 0.687us 0.000us 0.00% 0.000us 0.000us 6
4045
- cudaDeviceSynchronize 85.46% 9.697ms 85.46% 9.697ms 9.697ms 0.000us 0.00% 0.000us 0.000us 1
4046
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4047
- Self CPU time total: 11.347ms
4048
- Self CUDA time total: 9.838ms
4049
 
4050
 
4051
  impl wl p50(ms) ok
@@ -4057,7 +4058,7 @@ torch_layer_norm LN_B16_S4096_D8192 3.33 True
4057
  <div class="uv-install-logs" id="uv-logs-benchmark">
4058
  <div class="uv-logs-header" onclick="toggleUvLogs(this)">▶ UV Install Logs</div>
4059
  <div class="uv-logs-content" style="display: none;">
4060
- Installed 37 packages in 221ms
4061
  </div>
4062
  </div>
4063
  <div class="cell-artifacts">
 
3862
  </div>
3863
 
3864
  <div class="main-content">
3865
+ <h1>Torch LayerNorm Implementation</h1>
 
3866
  <h2>GPU Info</h2>
3867
  <div class="cell" id="cell-nv">
3868
  <div class="cell-header">
 
3871
  <span onclick="toggleOutput('nv')" style="cursor: pointer;">▼ output</span>
3872
  <span id="uv-indicator-nv" style="cursor: default; opacity: 0.3;">▶ uv-logs</span>
3873
  </span> |
3874
+ Cell: nv | 0.26s
3875
  | <button class="run-btn" onclick="runCell('nv')">▶ run</button>
3876
  <button class="copy-btn" onclick="copyCell('nv')">Copy</button>
3877
  <a href="cells/nv.py" target="_blank" class="raw-btn">Raw</a>
3878
+ <a href="https://github.com/huggingface/kernels-uvnotes/blob/main/layer_norm/impls/torch_layer_norm.md" target="_blank" class="github-btn">GitHub</a>
3879
  </div>
3880
  <div id="code-nv" class="cell-code" data-lines="2">
3881
  <div class="code-wrap">
 
3887
  </div>
3888
  </div>
3889
  <div id="output-nv" class="cell-output">
3890
+ <div class="cell-stdout"><pre class="stdout-text">Wed Oct 29 14:26:26 2025
3891
  +-----------------------------------------------------------------------------------------+
3892
  | NVIDIA-SMI 570.195.03 Driver Version: 570.195.03 CUDA Version: 12.8 |
3893
  |-----------------------------------------+------------------------+----------------------+
 
3896
  | | | MIG M. |
3897
  |=========================================+========================+======================|
3898
  | 0 NVIDIA L40S On | 00000000:4D:00.0 Off | 0 |
3899
+ | N/A 30C P0 108W / 350W | 0MiB / 46068MiB | 100% Default |
3900
  | | | N/A |
3901
  +-----------------------------------------+------------------------+----------------------+
3902
 
 
3920
  <span onclick="toggleOutput('benchmark')" style="cursor: pointer;">▼ output</span>
3921
  <span id="uv-indicator-benchmark" onclick="toggleUvLogsFromHeader('benchmark')" style="cursor: pointer;">▶ uv-logs</span>
3922
  </span> |
3923
+ Cell: benchmark | 7.36s
3924
  | <button class="run-btn" onclick="runCell('benchmark')">▶ run</button>
3925
  <button class="copy-btn" onclick="copyCell('benchmark')">Copy</button>
3926
  <a href="cells/benchmark.py" target="_blank" class="raw-btn">Raw</a>
3927
+ <a href="https://github.com/huggingface/kernels-uvnotes/blob/main/layer_norm/impls/torch_layer_norm.md" target="_blank" class="github-btn">GitHub</a>
3928
  </div>
3929
  <div id="code-benchmark" class="cell-code" data-lines="26">
3930
  <div class="code-wrap">
 
3968
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
3969
  Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
3970
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
3971
+ torch_layer_norm 3.90% 151.572us 46.01% 1.786ms 1.786ms 0.000us 0.00% 3.026ms 3.026ms 1
3972
+ aten::layer_norm 0.43% 16.762us 42.11% 1.635ms 544.851us 0.000us 0.00% 3.026ms 1.009ms 3
3973
+ aten::native_layer_norm 2.06% 80.009us 41.67% 1.618ms 539.263us 2.316ms 100.00% 3.026ms 1.009ms 3
3974
  torch_layer_norm 0.00% 0.000us 0.00% 0.000us 0.000us 2.318ms 100.06% 2.318ms 2.318ms 1
3975
+ void at::native::(anonymous namespace)::vectorized_l... 0.00% 0.000us 0.00% 0.000us 0.000us 2.316ms 100.00% 2.316ms 772.127us 3
3976
+ Activity Buffer Request 37.08% 1.440ms 37.08% 1.440ms 1.440ms 709.855us 30.65% 709.855us 709.855us 1
3977
+ aten::empty 1.19% 46.261us 1.19% 46.261us 5.140us 0.000us 0.00% 0.000us 0.000us 9
3978
+ cudaLaunchKernel 1.16% 45.163us 1.16% 45.163us 15.054us 0.000us 0.00% 0.000us 0.000us 3
3979
+ aten::view 0.17% 6.761us 0.17% 6.761us 1.127us 0.000us 0.00% 0.000us 0.000us 6
3980
+ cudaDeviceSynchronize 53.99% 2.096ms 53.99% 2.096ms 2.096ms 0.000us 0.00% 0.000us 0.000us 1
3981
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
3982
+ Self CPU time total: 3.882ms
3983
+ Self CUDA time total: 2.316ms
3984
 
3985
 
3986
 
 
3990
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
3991
  Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
3992
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
3993
+ torch_layer_norm 1.19% 75.581us 25.55% 1.628ms 1.628ms 0.000us 0.00% 6.473ms 6.473ms 1
3994
+ aten::layer_norm 0.14% 9.142us 24.37% 1.553ms 517.550us 0.000us 0.00% 6.473ms 2.158ms 3
3995
+ aten::native_layer_norm 0.81% 51.921us 24.22% 1.544ms 514.502us 4.881ms 100.00% 6.473ms 2.158ms 3
3996
+ torch_layer_norm 0.00% 0.000us 0.00% 0.000us 0.000us 4.882ms 100.03% 4.882ms 4.882ms 1
3997
+ void at::native::(anonymous namespace)::vectorized_l... 0.00% 0.000us 0.00% 0.000us 0.000us 4.881ms 100.00% 4.881ms 1.627ms 3
3998
+ Activity Buffer Request 22.46% 1.431ms 22.46% 1.431ms 1.431ms 1.592ms 32.61% 1.592ms 1.592ms 1
3999
+ aten::empty 0.44% 27.841us 0.44% 27.841us 3.093us 0.000us 0.00% 0.000us 0.000us 9
4000
+ cudaLaunchKernel 0.45% 28.910us 0.45% 28.910us 9.637us 0.000us 0.00% 0.000us 0.000us 3
4001
+ aten::view 0.06% 3.829us 0.06% 3.829us 0.638us 0.000us 0.00% 0.000us 0.000us 6
4002
+ cudaDeviceSynchronize 74.45% 4.743ms 74.45% 4.743ms 4.743ms 0.000us 0.00% 0.000us 0.000us 1
4003
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4004
+ Self CPU time total: 6.372ms
4005
+ Self CUDA time total: 4.881ms
4006
 
4007
 
4008
 
 
4012
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4013
  Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
4014
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4015
+ torch_layer_norm 1.15% 71.882us 26.71% 1.668ms 1.668ms 0.000us 0.00% 6.222ms 6.222ms 1
4016
+ aten::layer_norm 0.15% 9.629us 25.56% 1.596ms 532.153us 0.000us 0.00% 6.222ms 2.074ms 3
4017
+ aten::native_layer_norm 0.90% 56.373us 25.41% 1.587ms 528.943us 4.717ms 100.00% 6.222ms 2.074ms 3
4018
+ torch_layer_norm 0.00% 0.000us 0.00% 0.000us 0.000us 4.718ms 100.03% 4.718ms 4.718ms 1
4019
+ void at::native::(anonymous namespace)::vectorized_l... 0.00% 0.000us 0.00% 0.000us 0.000us 4.717ms 100.00% 4.717ms 1.572ms 3
4020
+ Activity Buffer Request 23.44% 1.464ms 23.44% 1.464ms 1.464ms 1.506ms 31.93% 1.506ms 1.506ms 1
4021
+ aten::empty 0.46% 28.850us 0.46% 28.850us 3.206us 0.000us 0.00% 0.000us 0.000us 9
4022
+ cudaLaunchKernel 0.52% 32.781us 0.52% 32.781us 10.927us 0.000us 0.00% 0.000us 0.000us 3
4023
+ aten::view 0.07% 4.590us 0.07% 4.590us 0.765us 0.000us 0.00% 0.000us 0.000us 6
4024
+ cudaDeviceSynchronize 73.29% 4.577ms 73.29% 4.577ms 4.577ms 0.000us 0.00% 0.000us 0.000us 1
4025
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4026
+ Self CPU time total: 6.246ms
4027
+ Self CUDA time total: 4.717ms
4028
 
4029
 
4030
 
 
4034
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4035
  Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
4036
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4037
+ torch_layer_norm 0.67% 74.340us 13.35% 1.490ms 1.490ms 0.000us 0.00% 13.028ms 13.028ms 1
4038
+ aten::layer_norm 0.09% 9.510us 12.69% 1.416ms 471.835us 0.000us 0.00% 13.028ms 4.343ms 3
4039
+ aten::native_layer_norm 0.47% 52.269us 12.60% 1.406ms 468.665us 9.808ms 100.00% 13.028ms 4.343ms 3
4040
+ torch_layer_norm 0.00% 0.000us 0.00% 0.000us 0.000us 9.809ms 100.02% 9.809ms 9.809ms 1
4041
+ void at::native::(anonymous namespace)::vectorized_l... 0.00% 0.000us 0.00% 0.000us 0.000us 9.808ms 100.00% 9.808ms 3.269ms 3
4042
+ Activity Buffer Request 9.72% 1.085ms 9.72% 1.085ms 1.085ms 3.220ms 32.83% 3.220ms 3.220ms 1
4043
+ aten::empty 0.26% 29.181us 0.26% 29.181us 3.242us 0.000us 0.00% 0.000us 0.000us 9
4044
+ cudaLaunchKernel 2.11% 235.817us 2.11% 235.817us 78.606us 0.000us 0.00% 0.000us 0.000us 3
4045
+ aten::view 0.04% 4.022us 0.04% 4.022us 0.670us 0.000us 0.00% 0.000us 0.000us 6
4046
+ cudaDeviceSynchronize 86.65% 9.669ms 86.65% 9.669ms 9.669ms 0.000us 0.00% 0.000us 0.000us 1
4047
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4048
+ Self CPU time total: 11.159ms
4049
+ Self CUDA time total: 9.808ms
4050
 
4051
 
4052
  impl wl p50(ms) ok
 
4058
  <div class="uv-install-logs" id="uv-logs-benchmark">
4059
  <div class="uv-logs-header" onclick="toggleUvLogs(this)">▶ UV Install Logs</div>
4060
  <div class="uv-logs-content" style="display: none;">
4061
+ Installed 37 packages in 222ms
4062
  </div>
4063
  </div>
4064
  <div class="cell-artifacts">
layer_norm/results/artifacts/combine/latency.svg CHANGED

Git LFS Details

  • SHA256: 1e41c135df9f0b506fa1ac950b90bd609d850f01d79b3171b3678c24fdab066a
  • Pointer size: 130 Bytes
  • Size of remote file: 14.6 kB

Git LFS Details

  • SHA256: 8fd53794c4617f7e947676c655de6f739b720b8f16a59432369c127bfc08190a
  • Pointer size: 130 Bytes
  • Size of remote file: 14.6 kB
layer_norm/results/combined_results.html CHANGED
@@ -3872,7 +3872,7 @@ span.linenos.special { color: #000000; background-color: #ffffc0; padding-left:
3872
  <rdf:RDF>
3873
  <ns2:Work>
3874
  <dc:type rdf:resource="http://purl.org/dc/dcmitype/StillImage" />
3875
- <dc:date>2025-10-28T14:09:21.825978</dc:date>
3876
  <dc:format>image/svg+xml</dc:format>
3877
  <dc:creator>
3878
  <ns2:Agent>
@@ -3956,70 +3956,70 @@ span.linenos.special { color: #000000; background-color: #ffffc0; padding-left:
3956
  <g id="matplotlib.axis_2">
3957
  <g id="ytick_1">
3958
  <g id="grid-y--2" class="grid grid-y">
3959
- <path d="M 47.72 409.029804 L 840.20233 409.029804 " clip-path="url(#p2214f54723)" style="fill: none; stroke: #b0b0b0; stroke-opacity: 0.3; stroke-width: 0.8; stroke-linecap: square" />
3960
  </g>
3961
  <g id="line2d_5">
3962
  <defs>
3963
  <path id="m0fca2865ba" d="M 0 0 L -3.5 0 " style="stroke: #000000; stroke-width: 0.8" />
3964
  </defs>
3965
  <g>
3966
- <use ns4:href="#m0fca2865ba" x="47.72" y="409.029804" style="stroke: #000000; stroke-width: 0.8" />
3967
  </g>
3968
  </g>
3969
  <g id="text_5">
3970
- <text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: end" x="40.72" y="412.829023" transform="rotate(-0 40.72 412.829023)">1.0</text>
3971
  </g>
3972
  </g>
3973
  <g id="ytick_2">
3974
  <g id="grid-y--3" class="grid grid-y">
3975
- <path d="M 47.72 331.290271 L 840.20233 331.290271 " clip-path="url(#p2214f54723)" style="fill: none; stroke: #b0b0b0; stroke-opacity: 0.3; stroke-width: 0.8; stroke-linecap: square" />
3976
  </g>
3977
  <g id="line2d_6">
3978
  <g>
3979
- <use ns4:href="#m0fca2865ba" x="47.72" y="331.290271" style="stroke: #000000; stroke-width: 0.8" />
3980
  </g>
3981
  </g>
3982
  <g id="text_6">
3983
- <text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: end" x="40.72" y="335.08949" transform="rotate(-0 40.72 335.08949)">1.5</text>
3984
  </g>
3985
  </g>
3986
  <g id="ytick_3">
3987
  <g id="grid-y--4" class="grid grid-y">
3988
- <path d="M 47.72 253.550738 L 840.20233 253.550738 " clip-path="url(#p2214f54723)" style="fill: none; stroke: #b0b0b0; stroke-opacity: 0.3; stroke-width: 0.8; stroke-linecap: square" />
3989
  </g>
3990
  <g id="line2d_7">
3991
  <g>
3992
- <use ns4:href="#m0fca2865ba" x="47.72" y="253.550738" style="stroke: #000000; stroke-width: 0.8" />
3993
  </g>
3994
  </g>
3995
  <g id="text_7">
3996
- <text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: end" x="40.72" y="257.349957" transform="rotate(-0 40.72 257.349957)">2.0</text>
3997
  </g>
3998
  </g>
3999
  <g id="ytick_4">
4000
  <g id="grid-y--5" class="grid grid-y">
4001
- <path d="M 47.72 175.811205 L 840.20233 175.811205 " clip-path="url(#p2214f54723)" style="fill: none; stroke: #b0b0b0; stroke-opacity: 0.3; stroke-width: 0.8; stroke-linecap: square" />
4002
  </g>
4003
  <g id="line2d_8">
4004
  <g>
4005
- <use ns4:href="#m0fca2865ba" x="47.72" y="175.811205" style="stroke: #000000; stroke-width: 0.8" />
4006
  </g>
4007
  </g>
4008
  <g id="text_8">
4009
- <text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: end" x="40.72" y="179.610424" transform="rotate(-0 40.72 179.610424)">2.5</text>
4010
  </g>
4011
  </g>
4012
  <g id="ytick_5">
4013
  <g id="grid-y--6" class="grid grid-y">
4014
- <path d="M 47.72 98.071672 L 840.20233 98.071672 " clip-path="url(#p2214f54723)" style="fill: none; stroke: #b0b0b0; stroke-opacity: 0.3; stroke-width: 0.8; stroke-linecap: square" />
4015
  </g>
4016
  <g id="line2d_9">
4017
  <g>
4018
- <use ns4:href="#m0fca2865ba" x="47.72" y="98.071672" style="stroke: #000000; stroke-width: 0.8" />
4019
  </g>
4020
  </g>
4021
  <g id="text_9">
4022
- <text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: end" x="40.72" y="101.870891" transform="rotate(-0 40.72 101.870891)">3.0</text>
4023
  </g>
4024
  </g>
4025
  <g id="label--y" class="ylabel">
@@ -4027,27 +4027,27 @@ span.linenos.special { color: #000000; background-color: #ffffc0; padding-left:
4027
  </g>
4028
  </g>
4029
  <g id="series--torch-layer-norm" class="series">
4030
- <path d="M 83.741924 437.689571 L 323.888085 303.094453 L 564.034245 314.534914 L 804.180406 46.442361 " clip-path="url(#p2214f54723)" style="fill: none; stroke: #1f77b4; stroke-width: 1.5; stroke-linecap: square" />
4031
  <defs>
4032
  <path id="md7efaf3aec" d="M 0 3 C 0.795609 3 1.55874 2.683901 2.12132 2.12132 C 2.683901 1.55874 3 0.795609 3 0 C 3 -0.795609 2.683901 -1.55874 2.12132 -2.12132 C 1.55874 -2.683901 0.795609 -3 0 -3 C -0.795609 -3 -1.55874 -2.683901 -2.12132 -2.12132 C -2.683901 -1.55874 -3 -0.795609 -3 0 C -3 0.795609 -2.683901 1.55874 -2.12132 2.12132 C -1.55874 2.683901 -0.795609 3 0 3 z " style="stroke: #1f77b4" />
4033
  </defs>
4034
  <g clip-path="url(#p2214f54723)">
4035
  <use ns4:href="#md7efaf3aec" x="83.741924" y="437.689571" style="fill: #1f77b4; stroke: #1f77b4" />
4036
- <use ns4:href="#md7efaf3aec" x="323.888085" y="303.094453" style="fill: #1f77b4; stroke: #1f77b4" />
4037
- <use ns4:href="#md7efaf3aec" x="564.034245" y="314.534914" style="fill: #1f77b4; stroke: #1f77b4" />
4038
  <use ns4:href="#md7efaf3aec" x="804.180406" y="46.442361" style="fill: #1f77b4; stroke: #1f77b4" />
4039
  </g>
4040
  </g>
4041
  <g id="series--hf-kernels-layer-norm" class="series">
4042
- <path d="M 83.741924 434.514533 L 323.888085 307.713737 L 564.034245 307.460461 L 804.180406 56.028111 " clip-path="url(#p2214f54723)" style="fill: none; stroke: #ff7f0e; stroke-width: 1.5; stroke-linecap: square" />
4043
  <defs>
4044
  <path id="m9b8c54d372" d="M 0 3 C 0.795609 3 1.55874 2.683901 2.12132 2.12132 C 2.683901 1.55874 3 0.795609 3 0 C 3 -0.795609 2.683901 -1.55874 2.12132 -2.12132 C 1.55874 -2.683901 0.795609 -3 0 -3 C -0.795609 -3 -1.55874 -2.683901 -2.12132 -2.12132 C -2.683901 -1.55874 -3 -0.795609 -3 0 C -3 0.795609 -2.683901 1.55874 -2.12132 2.12132 C -1.55874 2.683901 -0.795609 3 0 3 z " style="stroke: #ff7f0e" />
4045
  </defs>
4046
  <g clip-path="url(#p2214f54723)">
4047
- <use ns4:href="#m9b8c54d372" x="83.741924" y="434.514533" style="fill: #ff7f0e; stroke: #ff7f0e" />
4048
- <use ns4:href="#m9b8c54d372" x="323.888085" y="307.713737" style="fill: #ff7f0e; stroke: #ff7f0e" />
4049
- <use ns4:href="#m9b8c54d372" x="564.034245" y="307.460461" style="fill: #ff7f0e; stroke: #ff7f0e" />
4050
- <use ns4:href="#m9b8c54d372" x="804.180406" y="56.028111" style="fill: #ff7f0e; stroke: #ff7f0e" />
4051
  </g>
4052
  </g>
4053
  <g id="patch_3">
@@ -4105,7 +4105,7 @@ span.linenos.special { color: #000000; background-color: #ffffc0; padding-left:
4105
  <span onclick="toggleOutput('combine')" style="cursor: pointer;">▼ output</span>
4106
  <span id="uv-indicator-combine" onclick="toggleUvLogsFromHeader('combine')" style="cursor: pointer;">▶ uv-logs</span>
4107
  </span> |
4108
- Cell: combine | 4.25s
4109
  | <button class="run-btn" onclick="runCell('combine')">▶ run</button>
4110
  <button class="copy-btn" onclick="copyCell('combine')">Copy</button>
4111
  <a href="cells/combine.py" target="_blank" class="raw-btn">Raw</a>
@@ -4195,7 +4195,7 @@ impl wl p50(ms) ok
4195
  hf_kernels_layer_norm LN_B16_S2048_D4096 0.84 True
4196
  hf_kernels_layer_norm LN_B16_S2048_D8192 1.65 True
4197
  hf_kernels_layer_norm LN_B16_S4096_D4096 1.65 True
4198
- hf_kernels_layer_norm LN_B16_S4096_D8192 3.27 True
4199
  torch_layer_norm LN_B16_S2048_D4096 0.82 True
4200
  torch_layer_norm LN_B16_S2048_D8192 1.68 True
4201
  torch_layer_norm LN_B16_S4096_D4096 1.61 True
@@ -4219,7 +4219,7 @@ Implementations included:
4219
  <div class="uv-install-logs" id="uv-logs-combine">
4220
  <div class="uv-logs-header" onclick="toggleUvLogs(this)">▶ UV Install Logs</div>
4221
  <div class="uv-logs-content" style="display: none;">
4222
- Installed 37 packages in 219ms
4223
  </div>
4224
  </div>
4225
  <div class="cell-artifacts">
@@ -4232,7 +4232,7 @@ Installed 37 packages in 219ms
4232
  <rdf:RDF>
4233
  <ns2:Work>
4234
  <dc:type rdf:resource="http://purl.org/dc/dcmitype/StillImage" />
4235
- <dc:date>2025-10-28T14:09:21.825978</dc:date>
4236
  <dc:format>image/svg+xml</dc:format>
4237
  <dc:creator>
4238
  <ns2:Agent>
@@ -4316,70 +4316,70 @@ Installed 37 packages in 219ms
4316
  <g id="matplotlib.axis_2">
4317
  <g id="ytick_1">
4318
  <g id="grid-y--2" class="grid grid-y">
4319
- <path d="M 47.72 409.029804 L 840.20233 409.029804 " clip-path="url(#p2214f54723)" style="fill: none; stroke: #b0b0b0; stroke-opacity: 0.3; stroke-width: 0.8; stroke-linecap: square" />
4320
  </g>
4321
  <g id="line2d_5">
4322
  <defs>
4323
  <path id="m0fca2865ba" d="M 0 0 L -3.5 0 " style="stroke: #000000; stroke-width: 0.8" />
4324
  </defs>
4325
  <g>
4326
- <use ns4:href="#m0fca2865ba" x="47.72" y="409.029804" style="stroke: #000000; stroke-width: 0.8" />
4327
  </g>
4328
  </g>
4329
  <g id="text_5">
4330
- <text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: end" x="40.72" y="412.829023" transform="rotate(-0 40.72 412.829023)">1.0</text>
4331
  </g>
4332
  </g>
4333
  <g id="ytick_2">
4334
  <g id="grid-y--3" class="grid grid-y">
4335
- <path d="M 47.72 331.290271 L 840.20233 331.290271 " clip-path="url(#p2214f54723)" style="fill: none; stroke: #b0b0b0; stroke-opacity: 0.3; stroke-width: 0.8; stroke-linecap: square" />
4336
  </g>
4337
  <g id="line2d_6">
4338
  <g>
4339
- <use ns4:href="#m0fca2865ba" x="47.72" y="331.290271" style="stroke: #000000; stroke-width: 0.8" />
4340
  </g>
4341
  </g>
4342
  <g id="text_6">
4343
- <text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: end" x="40.72" y="335.08949" transform="rotate(-0 40.72 335.08949)">1.5</text>
4344
  </g>
4345
  </g>
4346
  <g id="ytick_3">
4347
  <g id="grid-y--4" class="grid grid-y">
4348
- <path d="M 47.72 253.550738 L 840.20233 253.550738 " clip-path="url(#p2214f54723)" style="fill: none; stroke: #b0b0b0; stroke-opacity: 0.3; stroke-width: 0.8; stroke-linecap: square" />
4349
  </g>
4350
  <g id="line2d_7">
4351
  <g>
4352
- <use ns4:href="#m0fca2865ba" x="47.72" y="253.550738" style="stroke: #000000; stroke-width: 0.8" />
4353
  </g>
4354
  </g>
4355
  <g id="text_7">
4356
- <text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: end" x="40.72" y="257.349957" transform="rotate(-0 40.72 257.349957)">2.0</text>
4357
  </g>
4358
  </g>
4359
  <g id="ytick_4">
4360
  <g id="grid-y--5" class="grid grid-y">
4361
- <path d="M 47.72 175.811205 L 840.20233 175.811205 " clip-path="url(#p2214f54723)" style="fill: none; stroke: #b0b0b0; stroke-opacity: 0.3; stroke-width: 0.8; stroke-linecap: square" />
4362
  </g>
4363
  <g id="line2d_8">
4364
  <g>
4365
- <use ns4:href="#m0fca2865ba" x="47.72" y="175.811205" style="stroke: #000000; stroke-width: 0.8" />
4366
  </g>
4367
  </g>
4368
  <g id="text_8">
4369
- <text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: end" x="40.72" y="179.610424" transform="rotate(-0 40.72 179.610424)">2.5</text>
4370
  </g>
4371
  </g>
4372
  <g id="ytick_5">
4373
  <g id="grid-y--6" class="grid grid-y">
4374
- <path d="M 47.72 98.071672 L 840.20233 98.071672 " clip-path="url(#p2214f54723)" style="fill: none; stroke: #b0b0b0; stroke-opacity: 0.3; stroke-width: 0.8; stroke-linecap: square" />
4375
  </g>
4376
  <g id="line2d_9">
4377
  <g>
4378
- <use ns4:href="#m0fca2865ba" x="47.72" y="98.071672" style="stroke: #000000; stroke-width: 0.8" />
4379
  </g>
4380
  </g>
4381
  <g id="text_9">
4382
- <text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: end" x="40.72" y="101.870891" transform="rotate(-0 40.72 101.870891)">3.0</text>
4383
  </g>
4384
  </g>
4385
  <g id="label--y" class="ylabel">
@@ -4387,27 +4387,27 @@ Installed 37 packages in 219ms
4387
  </g>
4388
  </g>
4389
  <g id="series--torch-layer-norm" class="series">
4390
- <path d="M 83.741924 437.689571 L 323.888085 303.094453 L 564.034245 314.534914 L 804.180406 46.442361 " clip-path="url(#p2214f54723)" style="fill: none; stroke: #1f77b4; stroke-width: 1.5; stroke-linecap: square" />
4391
  <defs>
4392
  <path id="md7efaf3aec" d="M 0 3 C 0.795609 3 1.55874 2.683901 2.12132 2.12132 C 2.683901 1.55874 3 0.795609 3 0 C 3 -0.795609 2.683901 -1.55874 2.12132 -2.12132 C 1.55874 -2.683901 0.795609 -3 0 -3 C -0.795609 -3 -1.55874 -2.683901 -2.12132 -2.12132 C -2.683901 -1.55874 -3 -0.795609 -3 0 C -3 0.795609 -2.683901 1.55874 -2.12132 2.12132 C -1.55874 2.683901 -0.795609 3 0 3 z " style="stroke: #1f77b4" />
4393
  </defs>
4394
  <g clip-path="url(#p2214f54723)">
4395
  <use ns4:href="#md7efaf3aec" x="83.741924" y="437.689571" style="fill: #1f77b4; stroke: #1f77b4" />
4396
- <use ns4:href="#md7efaf3aec" x="323.888085" y="303.094453" style="fill: #1f77b4; stroke: #1f77b4" />
4397
- <use ns4:href="#md7efaf3aec" x="564.034245" y="314.534914" style="fill: #1f77b4; stroke: #1f77b4" />
4398
  <use ns4:href="#md7efaf3aec" x="804.180406" y="46.442361" style="fill: #1f77b4; stroke: #1f77b4" />
4399
  </g>
4400
  </g>
4401
  <g id="series--hf-kernels-layer-norm" class="series">
4402
- <path d="M 83.741924 434.514533 L 323.888085 307.713737 L 564.034245 307.460461 L 804.180406 56.028111 " clip-path="url(#p2214f54723)" style="fill: none; stroke: #ff7f0e; stroke-width: 1.5; stroke-linecap: square" />
4403
  <defs>
4404
  <path id="m9b8c54d372" d="M 0 3 C 0.795609 3 1.55874 2.683901 2.12132 2.12132 C 2.683901 1.55874 3 0.795609 3 0 C 3 -0.795609 2.683901 -1.55874 2.12132 -2.12132 C 1.55874 -2.683901 0.795609 -3 0 -3 C -0.795609 -3 -1.55874 -2.683901 -2.12132 -2.12132 C -2.683901 -1.55874 -3 -0.795609 -3 0 C -3 0.795609 -2.683901 1.55874 -2.12132 2.12132 C -1.55874 2.683901 -0.795609 3 0 3 z " style="stroke: #ff7f0e" />
4405
  </defs>
4406
  <g clip-path="url(#p2214f54723)">
4407
- <use ns4:href="#m9b8c54d372" x="83.741924" y="434.514533" style="fill: #ff7f0e; stroke: #ff7f0e" />
4408
- <use ns4:href="#m9b8c54d372" x="323.888085" y="307.713737" style="fill: #ff7f0e; stroke: #ff7f0e" />
4409
- <use ns4:href="#m9b8c54d372" x="564.034245" y="307.460461" style="fill: #ff7f0e; stroke: #ff7f0e" />
4410
- <use ns4:href="#m9b8c54d372" x="804.180406" y="56.028111" style="fill: #ff7f0e; stroke: #ff7f0e" />
4411
  </g>
4412
  </g>
4413
  <g id="patch_3">
 
3872
  <rdf:RDF>
3873
  <ns2:Work>
3874
  <dc:type rdf:resource="http://purl.org/dc/dcmitype/StillImage" />
3875
+ <dc:date>2025-10-29T14:27:45.722521</dc:date>
3876
  <dc:format>image/svg+xml</dc:format>
3877
  <dc:creator>
3878
  <ns2:Agent>
 
3956
  <g id="matplotlib.axis_2">
3957
  <g id="ytick_1">
3958
  <g id="grid-y--2" class="grid grid-y">
3959
+ <path d="M 47.72 408.957392 L 840.20233 408.957392 " clip-path="url(#p2214f54723)" style="fill: none; stroke: #b0b0b0; stroke-opacity: 0.3; stroke-width: 0.8; stroke-linecap: square" />
3960
  </g>
3961
  <g id="line2d_5">
3962
  <defs>
3963
  <path id="m0fca2865ba" d="M 0 0 L -3.5 0 " style="stroke: #000000; stroke-width: 0.8" />
3964
  </defs>
3965
  <g>
3966
+ <use ns4:href="#m0fca2865ba" x="47.72" y="408.957392" style="stroke: #000000; stroke-width: 0.8" />
3967
  </g>
3968
  </g>
3969
  <g id="text_5">
3970
+ <text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: end" x="40.72" y="412.756611" transform="rotate(-0 40.72 412.756611)">1.0</text>
3971
  </g>
3972
  </g>
3973
  <g id="ytick_2">
3974
  <g id="grid-y--3" class="grid grid-y">
3975
+ <path d="M 47.72 331.05018 L 840.20233 331.05018 " clip-path="url(#p2214f54723)" style="fill: none; stroke: #b0b0b0; stroke-opacity: 0.3; stroke-width: 0.8; stroke-linecap: square" />
3976
  </g>
3977
  <g id="line2d_6">
3978
  <g>
3979
+ <use ns4:href="#m0fca2865ba" x="47.72" y="331.05018" style="stroke: #000000; stroke-width: 0.8" />
3980
  </g>
3981
  </g>
3982
  <g id="text_6">
3983
+ <text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: end" x="40.72" y="334.849399" transform="rotate(-0 40.72 334.849399)">1.5</text>
3984
  </g>
3985
  </g>
3986
  <g id="ytick_3">
3987
  <g id="grid-y--4" class="grid grid-y">
3988
+ <path d="M 47.72 253.142969 L 840.20233 253.142969 " clip-path="url(#p2214f54723)" style="fill: none; stroke: #b0b0b0; stroke-opacity: 0.3; stroke-width: 0.8; stroke-linecap: square" />
3989
  </g>
3990
  <g id="line2d_7">
3991
  <g>
3992
+ <use ns4:href="#m0fca2865ba" x="47.72" y="253.142969" style="stroke: #000000; stroke-width: 0.8" />
3993
  </g>
3994
  </g>
3995
  <g id="text_7">
3996
+ <text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: end" x="40.72" y="256.942188" transform="rotate(-0 40.72 256.942188)">2.0</text>
3997
  </g>
3998
  </g>
3999
  <g id="ytick_4">
4000
  <g id="grid-y--5" class="grid grid-y">
4001
+ <path d="M 47.72 175.235758 L 840.20233 175.235758 " clip-path="url(#p2214f54723)" style="fill: none; stroke: #b0b0b0; stroke-opacity: 0.3; stroke-width: 0.8; stroke-linecap: square" />
4002
  </g>
4003
  <g id="line2d_8">
4004
  <g>
4005
+ <use ns4:href="#m0fca2865ba" x="47.72" y="175.235758" style="stroke: #000000; stroke-width: 0.8" />
4006
  </g>
4007
  </g>
4008
  <g id="text_8">
4009
+ <text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: end" x="40.72" y="179.034976" transform="rotate(-0 40.72 179.034976)">2.5</text>
4010
  </g>
4011
  </g>
4012
  <g id="ytick_5">
4013
  <g id="grid-y--6" class="grid grid-y">
4014
+ <path d="M 47.72 97.328546 L 840.20233 97.328546 " clip-path="url(#p2214f54723)" style="fill: none; stroke: #b0b0b0; stroke-opacity: 0.3; stroke-width: 0.8; stroke-linecap: square" />
4015
  </g>
4016
  <g id="line2d_9">
4017
  <g>
4018
+ <use ns4:href="#m0fca2865ba" x="47.72" y="97.328546" style="stroke: #000000; stroke-width: 0.8" />
4019
  </g>
4020
  </g>
4021
  <g id="text_9">
4022
+ <text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: end" x="40.72" y="101.127765" transform="rotate(-0 40.72 101.127765)">3.0</text>
4023
  </g>
4024
  </g>
4025
  <g id="label--y" class="ylabel">
 
4027
  </g>
4028
  </g>
4029
  <g id="series--torch-layer-norm" class="series">
4030
+ <path d="M 83.741924 437.689571 L 323.888085 302.833591 L 564.034245 313.993176 L 804.180406 46.442361 " clip-path="url(#p2214f54723)" style="fill: none; stroke: #1f77b4; stroke-width: 1.5; stroke-linecap: square" />
4031
  <defs>
4032
  <path id="md7efaf3aec" d="M 0 3 C 0.795609 3 1.55874 2.683901 2.12132 2.12132 C 2.683901 1.55874 3 0.795609 3 0 C 3 -0.795609 2.683901 -1.55874 2.12132 -2.12132 C 1.55874 -2.683901 0.795609 -3 0 -3 C -0.795609 -3 -1.55874 -2.683901 -2.12132 -2.12132 C -2.683901 -1.55874 -3 -0.795609 -3 0 C -3 0.795609 -2.683901 1.55874 -2.12132 2.12132 C -1.55874 2.683901 -0.795609 3 0 3 z " style="stroke: #1f77b4" />
4033
  </defs>
4034
  <g clip-path="url(#p2214f54723)">
4035
  <use ns4:href="#md7efaf3aec" x="83.741924" y="437.689571" style="fill: #1f77b4; stroke: #1f77b4" />
4036
+ <use ns4:href="#md7efaf3aec" x="323.888085" y="302.833591" style="fill: #1f77b4; stroke: #1f77b4" />
4037
+ <use ns4:href="#md7efaf3aec" x="564.034245" y="313.993176" style="fill: #1f77b4; stroke: #1f77b4" />
4038
  <use ns4:href="#md7efaf3aec" x="804.180406" y="46.442361" style="fill: #1f77b4; stroke: #1f77b4" />
4039
  </g>
4040
  </g>
4041
  <g id="series--hf-kernels-layer-norm" class="series">
4042
+ <path d="M 83.741924 434.434608 L 323.888085 307.690482 L 564.034245 307.408302 L 804.180406 57.182805 " clip-path="url(#p2214f54723)" style="fill: none; stroke: #ff7f0e; stroke-width: 1.5; stroke-linecap: square" />
4043
  <defs>
4044
  <path id="m9b8c54d372" d="M 0 3 C 0.795609 3 1.55874 2.683901 2.12132 2.12132 C 2.683901 1.55874 3 0.795609 3 0 C 3 -0.795609 2.683901 -1.55874 2.12132 -2.12132 C 1.55874 -2.683901 0.795609 -3 0 -3 C -0.795609 -3 -1.55874 -2.683901 -2.12132 -2.12132 C -2.683901 -1.55874 -3 -0.795609 -3 0 C -3 0.795609 -2.683901 1.55874 -2.12132 2.12132 C -1.55874 2.683901 -0.795609 3 0 3 z " style="stroke: #ff7f0e" />
4045
  </defs>
4046
  <g clip-path="url(#p2214f54723)">
4047
+ <use ns4:href="#m9b8c54d372" x="83.741924" y="434.434608" style="fill: #ff7f0e; stroke: #ff7f0e" />
4048
+ <use ns4:href="#m9b8c54d372" x="323.888085" y="307.690482" style="fill: #ff7f0e; stroke: #ff7f0e" />
4049
+ <use ns4:href="#m9b8c54d372" x="564.034245" y="307.408302" style="fill: #ff7f0e; stroke: #ff7f0e" />
4050
+ <use ns4:href="#m9b8c54d372" x="804.180406" y="57.182805" style="fill: #ff7f0e; stroke: #ff7f0e" />
4051
  </g>
4052
  </g>
4053
  <g id="patch_3">
 
4105
  <span onclick="toggleOutput('combine')" style="cursor: pointer;">▼ output</span>
4106
  <span id="uv-indicator-combine" onclick="toggleUvLogsFromHeader('combine')" style="cursor: pointer;">▶ uv-logs</span>
4107
  </span> |
4108
+ Cell: combine | 4.21s
4109
  | <button class="run-btn" onclick="runCell('combine')">▶ run</button>
4110
  <button class="copy-btn" onclick="copyCell('combine')">Copy</button>
4111
  <a href="cells/combine.py" target="_blank" class="raw-btn">Raw</a>
 
4195
  hf_kernels_layer_norm LN_B16_S2048_D4096 0.84 True
4196
  hf_kernels_layer_norm LN_B16_S2048_D8192 1.65 True
4197
  hf_kernels_layer_norm LN_B16_S4096_D4096 1.65 True
4198
+ hf_kernels_layer_norm LN_B16_S4096_D8192 3.26 True
4199
  torch_layer_norm LN_B16_S2048_D4096 0.82 True
4200
  torch_layer_norm LN_B16_S2048_D8192 1.68 True
4201
  torch_layer_norm LN_B16_S4096_D4096 1.61 True
 
4219
  <div class="uv-install-logs" id="uv-logs-combine">
4220
  <div class="uv-logs-header" onclick="toggleUvLogs(this)">▶ UV Install Logs</div>
4221
  <div class="uv-logs-content" style="display: none;">
4222
+ Installed 37 packages in 210ms
4223
  </div>
4224
  </div>
4225
  <div class="cell-artifacts">
 
4232
  <rdf:RDF>
4233
  <ns2:Work>
4234
  <dc:type rdf:resource="http://purl.org/dc/dcmitype/StillImage" />
4235
+ <dc:date>2025-10-29T14:27:45.722521</dc:date>
4236
  <dc:format>image/svg+xml</dc:format>
4237
  <dc:creator>
4238
  <ns2:Agent>
 
4316
  <g id="matplotlib.axis_2">
4317
  <g id="ytick_1">
4318
  <g id="grid-y--2" class="grid grid-y">
4319
+ <path d="M 47.72 408.957392 L 840.20233 408.957392 " clip-path="url(#p2214f54723)" style="fill: none; stroke: #b0b0b0; stroke-opacity: 0.3; stroke-width: 0.8; stroke-linecap: square" />
4320
  </g>
4321
  <g id="line2d_5">
4322
  <defs>
4323
  <path id="m0fca2865ba" d="M 0 0 L -3.5 0 " style="stroke: #000000; stroke-width: 0.8" />
4324
  </defs>
4325
  <g>
4326
+ <use ns4:href="#m0fca2865ba" x="47.72" y="408.957392" style="stroke: #000000; stroke-width: 0.8" />
4327
  </g>
4328
  </g>
4329
  <g id="text_5">
4330
+ <text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: end" x="40.72" y="412.756611" transform="rotate(-0 40.72 412.756611)">1.0</text>
4331
  </g>
4332
  </g>
4333
  <g id="ytick_2">
4334
  <g id="grid-y--3" class="grid grid-y">
4335
+ <path d="M 47.72 331.05018 L 840.20233 331.05018 " clip-path="url(#p2214f54723)" style="fill: none; stroke: #b0b0b0; stroke-opacity: 0.3; stroke-width: 0.8; stroke-linecap: square" />
4336
  </g>
4337
  <g id="line2d_6">
4338
  <g>
4339
+ <use ns4:href="#m0fca2865ba" x="47.72" y="331.05018" style="stroke: #000000; stroke-width: 0.8" />
4340
  </g>
4341
  </g>
4342
  <g id="text_6">
4343
+ <text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: end" x="40.72" y="334.849399" transform="rotate(-0 40.72 334.849399)">1.5</text>
4344
  </g>
4345
  </g>
4346
  <g id="ytick_3">
4347
  <g id="grid-y--4" class="grid grid-y">
4348
+ <path d="M 47.72 253.142969 L 840.20233 253.142969 " clip-path="url(#p2214f54723)" style="fill: none; stroke: #b0b0b0; stroke-opacity: 0.3; stroke-width: 0.8; stroke-linecap: square" />
4349
  </g>
4350
  <g id="line2d_7">
4351
  <g>
4352
+ <use ns4:href="#m0fca2865ba" x="47.72" y="253.142969" style="stroke: #000000; stroke-width: 0.8" />
4353
  </g>
4354
  </g>
4355
  <g id="text_7">
4356
+ <text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: end" x="40.72" y="256.942188" transform="rotate(-0 40.72 256.942188)">2.0</text>
4357
  </g>
4358
  </g>
4359
  <g id="ytick_4">
4360
  <g id="grid-y--5" class="grid grid-y">
4361
+ <path d="M 47.72 175.235758 L 840.20233 175.235758 " clip-path="url(#p2214f54723)" style="fill: none; stroke: #b0b0b0; stroke-opacity: 0.3; stroke-width: 0.8; stroke-linecap: square" />
4362
  </g>
4363
  <g id="line2d_8">
4364
  <g>
4365
+ <use ns4:href="#m0fca2865ba" x="47.72" y="175.235758" style="stroke: #000000; stroke-width: 0.8" />
4366
  </g>
4367
  </g>
4368
  <g id="text_8">
4369
+ <text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: end" x="40.72" y="179.034976" transform="rotate(-0 40.72 179.034976)">2.5</text>
4370
  </g>
4371
  </g>
4372
  <g id="ytick_5">
4373
  <g id="grid-y--6" class="grid grid-y">
4374
+ <path d="M 47.72 97.328546 L 840.20233 97.328546 " clip-path="url(#p2214f54723)" style="fill: none; stroke: #b0b0b0; stroke-opacity: 0.3; stroke-width: 0.8; stroke-linecap: square" />
4375
  </g>
4376
  <g id="line2d_9">
4377
  <g>
4378
+ <use ns4:href="#m0fca2865ba" x="47.72" y="97.328546" style="stroke: #000000; stroke-width: 0.8" />
4379
  </g>
4380
  </g>
4381
  <g id="text_9">
4382
+ <text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: end" x="40.72" y="101.127765" transform="rotate(-0 40.72 101.127765)">3.0</text>
4383
  </g>
4384
  </g>
4385
  <g id="label--y" class="ylabel">
 
4387
  </g>
4388
  </g>
4389
  <g id="series--torch-layer-norm" class="series">
4390
+ <path d="M 83.741924 437.689571 L 323.888085 302.833591 L 564.034245 313.993176 L 804.180406 46.442361 " clip-path="url(#p2214f54723)" style="fill: none; stroke: #1f77b4; stroke-width: 1.5; stroke-linecap: square" />
4391
  <defs>
4392
  <path id="md7efaf3aec" d="M 0 3 C 0.795609 3 1.55874 2.683901 2.12132 2.12132 C 2.683901 1.55874 3 0.795609 3 0 C 3 -0.795609 2.683901 -1.55874 2.12132 -2.12132 C 1.55874 -2.683901 0.795609 -3 0 -3 C -0.795609 -3 -1.55874 -2.683901 -2.12132 -2.12132 C -2.683901 -1.55874 -3 -0.795609 -3 0 C -3 0.795609 -2.683901 1.55874 -2.12132 2.12132 C -1.55874 2.683901 -0.795609 3 0 3 z " style="stroke: #1f77b4" />
4393
  </defs>
4394
  <g clip-path="url(#p2214f54723)">
4395
  <use ns4:href="#md7efaf3aec" x="83.741924" y="437.689571" style="fill: #1f77b4; stroke: #1f77b4" />
4396
+ <use ns4:href="#md7efaf3aec" x="323.888085" y="302.833591" style="fill: #1f77b4; stroke: #1f77b4" />
4397
+ <use ns4:href="#md7efaf3aec" x="564.034245" y="313.993176" style="fill: #1f77b4; stroke: #1f77b4" />
4398
  <use ns4:href="#md7efaf3aec" x="804.180406" y="46.442361" style="fill: #1f77b4; stroke: #1f77b4" />
4399
  </g>
4400
  </g>
4401
  <g id="series--hf-kernels-layer-norm" class="series">
4402
+ <path d="M 83.741924 434.434608 L 323.888085 307.690482 L 564.034245 307.408302 L 804.180406 57.182805 " clip-path="url(#p2214f54723)" style="fill: none; stroke: #ff7f0e; stroke-width: 1.5; stroke-linecap: square" />
4403
  <defs>
4404
  <path id="m9b8c54d372" d="M 0 3 C 0.795609 3 1.55874 2.683901 2.12132 2.12132 C 2.683901 1.55874 3 0.795609 3 0 C 3 -0.795609 2.683901 -1.55874 2.12132 -2.12132 C 1.55874 -2.683901 0.795609 -3 0 -3 C -0.795609 -3 -1.55874 -2.683901 -2.12132 -2.12132 C -2.683901 -1.55874 -3 -0.795609 -3 0 C -3 0.795609 -2.683901 1.55874 -2.12132 2.12132 C -1.55874 2.683901 -0.795609 3 0 3 z " style="stroke: #ff7f0e" />
4405
  </defs>
4406
  <g clip-path="url(#p2214f54723)">
4407
+ <use ns4:href="#m9b8c54d372" x="83.741924" y="434.434608" style="fill: #ff7f0e; stroke: #ff7f0e" />
4408
+ <use ns4:href="#m9b8c54d372" x="323.888085" y="307.690482" style="fill: #ff7f0e; stroke: #ff7f0e" />
4409
+ <use ns4:href="#m9b8c54d372" x="564.034245" y="307.408302" style="fill: #ff7f0e; stroke: #ff7f0e" />
4410
+ <use ns4:href="#m9b8c54d372" x="804.180406" y="57.182805" style="fill: #ff7f0e; stroke: #ff7f0e" />
4411
  </g>
4412
  </g>
4413
  <g id="patch_3">
rotary/impls/artifacts/benchmark/rotary.jsonl CHANGED
@@ -1,24 +1,24 @@
1
- {"ts": "2025-10-28T14:08:27Z", "run": "19f83b2c64b6453c8956e46baabe2e54", "impl": "torch_eager", "tags": {"family": "pytorch", "backend": "eager"}, "wl": {"name": "cuda_B1_S128_H8_D64_R32", "batch": 1, "seqlen": 128, "num_heads": 8, "head_dim": 64, "rotary_dim": 32, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.1724160000549091, "p50": 0.17308600001797458, "p90": 0.1756759999125279, "mean": 0.1760500000045795, "iqr": 0.0032199998258874984, "raw_times": [0.17245600008664042, 0.1756759999125279, 0.1724160000549091, 0.17308600001797458, 0.18661599995084543], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.17975699995531613, "peak_bytes": 1720320, "ok": true, "absmax": null, "corr": {"ok": true, "rtol": 0.003, "atol": 0.005, "absmax_q": 0.0, "absmax_k": 0.0, "mae_q": 0.0, "mae_k": 0.0, "mse_q": 0.0, "mse_k": 0.0, "ref": "rotary_torch"}, "err": null}
2
- {"ts": "2025-10-28T14:08:27Z", "run": "19f83b2c64b6453c8956e46baabe2e54", "impl": "torch_eager", "tags": {"family": "pytorch", "backend": "eager"}, "wl": {"name": "cuda_B1_S128_H8_D128_R64", "batch": 1, "seqlen": 128, "num_heads": 8, "head_dim": 128, "rotary_dim": 64, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.22642799990535423, "p50": 0.2294280000114668, "p90": 0.23093799995876907, "mean": 0.23135619996992318, "iqr": 0.0026599999500831473, "raw_times": [0.23093799995876907, 0.22642799990535423, 0.22827800000868592, 0.2417089999653399, 0.2294280000114668], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.23494799995660287, "peak_bytes": 3440640, "ok": true, "absmax": null, "corr": {"ok": true, "rtol": 0.003, "atol": 0.005, "absmax_q": 0.0, "absmax_k": 0.0, "mae_q": 0.0, "mae_k": 0.0, "mse_q": 0.0, "mse_k": 0.0, "ref": "rotary_torch"}, "err": null}
3
- {"ts": "2025-10-28T14:08:27Z", "run": "19f83b2c64b6453c8956e46baabe2e54", "impl": "torch_eager", "tags": {"family": "pytorch", "backend": "eager"}, "wl": {"name": "cuda_B1_S128_H32_D64_R32", "batch": 1, "seqlen": 128, "num_heads": 32, "head_dim": 64, "rotary_dim": 32, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.21688800006813835, "p50": 0.21992799997860857, "p90": 0.2219079999576934, "mean": 0.22172000001319248, "iqr": 0.004439999884198187, "raw_times": [0.2174680000734952, 0.2219079999576934, 0.21688800006813835, 0.23240799998802686, 0.21992799997860857], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.225418000013633, "peak_bytes": 6832128, "ok": true, "absmax": null, "corr": {"ok": true, "rtol": 0.003, "atol": 0.005, "absmax_q": 0.0, "absmax_k": 0.0, "mae_q": 0.0, "mae_k": 0.0, "mse_q": 0.0, "mse_k": 0.0, "ref": "rotary_torch"}, "err": null}
4
- {"ts": "2025-10-28T14:08:27Z", "run": "19f83b2c64b6453c8956e46baabe2e54", "impl": "torch_eager", "tags": {"family": "pytorch", "backend": "eager"}, "wl": {"name": "cuda_B1_S128_H32_D128_R64", "batch": 1, "seqlen": 128, "num_heads": 32, "head_dim": 128, "rotary_dim": 64, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.21487700007583044, "p50": 0.21964699999443837, "p90": 0.22132800006602338, "mean": 0.21978760003094067, "iqr": 0.005100000066704524, "raw_times": [0.21487700007583044, 0.21622799999931885, 0.21964699999443837, 0.22132800006602338, 0.2268580000190923], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.24882799993974913, "peak_bytes": 13664256, "ok": true, "absmax": null, "corr": {"ok": true, "rtol": 0.003, "atol": 0.005, "absmax_q": 0.0, "absmax_k": 0.0, "mae_q": 0.0, "mae_k": 0.0, "mse_q": 0.0, "mse_k": 0.0, "ref": "rotary_torch"}, "err": null}
5
- {"ts": "2025-10-28T14:08:27Z", "run": "19f83b2c64b6453c8956e46baabe2e54", "impl": "torch_eager", "tags": {"family": "pytorch", "backend": "eager"}, "wl": {"name": "cuda_B1_S512_H8_D64_R32", "batch": 1, "seqlen": 512, "num_heads": 8, "head_dim": 64, "rotary_dim": 32, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.21332699998311, "p50": 0.21615699995436444, "p90": 0.21744800005762954, "mean": 0.21590960000139603, "iqr": 0.0025000000505315256, "raw_times": [0.21332699998311, 0.21744800005762954, 0.21494800000709802, 0.21766800000477815, 0.21615699995436444], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.22001800005000405, "peak_bytes": 6881280, "ok": true, "absmax": null, "corr": {"ok": true, "rtol": 0.003, "atol": 0.005, "absmax_q": 0.0, "absmax_k": 0.0, "mae_q": 0.0, "mae_k": 0.0, "mse_q": 0.0, "mse_k": 0.0, "ref": "rotary_torch"}, "err": null}
6
- {"ts": "2025-10-28T14:08:27Z", "run": "19f83b2c64b6453c8956e46baabe2e54", "impl": "torch_eager", "tags": {"family": "pytorch", "backend": "eager"}, "wl": {"name": "cuda_B1_S512_H8_D128_R64", "batch": 1, "seqlen": 512, "num_heads": 8, "head_dim": 128, "rotary_dim": 64, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.21038799991401902, "p50": 0.21561700009442575, "p90": 0.21720800009461527, "mean": 0.22098599999935686, "iqr": 0.004100000182916119, "raw_times": [0.21038799991401902, 0.21720800009461527, 0.21561700009442575, 0.24860899998202513, 0.21310799991169915], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.2178580000418151, "peak_bytes": 13762560, "ok": true, "absmax": null, "corr": {"ok": true, "rtol": 0.003, "atol": 0.005, "absmax_q": 0.0, "absmax_k": 0.0, "mae_q": 0.0, "mae_k": 0.0, "mse_q": 0.0, "mse_k": 0.0, "ref": "rotary_torch"}, "err": null}
7
- {"ts": "2025-10-28T14:08:27Z", "run": "19f83b2c64b6453c8956e46baabe2e54", "impl": "torch_eager", "tags": {"family": "pytorch", "backend": "eager"}, "wl": {"name": "cuda_B1_S512_H32_D64_R32", "batch": 1, "seqlen": 512, "num_heads": 32, "head_dim": 64, "rotary_dim": 32, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.21404700009952649, "p50": 0.21557699994900759, "p90": 0.2158679999411106, "mean": 0.2152116000161186, "iqr": 0.0011999999287581886, "raw_times": [0.2158679999411106, 0.21589800007859594, 0.21404700009952649, 0.21466800001235242, 0.21557699994900759], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.21567799990407366, "peak_bytes": 27328512, "ok": true, "absmax": null, "corr": {"ok": true, "rtol": 0.003, "atol": 0.005, "absmax_q": 0.0, "absmax_k": 0.0, "mae_q": 0.0, "mae_k": 0.0, "mse_q": 0.0, "mse_k": 0.0, "ref": "rotary_torch"}, "err": null}
8
- {"ts": "2025-10-28T14:08:27Z", "run": "19f83b2c64b6453c8956e46baabe2e54", "impl": "torch_eager", "tags": {"family": "pytorch", "backend": "eager"}, "wl": {"name": "cuda_B1_S512_H32_D128_R64", "batch": 1, "seqlen": 512, "num_heads": 32, "head_dim": 128, "rotary_dim": 64, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.21701799994389148, "p50": 0.21822700000484474, "p90": 0.22002800005793688, "mean": 0.2237478000097326, "iqr": 0.002031000008173578, "raw_times": [0.22002800005793688, 0.2179970000497633, 0.2454689999922266, 0.21822700000484474, 0.21701799994389148], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.22291799996310147, "peak_bytes": 54657024, "ok": true, "absmax": null, "corr": {"ok": true, "rtol": 0.003, "atol": 0.005, "absmax_q": 0.0, "absmax_k": 0.0, "mae_q": 0.0, "mae_k": 0.0, "mse_q": 0.0, "mse_k": 0.0, "ref": "rotary_torch"}, "err": null}
9
- {"ts": "2025-10-28T14:08:27Z", "run": "19f83b2c64b6453c8956e46baabe2e54", "impl": "torch_eager", "tags": {"family": "pytorch", "backend": "eager"}, "wl": {"name": "cuda_B1_S2048_H8_D64_R32", "batch": 1, "seqlen": 2048, "num_heads": 8, "head_dim": 64, "rotary_dim": 32, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.21712800003115262, "p50": 0.21885700004986575, "p90": 0.2196080000658185, "mean": 0.22401780001928273, "iqr": 0.001630000042496249, "raw_times": [0.21797800002332224, 0.2196080000658185, 0.24651799992625456, 0.21885700004986575, 0.21712800003115262], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.2206780000051367, "peak_bytes": 27525120, "ok": true, "absmax": null, "corr": {"ok": true, "rtol": 0.003, "atol": 0.005, "absmax_q": 0.0, "absmax_k": 0.0, "mae_q": 0.0, "mae_k": 0.0, "mse_q": 0.0, "mse_k": 0.0, "ref": "rotary_torch"}, "err": null}
10
- {"ts": "2025-10-28T14:08:27Z", "run": "19f83b2c64b6453c8956e46baabe2e54", "impl": "torch_eager", "tags": {"family": "pytorch", "backend": "eager"}, "wl": {"name": "cuda_B1_S2048_H8_D128_R64", "batch": 1, "seqlen": 2048, "num_heads": 8, "head_dim": 128, "rotary_dim": 64, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.21462800009430794, "p50": 0.21782799990432977, "p90": 0.21795700001803198, "mean": 0.21911359999648994, "iqr": 0.0030300000162242213, "raw_times": [0.21462800009430794, 0.23022799996397225, 0.21782799990432977, 0.21492700000180776, 0.21795700001803198], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.2186980000260519, "peak_bytes": 55050240, "ok": true, "absmax": null, "corr": {"ok": true, "rtol": 0.003, "atol": 0.005, "absmax_q": 0.0, "absmax_k": 0.0, "mae_q": 0.0, "mae_k": 0.0, "mse_q": 0.0, "mse_k": 0.0, "ref": "rotary_torch"}, "err": null}
11
- {"ts": "2025-10-28T14:08:27Z", "run": "19f83b2c64b6453c8956e46baabe2e54", "impl": "torch_eager", "tags": {"family": "pytorch", "backend": "eager"}, "wl": {"name": "cuda_B1_S2048_H32_D64_R32", "batch": 1, "seqlen": 2048, "num_heads": 32, "head_dim": 64, "rotary_dim": 32, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.21579799999926763, "p50": 0.21701699995446688, "p90": 0.22130799993647088, "mean": 0.2237457999626713, "iqr": 0.004450000005817856, "raw_times": [0.21701699995446688, 0.22130799993647088, 0.21579799999926763, 0.24774799999249808, 0.21685799993065302], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.22235700009787251, "peak_bytes": 109314048, "ok": true, "absmax": null, "corr": {"ok": true, "rtol": 0.003, "atol": 0.005, "absmax_q": 0.0, "absmax_k": 0.0, "mae_q": 0.0, "mae_k": 0.0, "mse_q": 0.0, "mse_k": 0.0, "ref": "rotary_torch"}, "err": null}
12
- {"ts": "2025-10-28T14:08:27Z", "run": "19f83b2c64b6453c8956e46baabe2e54", "impl": "torch_eager", "tags": {"family": "pytorch", "backend": "eager"}, "wl": {"name": "cuda_B1_S2048_H32_D128_R64", "batch": 1, "seqlen": 2048, "num_heads": 32, "head_dim": 128, "rotary_dim": 64, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.22434800007431477, "p50": 0.2248280000003433, "p90": 0.22490799995011912, "mean": 0.22479799997654482, "iqr": 0.00031000001854408765, "raw_times": [0.2248280000003433, 0.22490799995011912, 0.22459799993157503, 0.22434800007431477, 0.22530799992637185], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.23522799995134847, "peak_bytes": 218628096, "ok": true, "absmax": null, "corr": {"ok": true, "rtol": 0.003, "atol": 0.005, "absmax_q": 0.0, "absmax_k": 0.0, "mae_q": 0.0, "mae_k": 0.0, "mse_q": 0.0, "mse_k": 0.0, "ref": "rotary_torch"}, "err": null}
13
- {"ts": "2025-10-28T14:08:27Z", "run": "19f83b2c64b6453c8956e46baabe2e54", "impl": "torch_eager", "tags": {"family": "pytorch", "backend": "eager"}, "wl": {"name": "cuda_B2_S128_H8_D64_R32", "batch": 2, "seqlen": 128, "num_heads": 8, "head_dim": 64, "rotary_dim": 32, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.21574699997017888, "p50": 0.21802799994929956, "p90": 0.21904799996264046, "mean": 0.22033179998288688, "iqr": 0.0018999999156221747, "raw_times": [0.21714800004701829, 0.21802799994929956, 0.2316879999852972, 0.21904799996264046, 0.21574699997017888], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.22168800001054478, "peak_bytes": 68698112, "ok": true, "absmax": null, "corr": {"ok": true, "rtol": 0.003, "atol": 0.005, "absmax_q": 0.0, "absmax_k": 0.0, "mae_q": 0.0, "mae_k": 0.0, "mse_q": 0.0, "mse_k": 0.0, "ref": "rotary_torch"}, "err": null}
14
- {"ts": "2025-10-28T14:08:27Z", "run": "19f83b2c64b6453c8956e46baabe2e54", "impl": "torch_eager", "tags": {"family": "pytorch", "backend": "eager"}, "wl": {"name": "cuda_B2_S128_H8_D128_R64", "batch": 2, "seqlen": 128, "num_heads": 8, "head_dim": 128, "rotary_dim": 64, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.21753800001533818, "p50": 0.21888800006308884, "p90": 0.22129700005280029, "mean": 0.22190180004599824, "iqr": 0.003358999947522534, "raw_times": [0.21753800001533818, 0.23384799999348616, 0.21793800010527775, 0.21888800006308884, 0.22129700005280029], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.22266799999215436, "peak_bytes": 6848512, "ok": true, "absmax": null, "corr": {"ok": true, "rtol": 0.003, "atol": 0.005, "absmax_q": 0.0, "absmax_k": 0.0, "mae_q": 0.0, "mae_k": 0.0, "mse_q": 0.0, "mse_k": 0.0, "ref": "rotary_torch"}, "err": null}
15
- {"ts": "2025-10-28T14:08:27Z", "run": "19f83b2c64b6453c8956e46baabe2e54", "impl": "torch_eager", "tags": {"family": "pytorch", "backend": "eager"}, "wl": {"name": "cuda_B2_S128_H32_D64_R32", "batch": 2, "seqlen": 128, "num_heads": 32, "head_dim": 64, "rotary_dim": 32, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.2166670000178783, "p50": 0.21850699999959033, "p90": 0.21964699999443837, "mean": 0.21864339998955984, "iqr": 0.001419000000169035, "raw_times": [0.21850699999959033, 0.2166670000178783, 0.22016799994162284, 0.21822799999426934, 0.21964699999443837], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.23552799996195972, "peak_bytes": 13647872, "ok": true, "absmax": null, "corr": {"ok": true, "rtol": 0.003, "atol": 0.005, "absmax_q": 0.0, "absmax_k": 0.0, "mae_q": 0.0, "mae_k": 0.0, "mse_q": 0.0, "mse_k": 0.0, "ref": "rotary_torch"}, "err": null}
16
- {"ts": "2025-10-28T14:08:27Z", "run": "19f83b2c64b6453c8956e46baabe2e54", "impl": "torch_eager", "tags": {"family": "pytorch", "backend": "eager"}, "wl": {"name": "cuda_B2_S128_H32_D128_R64", "batch": 2, "seqlen": 128, "num_heads": 32, "head_dim": 128, "rotary_dim": 64, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.21624800001518452, "p50": 0.21773700007088337, "p90": 0.21802799994929956, "mean": 0.21774760000425886, "iqr": 0.0013409999155555852, "raw_times": [0.21668700003374397, 0.21773700007088337, 0.22003799995218287, 0.21624800001518452, 0.21802799994929956], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.2226780000000872, "peak_bytes": 27295744, "ok": true, "absmax": null, "corr": {"ok": true, "rtol": 0.003, "atol": 0.005, "absmax_q": 0.0, "absmax_k": 0.0, "mae_q": 0.0, "mae_k": 0.0, "mse_q": 0.0, "mse_k": 0.0, "ref": "rotary_torch"}, "err": null}
17
- {"ts": "2025-10-28T14:08:27Z", "run": "19f83b2c64b6453c8956e46baabe2e54", "impl": "torch_eager", "tags": {"family": "pytorch", "backend": "eager"}, "wl": {"name": "cuda_B2_S512_H8_D64_R32", "batch": 2, "seqlen": 512, "num_heads": 8, "head_dim": 64, "rotary_dim": 32, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.21748699998624943, "p50": 0.22014700005001941, "p90": 0.22206799997093185, "mean": 0.22232159999475698, "iqr": 0.0019999999949504854, "raw_times": [0.22014700005001941, 0.23183799999060284, 0.22206799997093185, 0.21748699998624943, 0.22006799997598137], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.22040800001832395, "peak_bytes": 13697024, "ok": true, "absmax": null, "corr": {"ok": true, "rtol": 0.003, "atol": 0.005, "absmax_q": 0.0, "absmax_k": 0.0, "mae_q": 0.0, "mae_k": 0.0, "mse_q": 0.0, "mse_k": 0.0, "ref": "rotary_torch"}, "err": null}
18
- {"ts": "2025-10-28T14:08:27Z", "run": "19f83b2c64b6453c8956e46baabe2e54", "impl": "torch_eager", "tags": {"family": "pytorch", "backend": "eager"}, "wl": {"name": "cuda_B2_S512_H8_D128_R64", "batch": 2, "seqlen": 512, "num_heads": 8, "head_dim": 128, "rotary_dim": 64, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.21601800006010308, "p50": 0.21957800004202, "p90": 0.22023799999715266, "mean": 0.2213318000030995, "iqr": 0.0024510000002919696, "raw_times": [0.23303799991936103, 0.21601800006010308, 0.2177869999968607, 0.21957800004202, 0.22023799999715266], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.220787999978711, "peak_bytes": 27394048, "ok": true, "absmax": null, "corr": {"ok": true, "rtol": 0.003, "atol": 0.005, "absmax_q": 0.0, "absmax_k": 0.0, "mae_q": 0.0, "mae_k": 0.0, "mse_q": 0.0, "mse_k": 0.0, "ref": "rotary_torch"}, "err": null}
19
- {"ts": "2025-10-28T14:08:27Z", "run": "19f83b2c64b6453c8956e46baabe2e54", "impl": "torch_eager", "tags": {"family": "pytorch", "backend": "eager"}, "wl": {"name": "cuda_B2_S512_H32_D64_R32", "batch": 2, "seqlen": 512, "num_heads": 32, "head_dim": 64, "rotary_dim": 32, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.21692799998618284, "p50": 0.22003699996275827, "p90": 0.2230679999684071, "mean": 0.222287800011145, "iqr": 0.0031599998919773498, "raw_times": [0.21692799998618284, 0.21990800007642974, 0.2314980000619471, 0.2230679999684071, 0.22003699996275827], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.22102700006598752, "peak_bytes": 54591488, "ok": true, "absmax": null, "corr": {"ok": true, "rtol": 0.003, "atol": 0.005, "absmax_q": 0.0, "absmax_k": 0.0, "mae_q": 0.0, "mae_k": 0.0, "mse_q": 0.0, "mse_k": 0.0, "ref": "rotary_torch"}, "err": null}
20
- {"ts": "2025-10-28T14:08:27Z", "run": "19f83b2c64b6453c8956e46baabe2e54", "impl": "torch_eager", "tags": {"family": "pytorch", "backend": "eager"}, "wl": {"name": "cuda_B2_S512_H32_D128_R64", "batch": 2, "seqlen": 512, "num_heads": 32, "head_dim": 128, "rotary_dim": 64, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.2160679999860804, "p50": 0.21972700005790102, "p90": 0.22029800004474964, "mean": 0.21970960001453932, "iqr": 0.0024610000082248007, "raw_times": [0.2160679999860804, 0.2246179999474407, 0.22029800004474964, 0.21972700005790102, 0.21783700003652484], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.22191799996562622, "peak_bytes": 109182976, "ok": true, "absmax": null, "corr": {"ok": true, "rtol": 0.003, "atol": 0.005, "absmax_q": 0.0, "absmax_k": 0.0, "mae_q": 0.0, "mae_k": 0.0, "mse_q": 0.0, "mse_k": 0.0, "ref": "rotary_torch"}, "err": null}
21
- {"ts": "2025-10-28T14:08:27Z", "run": "19f83b2c64b6453c8956e46baabe2e54", "impl": "torch_eager", "tags": {"family": "pytorch", "backend": "eager"}, "wl": {"name": "cuda_B2_S2048_H8_D64_R32", "batch": 2, "seqlen": 2048, "num_heads": 8, "head_dim": 64, "rotary_dim": 32, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.2172279999967941, "p50": 0.21847799996521644, "p90": 0.22105800007921061, "mean": 0.22193580000475777, "iqr": 0.0035110000453641987, "raw_times": [0.21847799996521644, 0.22105800007921061, 0.23536799994872126, 0.21754700003384642, 0.2172279999967941], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.22206799997093185, "peak_bytes": 54788096, "ok": true, "absmax": null, "corr": {"ok": true, "rtol": 0.003, "atol": 0.005, "absmax_q": 0.0, "absmax_k": 0.0, "mae_q": 0.0, "mae_k": 0.0, "mse_q": 0.0, "mse_k": 0.0, "ref": "rotary_torch"}, "err": null}
22
- {"ts": "2025-10-28T14:08:27Z", "run": "19f83b2c64b6453c8956e46baabe2e54", "impl": "torch_eager", "tags": {"family": "pytorch", "backend": "eager"}, "wl": {"name": "cuda_B2_S2048_H8_D128_R64", "batch": 2, "seqlen": 2048, "num_heads": 8, "head_dim": 128, "rotary_dim": 64, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.21436800000174117, "p50": 0.21785799992812827, "p90": 0.2195579999124675, "mean": 0.2202379999744153, "iqr": 0.0030299999025373836, "raw_times": [0.21436800000174117, 0.21785799992812827, 0.2195579999124675, 0.2165280000099301, 0.2328780000198094], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.25353900002755836, "peak_bytes": 109576192, "ok": true, "absmax": null, "corr": {"ok": true, "rtol": 0.003, "atol": 0.005, "absmax_q": 0.0, "absmax_k": 0.0, "mae_q": 0.0, "mae_k": 0.0, "mse_q": 0.0, "mse_k": 0.0, "ref": "rotary_torch"}, "err": null}
23
- {"ts": "2025-10-28T14:08:27Z", "run": "19f83b2c64b6453c8956e46baabe2e54", "impl": "torch_eager", "tags": {"family": "pytorch", "backend": "eager"}, "wl": {"name": "cuda_B2_S2048_H32_D64_R32", "batch": 2, "seqlen": 2048, "num_heads": 32, "head_dim": 64, "rotary_dim": 32, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.22968799999034673, "p50": 0.23015800002212927, "p90": 0.23064800006977748, "mean": 0.23369620002995362, "iqr": 0.0006600000688194996, "raw_times": [0.23015800002212927, 0.24799900006655662, 0.22968799999034673, 0.22998800000095798, 0.23064800006977748], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.23042800000894204, "peak_bytes": 218365952, "ok": true, "absmax": null, "corr": {"ok": true, "rtol": 0.003, "atol": 0.005, "absmax_q": 0.0, "absmax_k": 0.0, "mae_q": 0.0, "mae_k": 0.0, "mse_q": 0.0, "mse_k": 0.0, "ref": "rotary_torch"}, "err": null}
24
- {"ts": "2025-10-28T14:08:27Z", "run": "19f83b2c64b6453c8956e46baabe2e54", "impl": "torch_eager", "tags": {"family": "pytorch", "backend": "eager"}, "wl": {"name": "cuda_B2_S2048_H32_D128_R64", "batch": 2, "seqlen": 2048, "num_heads": 32, "head_dim": 128, "rotary_dim": 64, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.6347319999804313, "p50": 0.6375930000785957, "p90": 0.639283000055002, "mean": 0.6376124000325945, "iqr": 0.003270999968663091, "raw_times": [0.6375930000785957, 0.636012000086339, 0.6404419999626043, 0.6347319999804313, 0.639283000055002], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.639422999938688, "peak_bytes": 436731904, "ok": true, "absmax": null, "corr": {"ok": true, "rtol": 0.003, "atol": 0.005, "absmax_q": 0.0, "absmax_k": 0.0, "mae_q": 0.0, "mae_k": 0.0, "mse_q": 0.0, "mse_k": 0.0, "ref": "rotary_torch"}, "err": null}
 
1
+ {"ts": "2025-10-29T14:27:29Z", "run": "fb4d417a09cd449ead797cfa379bb3d8", "impl": "torch_eager", "tags": {"family": "pytorch", "backend": "eager"}, "wl": {"name": "cuda_B1_S128_H8_D64_R32", "batch": 1, "seqlen": 128, "num_heads": 8, "head_dim": 64, "rotary_dim": 32, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.17006399997399058, "p50": 0.17533400000502297, "p90": 0.1853339999797754, "mean": 0.1802961999942454, "iqr": 0.014799999974002276, "raw_times": [0.17533400000502297, 0.20021500000666492, 0.17053400000577312, 0.1853339999797754, 0.17006399997399058], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.18331500001522727, "peak_bytes": 1720320, "ok": true, "absmax": null, "corr": {"ok": true, "rtol": 0.003, "atol": 0.005, "absmax_q": 0.0, "absmax_k": 0.0, "mae_q": 0.0, "mae_k": 0.0, "mse_q": 0.0, "mse_k": 0.0, "ref": "rotary_torch"}, "err": null}
2
+ {"ts": "2025-10-29T14:27:29Z", "run": "fb4d417a09cd449ead797cfa379bb3d8", "impl": "torch_eager", "tags": {"family": "pytorch", "backend": "eager"}, "wl": {"name": "cuda_B1_S128_H8_D128_R64", "batch": 1, "seqlen": 128, "num_heads": 8, "head_dim": 128, "rotary_dim": 64, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.2246159999685915, "p50": 0.2266160000203854, "p90": 0.22888500001272405, "mean": 0.22735560000910482, "iqr": 0.002880000010918593, "raw_times": [0.22600500000180546, 0.22888500001272405, 0.2246159999685915, 0.2266160000203854, 0.2306560000420177], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.2471160000254713, "peak_bytes": 3440640, "ok": true, "absmax": null, "corr": {"ok": true, "rtol": 0.003, "atol": 0.005, "absmax_q": 0.0, "absmax_k": 0.0, "mae_q": 0.0, "mae_k": 0.0, "mse_q": 0.0, "mse_k": 0.0, "ref": "rotary_torch"}, "err": null}
3
+ {"ts": "2025-10-29T14:27:29Z", "run": "fb4d417a09cd449ead797cfa379bb3d8", "impl": "torch_eager", "tags": {"family": "pytorch", "backend": "eager"}, "wl": {"name": "cuda_B1_S128_H32_D64_R32", "batch": 1, "seqlen": 128, "num_heads": 32, "head_dim": 64, "rotary_dim": 32, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.21813499995460006, "p50": 0.22189599997091136, "p90": 0.2272149999953399, "mean": 0.22315939997952228, "iqr": 0.007960000004914036, "raw_times": [0.2272149999953399, 0.22189599997091136, 0.21925499999042586, 0.21813499995460006, 0.2292959999863342], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.2391049999914685, "peak_bytes": 6832128, "ok": true, "absmax": null, "corr": {"ok": true, "rtol": 0.003, "atol": 0.005, "absmax_q": 0.0, "absmax_k": 0.0, "mae_q": 0.0, "mae_k": 0.0, "mse_q": 0.0, "mse_k": 0.0, "ref": "rotary_torch"}, "err": null}
4
+ {"ts": "2025-10-29T14:27:29Z", "run": "fb4d417a09cd449ead797cfa379bb3d8", "impl": "torch_eager", "tags": {"family": "pytorch", "backend": "eager"}, "wl": {"name": "cuda_B1_S128_H32_D128_R64", "batch": 1, "seqlen": 128, "num_heads": 32, "head_dim": 128, "rotary_dim": 64, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.21478600001501036, "p50": 0.21544499998071842, "p90": 0.2178249999928994, "mean": 0.2161891999890031, "iqr": 0.0027799999884337012, "raw_times": [0.21544499998071842, 0.2178249999928994, 0.21478600001501036, 0.21784499995192164, 0.2150450000044657], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.22814599998355334, "peak_bytes": 13664256, "ok": true, "absmax": null, "corr": {"ok": true, "rtol": 0.003, "atol": 0.005, "absmax_q": 0.0, "absmax_k": 0.0, "mae_q": 0.0, "mae_k": 0.0, "mse_q": 0.0, "mse_k": 0.0, "ref": "rotary_torch"}, "err": null}
5
+ {"ts": "2025-10-29T14:27:29Z", "run": "fb4d417a09cd449ead797cfa379bb3d8", "impl": "torch_eager", "tags": {"family": "pytorch", "backend": "eager"}, "wl": {"name": "cuda_B1_S512_H8_D64_R32", "batch": 1, "seqlen": 512, "num_heads": 8, "head_dim": 64, "rotary_dim": 32, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.2160950000416051, "p50": 0.22390499998437008, "p90": 0.22473600000694205, "mean": 0.22559540000202105, "iqr": 0.00507100003233063, "raw_times": [0.22390499998437008, 0.24357600000257662, 0.22473600000694205, 0.2160950000416051, 0.21966499997461142], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.229085999990275, "peak_bytes": 6881280, "ok": true, "absmax": null, "corr": {"ok": true, "rtol": 0.003, "atol": 0.005, "absmax_q": 0.0, "absmax_k": 0.0, "mae_q": 0.0, "mae_k": 0.0, "mse_q": 0.0, "mse_k": 0.0, "ref": "rotary_torch"}, "err": null}
6
+ {"ts": "2025-10-29T14:27:30Z", "run": "fb4d417a09cd449ead797cfa379bb3d8", "impl": "torch_eager", "tags": {"family": "pytorch", "backend": "eager"}, "wl": {"name": "cuda_B1_S512_H8_D128_R64", "batch": 1, "seqlen": 512, "num_heads": 8, "head_dim": 128, "rotary_dim": 64, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.21366499998975996, "p50": 0.21597500000325454, "p90": 0.21670500001391702, "mean": 0.2158129999884295, "iqr": 0.0008600000569458643, "raw_times": [0.21366499998975996, 0.2168749999782449, 0.21597500000325454, 0.21670500001391702, 0.21584499995697115], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.21572499997546402, "peak_bytes": 13762560, "ok": true, "absmax": null, "corr": {"ok": true, "rtol": 0.003, "atol": 0.005, "absmax_q": 0.0, "absmax_k": 0.0, "mae_q": 0.0, "mae_k": 0.0, "mse_q": 0.0, "mse_k": 0.0, "ref": "rotary_torch"}, "err": null}
7
+ {"ts": "2025-10-29T14:27:30Z", "run": "fb4d417a09cd449ead797cfa379bb3d8", "impl": "torch_eager", "tags": {"family": "pytorch", "backend": "eager"}, "wl": {"name": "cuda_B1_S512_H32_D64_R32", "batch": 1, "seqlen": 512, "num_heads": 32, "head_dim": 64, "rotary_dim": 32, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.2142449999951168, "p50": 0.21574499999132968, "p90": 0.2169850000086626, "mean": 0.21585539999477987, "iqr": 0.0022990000161371427, "raw_times": [0.2142449999951168, 0.2176159999862648, 0.21468599999252547, 0.2169850000086626, 0.21574499999132968], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.2192349999745602, "peak_bytes": 27328512, "ok": true, "absmax": null, "corr": {"ok": true, "rtol": 0.003, "atol": 0.005, "absmax_q": 0.0, "absmax_k": 0.0, "mae_q": 0.0, "mae_k": 0.0, "mse_q": 0.0, "mse_k": 0.0, "ref": "rotary_torch"}, "err": null}
8
+ {"ts": "2025-10-29T14:27:30Z", "run": "fb4d417a09cd449ead797cfa379bb3d8", "impl": "torch_eager", "tags": {"family": "pytorch", "backend": "eager"}, "wl": {"name": "cuda_B1_S512_H32_D128_R64", "batch": 1, "seqlen": 512, "num_heads": 32, "head_dim": 128, "rotary_dim": 64, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.21675499999673775, "p50": 0.21711599998752718, "p90": 0.21833499999956985, "mean": 0.2174776000060774, "iqr": 0.0015789999565640755, "raw_times": [0.21675499999673775, 0.21711599998752718, 0.21833499999956985, 0.21675600004300577, 0.2184260000035465], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.22064600000248902, "peak_bytes": 54657024, "ok": true, "absmax": null, "corr": {"ok": true, "rtol": 0.003, "atol": 0.005, "absmax_q": 0.0, "absmax_k": 0.0, "mae_q": 0.0, "mae_k": 0.0, "mse_q": 0.0, "mse_k": 0.0, "ref": "rotary_torch"}, "err": null}
9
+ {"ts": "2025-10-29T14:27:30Z", "run": "fb4d417a09cd449ead797cfa379bb3d8", "impl": "torch_eager", "tags": {"family": "pytorch", "backend": "eager"}, "wl": {"name": "cuda_B1_S2048_H8_D64_R32", "batch": 1, "seqlen": 2048, "num_heads": 8, "head_dim": 64, "rotary_dim": 32, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.2153649999740992, "p50": 0.21702599997297511, "p90": 0.21829499996783852, "mean": 0.21729759998834197, "iqr": 0.0014989999499448459, "raw_times": [0.2153649999740992, 0.21679600001789368, 0.21900600000890336, 0.21702599997297511, 0.21829499996783852], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.22154499998805477, "peak_bytes": 27525120, "ok": true, "absmax": null, "corr": {"ok": true, "rtol": 0.003, "atol": 0.005, "absmax_q": 0.0, "absmax_k": 0.0, "mae_q": 0.0, "mae_k": 0.0, "mse_q": 0.0, "mse_k": 0.0, "ref": "rotary_torch"}, "err": null}
10
+ {"ts": "2025-10-29T14:27:30Z", "run": "fb4d417a09cd449ead797cfa379bb3d8", "impl": "torch_eager", "tags": {"family": "pytorch", "backend": "eager"}, "wl": {"name": "cuda_B1_S2048_H8_D128_R64", "batch": 1, "seqlen": 2048, "num_heads": 8, "head_dim": 128, "rotary_dim": 64, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.2143060000321384, "p50": 0.2161449999675824, "p90": 0.21640500000330576, "mean": 0.21578740000904872, "iqr": 0.0008589999538344273, "raw_times": [0.21653499999274572, 0.21640500000330576, 0.2143060000321384, 0.2161449999675824, 0.21554600004947133], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.23455599995259035, "peak_bytes": 55050240, "ok": true, "absmax": null, "corr": {"ok": true, "rtol": 0.003, "atol": 0.005, "absmax_q": 0.0, "absmax_k": 0.0, "mae_q": 0.0, "mae_k": 0.0, "mse_q": 0.0, "mse_k": 0.0, "ref": "rotary_torch"}, "err": null}
11
+ {"ts": "2025-10-29T14:27:30Z", "run": "fb4d417a09cd449ead797cfa379bb3d8", "impl": "torch_eager", "tags": {"family": "pytorch", "backend": "eager"}, "wl": {"name": "cuda_B1_S2048_H32_D64_R32", "batch": 1, "seqlen": 2048, "num_heads": 32, "head_dim": 64, "rotary_dim": 32, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.21488499999122723, "p50": 0.21633500000461936, "p90": 0.21918499999173946, "mean": 0.21730919999072285, "iqr": 0.004300000000512227, "raw_times": [0.21488499999122723, 0.21918499999173946, 0.21488499999122723, 0.22125599997480094, 0.21633500000461936], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.2185359999771208, "peak_bytes": 109314048, "ok": true, "absmax": null, "corr": {"ok": true, "rtol": 0.003, "atol": 0.005, "absmax_q": 0.0, "absmax_k": 0.0, "mae_q": 0.0, "mae_k": 0.0, "mse_q": 0.0, "mse_k": 0.0, "ref": "rotary_torch"}, "err": null}
12
+ {"ts": "2025-10-29T14:27:30Z", "run": "fb4d417a09cd449ead797cfa379bb3d8", "impl": "torch_eager", "tags": {"family": "pytorch", "backend": "eager"}, "wl": {"name": "cuda_B1_S2048_H32_D128_R64", "batch": 1, "seqlen": 2048, "num_heads": 32, "head_dim": 128, "rotary_dim": 64, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.2230359999657594, "p50": 0.22526600002947816, "p90": 0.22695600000588456, "mean": 0.22723160000168718, "iqr": 0.0026509999884183344, "raw_times": [0.22526600002947816, 0.2230359999657594, 0.23659499998984757, 0.22430500001746623, 0.22695600000588456], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.22456599998577076, "peak_bytes": 218628096, "ok": true, "absmax": null, "corr": {"ok": true, "rtol": 0.003, "atol": 0.005, "absmax_q": 0.0, "absmax_k": 0.0, "mae_q": 0.0, "mae_k": 0.0, "mse_q": 0.0, "mse_k": 0.0, "ref": "rotary_torch"}, "err": null}
13
+ {"ts": "2025-10-29T14:27:30Z", "run": "fb4d417a09cd449ead797cfa379bb3d8", "impl": "torch_eager", "tags": {"family": "pytorch", "backend": "eager"}, "wl": {"name": "cuda_B2_S128_H8_D64_R32", "batch": 2, "seqlen": 128, "num_heads": 8, "head_dim": 64, "rotary_dim": 32, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.21608499997682884, "p50": 0.2175149999743553, "p90": 0.22948600002337116, "mean": 0.2247094000040306, "iqr": 0.012610999988282856, "raw_times": [0.21608499997682884, 0.2168750000350883, 0.22948600002337116, 0.24358600001050945, 0.2175149999743553], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.21851499997183055, "peak_bytes": 68698112, "ok": true, "absmax": null, "corr": {"ok": true, "rtol": 0.003, "atol": 0.005, "absmax_q": 0.0, "absmax_k": 0.0, "mae_q": 0.0, "mae_k": 0.0, "mse_q": 0.0, "mse_k": 0.0, "ref": "rotary_torch"}, "err": null}
14
+ {"ts": "2025-10-29T14:27:30Z", "run": "fb4d417a09cd449ead797cfa379bb3d8", "impl": "torch_eager", "tags": {"family": "pytorch", "backend": "eager"}, "wl": {"name": "cuda_B2_S128_H8_D128_R64", "batch": 2, "seqlen": 128, "num_heads": 8, "head_dim": 128, "rotary_dim": 64, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.2162149999662688, "p50": 0.21694499997693129, "p90": 0.2171250000060354, "mean": 0.21706100000074002, "iqr": 0.0003099999617006688, "raw_times": [0.2162149999662688, 0.21694499997693129, 0.2171250000060354, 0.21820500001012988, 0.21681500004433474], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.21809500003655558, "peak_bytes": 6848512, "ok": true, "absmax": null, "corr": {"ok": true, "rtol": 0.003, "atol": 0.005, "absmax_q": 0.0, "absmax_k": 0.0, "mae_q": 0.0, "mae_k": 0.0, "mse_q": 0.0, "mse_k": 0.0, "ref": "rotary_torch"}, "err": null}
15
+ {"ts": "2025-10-29T14:27:30Z", "run": "fb4d417a09cd449ead797cfa379bb3d8", "impl": "torch_eager", "tags": {"family": "pytorch", "backend": "eager"}, "wl": {"name": "cuda_B2_S128_H32_D64_R32", "batch": 2, "seqlen": 128, "num_heads": 32, "head_dim": 64, "rotary_dim": 32, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.2136749999976928, "p50": 0.21658500003240988, "p90": 0.21662599999672238, "mean": 0.21621120000645533, "iqr": 0.00066100000140068, "raw_times": [0.2136749999976928, 0.2159649999953217, 0.21658500003240988, 0.21662599999672238, 0.21820500001012988], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.2181750000431748, "peak_bytes": 13647872, "ok": true, "absmax": null, "corr": {"ok": true, "rtol": 0.003, "atol": 0.005, "absmax_q": 0.0, "absmax_k": 0.0, "mae_q": 0.0, "mae_k": 0.0, "mse_q": 0.0, "mse_k": 0.0, "ref": "rotary_torch"}, "err": null}
16
+ {"ts": "2025-10-29T14:27:30Z", "run": "fb4d417a09cd449ead797cfa379bb3d8", "impl": "torch_eager", "tags": {"family": "pytorch", "backend": "eager"}, "wl": {"name": "cuda_B2_S128_H32_D128_R64", "batch": 2, "seqlen": 128, "num_heads": 32, "head_dim": 128, "rotary_dim": 64, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.21695499998486412, "p50": 0.21774499998628016, "p90": 0.2285450000272249, "mean": 0.22256720000086716, "iqr": 0.010920000022451859, "raw_times": [0.21774499998628016, 0.21762500000477303, 0.2319660000011936, 0.21695499998486412, 0.2285450000272249], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.22269599998026024, "peak_bytes": 27295744, "ok": true, "absmax": null, "corr": {"ok": true, "rtol": 0.003, "atol": 0.005, "absmax_q": 0.0, "absmax_k": 0.0, "mae_q": 0.0, "mae_k": 0.0, "mse_q": 0.0, "mse_k": 0.0, "ref": "rotary_torch"}, "err": null}
17
+ {"ts": "2025-10-29T14:27:30Z", "run": "fb4d417a09cd449ead797cfa379bb3d8", "impl": "torch_eager", "tags": {"family": "pytorch", "backend": "eager"}, "wl": {"name": "cuda_B2_S512_H8_D64_R32", "batch": 2, "seqlen": 512, "num_heads": 8, "head_dim": 64, "rotary_dim": 32, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.21440599999777987, "p50": 0.21785499995985447, "p90": 0.2335159999802272, "mean": 0.2228595999895333, "iqr": 0.01891099998374557, "raw_times": [0.21440599999777987, 0.23391600001332336, 0.21785499995985447, 0.2335159999802272, 0.21460499999648164], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.21932499998911226, "peak_bytes": 13697024, "ok": true, "absmax": null, "corr": {"ok": true, "rtol": 0.003, "atol": 0.005, "absmax_q": 0.0, "absmax_k": 0.0, "mae_q": 0.0, "mae_k": 0.0, "mse_q": 0.0, "mse_k": 0.0, "ref": "rotary_torch"}, "err": null}
18
+ {"ts": "2025-10-29T14:27:30Z", "run": "fb4d417a09cd449ead797cfa379bb3d8", "impl": "torch_eager", "tags": {"family": "pytorch", "backend": "eager"}, "wl": {"name": "cuda_B2_S512_H8_D128_R64", "batch": 2, "seqlen": 512, "num_heads": 8, "head_dim": 128, "rotary_dim": 64, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.2133250000042608, "p50": 0.21422499997925115, "p90": 0.21653499999274572, "mean": 0.21708740000576654, "iqr": 0.0029589999712698045, "raw_times": [0.21357600002147592, 0.2133250000042608, 0.21653499999274572, 0.21422499997925115, 0.2277760000310991], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.22739600001386862, "peak_bytes": 27394048, "ok": true, "absmax": null, "corr": {"ok": true, "rtol": 0.003, "atol": 0.005, "absmax_q": 0.0, "absmax_k": 0.0, "mae_q": 0.0, "mae_k": 0.0, "mse_q": 0.0, "mse_k": 0.0, "ref": "rotary_torch"}, "err": null}
19
+ {"ts": "2025-10-29T14:27:30Z", "run": "fb4d417a09cd449ead797cfa379bb3d8", "impl": "torch_eager", "tags": {"family": "pytorch", "backend": "eager"}, "wl": {"name": "cuda_B2_S512_H32_D64_R32", "batch": 2, "seqlen": 512, "num_heads": 32, "head_dim": 64, "rotary_dim": 32, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.21597500000325454, "p50": 0.2176859999849512, "p90": 0.21771499996248167, "mean": 0.21758339998996234, "iqr": 0.0013999999737279722, "raw_times": [0.2176859999849512, 0.21771499996248167, 0.22022600001037063, 0.2163149999887537, 0.21597500000325454], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.21613599994907418, "peak_bytes": 54591488, "ok": true, "absmax": null, "corr": {"ok": true, "rtol": 0.003, "atol": 0.005, "absmax_q": 0.0, "absmax_k": 0.0, "mae_q": 0.0, "mae_k": 0.0, "mse_q": 0.0, "mse_k": 0.0, "ref": "rotary_torch"}, "err": null}
20
+ {"ts": "2025-10-29T14:27:30Z", "run": "fb4d417a09cd449ead797cfa379bb3d8", "impl": "torch_eager", "tags": {"family": "pytorch", "backend": "eager"}, "wl": {"name": "cuda_B2_S512_H32_D128_R64", "batch": 2, "seqlen": 512, "num_heads": 32, "head_dim": 128, "rotary_dim": 64, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.21577600000455277, "p50": 0.2173749999769825, "p90": 0.21900600000890336, "mean": 0.21836960000882755, "iqr": 0.0018509999790694565, "raw_times": [0.2171550000298339, 0.21577600000455277, 0.2225360000238652, 0.2173749999769825, 0.21900600000890336], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.22321599999486352, "peak_bytes": 109182976, "ok": true, "absmax": null, "corr": {"ok": true, "rtol": 0.003, "atol": 0.005, "absmax_q": 0.0, "absmax_k": 0.0, "mae_q": 0.0, "mae_k": 0.0, "mse_q": 0.0, "mse_k": 0.0, "ref": "rotary_torch"}, "err": null}
21
+ {"ts": "2025-10-29T14:27:30Z", "run": "fb4d417a09cd449ead797cfa379bb3d8", "impl": "torch_eager", "tags": {"family": "pytorch", "backend": "eager"}, "wl": {"name": "cuda_B2_S2048_H8_D64_R32", "batch": 2, "seqlen": 2048, "num_heads": 8, "head_dim": 64, "rotary_dim": 32, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.21538499998996485, "p50": 0.21647599999141676, "p90": 0.21717500004569956, "mean": 0.2167214000110107, "iqr": 0.001030000021273736, "raw_times": [0.21717500004569956, 0.2184260000035465, 0.21538499998996485, 0.21614500002442583, 0.21647599999141676], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.21872600001415776, "peak_bytes": 54788096, "ok": true, "absmax": null, "corr": {"ok": true, "rtol": 0.003, "atol": 0.005, "absmax_q": 0.0, "absmax_k": 0.0, "mae_q": 0.0, "mae_k": 0.0, "mse_q": 0.0, "mse_k": 0.0, "ref": "rotary_torch"}, "err": null}
22
+ {"ts": "2025-10-29T14:27:30Z", "run": "fb4d417a09cd449ead797cfa379bb3d8", "impl": "torch_eager", "tags": {"family": "pytorch", "backend": "eager"}, "wl": {"name": "cuda_B2_S2048_H8_D128_R64", "batch": 2, "seqlen": 2048, "num_heads": 8, "head_dim": 128, "rotary_dim": 64, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.21501500003751062, "p50": 0.2168760000245129, "p90": 0.2187050000088675, "mean": 0.21949320001795058, "iqr": 0.0030500000320898835, "raw_times": [0.21565499997677762, 0.2187050000088675, 0.21501500003751062, 0.23121500004208428, 0.2168760000245129], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.22076499999457155, "peak_bytes": 109576192, "ok": true, "absmax": null, "corr": {"ok": true, "rtol": 0.003, "atol": 0.005, "absmax_q": 0.0, "absmax_k": 0.0, "mae_q": 0.0, "mae_k": 0.0, "mse_q": 0.0, "mse_k": 0.0, "ref": "rotary_torch"}, "err": null}
23
+ {"ts": "2025-10-29T14:27:30Z", "run": "fb4d417a09cd449ead797cfa379bb3d8", "impl": "torch_eager", "tags": {"family": "pytorch", "backend": "eager"}, "wl": {"name": "cuda_B2_S2048_H32_D64_R32", "batch": 2, "seqlen": 2048, "num_heads": 32, "head_dim": 64, "rotary_dim": 32, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.22937599999295344, "p50": 0.23008499999832566, "p90": 0.23144499999716572, "mean": 0.23359140000138723, "iqr": 0.0020100000028833165, "raw_times": [0.23144499999716572, 0.23008499999832566, 0.2294349999942824, 0.22937599999295344, 0.24761600002420892], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.23195599999326078, "peak_bytes": 218365952, "ok": true, "absmax": null, "corr": {"ok": true, "rtol": 0.003, "atol": 0.005, "absmax_q": 0.0, "absmax_k": 0.0, "mae_q": 0.0, "mae_k": 0.0, "mse_q": 0.0, "mse_k": 0.0, "ref": "rotary_torch"}, "err": null}
24
+ {"ts": "2025-10-29T14:27:30Z", "run": "fb4d417a09cd449ead797cfa379bb3d8", "impl": "torch_eager", "tags": {"family": "pytorch", "backend": "eager"}, "wl": {"name": "cuda_B2_S2048_H32_D128_R64", "batch": 2, "seqlen": 2048, "num_heads": 32, "head_dim": 128, "rotary_dim": 64, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.637245999996594, "p50": 0.6388759999822469, "p90": 0.6389449999915087, "mean": 0.6396317999929124, "iqr": 0.0012190000120426703, "raw_times": [0.6388759999822469, 0.6453660000147465, 0.6389449999915087, 0.637245999996594, 0.637725999979466], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.6364359999793123, "peak_bytes": 436731904, "ok": true, "absmax": null, "corr": {"ok": true, "rtol": 0.003, "atol": 0.005, "absmax_q": 0.0, "absmax_k": 0.0, "mae_q": 0.0, "mae_k": 0.0, "mse_q": 0.0, "mse_k": 0.0, "ref": "rotary_torch"}, "err": null}
rotary/impls/hf_kernels_rotary.html CHANGED
The diff for this file is too large to render. See raw diff
 
rotary/impls/torch_rotary.html CHANGED
The diff for this file is too large to render. See raw diff
 
rotary/index.html CHANGED
The diff for this file is too large to render. See raw diff
 
rotary/results/artifacts/combine/latency.svg CHANGED

Git LFS Details

  • SHA256: 0517a426384d0bc9df1932ace04595ea1867cb036e7fbeced61eb044cff2e335
  • Pointer size: 130 Bytes
  • Size of remote file: 31 kB

Git LFS Details

  • SHA256: 36e71e631ab1a00097df3bc72a4532b4b383ed31a1df2368bd041e765254a9c3
  • Pointer size: 130 Bytes
  • Size of remote file: 31 kB
rotary/results/combined_results.html CHANGED
@@ -3872,7 +3872,7 @@ span.linenos.special { color: #000000; background-color: #ffffc0; padding-left:
3872
  <rdf:RDF>
3873
  <ns2:Work>
3874
  <dc:type rdf:resource="http://purl.org/dc/dcmitype/StillImage" />
3875
- <dc:date>2025-10-28T14:09:08.848427</dc:date>
3876
  <dc:format>image/svg+xml</dc:format>
3877
  <dc:creator>
3878
  <ns2:Agent>
@@ -4216,70 +4216,70 @@ span.linenos.special { color: #000000; background-color: #ffffc0; padding-left:
4216
  <g id="matplotlib.axis_2">
4217
  <g id="ytick_1">
4218
  <g id="grid-y--2" class="grid grid-y">
4219
- <path d="M 47.72 384.19215 L 823.142937 384.19215 " clip-path="url(#p088c925177)" style="fill: none; stroke: #b0b0b0; stroke-opacity: 0.3; stroke-width: 0.8; stroke-linecap: square" />
4220
  </g>
4221
  <g id="line2d_25">
4222
  <defs>
4223
  <path id="m0fca2865ba" d="M 0 0 L -3.5 0 " style="stroke: #000000; stroke-width: 0.8" />
4224
  </defs>
4225
  <g>
4226
- <use ns4:href="#m0fca2865ba" x="47.72" y="384.19215" style="stroke: #000000; stroke-width: 0.8" />
4227
  </g>
4228
  </g>
4229
  <g id="text_25">
4230
- <text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: end" x="40.72" y="387.991369" transform="rotate(-0 40.72 387.991369)">0.2</text>
4231
  </g>
4232
  </g>
4233
  <g id="ytick_2">
4234
  <g id="grid-y--3" class="grid grid-y">
4235
- <path d="M 47.72 306.653539 L 823.142937 306.653539 " clip-path="url(#p088c925177)" style="fill: none; stroke: #b0b0b0; stroke-opacity: 0.3; stroke-width: 0.8; stroke-linecap: square" />
4236
  </g>
4237
  <g id="line2d_26">
4238
  <g>
4239
- <use ns4:href="#m0fca2865ba" x="47.72" y="306.653539" style="stroke: #000000; stroke-width: 0.8" />
4240
  </g>
4241
  </g>
4242
  <g id="text_26">
4243
- <text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: end" x="40.72" y="310.452758" transform="rotate(-0 40.72 310.452758)">0.3</text>
4244
  </g>
4245
  </g>
4246
  <g id="ytick_3">
4247
  <g id="grid-y--4" class="grid grid-y">
4248
- <path d="M 47.72 229.114927 L 823.142937 229.114927 " clip-path="url(#p088c925177)" style="fill: none; stroke: #b0b0b0; stroke-opacity: 0.3; stroke-width: 0.8; stroke-linecap: square" />
4249
  </g>
4250
  <g id="line2d_27">
4251
  <g>
4252
- <use ns4:href="#m0fca2865ba" x="47.72" y="229.114927" style="stroke: #000000; stroke-width: 0.8" />
4253
  </g>
4254
  </g>
4255
  <g id="text_27">
4256
- <text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: end" x="40.72" y="232.914146" transform="rotate(-0 40.72 232.914146)">0.4</text>
4257
  </g>
4258
  </g>
4259
  <g id="ytick_4">
4260
  <g id="grid-y--5" class="grid grid-y">
4261
- <path d="M 47.72 151.576316 L 823.142937 151.576316 " clip-path="url(#p088c925177)" style="fill: none; stroke: #b0b0b0; stroke-opacity: 0.3; stroke-width: 0.8; stroke-linecap: square" />
4262
  </g>
4263
  <g id="line2d_28">
4264
  <g>
4265
- <use ns4:href="#m0fca2865ba" x="47.72" y="151.576316" style="stroke: #000000; stroke-width: 0.8" />
4266
  </g>
4267
  </g>
4268
  <g id="text_28">
4269
- <text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: end" x="40.72" y="155.375535" transform="rotate(-0 40.72 155.375535)">0.5</text>
4270
  </g>
4271
  </g>
4272
  <g id="ytick_5">
4273
  <g id="grid-y--6" class="grid grid-y">
4274
- <path d="M 47.72 74.037704 L 823.142937 74.037704 " clip-path="url(#p088c925177)" style="fill: none; stroke: #b0b0b0; stroke-opacity: 0.3; stroke-width: 0.8; stroke-linecap: square" />
4275
  </g>
4276
  <g id="line2d_29">
4277
  <g>
4278
- <use ns4:href="#m0fca2865ba" x="47.72" y="74.037704" style="stroke: #000000; stroke-width: 0.8" />
4279
  </g>
4280
  </g>
4281
  <g id="text_29">
4282
- <text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: end" x="40.72" y="77.836923" transform="rotate(-0 40.72 77.836923)">0.6</text>
4283
  </g>
4284
  </g>
4285
  <g id="label--y" class="ylabel">
@@ -4287,34 +4287,34 @@ span.linenos.special { color: #000000; background-color: #ffffc0; padding-left:
4287
  </g>
4288
  </g>
4289
  <g id="series--torch-eager" class="series">
4290
- <path d="M 82.966497 405.060892 L 113.615625 361.374088 L 144.264753 368.740256 L 174.913881 368.958139 L 205.563009 371.664237 L 236.212137 372.082945 L 266.861265 372.113961 L 297.510393 370.059188 L 328.159521 369.570694 L 358.808648 370.368567 L 389.457776 370.997405 L 420.106904 364.940864 L 450.756032 370.21349 L 481.40516 369.546657 L 512.054288 369.84208 L 542.703416 370.439127 L 573.352544 368.570446 L 604.001672 369.011641 L 634.6508 368.655739 L 665.299928 368.896108 L 695.949056 369.864566 L 726.598184 370.345305 L 757.247312 360.808056 L 787.896439 44.888614 " clip-path="url(#p088c925177)" style="fill: none; stroke: #1f77b4; stroke-width: 1.5; stroke-linecap: square" />
4291
  <defs>
4292
  <path id="md7efaf3aec" d="M 0 3 C 0.795609 3 1.55874 2.683901 2.12132 2.12132 C 2.683901 1.55874 3 0.795609 3 0 C 3 -0.795609 2.683901 -1.55874 2.12132 -2.12132 C 1.55874 -2.683901 0.795609 -3 0 -3 C -0.795609 -3 -1.55874 -2.683901 -2.12132 -2.12132 C -2.683901 -1.55874 -3 -0.795609 -3 0 C -3 0.795609 -2.683901 1.55874 -2.12132 2.12132 C -1.55874 2.683901 -0.795609 3 0 3 z " style="stroke: #1f77b4" />
4293
  </defs>
4294
  <g clip-path="url(#p088c925177)">
4295
  <use ns4:href="#md7efaf3aec" x="82.966497" y="405.060892" style="fill: #1f77b4; stroke: #1f77b4" />
4296
- <use ns4:href="#md7efaf3aec" x="113.615625" y="361.374088" style="fill: #1f77b4; stroke: #1f77b4" />
4297
- <use ns4:href="#md7efaf3aec" x="144.264753" y="368.740256" style="fill: #1f77b4; stroke: #1f77b4" />
4298
- <use ns4:href="#md7efaf3aec" x="174.913881" y="368.958139" style="fill: #1f77b4; stroke: #1f77b4" />
4299
- <use ns4:href="#md7efaf3aec" x="205.563009" y="371.664237" style="fill: #1f77b4; stroke: #1f77b4" />
4300
- <use ns4:href="#md7efaf3aec" x="236.212137" y="372.082945" style="fill: #1f77b4; stroke: #1f77b4" />
4301
- <use ns4:href="#md7efaf3aec" x="266.861265" y="372.113961" style="fill: #1f77b4; stroke: #1f77b4" />
4302
- <use ns4:href="#md7efaf3aec" x="297.510393" y="370.059188" style="fill: #1f77b4; stroke: #1f77b4" />
4303
- <use ns4:href="#md7efaf3aec" x="328.159521" y="369.570694" style="fill: #1f77b4; stroke: #1f77b4" />
4304
- <use ns4:href="#md7efaf3aec" x="358.808648" y="370.368567" style="fill: #1f77b4; stroke: #1f77b4" />
4305
- <use ns4:href="#md7efaf3aec" x="389.457776" y="370.997405" style="fill: #1f77b4; stroke: #1f77b4" />
4306
- <use ns4:href="#md7efaf3aec" x="420.106904" y="364.940864" style="fill: #1f77b4; stroke: #1f77b4" />
4307
- <use ns4:href="#md7efaf3aec" x="450.756032" y="370.21349" style="fill: #1f77b4; stroke: #1f77b4" />
4308
- <use ns4:href="#md7efaf3aec" x="481.40516" y="369.546657" style="fill: #1f77b4; stroke: #1f77b4" />
4309
- <use ns4:href="#md7efaf3aec" x="512.054288" y="369.84208" style="fill: #1f77b4; stroke: #1f77b4" />
4310
- <use ns4:href="#md7efaf3aec" x="542.703416" y="370.439127" style="fill: #1f77b4; stroke: #1f77b4" />
4311
- <use ns4:href="#md7efaf3aec" x="573.352544" y="368.570446" style="fill: #1f77b4; stroke: #1f77b4" />
4312
- <use ns4:href="#md7efaf3aec" x="604.001672" y="369.011641" style="fill: #1f77b4; stroke: #1f77b4" />
4313
- <use ns4:href="#md7efaf3aec" x="634.6508" y="368.655739" style="fill: #1f77b4; stroke: #1f77b4" />
4314
- <use ns4:href="#md7efaf3aec" x="665.299928" y="368.896108" style="fill: #1f77b4; stroke: #1f77b4" />
4315
- <use ns4:href="#md7efaf3aec" x="695.949056" y="369.864566" style="fill: #1f77b4; stroke: #1f77b4" />
4316
- <use ns4:href="#md7efaf3aec" x="726.598184" y="370.345305" style="fill: #1f77b4; stroke: #1f77b4" />
4317
- <use ns4:href="#md7efaf3aec" x="757.247312" y="360.808056" style="fill: #1f77b4; stroke: #1f77b4" />
4318
  <use ns4:href="#md7efaf3aec" x="787.896439" y="44.888614" style="fill: #1f77b4; stroke: #1f77b4" />
4319
  </g>
4320
  </g>
@@ -4364,7 +4364,7 @@ span.linenos.special { color: #000000; background-color: #ffffc0; padding-left:
4364
  <span onclick="toggleOutput('combine')" style="cursor: pointer;">▼ output</span>
4365
  <span id="uv-indicator-combine" onclick="toggleUvLogsFromHeader('combine')" style="cursor: pointer;">▶ uv-logs</span>
4366
  </span> |
4367
- Cell: combine | 4.36s
4368
  | <button class="run-btn" onclick="runCell('combine')">▶ run</button>
4369
  <button class="copy-btn" onclick="copyCell('combine')">Copy</button>
4370
  <a href="cells/combine.py" target="_blank" class="raw-btn">Raw</a>
@@ -4453,7 +4453,7 @@ COMBINED BENCHMARK SUMMARY
4453
  impl wl p50(ms) ok
4454
  hf_kernels_rotary cuda_B1_S128_H32_D128_R64 0.09 False
4455
  hf_kernels_rotary cuda_B1_S128_H32_D64_R32 0.09 False
4456
- hf_kernels_rotary cuda_B1_S128_H8_D128_R64 0.09 False
4457
  hf_kernels_rotary cuda_B1_S128_H8_D64_R32 0.08 False
4458
  hf_kernels_rotary cuda_B1_S2048_H32_D128_R64 0.09 False
4459
  hf_kernels_rotary cuda_B1_S2048_H32_D64_R32 0.09 False
@@ -4478,8 +4478,8 @@ hf_kernels_rotary cuda_B2_S512_H8_D64_R32 0.09 False
4478
  torch_eager cuda_B1_S128_H32_D128_R64 0.22 True
4479
  torch_eager cuda_B1_S128_H32_D64_R32 0.22 True
4480
  torch_eager cuda_B1_S128_H8_D128_R64 0.23 True
4481
- torch_eager cuda_B1_S128_H8_D64_R32 0.17 True
4482
- torch_eager cuda_B1_S2048_H32_D128_R64 0.22 True
4483
  torch_eager cuda_B1_S2048_H32_D64_R32 0.22 True
4484
  torch_eager cuda_B1_S2048_H8_D128_R64 0.22 True
4485
  torch_eager cuda_B1_S2048_H8_D64_R32 0.22 True
@@ -4497,7 +4497,7 @@ torch_eager cuda_B2_S2048_H8_D128_R64 0.22 True
4497
  torch_eager cuda_B2_S2048_H8_D64_R32 0.22 True
4498
  torch_eager cuda_B2_S512_H32_D128_R64 0.22 True
4499
  torch_eager cuda_B2_S512_H32_D64_R32 0.22 True
4500
- torch_eager cuda_B2_S512_H8_D128_R64 0.22 True
4501
  torch_eager cuda_B2_S512_H8_D64_R32 0.22 True
4502
 
4503
  GENERATING COMBINED VISUALIZATION
@@ -4518,7 +4518,7 @@ Implementations included:
4518
  <div class="uv-install-logs" id="uv-logs-combine">
4519
  <div class="uv-logs-header" onclick="toggleUvLogs(this)">▶ UV Install Logs</div>
4520
  <div class="uv-logs-content" style="display: none;">
4521
- Installed 37 packages in 219ms
4522
  </div>
4523
  </div>
4524
  <div class="cell-artifacts">
@@ -4531,7 +4531,7 @@ Installed 37 packages in 219ms
4531
  <rdf:RDF>
4532
  <ns2:Work>
4533
  <dc:type rdf:resource="http://purl.org/dc/dcmitype/StillImage" />
4534
- <dc:date>2025-10-28T14:09:08.848427</dc:date>
4535
  <dc:format>image/svg+xml</dc:format>
4536
  <dc:creator>
4537
  <ns2:Agent>
@@ -4875,70 +4875,70 @@ Installed 37 packages in 219ms
4875
  <g id="matplotlib.axis_2">
4876
  <g id="ytick_1">
4877
  <g id="grid-y--2" class="grid grid-y">
4878
- <path d="M 47.72 384.19215 L 823.142937 384.19215 " clip-path="url(#p088c925177)" style="fill: none; stroke: #b0b0b0; stroke-opacity: 0.3; stroke-width: 0.8; stroke-linecap: square" />
4879
  </g>
4880
  <g id="line2d_25">
4881
  <defs>
4882
  <path id="m0fca2865ba" d="M 0 0 L -3.5 0 " style="stroke: #000000; stroke-width: 0.8" />
4883
  </defs>
4884
  <g>
4885
- <use ns4:href="#m0fca2865ba" x="47.72" y="384.19215" style="stroke: #000000; stroke-width: 0.8" />
4886
  </g>
4887
  </g>
4888
  <g id="text_25">
4889
- <text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: end" x="40.72" y="387.991369" transform="rotate(-0 40.72 387.991369)">0.2</text>
4890
  </g>
4891
  </g>
4892
  <g id="ytick_2">
4893
  <g id="grid-y--3" class="grid grid-y">
4894
- <path d="M 47.72 306.653539 L 823.142937 306.653539 " clip-path="url(#p088c925177)" style="fill: none; stroke: #b0b0b0; stroke-opacity: 0.3; stroke-width: 0.8; stroke-linecap: square" />
4895
  </g>
4896
  <g id="line2d_26">
4897
  <g>
4898
- <use ns4:href="#m0fca2865ba" x="47.72" y="306.653539" style="stroke: #000000; stroke-width: 0.8" />
4899
  </g>
4900
  </g>
4901
  <g id="text_26">
4902
- <text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: end" x="40.72" y="310.452758" transform="rotate(-0 40.72 310.452758)">0.3</text>
4903
  </g>
4904
  </g>
4905
  <g id="ytick_3">
4906
  <g id="grid-y--4" class="grid grid-y">
4907
- <path d="M 47.72 229.114927 L 823.142937 229.114927 " clip-path="url(#p088c925177)" style="fill: none; stroke: #b0b0b0; stroke-opacity: 0.3; stroke-width: 0.8; stroke-linecap: square" />
4908
  </g>
4909
  <g id="line2d_27">
4910
  <g>
4911
- <use ns4:href="#m0fca2865ba" x="47.72" y="229.114927" style="stroke: #000000; stroke-width: 0.8" />
4912
  </g>
4913
  </g>
4914
  <g id="text_27">
4915
- <text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: end" x="40.72" y="232.914146" transform="rotate(-0 40.72 232.914146)">0.4</text>
4916
  </g>
4917
  </g>
4918
  <g id="ytick_4">
4919
  <g id="grid-y--5" class="grid grid-y">
4920
- <path d="M 47.72 151.576316 L 823.142937 151.576316 " clip-path="url(#p088c925177)" style="fill: none; stroke: #b0b0b0; stroke-opacity: 0.3; stroke-width: 0.8; stroke-linecap: square" />
4921
  </g>
4922
  <g id="line2d_28">
4923
  <g>
4924
- <use ns4:href="#m0fca2865ba" x="47.72" y="151.576316" style="stroke: #000000; stroke-width: 0.8" />
4925
  </g>
4926
  </g>
4927
  <g id="text_28">
4928
- <text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: end" x="40.72" y="155.375535" transform="rotate(-0 40.72 155.375535)">0.5</text>
4929
  </g>
4930
  </g>
4931
  <g id="ytick_5">
4932
  <g id="grid-y--6" class="grid grid-y">
4933
- <path d="M 47.72 74.037704 L 823.142937 74.037704 " clip-path="url(#p088c925177)" style="fill: none; stroke: #b0b0b0; stroke-opacity: 0.3; stroke-width: 0.8; stroke-linecap: square" />
4934
  </g>
4935
  <g id="line2d_29">
4936
  <g>
4937
- <use ns4:href="#m0fca2865ba" x="47.72" y="74.037704" style="stroke: #000000; stroke-width: 0.8" />
4938
  </g>
4939
  </g>
4940
  <g id="text_29">
4941
- <text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: end" x="40.72" y="77.836923" transform="rotate(-0 40.72 77.836923)">0.6</text>
4942
  </g>
4943
  </g>
4944
  <g id="label--y" class="ylabel">
@@ -4946,34 +4946,34 @@ Installed 37 packages in 219ms
4946
  </g>
4947
  </g>
4948
  <g id="series--torch-eager" class="series">
4949
- <path d="M 82.966497 405.060892 L 113.615625 361.374088 L 144.264753 368.740256 L 174.913881 368.958139 L 205.563009 371.664237 L 236.212137 372.082945 L 266.861265 372.113961 L 297.510393 370.059188 L 328.159521 369.570694 L 358.808648 370.368567 L 389.457776 370.997405 L 420.106904 364.940864 L 450.756032 370.21349 L 481.40516 369.546657 L 512.054288 369.84208 L 542.703416 370.439127 L 573.352544 368.570446 L 604.001672 369.011641 L 634.6508 368.655739 L 665.299928 368.896108 L 695.949056 369.864566 L 726.598184 370.345305 L 757.247312 360.808056 L 787.896439 44.888614 " clip-path="url(#p088c925177)" style="fill: none; stroke: #1f77b4; stroke-width: 1.5; stroke-linecap: square" />
4950
  <defs>
4951
  <path id="md7efaf3aec" d="M 0 3 C 0.795609 3 1.55874 2.683901 2.12132 2.12132 C 2.683901 1.55874 3 0.795609 3 0 C 3 -0.795609 2.683901 -1.55874 2.12132 -2.12132 C 1.55874 -2.683901 0.795609 -3 0 -3 C -0.795609 -3 -1.55874 -2.683901 -2.12132 -2.12132 C -2.683901 -1.55874 -3 -0.795609 -3 0 C -3 0.795609 -2.683901 1.55874 -2.12132 2.12132 C -1.55874 2.683901 -0.795609 3 0 3 z " style="stroke: #1f77b4" />
4952
  </defs>
4953
  <g clip-path="url(#p088c925177)">
4954
  <use ns4:href="#md7efaf3aec" x="82.966497" y="405.060892" style="fill: #1f77b4; stroke: #1f77b4" />
4955
- <use ns4:href="#md7efaf3aec" x="113.615625" y="361.374088" style="fill: #1f77b4; stroke: #1f77b4" />
4956
- <use ns4:href="#md7efaf3aec" x="144.264753" y="368.740256" style="fill: #1f77b4; stroke: #1f77b4" />
4957
- <use ns4:href="#md7efaf3aec" x="174.913881" y="368.958139" style="fill: #1f77b4; stroke: #1f77b4" />
4958
- <use ns4:href="#md7efaf3aec" x="205.563009" y="371.664237" style="fill: #1f77b4; stroke: #1f77b4" />
4959
- <use ns4:href="#md7efaf3aec" x="236.212137" y="372.082945" style="fill: #1f77b4; stroke: #1f77b4" />
4960
- <use ns4:href="#md7efaf3aec" x="266.861265" y="372.113961" style="fill: #1f77b4; stroke: #1f77b4" />
4961
- <use ns4:href="#md7efaf3aec" x="297.510393" y="370.059188" style="fill: #1f77b4; stroke: #1f77b4" />
4962
- <use ns4:href="#md7efaf3aec" x="328.159521" y="369.570694" style="fill: #1f77b4; stroke: #1f77b4" />
4963
- <use ns4:href="#md7efaf3aec" x="358.808648" y="370.368567" style="fill: #1f77b4; stroke: #1f77b4" />
4964
- <use ns4:href="#md7efaf3aec" x="389.457776" y="370.997405" style="fill: #1f77b4; stroke: #1f77b4" />
4965
- <use ns4:href="#md7efaf3aec" x="420.106904" y="364.940864" style="fill: #1f77b4; stroke: #1f77b4" />
4966
- <use ns4:href="#md7efaf3aec" x="450.756032" y="370.21349" style="fill: #1f77b4; stroke: #1f77b4" />
4967
- <use ns4:href="#md7efaf3aec" x="481.40516" y="369.546657" style="fill: #1f77b4; stroke: #1f77b4" />
4968
- <use ns4:href="#md7efaf3aec" x="512.054288" y="369.84208" style="fill: #1f77b4; stroke: #1f77b4" />
4969
- <use ns4:href="#md7efaf3aec" x="542.703416" y="370.439127" style="fill: #1f77b4; stroke: #1f77b4" />
4970
- <use ns4:href="#md7efaf3aec" x="573.352544" y="368.570446" style="fill: #1f77b4; stroke: #1f77b4" />
4971
- <use ns4:href="#md7efaf3aec" x="604.001672" y="369.011641" style="fill: #1f77b4; stroke: #1f77b4" />
4972
- <use ns4:href="#md7efaf3aec" x="634.6508" y="368.655739" style="fill: #1f77b4; stroke: #1f77b4" />
4973
- <use ns4:href="#md7efaf3aec" x="665.299928" y="368.896108" style="fill: #1f77b4; stroke: #1f77b4" />
4974
- <use ns4:href="#md7efaf3aec" x="695.949056" y="369.864566" style="fill: #1f77b4; stroke: #1f77b4" />
4975
- <use ns4:href="#md7efaf3aec" x="726.598184" y="370.345305" style="fill: #1f77b4; stroke: #1f77b4" />
4976
- <use ns4:href="#md7efaf3aec" x="757.247312" y="360.808056" style="fill: #1f77b4; stroke: #1f77b4" />
4977
  <use ns4:href="#md7efaf3aec" x="787.896439" y="44.888614" style="fill: #1f77b4; stroke: #1f77b4" />
4978
  </g>
4979
  </g>
 
3872
  <rdf:RDF>
3873
  <ns2:Work>
3874
  <dc:type rdf:resource="http://purl.org/dc/dcmitype/StillImage" />
3875
+ <dc:date>2025-10-29T14:27:54.393501</dc:date>
3876
  <dc:format>image/svg+xml</dc:format>
3877
  <dc:creator>
3878
  <ns2:Agent>
 
4216
  <g id="matplotlib.axis_2">
4217
  <g id="ytick_1">
4218
  <g id="grid-y--2" class="grid grid-y">
4219
+ <path d="M 47.72 385.895403 L 823.142937 385.895403 " clip-path="url(#p088c925177)" style="fill: none; stroke: #b0b0b0; stroke-opacity: 0.3; stroke-width: 0.8; stroke-linecap: square" />
4220
  </g>
4221
  <g id="line2d_25">
4222
  <defs>
4223
  <path id="m0fca2865ba" d="M 0 0 L -3.5 0 " style="stroke: #000000; stroke-width: 0.8" />
4224
  </defs>
4225
  <g>
4226
+ <use ns4:href="#m0fca2865ba" x="47.72" y="385.895403" style="stroke: #000000; stroke-width: 0.8" />
4227
  </g>
4228
  </g>
4229
  <g id="text_25">
4230
+ <text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: end" x="40.72" y="389.694621" transform="rotate(-0 40.72 389.694621)">0.2</text>
4231
  </g>
4232
  </g>
4233
  <g id="ytick_2">
4234
  <g id="grid-y--3" class="grid grid-y">
4235
+ <path d="M 47.72 308.195371 L 823.142937 308.195371 " clip-path="url(#p088c925177)" style="fill: none; stroke: #b0b0b0; stroke-opacity: 0.3; stroke-width: 0.8; stroke-linecap: square" />
4236
  </g>
4237
  <g id="line2d_26">
4238
  <g>
4239
+ <use ns4:href="#m0fca2865ba" x="47.72" y="308.195371" style="stroke: #000000; stroke-width: 0.8" />
4240
  </g>
4241
  </g>
4242
  <g id="text_26">
4243
+ <text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: end" x="40.72" y="311.99459" transform="rotate(-0 40.72 311.99459)">0.3</text>
4244
  </g>
4245
  </g>
4246
  <g id="ytick_3">
4247
  <g id="grid-y--4" class="grid grid-y">
4248
+ <path d="M 47.72 230.49534 L 823.142937 230.49534 " clip-path="url(#p088c925177)" style="fill: none; stroke: #b0b0b0; stroke-opacity: 0.3; stroke-width: 0.8; stroke-linecap: square" />
4249
  </g>
4250
  <g id="line2d_27">
4251
  <g>
4252
+ <use ns4:href="#m0fca2865ba" x="47.72" y="230.49534" style="stroke: #000000; stroke-width: 0.8" />
4253
  </g>
4254
  </g>
4255
  <g id="text_27">
4256
+ <text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: end" x="40.72" y="234.294559" transform="rotate(-0 40.72 234.294559)">0.4</text>
4257
  </g>
4258
  </g>
4259
  <g id="ytick_4">
4260
  <g id="grid-y--5" class="grid grid-y">
4261
+ <path d="M 47.72 152.795309 L 823.142937 152.795309 " clip-path="url(#p088c925177)" style="fill: none; stroke: #b0b0b0; stroke-opacity: 0.3; stroke-width: 0.8; stroke-linecap: square" />
4262
  </g>
4263
  <g id="line2d_28">
4264
  <g>
4265
+ <use ns4:href="#m0fca2865ba" x="47.72" y="152.795309" style="stroke: #000000; stroke-width: 0.8" />
4266
  </g>
4267
  </g>
4268
  <g id="text_28">
4269
+ <text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: end" x="40.72" y="156.594528" transform="rotate(-0 40.72 156.594528)">0.5</text>
4270
  </g>
4271
  </g>
4272
  <g id="ytick_5">
4273
  <g id="grid-y--6" class="grid grid-y">
4274
+ <path d="M 47.72 75.095278 L 823.142937 75.095278 " clip-path="url(#p088c925177)" style="fill: none; stroke: #b0b0b0; stroke-opacity: 0.3; stroke-width: 0.8; stroke-linecap: square" />
4275
  </g>
4276
  <g id="line2d_29">
4277
  <g>
4278
+ <use ns4:href="#m0fca2865ba" x="47.72" y="75.095278" style="stroke: #000000; stroke-width: 0.8" />
4279
  </g>
4280
  </g>
4281
  <g id="text_29">
4282
+ <text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: end" x="40.72" y="78.894497" transform="rotate(-0 40.72 78.894497)">0.6</text>
4283
  </g>
4284
  </g>
4285
  <g id="label--y" class="ylabel">
 
4287
  </g>
4288
  </g>
4289
  <g id="series--torch-eager" class="series">
4290
+ <path d="M 82.966497 405.060892 L 113.615625 365.214762 L 144.264753 368.882204 L 174.913881 373.894633 L 205.563009 367.32121 L 236.212137 373.482823 L 266.861265 373.661533 L 297.510393 372.596265 L 328.159521 372.666195 L 358.808648 373.350733 L 389.457776 373.203103 L 420.106904 366.263713 L 450.756032 372.286242 L 481.40516 372.729132 L 512.054288 373.008852 L 542.703416 372.107532 L 573.352544 372.022062 L 604.001672 374.842573 L 634.6508 372.153375 L 665.299928 372.395022 L 695.949056 373.093546 L 726.598184 372.782745 L 757.247312 362.519348 L 787.896439 44.888614 " clip-path="url(#p088c925177)" style="fill: none; stroke: #1f77b4; stroke-width: 1.5; stroke-linecap: square" />
4291
  <defs>
4292
  <path id="md7efaf3aec" d="M 0 3 C 0.795609 3 1.55874 2.683901 2.12132 2.12132 C 2.683901 1.55874 3 0.795609 3 0 C 3 -0.795609 2.683901 -1.55874 2.12132 -2.12132 C 1.55874 -2.683901 0.795609 -3 0 -3 C -0.795609 -3 -1.55874 -2.683901 -2.12132 -2.12132 C -2.683901 -1.55874 -3 -0.795609 -3 0 C -3 0.795609 -2.683901 1.55874 -2.12132 2.12132 C -1.55874 2.683901 -0.795609 3 0 3 z " style="stroke: #1f77b4" />
4293
  </defs>
4294
  <g clip-path="url(#p088c925177)">
4295
  <use ns4:href="#md7efaf3aec" x="82.966497" y="405.060892" style="fill: #1f77b4; stroke: #1f77b4" />
4296
+ <use ns4:href="#md7efaf3aec" x="113.615625" y="365.214762" style="fill: #1f77b4; stroke: #1f77b4" />
4297
+ <use ns4:href="#md7efaf3aec" x="144.264753" y="368.882204" style="fill: #1f77b4; stroke: #1f77b4" />
4298
+ <use ns4:href="#md7efaf3aec" x="174.913881" y="373.894633" style="fill: #1f77b4; stroke: #1f77b4" />
4299
+ <use ns4:href="#md7efaf3aec" x="205.563009" y="367.32121" style="fill: #1f77b4; stroke: #1f77b4" />
4300
+ <use ns4:href="#md7efaf3aec" x="236.212137" y="373.482823" style="fill: #1f77b4; stroke: #1f77b4" />
4301
+ <use ns4:href="#md7efaf3aec" x="266.861265" y="373.661533" style="fill: #1f77b4; stroke: #1f77b4" />
4302
+ <use ns4:href="#md7efaf3aec" x="297.510393" y="372.596265" style="fill: #1f77b4; stroke: #1f77b4" />
4303
+ <use ns4:href="#md7efaf3aec" x="328.159521" y="372.666195" style="fill: #1f77b4; stroke: #1f77b4" />
4304
+ <use ns4:href="#md7efaf3aec" x="358.808648" y="373.350733" style="fill: #1f77b4; stroke: #1f77b4" />
4305
+ <use ns4:href="#md7efaf3aec" x="389.457776" y="373.203103" style="fill: #1f77b4; stroke: #1f77b4" />
4306
+ <use ns4:href="#md7efaf3aec" x="420.106904" y="366.263713" style="fill: #1f77b4; stroke: #1f77b4" />
4307
+ <use ns4:href="#md7efaf3aec" x="450.756032" y="372.286242" style="fill: #1f77b4; stroke: #1f77b4" />
4308
+ <use ns4:href="#md7efaf3aec" x="481.40516" y="372.729132" style="fill: #1f77b4; stroke: #1f77b4" />
4309
+ <use ns4:href="#md7efaf3aec" x="512.054288" y="373.008852" style="fill: #1f77b4; stroke: #1f77b4" />
4310
+ <use ns4:href="#md7efaf3aec" x="542.703416" y="372.107532" style="fill: #1f77b4; stroke: #1f77b4" />
4311
+ <use ns4:href="#md7efaf3aec" x="573.352544" y="372.022062" style="fill: #1f77b4; stroke: #1f77b4" />
4312
+ <use ns4:href="#md7efaf3aec" x="604.001672" y="374.842573" style="fill: #1f77b4; stroke: #1f77b4" />
4313
+ <use ns4:href="#md7efaf3aec" x="634.6508" y="372.153375" style="fill: #1f77b4; stroke: #1f77b4" />
4314
+ <use ns4:href="#md7efaf3aec" x="665.299928" y="372.395022" style="fill: #1f77b4; stroke: #1f77b4" />
4315
+ <use ns4:href="#md7efaf3aec" x="695.949056" y="373.093546" style="fill: #1f77b4; stroke: #1f77b4" />
4316
+ <use ns4:href="#md7efaf3aec" x="726.598184" y="372.782745" style="fill: #1f77b4; stroke: #1f77b4" />
4317
+ <use ns4:href="#md7efaf3aec" x="757.247312" y="362.519348" style="fill: #1f77b4; stroke: #1f77b4" />
4318
  <use ns4:href="#md7efaf3aec" x="787.896439" y="44.888614" style="fill: #1f77b4; stroke: #1f77b4" />
4319
  </g>
4320
  </g>
 
4364
  <span onclick="toggleOutput('combine')" style="cursor: pointer;">▼ output</span>
4365
  <span id="uv-indicator-combine" onclick="toggleUvLogsFromHeader('combine')" style="cursor: pointer;">▶ uv-logs</span>
4366
  </span> |
4367
+ Cell: combine | 4.35s
4368
  | <button class="run-btn" onclick="runCell('combine')">▶ run</button>
4369
  <button class="copy-btn" onclick="copyCell('combine')">Copy</button>
4370
  <a href="cells/combine.py" target="_blank" class="raw-btn">Raw</a>
 
4453
  impl wl p50(ms) ok
4454
  hf_kernels_rotary cuda_B1_S128_H32_D128_R64 0.09 False
4455
  hf_kernels_rotary cuda_B1_S128_H32_D64_R32 0.09 False
4456
+ hf_kernels_rotary cuda_B1_S128_H8_D128_R64 0.10 False
4457
  hf_kernels_rotary cuda_B1_S128_H8_D64_R32 0.08 False
4458
  hf_kernels_rotary cuda_B1_S2048_H32_D128_R64 0.09 False
4459
  hf_kernels_rotary cuda_B1_S2048_H32_D64_R32 0.09 False
 
4478
  torch_eager cuda_B1_S128_H32_D128_R64 0.22 True
4479
  torch_eager cuda_B1_S128_H32_D64_R32 0.22 True
4480
  torch_eager cuda_B1_S128_H8_D128_R64 0.23 True
4481
+ torch_eager cuda_B1_S128_H8_D64_R32 0.18 True
4482
+ torch_eager cuda_B1_S2048_H32_D128_R64 0.23 True
4483
  torch_eager cuda_B1_S2048_H32_D64_R32 0.22 True
4484
  torch_eager cuda_B1_S2048_H8_D128_R64 0.22 True
4485
  torch_eager cuda_B1_S2048_H8_D64_R32 0.22 True
 
4497
  torch_eager cuda_B2_S2048_H8_D64_R32 0.22 True
4498
  torch_eager cuda_B2_S512_H32_D128_R64 0.22 True
4499
  torch_eager cuda_B2_S512_H32_D64_R32 0.22 True
4500
+ torch_eager cuda_B2_S512_H8_D128_R64 0.21 True
4501
  torch_eager cuda_B2_S512_H8_D64_R32 0.22 True
4502
 
4503
  GENERATING COMBINED VISUALIZATION
 
4518
  <div class="uv-install-logs" id="uv-logs-combine">
4519
  <div class="uv-logs-header" onclick="toggleUvLogs(this)">▶ UV Install Logs</div>
4520
  <div class="uv-logs-content" style="display: none;">
4521
+ Installed 37 packages in 239ms
4522
  </div>
4523
  </div>
4524
  <div class="cell-artifacts">
 
4531
  <rdf:RDF>
4532
  <ns2:Work>
4533
  <dc:type rdf:resource="http://purl.org/dc/dcmitype/StillImage" />
4534
+ <dc:date>2025-10-29T14:27:54.393501</dc:date>
4535
  <dc:format>image/svg+xml</dc:format>
4536
  <dc:creator>
4537
  <ns2:Agent>
 
4875
  <g id="matplotlib.axis_2">
4876
  <g id="ytick_1">
4877
  <g id="grid-y--2" class="grid grid-y">
4878
+ <path d="M 47.72 385.895403 L 823.142937 385.895403 " clip-path="url(#p088c925177)" style="fill: none; stroke: #b0b0b0; stroke-opacity: 0.3; stroke-width: 0.8; stroke-linecap: square" />
4879
  </g>
4880
  <g id="line2d_25">
4881
  <defs>
4882
  <path id="m0fca2865ba" d="M 0 0 L -3.5 0 " style="stroke: #000000; stroke-width: 0.8" />
4883
  </defs>
4884
  <g>
4885
+ <use ns4:href="#m0fca2865ba" x="47.72" y="385.895403" style="stroke: #000000; stroke-width: 0.8" />
4886
  </g>
4887
  </g>
4888
  <g id="text_25">
4889
+ <text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: end" x="40.72" y="389.694621" transform="rotate(-0 40.72 389.694621)">0.2</text>
4890
  </g>
4891
  </g>
4892
  <g id="ytick_2">
4893
  <g id="grid-y--3" class="grid grid-y">
4894
+ <path d="M 47.72 308.195371 L 823.142937 308.195371 " clip-path="url(#p088c925177)" style="fill: none; stroke: #b0b0b0; stroke-opacity: 0.3; stroke-width: 0.8; stroke-linecap: square" />
4895
  </g>
4896
  <g id="line2d_26">
4897
  <g>
4898
+ <use ns4:href="#m0fca2865ba" x="47.72" y="308.195371" style="stroke: #000000; stroke-width: 0.8" />
4899
  </g>
4900
  </g>
4901
  <g id="text_26">
4902
+ <text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: end" x="40.72" y="311.99459" transform="rotate(-0 40.72 311.99459)">0.3</text>
4903
  </g>
4904
  </g>
4905
  <g id="ytick_3">
4906
  <g id="grid-y--4" class="grid grid-y">
4907
+ <path d="M 47.72 230.49534 L 823.142937 230.49534 " clip-path="url(#p088c925177)" style="fill: none; stroke: #b0b0b0; stroke-opacity: 0.3; stroke-width: 0.8; stroke-linecap: square" />
4908
  </g>
4909
  <g id="line2d_27">
4910
  <g>
4911
+ <use ns4:href="#m0fca2865ba" x="47.72" y="230.49534" style="stroke: #000000; stroke-width: 0.8" />
4912
  </g>
4913
  </g>
4914
  <g id="text_27">
4915
+ <text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: end" x="40.72" y="234.294559" transform="rotate(-0 40.72 234.294559)">0.4</text>
4916
  </g>
4917
  </g>
4918
  <g id="ytick_4">
4919
  <g id="grid-y--5" class="grid grid-y">
4920
+ <path d="M 47.72 152.795309 L 823.142937 152.795309 " clip-path="url(#p088c925177)" style="fill: none; stroke: #b0b0b0; stroke-opacity: 0.3; stroke-width: 0.8; stroke-linecap: square" />
4921
  </g>
4922
  <g id="line2d_28">
4923
  <g>
4924
+ <use ns4:href="#m0fca2865ba" x="47.72" y="152.795309" style="stroke: #000000; stroke-width: 0.8" />
4925
  </g>
4926
  </g>
4927
  <g id="text_28">
4928
+ <text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: end" x="40.72" y="156.594528" transform="rotate(-0 40.72 156.594528)">0.5</text>
4929
  </g>
4930
  </g>
4931
  <g id="ytick_5">
4932
  <g id="grid-y--6" class="grid grid-y">
4933
+ <path d="M 47.72 75.095278 L 823.142937 75.095278 " clip-path="url(#p088c925177)" style="fill: none; stroke: #b0b0b0; stroke-opacity: 0.3; stroke-width: 0.8; stroke-linecap: square" />
4934
  </g>
4935
  <g id="line2d_29">
4936
  <g>
4937
+ <use ns4:href="#m0fca2865ba" x="47.72" y="75.095278" style="stroke: #000000; stroke-width: 0.8" />
4938
  </g>
4939
  </g>
4940
  <g id="text_29">
4941
+ <text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: end" x="40.72" y="78.894497" transform="rotate(-0 40.72 78.894497)">0.6</text>
4942
  </g>
4943
  </g>
4944
  <g id="label--y" class="ylabel">
 
4946
  </g>
4947
  </g>
4948
  <g id="series--torch-eager" class="series">
4949
+ <path d="M 82.966497 405.060892 L 113.615625 365.214762 L 144.264753 368.882204 L 174.913881 373.894633 L 205.563009 367.32121 L 236.212137 373.482823 L 266.861265 373.661533 L 297.510393 372.596265 L 328.159521 372.666195 L 358.808648 373.350733 L 389.457776 373.203103 L 420.106904 366.263713 L 450.756032 372.286242 L 481.40516 372.729132 L 512.054288 373.008852 L 542.703416 372.107532 L 573.352544 372.022062 L 604.001672 374.842573 L 634.6508 372.153375 L 665.299928 372.395022 L 695.949056 373.093546 L 726.598184 372.782745 L 757.247312 362.519348 L 787.896439 44.888614 " clip-path="url(#p088c925177)" style="fill: none; stroke: #1f77b4; stroke-width: 1.5; stroke-linecap: square" />
4950
  <defs>
4951
  <path id="md7efaf3aec" d="M 0 3 C 0.795609 3 1.55874 2.683901 2.12132 2.12132 C 2.683901 1.55874 3 0.795609 3 0 C 3 -0.795609 2.683901 -1.55874 2.12132 -2.12132 C 1.55874 -2.683901 0.795609 -3 0 -3 C -0.795609 -3 -1.55874 -2.683901 -2.12132 -2.12132 C -2.683901 -1.55874 -3 -0.795609 -3 0 C -3 0.795609 -2.683901 1.55874 -2.12132 2.12132 C -1.55874 2.683901 -0.795609 3 0 3 z " style="stroke: #1f77b4" />
4952
  </defs>
4953
  <g clip-path="url(#p088c925177)">
4954
  <use ns4:href="#md7efaf3aec" x="82.966497" y="405.060892" style="fill: #1f77b4; stroke: #1f77b4" />
4955
+ <use ns4:href="#md7efaf3aec" x="113.615625" y="365.214762" style="fill: #1f77b4; stroke: #1f77b4" />
4956
+ <use ns4:href="#md7efaf3aec" x="144.264753" y="368.882204" style="fill: #1f77b4; stroke: #1f77b4" />
4957
+ <use ns4:href="#md7efaf3aec" x="174.913881" y="373.894633" style="fill: #1f77b4; stroke: #1f77b4" />
4958
+ <use ns4:href="#md7efaf3aec" x="205.563009" y="367.32121" style="fill: #1f77b4; stroke: #1f77b4" />
4959
+ <use ns4:href="#md7efaf3aec" x="236.212137" y="373.482823" style="fill: #1f77b4; stroke: #1f77b4" />
4960
+ <use ns4:href="#md7efaf3aec" x="266.861265" y="373.661533" style="fill: #1f77b4; stroke: #1f77b4" />
4961
+ <use ns4:href="#md7efaf3aec" x="297.510393" y="372.596265" style="fill: #1f77b4; stroke: #1f77b4" />
4962
+ <use ns4:href="#md7efaf3aec" x="328.159521" y="372.666195" style="fill: #1f77b4; stroke: #1f77b4" />
4963
+ <use ns4:href="#md7efaf3aec" x="358.808648" y="373.350733" style="fill: #1f77b4; stroke: #1f77b4" />
4964
+ <use ns4:href="#md7efaf3aec" x="389.457776" y="373.203103" style="fill: #1f77b4; stroke: #1f77b4" />
4965
+ <use ns4:href="#md7efaf3aec" x="420.106904" y="366.263713" style="fill: #1f77b4; stroke: #1f77b4" />
4966
+ <use ns4:href="#md7efaf3aec" x="450.756032" y="372.286242" style="fill: #1f77b4; stroke: #1f77b4" />
4967
+ <use ns4:href="#md7efaf3aec" x="481.40516" y="372.729132" style="fill: #1f77b4; stroke: #1f77b4" />
4968
+ <use ns4:href="#md7efaf3aec" x="512.054288" y="373.008852" style="fill: #1f77b4; stroke: #1f77b4" />
4969
+ <use ns4:href="#md7efaf3aec" x="542.703416" y="372.107532" style="fill: #1f77b4; stroke: #1f77b4" />
4970
+ <use ns4:href="#md7efaf3aec" x="573.352544" y="372.022062" style="fill: #1f77b4; stroke: #1f77b4" />
4971
+ <use ns4:href="#md7efaf3aec" x="604.001672" y="374.842573" style="fill: #1f77b4; stroke: #1f77b4" />
4972
+ <use ns4:href="#md7efaf3aec" x="634.6508" y="372.153375" style="fill: #1f77b4; stroke: #1f77b4" />
4973
+ <use ns4:href="#md7efaf3aec" x="665.299928" y="372.395022" style="fill: #1f77b4; stroke: #1f77b4" />
4974
+ <use ns4:href="#md7efaf3aec" x="695.949056" y="373.093546" style="fill: #1f77b4; stroke: #1f77b4" />
4975
+ <use ns4:href="#md7efaf3aec" x="726.598184" y="372.782745" style="fill: #1f77b4; stroke: #1f77b4" />
4976
+ <use ns4:href="#md7efaf3aec" x="757.247312" y="362.519348" style="fill: #1f77b4; stroke: #1f77b4" />
4977
  <use ns4:href="#md7efaf3aec" x="787.896439" y="44.888614" style="fill: #1f77b4; stroke: #1f77b4" />
4978
  </g>
4979
  </g>