drbh HF Staff commited on
Commit
9b13459
·
verified ·
1 Parent(s): 0c5e7b7

Upload folder using huggingface_hub

Browse files
This view is limited to 50 files because it contains too many changes.   See raw diff
Files changed (50) hide show
  1. activation/impls/artifacts/benchmark/activation.jsonl +9 -9
  2. activation/impls/hf_kernels_swiglu.html +101 -99
  3. activation/impls/torch_swiglu.html +124 -124
  4. activation/results/artifacts/combine/latency.svg +2 -2
  5. activation/results/combined_results.html +76 -76
  6. causal_conv1d/impls/artifacts/benchmark/causal_conv1d.jsonl +24 -24
  7. causal_conv1d/impls/cells/benchmark.py +18 -9
  8. causal_conv1d/impls/hf_kernels_causal_conv1d.html +0 -0
  9. causal_conv1d/impls/torch_causal_conv1d.html +0 -0
  10. causal_conv1d/results/artifacts/combine/latency.svg +1 -1
  11. causal_conv1d/results/combined_results.html +141 -141
  12. deformable_detr/impls/artifacts/benchmark/deformable_detr.jsonl +4 -0
  13. deformable_detr/impls/cells/benchmark.py +118 -0
  14. deformable_detr/impls/cells/nv.py +2 -0
  15. deformable_detr/impls/hf_kernels_deformable_detr.html +0 -0
  16. deformable_detr/impls/index.html +89 -0
  17. deformable_detr/impls/torch_deformable_detr.html +0 -0
  18. deformable_detr/index.html +89 -0
  19. deformable_detr/results/artifacts/combine/latency.svg +3 -0
  20. deformable_detr/results/cells/combine.py +26 -0
  21. deformable_detr/results/combined_results.html +0 -0
  22. deformable_detr/results/index.html +88 -0
  23. flash_attn/impls/artifacts/benchmark/attention.jsonl +6 -6
  24. flash_attn/impls/cells/benchmark.py +8 -10
  25. flash_attn/impls/flash_attention.html +143 -149
  26. flash_attn/impls/hf_kernels_flash_attn.html +97 -102
  27. flash_attn/impls/hf_kernels_flash_attn3.html +79 -79
  28. flash_attn/impls/mem_efficient_attention.html +133 -133
  29. flash_attn/impls/sage_attention.html +18 -14
  30. flash_attn/impls/xformers.html +137 -91
  31. flash_attn/results/artifacts/combine/latency.svg +2 -2
  32. flash_attn/results/combined_results.html +143 -143
  33. index.html +205 -51
  34. layer_norm/impls/artifacts/benchmark/layer_norm.jsonl +4 -4
  35. layer_norm/impls/cells/benchmark.py +5 -28
  36. layer_norm/impls/hf_kernels_layer_norm.html +59 -56
  37. layer_norm/impls/torch_layer_norm.html +56 -62
  38. layer_norm/results/artifacts/combine/latency.svg +2 -2
  39. layer_norm/results/combined_results.html +53 -53
  40. openai_moe/impls/artifacts/benchmark/openai_moe.jsonl +8 -0
  41. openai_moe/impls/binned_torch.html +0 -0
  42. openai_moe/impls/cells/benchmark.py +136 -0
  43. openai_moe/impls/cells/nv.py +2 -0
  44. openai_moe/impls/gpt_oss_moe.html +0 -0
  45. openai_moe/impls/index.html +89 -0
  46. openai_moe/index.html +89 -0
  47. openai_moe/results/artifacts/combine/latency.svg +3 -0
  48. openai_moe/results/cells/combine.py +27 -0
  49. openai_moe/results/combined_results.html +0 -0
  50. openai_moe/results/index.html +88 -0
activation/impls/artifacts/benchmark/activation.jsonl CHANGED
@@ -1,9 +1,9 @@
1
- {"ts": "2025-10-30T15:52:35Z", "run": "37230f1ce31641f2b6ebd7aba7f793c9", "impl": "hf_kernels_swiglu", "tags": {"family": "hf-kernels", "backend": "cuda"}, "wl": {"name": "cuda_T128_D768", "num_tokens": 128, "hidden_dim": 768, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.022969999974975508, "p50": 0.023499999997511622, "p90": 0.023961000010785938, "mean": 0.02361460000201987, "iqr": 0.0009899999895424116, "raw_times": [0.022971000021243526, 0.022969999974975508, 0.023961000010785938, 0.023499999997511622, 0.024671000005582755], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.03073999999969601, "peak_bytes": 1966080, "ok": true, "absmax": 0.0, "corr": {"ok": true, "rtol": 0.02, "atol": 0.02, "absmax": 0.0, "mae": 0.0, "mse": 0.0, "ref": "swiglu_fp32"}, "err": null}
2
- {"ts": "2025-10-30T15:52:35Z", "run": "37230f1ce31641f2b6ebd7aba7f793c9", "impl": "hf_kernels_swiglu", "tags": {"family": "hf-kernels", "backend": "cuda"}, "wl": {"name": "cuda_T128_D1024", "num_tokens": 128, "hidden_dim": 1024, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.027540000019143918, "p50": 0.029130999962490023, "p90": 0.03002100004323438, "mean": 0.029014800009008468, "iqr": 0.0016900000332498166, "raw_times": [0.027540000019143918, 0.030051000010189455, 0.03002100004323438, 0.029130999962490023, 0.028331000009984564], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.03343999998151048, "peak_bytes": 2621440, "ok": true, "absmax": 0.0, "corr": {"ok": true, "rtol": 0.02, "atol": 0.02, "absmax": 0.0, "mae": 0.0, "mse": 0.0, "ref": "swiglu_fp32"}, "err": null}
3
- {"ts": "2025-10-30T15:52:35Z", "run": "37230f1ce31641f2b6ebd7aba7f793c9", "impl": "hf_kernels_swiglu", "tags": {"family": "hf-kernels", "backend": "cuda"}, "wl": {"name": "cuda_T128_D2048", "num_tokens": 128, "hidden_dim": 2048, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.02829999999676147, "p50": 0.029119999965132592, "p90": 0.03051000004461457, "mean": 0.029939999990347133, "iqr": 0.0019500000689731678, "raw_times": [0.02829999999676147, 0.03051000004461457, 0.033209999969585624, 0.029119999965132592, 0.028559999975641404], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.031761000002461515, "peak_bytes": 5242880, "ok": true, "absmax": 0.0, "corr": {"ok": true, "rtol": 0.02, "atol": 0.02, "absmax": 0.0, "mae": 0.0, "mse": 0.0, "ref": "swiglu_fp32"}, "err": null}
4
- {"ts": "2025-10-30T15:52:35Z", "run": "37230f1ce31641f2b6ebd7aba7f793c9", "impl": "hf_kernels_swiglu", "tags": {"family": "hf-kernels", "backend": "cuda"}, "wl": {"name": "cuda_T256_D768", "num_tokens": 256, "hidden_dim": 768, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.027531000000635686, "p50": 0.028170999996746104, "p90": 0.028501000031155854, "mean": 0.028293000002577173, "iqr": 0.0008900000239009387, "raw_times": [0.027611000007254916, 0.028170999996746104, 0.029650999977093306, 0.027531000000635686, 0.028501000031155854], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.03130000004603062, "peak_bytes": 3932160, "ok": true, "absmax": 0.0, "corr": {"ok": true, "rtol": 0.02, "atol": 0.02, "absmax": 0.0, "mae": 0.0, "mse": 0.0, "ref": "swiglu_fp32"}, "err": null}
5
- {"ts": "2025-10-30T15:52:35Z", "run": "37230f1ce31641f2b6ebd7aba7f793c9", "impl": "hf_kernels_swiglu", "tags": {"family": "hf-kernels", "backend": "cuda"}, "wl": {"name": "cuda_T256_D1024", "num_tokens": 256, "hidden_dim": 1024, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.02755000002707675, "p50": 0.02861000001530556, "p90": 0.028831000008722185, "mean": 0.02867660001584227, "iqr": 0.00023000001192485797, "raw_times": [0.028600999996797327, 0.029791000031309522, 0.028831000008722185, 0.02755000002707675, 0.02861000001530556], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.03139000000373926, "peak_bytes": 5242880, "ok": true, "absmax": 0.0, "corr": {"ok": true, "rtol": 0.02, "atol": 0.02, "absmax": 0.0, "mae": 0.0, "mse": 0.0, "ref": "swiglu_fp32"}, "err": null}
6
- {"ts": "2025-10-30T15:52:35Z", "run": "37230f1ce31641f2b6ebd7aba7f793c9", "impl": "hf_kernels_swiglu", "tags": {"family": "hf-kernels", "backend": "cuda"}, "wl": {"name": "cuda_T256_D2048", "num_tokens": 256, "hidden_dim": 2048, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.02618100000972845, "p50": 0.027131000024382956, "p90": 0.02731099999664366, "mean": 0.026918799994746223, "iqr": 0.0007610000238855719, "raw_times": [0.02618100000972845, 0.027131000024382956, 0.027420999970217963, 0.02731099999664366, 0.026549999972758087], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.03008099997714453, "peak_bytes": 10485760, "ok": true, "absmax": 0.0, "corr": {"ok": true, "rtol": 0.02, "atol": 0.02, "absmax": 0.0, "mae": 0.0, "mse": 0.0, "ref": "swiglu_fp32"}, "err": null}
7
- {"ts": "2025-10-30T15:52:35Z", "run": "37230f1ce31641f2b6ebd7aba7f793c9", "impl": "hf_kernels_swiglu", "tags": {"family": "hf-kernels", "backend": "cuda"}, "wl": {"name": "cuda_T512_D768", "num_tokens": 512, "hidden_dim": 768, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.026950999995278835, "p50": 0.02748000002839035, "p90": 0.02804100000730614, "mean": 0.02758480000011332, "iqr": 0.0006300000450210064, "raw_times": [0.026950999995278835, 0.02804100000730614, 0.027410999962285132, 0.02804100000730614, 0.02748000002839035], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.03104999996139668, "peak_bytes": 7864320, "ok": true, "absmax": 0.0, "corr": {"ok": true, "rtol": 0.02, "atol": 0.02, "absmax": 0.0, "mae": 0.0, "mse": 0.0, "ref": "swiglu_fp32"}, "err": null}
8
- {"ts": "2025-10-30T15:52:35Z", "run": "37230f1ce31641f2b6ebd7aba7f793c9", "impl": "hf_kernels_swiglu", "tags": {"family": "hf-kernels", "backend": "cuda"}, "wl": {"name": "cuda_T512_D1024", "num_tokens": 512, "hidden_dim": 1024, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.026300000001810986, "p50": 0.02733100001250932, "p90": 0.0275399999623005, "mean": 0.02720039998393986, "iqr": 0.0004789999934473599, "raw_times": [0.02706099996885314, 0.02733100001250932, 0.027769999974225357, 0.0275399999623005, 0.026300000001810986], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.03032000000757762, "peak_bytes": 10485760, "ok": true, "absmax": 0.0, "corr": {"ok": true, "rtol": 0.02, "atol": 0.02, "absmax": 0.0, "mae": 0.0, "mse": 0.0, "ref": "swiglu_fp32"}, "err": null}
9
- {"ts": "2025-10-30T15:52:35Z", "run": "37230f1ce31641f2b6ebd7aba7f793c9", "impl": "hf_kernels_swiglu", "tags": {"family": "hf-kernels", "backend": "cuda"}, "wl": {"name": "cuda_T512_D2048", "num_tokens": 512, "hidden_dim": 2048, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.02642100002958614, "p50": 0.027860999978202017, "p90": 0.02790100000993334, "mean": 0.027615000010428048, "iqr": 0.00036000000136482413, "raw_times": [0.02642100002958614, 0.028351000025850226, 0.027541000008568517, 0.02790100000993334, 0.027860999978202017], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.03163999997468636, "peak_bytes": 20971520, "ok": true, "absmax": 0.0, "corr": {"ok": true, "rtol": 0.02, "atol": 0.02, "absmax": 0.0, "mae": 0.0, "mse": 0.0, "ref": "swiglu_fp32"}, "err": null}
 
1
+ {"ts": "2025-10-31T20:01:08Z", "run": "c3623842075144ab92176d6468514bae", "impl": "hf_kernels_swiglu", "tags": {"family": "hf-kernels", "backend": "cuda"}, "wl": {"name": "cuda_T128_D768", "num_tokens": 128, "hidden_dim": 768, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.024230000008174102, "p50": 0.024741000004269154, "p90": 0.025410999967334646, "mean": 0.024872599999525846, "iqr": 0.0011599999538702832, "raw_times": [0.024251000013464363, 0.025730000004386966, 0.024230000008174102, 0.025410999967334646, 0.024741000004269154], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.03134100001034312, "peak_bytes": 1966080, "ok": true, "absmax": 0.0, "corr": {"ok": true, "rtol": 0.02, "atol": 0.02, "absmax": 0.0, "mae": 0.0, "mse": 0.0, "ref": "swiglu_fp32"}, "err": null}
2
+ {"ts": "2025-10-31T20:01:08Z", "run": "c3623842075144ab92176d6468514bae", "impl": "hf_kernels_swiglu", "tags": {"family": "hf-kernels", "backend": "cuda"}, "wl": {"name": "cuda_T128_D1024", "num_tokens": 128, "hidden_dim": 1024, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.026611000009779673, "p50": 0.029731000040555955, "p90": 0.03027100001418148, "mean": 0.029349000021738902, "iqr": 0.0009999999974752427, "raw_times": [0.026611000009779673, 0.029731000040555955, 0.030861000027471164, 0.03027100001418148, 0.02927100001670624], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.034871000025304966, "peak_bytes": 2621440, "ok": true, "absmax": 0.0, "corr": {"ok": true, "rtol": 0.02, "atol": 0.02, "absmax": 0.0, "mae": 0.0, "mse": 0.0, "ref": "swiglu_fp32"}, "err": null}
3
+ {"ts": "2025-10-31T20:01:08Z", "run": "c3623842075144ab92176d6468514bae", "impl": "hf_kernels_swiglu", "tags": {"family": "hf-kernels", "backend": "cuda"}, "wl": {"name": "cuda_T128_D2048", "num_tokens": 128, "hidden_dim": 2048, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.027259999967554904, "p50": 0.02879100003383428, "p90": 0.030951000042023225, "mean": 0.029224800016436348, "iqr": 0.0029600000175378227, "raw_times": [0.027991000024485402, 0.031131000014283927, 0.02879100003383428, 0.030951000042023225, 0.027259999967554904], "has_warnings": true, "reps": 5, "warmup": 2}, "compile_ms": 0.0323909999906391, "peak_bytes": 5242880, "ok": true, "absmax": 0.0, "corr": {"ok": true, "rtol": 0.02, "atol": 0.02, "absmax": 0.0, "mae": 0.0, "mse": 0.0, "ref": "swiglu_fp32"}, "err": null}
4
+ {"ts": "2025-10-31T20:01:08Z", "run": "c3623842075144ab92176d6468514bae", "impl": "hf_kernels_swiglu", "tags": {"family": "hf-kernels", "backend": "cuda"}, "wl": {"name": "cuda_T256_D768", "num_tokens": 256, "hidden_dim": 768, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.025391000008312403, "p50": 0.02888100004838634, "p90": 0.029160999986288516, "mean": 0.028055000007043418, "iqr": 0.001839999981712026, "raw_times": [0.025391000008312403, 0.02888100004838634, 0.02952099998765334, 0.029160999986288516, 0.02732100000457649], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.031509999985246395, "peak_bytes": 3932160, "ok": true, "absmax": 0.0, "corr": {"ok": true, "rtol": 0.02, "atol": 0.02, "absmax": 0.0, "mae": 0.0, "mse": 0.0, "ref": "swiglu_fp32"}, "err": null}
5
+ {"ts": "2025-10-31T20:01:08Z", "run": "c3623842075144ab92176d6468514bae", "impl": "hf_kernels_swiglu", "tags": {"family": "hf-kernels", "backend": "cuda"}, "wl": {"name": "cuda_T256_D1024", "num_tokens": 256, "hidden_dim": 1024, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.026630000036220736, "p50": 0.027450000004591857, "p90": 0.027921000025799003, "mean": 0.02735460001304091, "iqr": 0.0010800000040944724, "raw_times": [0.026630000036220736, 0.027450000004591857, 0.02684100002170453, 0.027921000025799003, 0.027930999976888415], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.03172099997073019, "peak_bytes": 5242880, "ok": true, "absmax": 0.0, "corr": {"ok": true, "rtol": 0.02, "atol": 0.02, "absmax": 0.0, "mae": 0.0, "mse": 0.0, "ref": "swiglu_fp32"}, "err": null}
6
+ {"ts": "2025-10-31T20:01:08Z", "run": "c3623842075144ab92176d6468514bae", "impl": "hf_kernels_swiglu", "tags": {"family": "hf-kernels", "backend": "cuda"}, "wl": {"name": "cuda_T256_D2048", "num_tokens": 256, "hidden_dim": 2048, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.025049999976545223, "p50": 0.02733100001250932, "p90": 0.028329999963716546, "mean": 0.02741439998317219, "iqr": 0.0016189999882953998, "raw_times": [0.025049999976545223, 0.029649999987668707, 0.028329999963716546, 0.02733100001250932, 0.026710999975421146], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.029950999987704563, "peak_bytes": 10485760, "ok": true, "absmax": 0.0, "corr": {"ok": true, "rtol": 0.02, "atol": 0.02, "absmax": 0.0, "mae": 0.0, "mse": 0.0, "ref": "swiglu_fp32"}, "err": null}
7
+ {"ts": "2025-10-31T20:01:08Z", "run": "c3623842075144ab92176d6468514bae", "impl": "hf_kernels_swiglu", "tags": {"family": "hf-kernels", "backend": "cuda"}, "wl": {"name": "cuda_T512_D768", "num_tokens": 512, "hidden_dim": 768, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.028341000017917395, "p50": 0.02927099995986282, "p90": 0.029501000028631097, "mean": 0.02909080000108588, "iqr": 0.0009110000291912002, "raw_times": [0.028341000017917395, 0.02927099995986282, 0.029501000028631097, 0.029750999999578198, 0.028589999999439897], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.03009099998507736, "peak_bytes": 7864320, "ok": true, "absmax": 0.0, "corr": {"ok": true, "rtol": 0.02, "atol": 0.02, "absmax": 0.0, "mae": 0.0, "mse": 0.0, "ref": "swiglu_fp32"}, "err": null}
8
+ {"ts": "2025-10-31T20:01:08Z", "run": "c3623842075144ab92176d6468514bae", "impl": "hf_kernels_swiglu", "tags": {"family": "hf-kernels", "backend": "cuda"}, "wl": {"name": "cuda_T512_D1024", "num_tokens": 512, "hidden_dim": 1024, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.024770999971224228, "p50": 0.02814099997294761, "p90": 0.028720999978304462, "mean": 0.0278467999919485, "iqr": 0.0007409999511764909, "raw_times": [0.024770999971224228, 0.02798000002712797, 0.028720999978304462, 0.02814099997294761, 0.029621000010138232], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.031990999957542954, "peak_bytes": 10485760, "ok": true, "absmax": 0.0, "corr": {"ok": true, "rtol": 0.02, "atol": 0.02, "absmax": 0.0, "mae": 0.0, "mse": 0.0, "ref": "swiglu_fp32"}, "err": null}
9
+ {"ts": "2025-10-31T20:01:08Z", "run": "c3623842075144ab92176d6468514bae", "impl": "hf_kernels_swiglu", "tags": {"family": "hf-kernels", "backend": "cuda"}, "wl": {"name": "cuda_T512_D2048", "num_tokens": 512, "hidden_dim": 2048, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.027751000004627713, "p50": 0.028230999987499672, "p90": 0.029471000004832604, "mean": 0.028608800005258672, "iqr": 0.0016500000015184924, "raw_times": [0.028230999987499672, 0.027751000004627713, 0.02782100000331411, 0.02977000002601926, 0.029471000004832604], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.030850999962694914, "peak_bytes": 20971520, "ok": true, "absmax": 0.0, "corr": {"ok": true, "rtol": 0.02, "atol": 0.02, "absmax": 0.0, "mae": 0.0, "mse": 0.0, "ref": "swiglu_fp32"}, "err": null}
activation/impls/hf_kernels_swiglu.html CHANGED
@@ -4106,11 +4106,12 @@ body[data-tool="eraser"] .main-content {
4106
  <span onclick="toggleOutput('nv')" style="cursor: pointer;">▼ output</span>
4107
  <span id="uv-indicator-nv" style="cursor: default; opacity: 0.3;">▶ uv-logs</span>
4108
  </span> |
4109
- Cell: nv | 0.23s
4110
  | <button class="run-btn" onclick="runCell('nv')">▶ run</button>
4111
  <button class="copy-btn" onclick="copyCell('nv')">Copy</button>
4112
  <a href="cells/nv.py" target="_blank" class="raw-btn">Raw</a>
4113
- <a href="https://github.com/huggingface/kernels-uvnotes/blob/main/activation/impls/hf_kernels_swiglu.md" target="_blank" class="github-btn">GitHub</a>
 
4114
  </div>
4115
  <div id="code-nv" class="cell-code" data-lines="2">
4116
  <div class="code-wrap">
@@ -4122,7 +4123,7 @@ Cell: nv | 0.23s
4122
  </div>
4123
  </div>
4124
  <div id="output-nv" class="cell-output">
4125
- <div class="cell-stdout"><pre class="stdout-text">Thu Oct 30 15:52:16 2025
4126
  +-----------------------------------------------------------------------------------------+
4127
  | NVIDIA-SMI 570.195.03 Driver Version: 570.195.03 CUDA Version: 12.8 |
4128
  |-----------------------------------------+------------------------+----------------------+
@@ -4131,7 +4132,7 @@ Cell: nv | 0.23s
4131
  | | | MIG M. |
4132
  |=========================================+========================+======================|
4133
  | 0 NVIDIA L40S On | 00000000:4D:00.0 Off | 0 |
4134
- | N/A 29C P0 86W / 350W | 0MiB / 46068MiB | 0% Default |
4135
  | | | N/A |
4136
  +-----------------------------------------+------------------------+----------------------+
4137
 
@@ -4155,11 +4156,12 @@ Cell: nv | 0.23s
4155
  <span onclick="toggleOutput('benchmark')" style="cursor: pointer;">▼ output</span>
4156
  <span id="uv-indicator-benchmark" onclick="toggleUvLogsFromHeader('benchmark')" style="cursor: pointer;">▶ uv-logs</span>
4157
  </span> |
4158
- Cell: benchmark | 4.17s
4159
  | <button class="run-btn" onclick="runCell('benchmark')">▶ run</button>
4160
  <button class="copy-btn" onclick="copyCell('benchmark')">Copy</button>
4161
  <a href="cells/benchmark.py" target="_blank" class="raw-btn">Raw</a>
4162
- <a href="https://github.com/huggingface/kernels-uvnotes/blob/main/activation/impls/hf_kernels_swiglu.md" target="_blank" class="github-btn">GitHub</a>
 
4163
  </div>
4164
  <div id="code-benchmark" class="cell-code" data-lines="34">
4165
  <div class="code-wrap">
@@ -4211,17 +4213,17 @@ PROFILE TRACE: hf_kernels_swiglu | cuda_T128_D768
4211
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4212
  Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
4213
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4214
- hf_kernels_swiglu 0.00% 0.000us 0.00% 0.000us 0.000us 78.752us 1953.17% 78.752us 78.752us 1
4215
- hf_kernels_swiglu 9.29% 160.875us 99.59% 1.725ms 1.725ms 0.000us 0.00% 5.440us 5.440us 1
4216
- _activation_beeaae6::silu_and_mul 1.15% 19.839us 87.61% 1.518ms 505.995us 4.032us 100.00% 5.440us 1.813us 3
4217
- void vllm::act_and_mul_kernel&lt;c10::BFloat16, &amp;(c10::... 0.00% 0.000us 0.00% 0.000us 0.000us 4.032us 100.00% 4.032us 1.344us 3
4218
- Activity Buffer Request 83.97% 1.455ms 83.97% 1.455ms 1.455ms 1.408us 34.92% 1.408us 1.408us 1
4219
- aten::empty 2.69% 46.600us 2.69% 46.600us 15.533us 0.000us 0.00% 0.000us 0.000us 3
4220
- cudaLaunchKernel 2.49% 43.201us 2.49% 43.201us 14.400us 0.000us 0.00% 0.000us 0.000us 3
4221
- cudaDeviceSynchronize 0.41% 7.161us 0.41% 7.161us 7.161us 0.000us 0.00% 0.000us 0.000us 1
4222
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4223
- Self CPU time total: 1.733ms
4224
- Self CUDA time total: 4.032us
4225
 
4226
 
4227
 
@@ -4231,17 +4233,17 @@ PROFILE TRACE: hf_kernels_swiglu | cuda_T128_D1024
4231
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4232
  Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
4233
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4234
- hf_kernels_swiglu 0.00% 0.000us 0.00% 0.000us 0.000us 62.528us 1575.81% 62.528us 62.528us 1
4235
- hf_kernels_swiglu 6.86% 110.833us 99.69% 1.610ms 1.610ms 0.000us 0.00% 5.312us 5.312us 1
4236
- _activation_beeaae6::silu_and_mul 1.31% 21.159us 91.69% 1.481ms 493.565us 3.968us 100.00% 5.312us 1.771us 3
4237
- void vllm::act_and_mul_kernel&lt;c10::BFloat16, &amp;(c10::... 0.00% 0.000us 0.00% 0.000us 0.000us 3.968us 100.00% 3.968us 1.323us 3
4238
- Activity Buffer Request 88.77% 1.434ms 88.77% 1.434ms 1.434ms 1.344us 33.87% 1.344us 1.344us 1
4239
- aten::empty 1.14% 18.330us 1.14% 18.330us 6.110us 0.000us 0.00% 0.000us 0.000us 3
4240
- cudaLaunchKernel 1.61% 26.001us 1.61% 26.001us 8.667us 0.000us 0.00% 0.000us 0.000us 3
4241
- cudaDeviceSynchronize 0.31% 5.030us 0.31% 5.030us 5.030us 0.000us 0.00% 0.000us 0.000us 1
4242
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4243
- Self CPU time total: 1.615ms
4244
- Self CUDA time total: 3.968us
4245
 
4246
 
4247
 
@@ -4251,17 +4253,17 @@ PROFILE TRACE: hf_kernels_swiglu | cuda_T128_D2048
4251
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4252
  Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
4253
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4254
- hf_kernels_swiglu 0.00% 0.000us 0.00% 0.000us 0.000us 63.232us 1291.50% 63.232us 63.232us 1
4255
- hf_kernels_swiglu 6.20% 101.121us 99.70% 1.627ms 1.627ms 0.000us 0.00% 6.528us 6.528us 1
4256
- _activation_beeaae6::silu_and_mul 1.27% 20.780us 92.37% 1.507ms 502.489us 4.896us 100.00% 6.528us 2.176us 3
4257
- void vllm::act_and_mul_kernel&lt;c10::BFloat16, &amp;(c10::... 0.00% 0.000us 0.00% 0.000us 0.000us 4.896us 100.00% 4.896us 1.632us 3
4258
- Activity Buffer Request 89.54% 1.461ms 89.54% 1.461ms 1.461ms 1.632us 33.33% 1.632us 1.632us 1
4259
- aten::empty 1.13% 18.440us 1.13% 18.440us 6.147us 0.000us 0.00% 0.000us 0.000us 3
4260
- cudaLaunchKernel 1.56% 25.391us 1.56% 25.391us 8.464us 0.000us 0.00% 0.000us 0.000us 3
4261
- cudaDeviceSynchronize 0.30% 4.970us 0.30% 4.970us 4.970us 0.000us 0.00% 0.000us 0.000us 1
4262
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4263
- Self CPU time total: 1.632ms
4264
- Self CUDA time total: 4.896us
4265
 
4266
 
4267
 
@@ -4271,17 +4273,17 @@ PROFILE TRACE: hf_kernels_swiglu | cuda_T256_D768
4271
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4272
  Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
4273
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4274
- hf_kernels_swiglu 0.00% 0.000us 0.00% 0.000us 0.000us 65.664us 1554.55% 65.664us 65.664us 1
4275
- hf_kernels_swiglu 5.63% 101.442us 99.74% 1.798ms 1.798ms 0.000us 0.00% 5.632us 5.632us 1
4276
- _activation_beeaae6::silu_and_mul 1.18% 21.341us 92.99% 1.677ms 558.850us 4.224us 100.00% 5.632us 1.877us 3
4277
- void vllm::act_and_mul_kernel&lt;c10::BFloat16, &amp;(c10::... 0.00% 0.000us 0.00% 0.000us 0.000us 4.224us 100.00% 4.224us 1.408us 3
4278
- Activity Buffer Request 79.26% 1.429ms 79.26% 1.429ms 1.429ms 1.408us 33.33% 1.408us 1.408us 1
4279
- aten::empty 1.12% 20.239us 1.12% 20.239us 6.746us 0.000us 0.00% 0.000us 0.000us 3
4280
- cudaLaunchKernel 12.54% 226.164us 12.54% 226.164us 75.388us 0.000us 0.00% 0.000us 0.000us 3
4281
- cudaDeviceSynchronize 0.26% 4.649us 0.26% 4.649us 4.649us 0.000us 0.00% 0.000us 0.000us 1
4282
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4283
- Self CPU time total: 1.803ms
4284
- Self CUDA time total: 4.224us
4285
 
4286
 
4287
 
@@ -4291,17 +4293,17 @@ PROFILE TRACE: hf_kernels_swiglu | cuda_T256_D1024
4291
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4292
  Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
4293
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4294
- hf_kernels_swiglu 0.00% 0.000us 0.00% 0.000us 0.000us 63.968us 1086.23% 63.968us 63.968us 1
4295
- hf_kernels_swiglu 19.44% 85.062us 98.79% 432.257us 432.257us 0.000us 0.00% 7.874us 7.874us 1
4296
- _activation_beeaae6::silu_and_mul 4.74% 20.731us 74.99% 328.126us 109.375us 5.889us 100.00% 7.874us 2.625us 3
4297
- void vllm::act_and_mul_kernel&lt;c10::BFloat16, &amp;(c10::... 0.00% 0.000us 0.00% 0.000us 0.000us 5.889us 100.00% 5.889us 1.963us 3
4298
- Activity Buffer Request 29.32% 128.302us 29.32% 128.302us 128.302us 1.985us 33.71% 1.985us 1.985us 1
4299
- aten::empty 4.36% 19.069us 4.36% 19.069us 6.356us 0.000us 0.00% 0.000us 0.000us 3
4300
- cudaLaunchKernel 40.93% 179.093us 40.93% 179.093us 59.698us 0.000us 0.00% 0.000us 0.000us 3
4301
- cudaDeviceSynchronize 1.21% 5.289us 1.21% 5.289us 5.289us 0.000us 0.00% 0.000us 0.000us 1
4302
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4303
- Self CPU time total: 437.546us
4304
- Self CUDA time total: 5.889us
4305
 
4306
 
4307
 
@@ -4311,17 +4313,17 @@ PROFILE TRACE: hf_kernels_swiglu | cuda_T256_D2048
4311
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4312
  Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
4313
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4314
- hf_kernels_swiglu 0.00% 0.000us 0.00% 0.000us 0.000us 67.167us 867.45% 67.167us 67.167us 1
4315
- hf_kernels_swiglu 5.97% 103.951us 99.66% 1.736ms 1.736ms 0.000us 0.00% 10.335us 10.335us 1
4316
- _activation_beeaae6::silu_and_mul 1.17% 20.451us 92.57% 1.612ms 537.363us 7.743us 100.00% 10.335us 3.445us 3
4317
- void vllm::act_and_mul_kernel&lt;c10::BFloat16, &amp;(c10::... 0.00% 0.000us 0.00% 0.000us 0.000us 7.743us 100.00% 7.743us 2.581us 3
4318
- Activity Buffer Request 82.03% 1.429ms 82.03% 1.429ms 1.429ms 2.592us 33.48% 2.592us 2.592us 1
4319
- aten::empty 1.12% 19.510us 1.12% 19.510us 6.503us 0.000us 0.00% 0.000us 0.000us 3
4320
- cudaLaunchKernel 9.36% 162.983us 9.36% 162.983us 54.328us 0.000us 0.00% 0.000us 0.000us 3
4321
- cudaDeviceSynchronize 0.34% 5.970us 0.34% 5.970us 5.970us 0.000us 0.00% 0.000us 0.000us 1
4322
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4323
- Self CPU time total: 1.742ms
4324
- Self CUDA time total: 7.743us
4325
 
4326
 
4327
 
@@ -4331,17 +4333,17 @@ PROFILE TRACE: hf_kernels_swiglu | cuda_T512_D768
4331
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4332
  Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
4333
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4334
- hf_kernels_swiglu 0.00% 0.000us 0.00% 0.000us 0.000us 67.999us 1036.41% 67.999us 67.999us 1
4335
- hf_kernels_swiglu 5.88% 101.172us 99.74% 1.716ms 1.716ms 0.000us 0.00% 8.769us 8.769us 1
4336
- _activation_beeaae6::silu_and_mul 1.20% 20.670us 92.73% 1.596ms 531.873us 6.561us 100.00% 8.769us 2.923us 3
4337
- void vllm::act_and_mul_kernel&lt;c10::BFloat16, &amp;(c10::... 0.00% 0.000us 0.00% 0.000us 0.000us 6.561us 100.00% 6.561us 2.187us 3
4338
- Activity Buffer Request 82.56% 1.421ms 82.56% 1.421ms 1.421ms 2.208us 33.65% 2.208us 2.208us 1
4339
- aten::empty 1.13% 19.490us 1.13% 19.490us 6.497us 0.000us 0.00% 0.000us 0.000us 3
4340
- cudaLaunchKernel 8.96% 154.233us 8.96% 154.233us 51.411us 0.000us 0.00% 0.000us 0.000us 3
4341
- cudaDeviceSynchronize 0.26% 4.490us 0.26% 4.490us 4.490us 0.000us 0.00% 0.000us 0.000us 1
4342
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4343
- Self CPU time total: 1.721ms
4344
- Self CUDA time total: 6.561us
4345
 
4346
 
4347
 
@@ -4351,17 +4353,17 @@ PROFILE TRACE: hf_kernels_swiglu | cuda_T512_D1024
4351
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4352
  Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
4353
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4354
- hf_kernels_swiglu 0.00% 0.000us 0.00% 0.000us 0.000us 63.295us 670.43% 63.295us 63.295us 1
4355
- hf_kernels_swiglu 23.24% 86.211us 98.67% 366.026us 366.026us 0.000us 0.00% 12.609us 12.609us 1
4356
- _activation_beeaae6::silu_and_mul 5.71% 21.191us 70.40% 261.155us 87.052us 9.441us 100.00% 12.609us 4.203us 3
4357
- void vllm::act_and_mul_kernel&lt;c10::BFloat16, &amp;(c10::... 0.00% 0.000us 0.00% 0.000us 0.000us 9.441us 100.00% 9.441us 3.147us 3
4358
- Activity Buffer Request 23.85% 88.481us 23.85% 88.481us 88.481us 3.168us 33.56% 3.168us 3.168us 1
4359
- aten::empty 5.03% 18.660us 5.03% 18.660us 6.220us 0.000us 0.00% 0.000us 0.000us 3
4360
- cudaLaunchKernel 40.84% 151.483us 40.84% 151.483us 50.494us 0.000us 0.00% 0.000us 0.000us 3
4361
- cudaDeviceSynchronize 1.33% 4.920us 1.33% 4.920us 4.920us 0.000us 0.00% 0.000us 0.000us 1
4362
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4363
- Self CPU time total: 370.946us
4364
- Self CUDA time total: 9.441us
4365
 
4366
 
4367
 
@@ -4371,17 +4373,17 @@ PROFILE TRACE: hf_kernels_swiglu | cuda_T512_D2048
4371
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4372
  Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
4373
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4374
- hf_kernels_swiglu 0.00% 0.000us 0.00% 0.000us 0.000us 65.342us 500.47% 65.342us 65.342us 1
4375
- hf_kernels_swiglu 22.94% 96.471us 98.88% 415.727us 415.727us 0.000us 0.00% 17.408us 17.408us 1
4376
- _activation_beeaae6::silu_and_mul 5.11% 21.490us 71.29% 299.725us 99.908us 13.056us 100.00% 17.408us 5.803us 3
4377
- void vllm::act_and_mul_kernel&lt;c10::BFloat16, &amp;(c10::... 0.00% 0.000us 0.00% 0.000us 0.000us 13.056us 100.00% 13.056us 4.352us 3
4378
- Activity Buffer Request 30.59% 128.632us 30.59% 128.632us 128.632us 4.352us 33.33% 4.352us 4.352us 1
4379
- aten::empty 4.65% 19.531us 4.65% 19.531us 6.510us 0.000us 0.00% 0.000us 0.000us 3
4380
- cudaLaunchKernel 35.58% 149.603us 35.58% 149.603us 49.868us 0.000us 0.00% 0.000us 0.000us 3
4381
- cudaDeviceSynchronize 1.12% 4.720us 1.12% 4.720us 4.720us 0.000us 0.00% 0.000us 0.000us 1
4382
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4383
- Self CPU time total: 420.447us
4384
- Self CUDA time total: 13.056us
4385
 
4386
 
4387
  impl wl p50(ms) ok
@@ -4398,12 +4400,12 @@ hf_kernels_swiglu cuda_T512_D768 0.03 True
4398
  <div class="uv-install-logs" id="uv-logs-benchmark">
4399
  <div class="uv-logs-header" onclick="toggleUvLogs(this)">▶ UV Install Logs</div>
4400
  <div class="uv-logs-content" style="display: none;">
4401
- Installed 15 packages in 13ms
4402
  </div>
4403
  </div>
4404
  <div class="cell-stderr">Fetching 7 files: 0%| | 0/7 [00:00&lt;?, ?it/s]
4405
- Fetching 7 files: 71%|███████▏ | 5/7 [00:00&lt;00:00, 14.50it/s]
4406
- Fetching 7 files: 100%|██████████| 7/7 [00:00&lt;00:00, 20.28it/s]</div>
4407
  <div class="cell-artifacts">
4408
  <h4>Artifacts:</h4>
4409
  <a href="artifacts/benchmark/activation.jsonl" class="artifact" target="_blank">activation.jsonl</a>
 
4106
  <span onclick="toggleOutput('nv')" style="cursor: pointer;">▼ output</span>
4107
  <span id="uv-indicator-nv" style="cursor: default; opacity: 0.3;">▶ uv-logs</span>
4108
  </span> |
4109
+ Cell: nv | 0.26s
4110
  | <button class="run-btn" onclick="runCell('nv')">▶ run</button>
4111
  <button class="copy-btn" onclick="copyCell('nv')">Copy</button>
4112
  <a href="cells/nv.py" target="_blank" class="raw-btn">Raw</a>
4113
+ <a href="https://github.com/huggingface/kernels-benchmarks/blob/main/activation/impls/hf_kernels_swiglu.md" target="_blank" class="github-btn">GitHub</a>
4114
+ <a href="https://huggingface.co/kernels-community/activation" target="_blank" class="hf-btn">🤗 HF</a>
4115
  </div>
4116
  <div id="code-nv" class="cell-code" data-lines="2">
4117
  <div class="code-wrap">
 
4123
  </div>
4124
  </div>
4125
  <div id="output-nv" class="cell-output">
4126
+ <div class="cell-stdout"><pre class="stdout-text">Fri Oct 31 20:00:17 2025
4127
  +-----------------------------------------------------------------------------------------+
4128
  | NVIDIA-SMI 570.195.03 Driver Version: 570.195.03 CUDA Version: 12.8 |
4129
  |-----------------------------------------+------------------------+----------------------+
 
4132
  | | | MIG M. |
4133
  |=========================================+========================+======================|
4134
  | 0 NVIDIA L40S On | 00000000:4D:00.0 Off | 0 |
4135
+ | N/A 33C P0 108W / 350W | 0MiB / 46068MiB | 88% Default |
4136
  | | | N/A |
4137
  +-----------------------------------------+------------------------+----------------------+
4138
 
 
4156
  <span onclick="toggleOutput('benchmark')" style="cursor: pointer;">▼ output</span>
4157
  <span id="uv-indicator-benchmark" onclick="toggleUvLogsFromHeader('benchmark')" style="cursor: pointer;">▶ uv-logs</span>
4158
  </span> |
4159
+ Cell: benchmark | 4.19s
4160
  | <button class="run-btn" onclick="runCell('benchmark')">▶ run</button>
4161
  <button class="copy-btn" onclick="copyCell('benchmark')">Copy</button>
4162
  <a href="cells/benchmark.py" target="_blank" class="raw-btn">Raw</a>
4163
+ <a href="https://github.com/huggingface/kernels-benchmarks/blob/main/activation/impls/hf_kernels_swiglu.md" target="_blank" class="github-btn">GitHub</a>
4164
+ <a href="https://huggingface.co/kernels-community/activation" target="_blank" class="hf-btn">🤗 HF</a>
4165
  </div>
4166
  <div id="code-benchmark" class="cell-code" data-lines="34">
4167
  <div class="code-wrap">
 
4213
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4214
  Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
4215
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4216
+ hf_kernels_swiglu 0.00% 0.000us 0.00% 0.000us 0.000us 105.055us 2585.65% 105.055us 105.055us 1
4217
+ hf_kernels_swiglu 11.41% 202.714us 99.64% 1.770ms 1.770ms 0.000us 0.00% 5.471us 5.471us 1
4218
+ _activation_beeaae6::silu_and_mul 1.18% 21.050us 84.47% 1.501ms 500.190us 4.063us 100.00% 5.471us 1.824us 3
4219
+ void vllm::act_and_mul_kernel&lt;c10::BFloat16, &amp;(c10::... 0.00% 0.000us 0.00% 0.000us 0.000us 4.063us 100.00% 4.063us 1.354us 3
4220
+ Activity Buffer Request 80.70% 1.434ms 80.70% 1.434ms 1.434ms 1.408us 34.65% 1.408us 1.408us 1
4221
+ aten::empty 3.76% 66.772us 3.76% 66.772us 22.257us 0.000us 0.00% 0.000us 0.000us 3
4222
+ cudaLaunchKernel 2.58% 45.872us 2.58% 45.872us 15.291us 0.000us 0.00% 0.000us 0.000us 3
4223
+ cudaDeviceSynchronize 0.36% 6.420us 0.36% 6.420us 6.420us 0.000us 0.00% 0.000us 0.000us 1
4224
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4225
+ Self CPU time total: 1.776ms
4226
+ Self CUDA time total: 4.063us
4227
 
4228
 
4229
 
 
4233
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4234
  Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
4235
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4236
+ hf_kernels_swiglu 0.00% 0.000us 0.00% 0.000us 0.000us 61.119us 1540.69% 61.119us 61.119us 1
4237
+ hf_kernels_swiglu 6.50% 104.811us 99.67% 1.607ms 1.607ms 0.000us 0.00% 5.279us 5.279us 1
4238
+ _activation_beeaae6::silu_and_mul 1.26% 20.331us 91.95% 1.482ms 494.073us 3.967us 100.00% 5.279us 1.760us 3
4239
+ void vllm::act_and_mul_kernel&lt;c10::BFloat16, &amp;(c10::... 0.00% 0.000us 0.00% 0.000us 0.000us 3.967us 100.00% 3.967us 1.322us 3
4240
+ Activity Buffer Request 89.13% 1.437ms 89.13% 1.437ms 1.437ms 1.312us 33.07% 1.312us 1.312us 1
4241
+ aten::empty 1.22% 19.632us 1.22% 19.632us 6.544us 0.000us 0.00% 0.000us 0.000us 3
4242
+ cudaLaunchKernel 1.56% 25.120us 1.56% 25.120us 8.373us 0.000us 0.00% 0.000us 0.000us 3
4243
+ cudaDeviceSynchronize 0.33% 5.360us 0.33% 5.360us 5.360us 0.000us 0.00% 0.000us 0.000us 1
4244
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4245
+ Self CPU time total: 1.612ms
4246
+ Self CUDA time total: 3.967us
4247
 
4248
 
4249
 
 
4253
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4254
  Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
4255
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4256
+ hf_kernels_swiglu 0.00% 0.000us 0.00% 0.000us 0.000us 63.488us 1288.31% 63.488us 63.488us 1
4257
+ hf_kernels_swiglu 6.89% 111.363us 99.67% 1.611ms 1.611ms 0.000us 0.00% 6.592us 6.592us 1
4258
+ _activation_beeaae6::silu_and_mul 1.36% 22.028us 91.47% 1.479ms 492.912us 4.928us 100.00% 6.592us 2.197us 3
4259
+ void vllm::act_and_mul_kernel&lt;c10::BFloat16, &amp;(c10::... 0.00% 0.000us 0.00% 0.000us 0.000us 4.928us 100.00% 4.928us 1.643us 3
4260
+ Activity Buffer Request 88.52% 1.431ms 88.52% 1.431ms 1.431ms 1.664us 33.77% 1.664us 1.664us 1
4261
+ aten::empty 1.30% 21.081us 1.30% 21.081us 7.027us 0.000us 0.00% 0.000us 0.000us 3
4262
+ cudaLaunchKernel 1.59% 25.652us 1.59% 25.652us 8.551us 0.000us 0.00% 0.000us 0.000us 3
4263
+ cudaDeviceSynchronize 0.33% 5.390us 0.33% 5.390us 5.390us 0.000us 0.00% 0.000us 0.000us 1
4264
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4265
+ Self CPU time total: 1.617ms
4266
+ Self CUDA time total: 4.928us
4267
 
4268
 
4269
 
 
4273
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4274
  Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
4275
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4276
+ hf_kernels_swiglu 0.00% 0.000us 0.00% 0.000us 0.000us 68.000us 1585.82% 68.000us 68.000us 1
4277
+ hf_kernels_swiglu 5.97% 106.915us 99.70% 1.784ms 1.784ms 0.000us 0.00% 5.760us 5.760us 1
4278
+ _activation_beeaae6::silu_and_mul 1.16% 20.770us 92.62% 1.658ms 552.564us 4.288us 100.00% 5.760us 1.920us 3
4279
+ void vllm::act_and_mul_kernel&lt;c10::BFloat16, &amp;(c10::... 0.00% 0.000us 0.00% 0.000us 0.000us 4.288us 100.00% 4.288us 1.429us 3
4280
+ Activity Buffer Request 80.58% 1.442ms 80.58% 1.442ms 1.442ms 1.472us 34.33% 1.472us 1.472us 1
4281
+ aten::empty 1.10% 19.770us 1.10% 19.770us 6.590us 0.000us 0.00% 0.000us 0.000us 3
4282
+ cudaLaunchKernel 10.88% 194.785us 10.88% 194.785us 64.928us 0.000us 0.00% 0.000us 0.000us 3
4283
+ cudaDeviceSynchronize 0.30% 5.350us 0.30% 5.350us 5.350us 0.000us 0.00% 0.000us 0.000us 1
4284
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4285
+ Self CPU time total: 1.790ms
4286
+ Self CUDA time total: 4.288us
4287
 
4288
 
4289
 
 
4293
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4294
  Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
4295
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4296
+ hf_kernels_swiglu 0.00% 0.000us 0.00% 0.000us 0.000us 65.599us 1108.28% 65.599us 65.599us 1
4297
+ hf_kernels_swiglu 18.75% 89.073us 98.88% 469.813us 469.813us 0.000us 0.00% 7.903us 7.903us 1
4298
+ _activation_beeaae6::silu_and_mul 4.69% 22.280us 76.20% 362.069us 120.690us 5.919us 100.00% 7.903us 2.634us 3
4299
+ void vllm::act_and_mul_kernel&lt;c10::BFloat16, &amp;(c10::... 0.00% 0.000us 0.00% 0.000us 0.000us 5.919us 100.00% 5.919us 1.973us 3
4300
+ Activity Buffer Request 38.23% 181.645us 38.23% 181.645us 181.645us 1.984us 33.52% 1.984us 1.984us 1
4301
+ aten::empty 3.93% 18.671us 3.93% 18.671us 6.224us 0.000us 0.00% 0.000us 0.000us 3
4302
+ cudaLaunchKernel 33.28% 158.144us 33.28% 158.144us 52.715us 0.000us 0.00% 0.000us 0.000us 3
4303
+ cudaDeviceSynchronize 1.12% 5.330us 1.12% 5.330us 5.330us 0.000us 0.00% 0.000us 0.000us 1
4304
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4305
+ Self CPU time total: 475.143us
4306
+ Self CUDA time total: 5.919us
4307
 
4308
 
4309
 
 
4313
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4314
  Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
4315
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4316
+ hf_kernels_swiglu 0.00% 0.000us 0.00% 0.000us 0.000us 70.207us 906.60% 70.207us 70.207us 1
4317
+ hf_kernels_swiglu 6.12% 106.261us 99.74% 1.733ms 1.733ms 0.000us 0.00% 10.336us 10.336us 1
4318
+ _activation_beeaae6::silu_and_mul 1.25% 21.782us 92.41% 1.606ms 535.254us 7.744us 100.00% 10.336us 3.445us 3
4319
+ void vllm::act_and_mul_kernel&lt;c10::BFloat16, &amp;(c10::... 0.00% 0.000us 0.00% 0.000us 0.000us 7.744us 100.00% 7.744us 2.581us 3
4320
+ Activity Buffer Request 82.36% 1.431ms 82.36% 1.431ms 1.431ms 2.592us 33.47% 2.592us 2.592us 1
4321
+ aten::empty 1.21% 21.081us 1.21% 21.081us 7.027us 0.000us 0.00% 0.000us 0.000us 3
4322
+ cudaLaunchKernel 8.80% 152.893us 8.80% 152.893us 50.964us 0.000us 0.00% 0.000us 0.000us 3
4323
+ cudaDeviceSynchronize 0.26% 4.511us 0.26% 4.511us 4.511us 0.000us 0.00% 0.000us 0.000us 1
4324
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4325
+ Self CPU time total: 1.738ms
4326
+ Self CUDA time total: 7.744us
4327
 
4328
 
4329
 
 
4333
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4334
  Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
4335
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4336
+ hf_kernels_swiglu 0.00% 0.000us 0.00% 0.000us 0.000us 69.214us 1045.06% 69.214us 69.214us 1
4337
+ hf_kernels_swiglu 7.00% 122.783us 99.73% 1.750ms 1.750ms 0.000us 0.00% 8.830us 8.830us 1
4338
+ _activation_beeaae6::silu_and_mul 1.22% 21.430us 91.58% 1.607ms 535.694us 6.623us 100.00% 8.830us 2.943us 3
4339
+ void vllm::act_and_mul_kernel&lt;c10::BFloat16, &amp;(c10::... 0.00% 0.000us 0.00% 0.000us 0.000us 6.623us 100.00% 6.623us 2.208us 3
4340
+ Activity Buffer Request 81.74% 1.434ms 81.74% 1.434ms 1.434ms 2.207us 33.32% 2.207us 2.207us 1
4341
+ aten::empty 1.15% 20.211us 1.15% 20.211us 6.737us 0.000us 0.00% 0.000us 0.000us 3
4342
+ cudaLaunchKernel 8.62% 151.304us 8.62% 151.304us 50.435us 0.000us 0.00% 0.000us 0.000us 3
4343
+ cudaDeviceSynchronize 0.27% 4.780us 0.27% 4.780us 4.780us 0.000us 0.00% 0.000us 0.000us 1
4344
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4345
+ Self CPU time total: 1.755ms
4346
+ Self CUDA time total: 6.623us
4347
 
4348
 
4349
 
 
4353
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4354
  Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
4355
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4356
+ hf_kernels_swiglu 0.00% 0.000us 0.00% 0.000us 0.000us 65.152us 692.52% 65.152us 65.152us 1
4357
+ hf_kernels_swiglu 21.62% 91.474us 98.93% 418.571us 418.571us 0.000us 0.00% 12.576us 12.576us 1
4358
+ _activation_beeaae6::silu_and_mul 4.88% 20.631us 69.03% 292.067us 97.356us 9.408us 100.00% 12.576us 4.192us 3
4359
+ void vllm::act_and_mul_kernel&lt;c10::BFloat16, &amp;(c10::... 0.00% 0.000us 0.00% 0.000us 0.000us 9.408us 100.00% 9.408us 3.136us 3
4360
+ Activity Buffer Request 28.63% 121.143us 28.63% 121.143us 121.143us 3.168us 33.67% 3.168us 3.168us 1
4361
+ aten::empty 8.28% 35.030us 8.28% 35.030us 11.677us 0.000us 0.00% 0.000us 0.000us 3
4362
+ cudaLaunchKernel 35.52% 150.293us 35.52% 150.293us 50.098us 0.000us 0.00% 0.000us 0.000us 3
4363
+ cudaDeviceSynchronize 1.07% 4.530us 1.07% 4.530us 4.530us 0.000us 0.00% 0.000us 0.000us 1
4364
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4365
+ Self CPU time total: 423.101us
4366
+ Self CUDA time total: 9.408us
4367
 
4368
 
4369
 
 
4373
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4374
  Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
4375
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4376
+ hf_kernels_swiglu 0.00% 0.000us 0.00% 0.000us 0.000us 67.197us 514.72% 67.197us 67.197us 1
4377
+ hf_kernels_swiglu 22.39% 97.642us 98.93% 431.481us 431.481us 0.000us 0.00% 17.439us 17.439us 1
4378
+ _activation_beeaae6::silu_and_mul 4.99% 21.781us 71.94% 313.789us 104.596us 13.055us 100.00% 17.439us 5.813us 3
4379
+ void vllm::act_and_mul_kernel&lt;c10::BFloat16, &amp;(c10::... 0.00% 0.000us 0.00% 0.000us 0.000us 13.055us 100.00% 13.055us 4.352us 3
4380
+ Activity Buffer Request 32.48% 141.684us 32.48% 141.684us 141.684us 4.384us 33.58% 4.384us 4.384us 1
4381
+ aten::empty 4.60% 20.050us 4.60% 20.050us 6.683us 0.000us 0.00% 0.000us 0.000us 3
4382
+ cudaLaunchKernel 34.47% 150.324us 34.47% 150.324us 50.108us 0.000us 0.00% 0.000us 0.000us 3
4383
+ cudaDeviceSynchronize 1.07% 4.681us 1.07% 4.681us 4.681us 0.000us 0.00% 0.000us 0.000us 1
4384
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4385
+ Self CPU time total: 436.162us
4386
+ Self CUDA time total: 13.055us
4387
 
4388
 
4389
  impl wl p50(ms) ok
 
4400
  <div class="uv-install-logs" id="uv-logs-benchmark">
4401
  <div class="uv-logs-header" onclick="toggleUvLogs(this)">▶ UV Install Logs</div>
4402
  <div class="uv-logs-content" style="display: none;">
4403
+ Installed 15 packages in 15ms
4404
  </div>
4405
  </div>
4406
  <div class="cell-stderr">Fetching 7 files: 0%| | 0/7 [00:00&lt;?, ?it/s]
4407
+ Fetching 7 files: 71%|███████▏ | 5/7 [00:00&lt;00:00, 15.31it/s]
4408
+ Fetching 7 files: 100%|██████████| 7/7 [00:00&lt;00:00, 21.41it/s]</div>
4409
  <div class="cell-artifacts">
4410
  <h4>Artifacts:</h4>
4411
  <a href="artifacts/benchmark/activation.jsonl" class="artifact" target="_blank">activation.jsonl</a>
activation/impls/torch_swiglu.html CHANGED
@@ -4106,11 +4106,11 @@ body[data-tool="eraser"] .main-content {
4106
  <span onclick="toggleOutput('nv')" style="cursor: pointer;">▼ output</span>
4107
  <span id="uv-indicator-nv" style="cursor: default; opacity: 0.3;">▶ uv-logs</span>
4108
  </span> |
4109
- Cell: nv | 0.23s
4110
  | <button class="run-btn" onclick="runCell('nv')">▶ run</button>
4111
  <button class="copy-btn" onclick="copyCell('nv')">Copy</button>
4112
  <a href="cells/nv.py" target="_blank" class="raw-btn">Raw</a>
4113
- <a href="https://github.com/huggingface/kernels-uvnotes/blob/main/activation/impls/torch_swiglu.md" target="_blank" class="github-btn">GitHub</a>
4114
  </div>
4115
  <div id="code-nv" class="cell-code" data-lines="2">
4116
  <div class="code-wrap">
@@ -4122,7 +4122,7 @@ Cell: nv | 0.23s
4122
  </div>
4123
  </div>
4124
  <div id="output-nv" class="cell-output">
4125
- <div class="cell-stdout"><pre class="stdout-text">Thu Oct 30 15:52:16 2025
4126
  +-----------------------------------------------------------------------------------------+
4127
  | NVIDIA-SMI 570.195.03 Driver Version: 570.195.03 CUDA Version: 12.8 |
4128
  |-----------------------------------------+------------------------+----------------------+
@@ -4131,7 +4131,7 @@ Cell: nv | 0.23s
4131
  | | | MIG M. |
4132
  |=========================================+========================+======================|
4133
  | 0 NVIDIA L40S On | 00000000:4D:00.0 Off | 0 |
4134
- | N/A 29C P0 86W / 350W | 0MiB / 46068MiB | 0% Default |
4135
  | | | N/A |
4136
  +-----------------------------------------+------------------------+----------------------+
4137
 
@@ -4155,11 +4155,11 @@ Cell: nv | 0.23s
4155
  <span onclick="toggleOutput('benchmark')" style="cursor: pointer;">▼ output</span>
4156
  <span id="uv-indicator-benchmark" onclick="toggleUvLogsFromHeader('benchmark')" style="cursor: pointer;">▶ uv-logs</span>
4157
  </span> |
4158
- Cell: benchmark | 6.88s
4159
  | <button class="run-btn" onclick="runCell('benchmark')">▶ run</button>
4160
  <button class="copy-btn" onclick="copyCell('benchmark')">Copy</button>
4161
  <a href="cells/benchmark.py" target="_blank" class="raw-btn">Raw</a>
4162
- <a href="https://github.com/huggingface/kernels-uvnotes/blob/main/activation/impls/torch_swiglu.md" target="_blank" class="github-btn">GitHub</a>
4163
  </div>
4164
  <div id="code-benchmark" class="cell-code" data-lines="28">
4165
  <div class="code-wrap">
@@ -4205,20 +4205,20 @@ PROFILE TRACE: torch_eager | cuda_T128_D768
4205
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4206
  Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
4207
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4208
- torch_eager 0.00% 0.000us 0.00% 0.000us 0.000us 179.327us 1411.47% 179.327us 179.327us 1
4209
- torch_eager 11.22% 210.364us 99.57% 1.867ms 1.867ms 0.000us 0.00% 15.009us 15.009us 1
4210
- aten::silu 3.37% 63.151us 82.30% 1.543ms 514.355us 6.497us 51.14% 8.801us 2.934us 3
4211
- void at::native::elementwise_kernel&lt;128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 6.497us 51.14% 6.497us 2.166us 3
4212
- aten::mul 1.76% 33.030us 2.90% 54.310us 18.103us 6.208us 48.86% 6.208us 2.069us 3
4213
- void at::native::elementwise_kernel&lt;128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 6.208us 48.86% 6.208us 2.069us 3
4214
- Activity Buffer Request 76.72% 1.439ms 76.72% 1.439ms 1.439ms 2.304us 18.13% 2.304us 2.304us 1
4215
- aten::slice 2.52% 47.241us 3.15% 59.052us 9.842us 0.000us 0.00% 0.000us 0.000us 6
4216
- aten::as_strided 0.63% 11.811us 0.63% 11.811us 1.968us 0.000us 0.00% 0.000us 0.000us 6
4217
- cudaLaunchKernel 3.34% 62.690us 3.34% 62.690us 10.448us 0.000us 0.00% 0.000us 0.000us 6
4218
- cudaDeviceSynchronize 0.43% 8.120us 0.43% 8.120us 8.120us 0.000us 0.00% 0.000us 0.000us 1
4219
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4220
- Self CPU time total: 1.875ms
4221
- Self CUDA time total: 12.705us
4222
 
4223
 
4224
 
@@ -4228,20 +4228,20 @@ PROFILE TRACE: torch_eager | cuda_T128_D1024
4228
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4229
  Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
4230
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4231
- torch_eager 0.00% 0.000us 0.00% 0.000us 0.000us 151.777us 1228.76% 151.777us 151.777us 1
4232
- torch_eager 6.62% 113.831us 99.66% 1.713ms 1.713ms 0.000us 0.00% 14.496us 14.496us 1
4233
- aten::silu 2.46% 42.260us 88.64% 1.523ms 507.722us 6.368us 51.55% 8.512us 2.837us 3
4234
- void at::native::elementwise_kernel&lt;128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 6.368us 51.55% 6.368us 2.123us 3
4235
- aten::mul 1.53% 26.241us 2.60% 44.713us 14.904us 5.984us 48.45% 5.984us 1.995us 3
4236
- void at::native::elementwise_kernel&lt;128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 5.984us 48.45% 5.984us 1.995us 3
4237
- Activity Buffer Request 84.63% 1.454ms 84.63% 1.454ms 1.454ms 2.144us 17.36% 2.144us 2.144us 1
4238
- aten::slice 1.45% 24.880us 1.80% 30.920us 5.153us 0.000us 0.00% 0.000us 0.000us 6
4239
- aten::as_strided 0.35% 6.040us 0.35% 6.040us 1.007us 0.000us 0.00% 0.000us 0.000us 6
4240
- cudaLaunchKernel 2.62% 45.062us 2.62% 45.062us 7.510us 0.000us 0.00% 0.000us 0.000us 6
4241
- cudaDeviceSynchronize 0.34% 5.800us 0.34% 5.800us 5.800us 0.000us 0.00% 0.000us 0.000us 1
4242
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4243
- Self CPU time total: 1.718ms
4244
- Self CUDA time total: 12.352us
4245
 
4246
 
4247
 
@@ -4251,20 +4251,20 @@ PROFILE TRACE: torch_eager | cuda_T128_D2048
4251
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4252
  Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
4253
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4254
- torch_eager 0.00% 0.000us 0.00% 0.000us 0.000us 151.422us 1145.66% 151.422us 151.422us 1
4255
- torch_eager 6.39% 108.591us 99.69% 1.694ms 1.694ms 0.000us 0.00% 15.489us 15.489us 1
4256
- aten::silu 2.42% 41.180us 88.84% 1.509ms 503.045us 6.784us 51.33% 9.056us 3.019us 3
4257
- void at::native::elementwise_kernel&lt;128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 6.784us 51.33% 6.784us 2.261us 3
4258
- aten::mul 1.56% 26.573us 2.72% 46.263us 15.421us 6.433us 48.67% 6.433us 2.144us 3
4259
- void at::native::elementwise_kernel&lt;128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 6.433us 48.67% 6.433us 2.144us 3
4260
- Activity Buffer Request 84.90% 1.442ms 84.90% 1.442ms 1.442ms 2.272us 17.19% 2.272us 2.272us 1
4261
- aten::slice 1.42% 24.110us 1.74% 29.570us 4.928us 0.000us 0.00% 0.000us 0.000us 6
4262
- aten::as_strided 0.32% 5.460us 0.32% 5.460us 0.910us 0.000us 0.00% 0.000us 0.000us 6
4263
- cudaLaunchKernel 2.67% 45.420us 2.67% 45.420us 7.570us 0.000us 0.00% 0.000us 0.000us 6
4264
- cudaDeviceSynchronize 0.31% 5.240us 0.31% 5.240us 5.240us 0.000us 0.00% 0.000us 0.000us 1
4265
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4266
- Self CPU time total: 1.699ms
4267
- Self CUDA time total: 13.217us
4268
 
4269
 
4270
 
@@ -4274,20 +4274,20 @@ PROFILE TRACE: torch_eager | cuda_T256_D768
4274
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4275
  Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
4276
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4277
- torch_eager 0.00% 0.000us 0.00% 0.000us 0.000us 152.159us 1197.73% 152.159us 152.159us 1
4278
- torch_eager 7.49% 109.251us 99.65% 1.454ms 1.454ms 0.000us 0.00% 14.912us 14.912us 1
4279
- aten::silu 2.87% 41.871us 86.91% 1.268ms 422.724us 6.560us 51.64% 8.768us 2.923us 3
4280
- void at::native::elementwise_kernel&lt;128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 6.560us 51.64% 6.560us 2.187us 3
4281
- aten::mul 1.82% 26.542us 3.09% 45.132us 15.044us 6.144us 48.36% 6.144us 2.048us 3
4282
- void at::native::elementwise_kernel&lt;128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 6.144us 48.36% 6.144us 2.048us 3
4283
- Activity Buffer Request 71.19% 1.039ms 71.19% 1.039ms 1.039ms 2.208us 17.38% 2.208us 2.208us 1
4284
- aten::slice 1.75% 25.480us 2.16% 31.560us 5.260us 0.000us 0.00% 0.000us 0.000us 6
4285
- aten::as_strided 0.42% 6.080us 0.42% 6.080us 1.013us 0.000us 0.00% 0.000us 0.000us 6
4286
- cudaLaunchKernel 14.12% 206.043us 14.12% 206.043us 34.340us 0.000us 0.00% 0.000us 0.000us 6
4287
- cudaDeviceSynchronize 0.35% 5.050us 0.35% 5.050us 5.050us 0.000us 0.00% 0.000us 0.000us 1
4288
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4289
- Self CPU time total: 1.459ms
4290
- Self CUDA time total: 12.704us
4291
 
4292
 
4293
 
@@ -4297,20 +4297,20 @@ PROFILE TRACE: torch_eager | cuda_T256_D1024
4297
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4298
  Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
4299
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4300
- torch_eager 0.00% 0.000us 0.00% 0.000us 0.000us 147.295us 1117.14% 147.295us 147.295us 1
4301
- torch_eager 5.91% 105.630us 99.72% 1.782ms 1.782ms 0.000us 0.00% 15.457us 15.457us 1
4302
- aten::silu 2.35% 41.900us 89.64% 1.602ms 533.846us 6.752us 51.21% 9.024us 3.008us 3
4303
- void at::native::elementwise_kernel&lt;128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 6.752us 51.21% 6.752us 2.251us 3
4304
- aten::mul 1.43% 25.502us 2.46% 43.882us 14.627us 6.433us 48.79% 6.433us 2.144us 3
4305
- void at::native::elementwise_kernel&lt;128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 6.433us 48.79% 6.433us 2.144us 3
4306
- Activity Buffer Request 78.53% 1.403ms 78.53% 1.403ms 1.403ms 2.272us 17.23% 2.272us 2.272us 1
4307
- aten::slice 1.39% 24.781us 1.71% 30.582us 5.097us 0.000us 0.00% 0.000us 0.000us 6
4308
- aten::as_strided 0.32% 5.801us 0.32% 5.801us 0.967us 0.000us 0.00% 0.000us 0.000us 6
4309
- cudaLaunchKernel 9.80% 175.053us 9.80% 175.053us 29.176us 0.000us 0.00% 0.000us 0.000us 6
4310
- cudaDeviceSynchronize 0.28% 4.969us 0.28% 4.969us 4.969us 0.000us 0.00% 0.000us 0.000us 1
4311
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4312
- Self CPU time total: 1.787ms
4313
- Self CUDA time total: 13.185us
4314
 
4315
 
4316
 
@@ -4320,20 +4320,20 @@ PROFILE TRACE: torch_eager | cuda_T256_D2048
4320
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4321
  Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
4322
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4323
- torch_eager 0.00% 0.000us 0.00% 0.000us 0.000us 143.964us 937.33% 143.964us 143.964us 1
4324
- torch_eager 21.41% 103.402us 98.95% 477.918us 477.918us 0.000us 0.00% 18.047us 18.047us 1
4325
- aten::silu 9.04% 43.640us 62.61% 302.394us 100.798us 7.872us 51.25% 10.560us 3.520us 3
4326
- void at::native::elementwise_kernel&lt;128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 7.872us 51.25% 7.872us 2.624us 3
4327
- aten::mul 5.13% 24.761us 8.85% 42.722us 14.241us 7.487us 48.75% 7.487us 2.496us 3
4328
- void at::native::elementwise_kernel&lt;128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 7.487us 48.75% 7.487us 2.496us 3
4329
- Activity Buffer Request 22.09% 106.692us 22.09% 106.692us 106.692us 2.688us 17.50% 2.688us 2.688us 1
4330
- aten::slice 4.94% 23.880us 6.09% 29.400us 4.900us 0.000us 0.00% 0.000us 0.000us 6
4331
- aten::as_strided 1.14% 5.520us 1.14% 5.520us 0.920us 0.000us 0.00% 0.000us 0.000us 6
4332
- cudaLaunchKernel 35.20% 170.023us 35.20% 170.023us 28.337us 0.000us 0.00% 0.000us 0.000us 6
4333
- cudaDeviceSynchronize 1.05% 5.060us 1.05% 5.060us 5.060us 0.000us 0.00% 0.000us 0.000us 1
4334
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4335
- Self CPU time total: 482.978us
4336
- Self CUDA time total: 15.359us
4337
 
4338
 
4339
 
@@ -4343,20 +4343,20 @@ PROFILE TRACE: torch_eager | cuda_T512_D768
4343
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4344
  Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
4345
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4346
- torch_eager 0.00% 0.000us 0.00% 0.000us 0.000us 154.301us 1078.65% 154.301us 154.301us 1
4347
- torch_eager 5.96% 107.399us 99.74% 1.796ms 1.796ms 0.000us 0.00% 16.769us 16.769us 1
4348
- aten::silu 2.38% 42.931us 89.51% 1.612ms 537.266us 7.328us 51.23% 9.792us 3.264us 3
4349
- void at::native::elementwise_kernel&lt;128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 7.328us 51.23% 7.328us 2.443us 3
4350
- aten::mul 1.49% 26.893us 2.55% 45.883us 15.294us 6.977us 48.77% 6.977us 2.326us 3
4351
- void at::native::elementwise_kernel&lt;128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 6.977us 48.77% 6.977us 2.326us 3
4352
- Activity Buffer Request 78.67% 1.417ms 78.67% 1.417ms 1.417ms 2.464us 17.22% 2.464us 2.464us 1
4353
- aten::slice 1.40% 25.140us 1.72% 31.031us 5.172us 0.000us 0.00% 0.000us 0.000us 6
4354
- aten::as_strided 0.33% 5.891us 0.33% 5.891us 0.982us 0.000us 0.00% 0.000us 0.000us 6
4355
- cudaLaunchKernel 9.51% 171.283us 9.51% 171.283us 28.547us 0.000us 0.00% 0.000us 0.000us 6
4356
- cudaDeviceSynchronize 0.26% 4.600us 0.26% 4.600us 4.600us 0.000us 0.00% 0.000us 0.000us 1
4357
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4358
- Self CPU time total: 1.801ms
4359
- Self CUDA time total: 14.305us
4360
 
4361
 
4362
 
@@ -4366,20 +4366,20 @@ PROFILE TRACE: torch_eager | cuda_T512_D1024
4366
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4367
  Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
4368
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4369
- torch_eager 0.00% 0.000us 0.00% 0.000us 0.000us 154.686us 1002.89% 154.686us 154.686us 1
4370
- torch_eager 22.31% 107.382us 99.03% 476.668us 476.668us 0.000us 0.00% 18.080us 18.080us 1
4371
- aten::silu 9.43% 45.390us 60.13% 289.404us 96.468us 7.872us 51.04% 10.528us 3.509us 3
4372
- void at::native::elementwise_kernel&lt;128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 7.872us 51.04% 7.872us 2.624us 3
4373
- aten::mul 6.54% 31.461us 10.39% 50.022us 16.674us 7.552us 48.96% 7.552us 2.517us 3
4374
- void at::native::elementwise_kernel&lt;128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 7.552us 48.96% 7.552us 2.517us 3
4375
- Activity Buffer Request 19.41% 93.401us 19.41% 93.401us 93.401us 2.656us 17.22% 2.656us 2.656us 1
4376
- aten::slice 5.01% 24.090us 6.20% 29.860us 4.977us 0.000us 0.00% 0.000us 0.000us 6
4377
- aten::as_strided 1.20% 5.770us 1.20% 5.770us 0.962us 0.000us 0.00% 0.000us 0.000us 6
4378
- cudaLaunchKernel 35.15% 169.174us 35.15% 169.174us 28.196us 0.000us 0.00% 0.000us 0.000us 6
4379
- cudaDeviceSynchronize 0.97% 4.650us 0.97% 4.650us 4.650us 0.000us 0.00% 0.000us 0.000us 1
4380
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4381
- Self CPU time total: 481.318us
4382
- Self CUDA time total: 15.424us
4383
 
4384
 
4385
 
@@ -4389,20 +4389,20 @@ PROFILE TRACE: torch_eager | cuda_T512_D2048
4389
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4390
  Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
4391
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4392
- torch_eager 0.00% 0.000us 0.00% 0.000us 0.000us 155.678us 692.09% 155.678us 155.678us 1
4393
- torch_eager 6.04% 109.222us 99.73% 1.805ms 1.805ms 0.000us 0.00% 26.365us 26.365us 1
4394
- aten::silu 2.28% 41.351us 89.49% 1.620ms 539.866us 11.614us 51.63% 15.485us 5.162us 3
4395
- void at::native::elementwise_kernel&lt;128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 11.614us 51.63% 11.614us 3.871us 3
4396
- aten::mul 1.47% 26.681us 2.47% 44.641us 14.880us 10.880us 48.37% 10.880us 3.627us 3
4397
- void at::native::elementwise_kernel&lt;128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 10.880us 48.37% 10.880us 3.627us 3
4398
- Activity Buffer Request 78.73% 1.425ms 78.73% 1.425ms 1.425ms 3.871us 17.21% 3.871us 3.871us 1
4399
- aten::slice 1.39% 25.188us 1.73% 31.390us 5.232us 0.000us 0.00% 0.000us 0.000us 6
4400
- aten::as_strided 0.34% 6.202us 0.34% 6.202us 1.034us 0.000us 0.00% 0.000us 0.000us 6
4401
- cudaLaunchKernel 9.47% 171.352us 9.47% 171.352us 28.559us 0.000us 0.00% 0.000us 0.000us 6
4402
- cudaDeviceSynchronize 0.27% 4.900us 0.27% 4.900us 4.900us 0.000us 0.00% 0.000us 0.000us 1
4403
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4404
- Self CPU time total: 1.810ms
4405
- Self CUDA time total: 22.494us
4406
 
4407
 
4408
  impl wl p50(ms) ok
@@ -4419,7 +4419,7 @@ torch_eager cuda_T512_D768 0.05 True
4419
  <div class="uv-install-logs" id="uv-logs-benchmark">
4420
  <div class="uv-logs-header" onclick="toggleUvLogs(this)">▶ UV Install Logs</div>
4421
  <div class="uv-logs-content" style="display: none;">
4422
- Installed 37 packages in 216ms
4423
  </div>
4424
  </div>
4425
  <div class="cell-artifacts">
 
4106
  <span onclick="toggleOutput('nv')" style="cursor: pointer;">▼ output</span>
4107
  <span id="uv-indicator-nv" style="cursor: default; opacity: 0.3;">▶ uv-logs</span>
4108
  </span> |
4109
+ Cell: nv | 0.26s
4110
  | <button class="run-btn" onclick="runCell('nv')">▶ run</button>
4111
  <button class="copy-btn" onclick="copyCell('nv')">Copy</button>
4112
  <a href="cells/nv.py" target="_blank" class="raw-btn">Raw</a>
4113
+ <a href="https://github.com/huggingface/kernels-benchmarks/blob/main/activation/impls/torch_swiglu.md" target="_blank" class="github-btn">GitHub</a>
4114
  </div>
4115
  <div id="code-nv" class="cell-code" data-lines="2">
4116
  <div class="code-wrap">
 
4122
  </div>
4123
  </div>
4124
  <div id="output-nv" class="cell-output">
4125
+ <div class="cell-stdout"><pre class="stdout-text">Fri Oct 31 20:00:17 2025
4126
  +-----------------------------------------------------------------------------------------+
4127
  | NVIDIA-SMI 570.195.03 Driver Version: 570.195.03 CUDA Version: 12.8 |
4128
  |-----------------------------------------+------------------------+----------------------+
 
4131
  | | | MIG M. |
4132
  |=========================================+========================+======================|
4133
  | 0 NVIDIA L40S On | 00000000:4D:00.0 Off | 0 |
4134
+ | N/A 33C P0 108W / 350W | 0MiB / 46068MiB | 88% Default |
4135
  | | | N/A |
4136
  +-----------------------------------------+------------------------+----------------------+
4137
 
 
4155
  <span onclick="toggleOutput('benchmark')" style="cursor: pointer;">▼ output</span>
4156
  <span id="uv-indicator-benchmark" onclick="toggleUvLogsFromHeader('benchmark')" style="cursor: pointer;">▶ uv-logs</span>
4157
  </span> |
4158
+ Cell: benchmark | 7.02s
4159
  | <button class="run-btn" onclick="runCell('benchmark')">▶ run</button>
4160
  <button class="copy-btn" onclick="copyCell('benchmark')">Copy</button>
4161
  <a href="cells/benchmark.py" target="_blank" class="raw-btn">Raw</a>
4162
+ <a href="https://github.com/huggingface/kernels-benchmarks/blob/main/activation/impls/torch_swiglu.md" target="_blank" class="github-btn">GitHub</a>
4163
  </div>
4164
  <div id="code-benchmark" class="cell-code" data-lines="28">
4165
  <div class="code-wrap">
 
4205
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4206
  Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
4207
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4208
+ torch_eager 0.00% 0.000us 0.00% 0.000us 0.000us 188.575us 1476.70% 188.575us 188.575us 1
4209
+ torch_eager 11.13% 210.826us 99.56% 1.887ms 1.887ms 0.000us 0.00% 15.106us 15.106us 1
4210
+ aten::silu 3.37% 63.781us 82.44% 1.562ms 520.736us 6.497us 50.88% 8.833us 2.944us 3
4211
+ void at::native::elementwise_kernel&lt;128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 6.497us 50.88% 6.497us 2.166us 3
4212
+ aten::mul 1.86% 35.170us 2.95% 55.841us 18.614us 6.273us 49.12% 6.273us 2.091us 3
4213
+ void at::native::elementwise_kernel&lt;128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 6.273us 49.12% 6.273us 2.091us 3
4214
+ Activity Buffer Request 76.78% 1.455ms 76.78% 1.455ms 1.455ms 2.336us 18.29% 2.336us 2.336us 1
4215
+ aten::slice 2.45% 46.380us 3.05% 57.842us 9.640us 0.000us 0.00% 0.000us 0.000us 6
4216
+ aten::as_strided 0.60% 11.462us 0.60% 11.462us 1.910us 0.000us 0.00% 0.000us 0.000us 6
4217
+ cudaLaunchKernel 3.38% 64.112us 3.38% 64.112us 10.685us 0.000us 0.00% 0.000us 0.000us 6
4218
+ cudaDeviceSynchronize 0.44% 8.280us 0.44% 8.280us 8.280us 0.000us 0.00% 0.000us 0.000us 1
4219
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4220
+ Self CPU time total: 1.895ms
4221
+ Self CUDA time total: 12.770us
4222
 
4223
 
4224
 
 
4228
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4229
  Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
4230
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4231
+ torch_eager 0.00% 0.000us 0.00% 0.000us 0.000us 152.926us 1234.87% 152.926us 152.926us 1
4232
+ torch_eager 6.55% 113.093us 99.67% 1.721ms 1.721ms 0.000us 0.00% 14.560us 14.560us 1
4233
+ aten::silu 2.40% 41.391us 88.69% 1.532ms 510.609us 6.400us 51.68% 8.576us 2.859us 3
4234
+ void at::native::elementwise_kernel&lt;128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 6.400us 51.68% 6.400us 2.133us 3
4235
+ aten::mul 1.50% 25.830us 2.63% 45.361us 15.120us 5.984us 48.32% 5.984us 1.995us 3
4236
+ void at::native::elementwise_kernel&lt;128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 5.984us 48.32% 5.984us 1.995us 3
4237
+ Activity Buffer Request 84.72% 1.463ms 84.72% 1.463ms 1.463ms 2.176us 17.57% 2.176us 2.176us 1
4238
+ aten::slice 1.43% 24.741us 1.80% 31.062us 5.177us 0.000us 0.00% 0.000us 0.000us 6
4239
+ aten::as_strided 0.37% 6.321us 0.37% 6.321us 1.054us 0.000us 0.00% 0.000us 0.000us 6
4240
+ cudaLaunchKernel 2.71% 46.721us 2.71% 46.721us 7.787us 0.000us 0.00% 0.000us 0.000us 6
4241
+ cudaDeviceSynchronize 0.33% 5.741us 0.33% 5.741us 5.741us 0.000us 0.00% 0.000us 0.000us 1
4242
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4243
+ Self CPU time total: 1.727ms
4244
+ Self CUDA time total: 12.384us
4245
 
4246
 
4247
 
 
4251
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4252
  Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
4253
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4254
+ torch_eager 0.00% 0.000us 0.00% 0.000us 0.000us 152.413us 1147.86% 152.413us 152.413us 1
4255
+ torch_eager 6.17% 105.134us 99.68% 1.699ms 1.699ms 0.000us 0.00% 15.581us 15.581us 1
4256
+ aten::silu 2.58% 43.990us 88.96% 1.517ms 505.533us 6.814us 51.32% 9.117us 3.039us 3
4257
+ void at::native::elementwise_kernel&lt;128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 6.814us 51.32% 6.814us 2.271us 3
4258
+ aten::mul 1.63% 27.711us 2.72% 46.371us 15.457us 6.464us 48.68% 6.464us 2.155us 3
4259
+ void at::native::elementwise_kernel&lt;128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 6.464us 48.68% 6.464us 2.155us 3
4260
+ Activity Buffer Request 84.84% 1.446ms 84.84% 1.446ms 1.446ms 2.303us 17.34% 2.303us 2.303us 1
4261
+ aten::slice 1.47% 24.990us 1.83% 31.250us 5.208us 0.000us 0.00% 0.000us 0.000us 6
4262
+ aten::as_strided 0.37% 6.260us 0.37% 6.260us 1.043us 0.000us 0.00% 0.000us 0.000us 6
4263
+ cudaLaunchKernel 2.63% 44.871us 2.63% 44.871us 7.478us 0.000us 0.00% 0.000us 0.000us 6
4264
+ cudaDeviceSynchronize 0.32% 5.431us 0.32% 5.431us 5.431us 0.000us 0.00% 0.000us 0.000us 1
4265
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4266
+ Self CPU time total: 1.705ms
4267
+ Self CUDA time total: 13.278us
4268
 
4269
 
4270
 
 
4274
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4275
  Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
4276
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4277
+ torch_eager 0.00% 0.000us 0.00% 0.000us 0.000us 155.359us 1219.84% 155.359us 155.359us 1
4278
+ torch_eager 6.31% 109.593us 99.71% 1.733ms 1.733ms 0.000us 0.00% 14.944us 14.944us 1
4279
+ aten::silu 2.48% 43.021us 88.93% 1.545ms 515.160us 6.560us 51.51% 8.768us 2.923us 3
4280
+ void at::native::elementwise_kernel&lt;128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 6.560us 51.51% 6.560us 2.187us 3
4281
+ aten::mul 1.62% 28.091us 2.66% 46.261us 15.420us 6.176us 48.49% 6.176us 2.059us 3
4282
+ void at::native::elementwise_kernel&lt;128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 6.176us 48.49% 6.176us 2.059us 3
4283
+ Activity Buffer Request 74.70% 1.298ms 74.70% 1.298ms 1.298ms 2.208us 17.34% 2.208us 2.208us 1
4284
+ aten::slice 1.46% 25.370us 1.82% 31.631us 5.272us 0.000us 0.00% 0.000us 0.000us 6
4285
+ aten::as_strided 0.36% 6.261us 0.36% 6.261us 1.043us 0.000us 0.00% 0.000us 0.000us 6
4286
+ cudaLaunchKernel 12.80% 222.405us 12.80% 222.405us 37.068us 0.000us 0.00% 0.000us 0.000us 6
4287
+ cudaDeviceSynchronize 0.29% 4.960us 0.29% 4.960us 4.960us 0.000us 0.00% 0.000us 0.000us 1
4288
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4289
+ Self CPU time total: 1.738ms
4290
+ Self CUDA time total: 12.736us
4291
 
4292
 
4293
 
 
4297
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4298
  Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
4299
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4300
+ torch_eager 0.00% 0.000us 0.00% 0.000us 0.000us 153.122us 1152.94% 153.122us 153.122us 1
4301
+ torch_eager 5.95% 108.905us 99.72% 1.827ms 1.827ms 0.000us 0.00% 15.585us 15.585us 1
4302
+ aten::silu 2.26% 41.441us 89.57% 1.641ms 546.874us 6.816us 51.32% 9.120us 3.040us 3
4303
+ void at::native::elementwise_kernel&lt;128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 6.816us 51.32% 6.816us 2.272us 3
4304
+ aten::mul 1.45% 26.581us 2.47% 45.261us 15.087us 6.465us 48.68% 6.465us 2.155us 3
4305
+ void at::native::elementwise_kernel&lt;128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 6.465us 48.68% 6.465us 2.155us 3
4306
+ Activity Buffer Request 78.54% 1.439ms 78.54% 1.439ms 1.439ms 2.304us 17.35% 2.304us 2.304us 1
4307
+ aten::slice 1.41% 25.869us 1.74% 31.870us 5.312us 0.000us 0.00% 0.000us 0.000us 6
4308
+ aten::as_strided 0.33% 6.001us 0.33% 6.001us 1.000us 0.000us 0.00% 0.000us 0.000us 6
4309
+ cudaLaunchKernel 9.78% 179.164us 9.78% 179.164us 29.861us 0.000us 0.00% 0.000us 0.000us 6
4310
+ cudaDeviceSynchronize 0.28% 5.090us 0.28% 5.090us 5.090us 0.000us 0.00% 0.000us 0.000us 1
4311
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4312
+ Self CPU time total: 1.832ms
4313
+ Self CUDA time total: 13.281us
4314
 
4315
 
4316
 
 
4320
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4321
  Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
4322
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4323
+ torch_eager 0.00% 0.000us 0.00% 0.000us 0.000us 150.877us 970.08% 150.877us 150.877us 1
4324
+ torch_eager 20.61% 104.763us 99.03% 503.283us 503.283us 0.000us 0.00% 18.241us 18.241us 1
4325
+ aten::silu 8.60% 43.701us 63.19% 321.148us 107.049us 7.969us 51.24% 10.657us 3.552us 3
4326
+ void at::native::elementwise_kernel&lt;128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 7.969us 51.24% 7.969us 2.656us 3
4327
+ aten::mul 5.45% 27.720us 8.99% 45.690us 15.230us 7.584us 48.76% 7.584us 2.528us 3
4328
+ void at::native::elementwise_kernel&lt;128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 7.584us 48.76% 7.584us 2.528us 3
4329
+ Activity Buffer Request 24.24% 123.213us 24.24% 123.213us 123.213us 2.688us 17.28% 2.688us 2.688us 1
4330
+ aten::slice 5.04% 25.603us 6.23% 31.682us 5.280us 0.000us 0.00% 0.000us 0.000us 6
4331
+ aten::as_strided 1.20% 6.079us 1.20% 6.079us 1.013us 0.000us 0.00% 0.000us 0.000us 6
4332
+ cudaLaunchKernel 33.88% 172.204us 33.88% 172.204us 28.701us 0.000us 0.00% 0.000us 0.000us 6
4333
+ cudaDeviceSynchronize 0.97% 4.940us 0.97% 4.940us 4.940us 0.000us 0.00% 0.000us 0.000us 1
4334
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4335
+ Self CPU time total: 508.223us
4336
+ Self CUDA time total: 15.553us
4337
 
4338
 
4339
 
 
4343
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4344
  Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
4345
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4346
+ torch_eager 0.00% 0.000us 0.00% 0.000us 0.000us 156.541us 1089.44% 156.541us 156.541us 1
4347
+ torch_eager 6.81% 125.673us 99.72% 1.840ms 1.840ms 0.000us 0.00% 16.866us 16.866us 1
4348
+ aten::silu 2.28% 42.101us 88.57% 1.634ms 544.654us 7.361us 51.23% 9.858us 3.286us 3
4349
+ void at::native::elementwise_kernel&lt;128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 7.361us 51.23% 7.361us 2.454us 3
4350
+ aten::mul 1.53% 28.200us 2.53% 46.622us 15.541us 7.008us 48.77% 7.008us 2.336us 3
4351
+ void at::native::elementwise_kernel&lt;128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 7.008us 48.77% 7.008us 2.336us 3
4352
+ Activity Buffer Request 77.96% 1.438ms 77.96% 1.438ms 1.438ms 2.497us 17.38% 2.497us 2.497us 1
4353
+ aten::slice 1.46% 26.979us 1.81% 33.310us 5.552us 0.000us 0.00% 0.000us 0.000us 6
4354
+ aten::as_strided 0.34% 6.331us 0.34% 6.331us 1.055us 0.000us 0.00% 0.000us 0.000us 6
4355
+ cudaLaunchKernel 9.33% 172.076us 9.33% 172.076us 28.679us 0.000us 0.00% 0.000us 0.000us 6
4356
+ cudaDeviceSynchronize 0.28% 5.210us 0.28% 5.210us 5.210us 0.000us 0.00% 0.000us 0.000us 1
4357
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4358
+ Self CPU time total: 1.845ms
4359
+ Self CUDA time total: 14.369us
4360
 
4361
 
4362
 
 
4366
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4367
  Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
4368
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4369
+ torch_eager 0.00% 0.000us 0.00% 0.000us 0.000us 149.754us 962.92% 149.754us 149.754us 1
4370
+ torch_eager 21.77% 106.163us 98.85% 481.952us 481.952us 0.000us 0.00% 18.240us 18.240us 1
4371
+ aten::silu 8.65% 42.151us 61.90% 301.788us 100.596us 7.968us 51.23% 10.656us 3.552us 3
4372
+ void at::native::elementwise_kernel&lt;128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 7.968us 51.23% 7.968us 2.656us 3
4373
+ aten::mul 5.09% 24.801us 8.77% 42.752us 14.251us 7.584us 48.77% 7.584us 2.528us 3
4374
+ void at::native::elementwise_kernel&lt;128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 7.584us 48.77% 7.584us 2.528us 3
4375
+ Activity Buffer Request 21.73% 105.953us 21.73% 105.953us 105.953us 2.688us 17.28% 2.688us 2.688us 1
4376
+ aten::slice 5.14% 25.050us 6.41% 31.249us 5.208us 0.000us 0.00% 0.000us 0.000us 6
4377
+ aten::as_strided 1.27% 6.199us 1.27% 6.199us 1.033us 0.000us 0.00% 0.000us 0.000us 6
4378
+ cudaLaunchKernel 35.20% 171.635us 35.20% 171.635us 28.606us 0.000us 0.00% 0.000us 0.000us 6
4379
+ cudaDeviceSynchronize 1.15% 5.600us 1.15% 5.600us 5.600us 0.000us 0.00% 0.000us 0.000us 1
4380
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4381
+ Self CPU time total: 487.552us
4382
+ Self CUDA time total: 15.552us
4383
 
4384
 
4385
 
 
4389
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4390
  Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
4391
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4392
+ torch_eager 0.00% 0.000us 0.00% 0.000us 0.000us 187.357us 834.00% 187.357us 187.357us 1
4393
+ torch_eager 6.93% 128.860us 99.74% 1.856ms 1.856ms 0.000us 0.00% 26.369us 26.369us 1
4394
+ aten::silu 2.32% 43.123us 88.23% 1.642ms 547.175us 11.616us 51.71% 15.520us 5.173us 3
4395
+ void at::native::elementwise_kernel&lt;128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 11.616us 51.71% 11.616us 3.872us 3
4396
+ aten::mul 1.63% 30.312us 2.74% 50.922us 16.974us 10.849us 48.29% 10.849us 3.616us 3
4397
+ void at::native::elementwise_kernel&lt;128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 10.849us 48.29% 10.849us 3.616us 3
4398
+ Activity Buffer Request 77.79% 1.447ms 77.79% 1.447ms 1.447ms 3.904us 17.38% 3.904us 3.904us 1
4399
+ aten::slice 1.49% 27.691us 1.84% 34.251us 5.708us 0.000us 0.00% 0.000us 0.000us 6
4400
+ aten::as_strided 0.35% 6.560us 0.35% 6.560us 1.093us 0.000us 0.00% 0.000us 0.000us 6
4401
+ cudaLaunchKernel 9.23% 171.734us 9.23% 171.734us 28.622us 0.000us 0.00% 0.000us 0.000us 6
4402
+ cudaDeviceSynchronize 0.26% 4.930us 0.26% 4.930us 4.930us 0.000us 0.00% 0.000us 0.000us 1
4403
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4404
+ Self CPU time total: 1.860ms
4405
+ Self CUDA time total: 22.465us
4406
 
4407
 
4408
  impl wl p50(ms) ok
 
4419
  <div class="uv-install-logs" id="uv-logs-benchmark">
4420
  <div class="uv-logs-header" onclick="toggleUvLogs(this)">▶ UV Install Logs</div>
4421
  <div class="uv-logs-content" style="display: none;">
4422
+ Installed 37 packages in 251ms
4423
  </div>
4424
  </div>
4425
  <div class="cell-artifacts">
activation/results/artifacts/combine/latency.svg CHANGED

Git LFS Details

  • SHA256: 49127439c8b28e18efed1525d57e9bb48bdb632034f2f84a60940f7d447aff24
  • Pointer size: 130 Bytes
  • Size of remote file: 20.6 kB

Git LFS Details

  • SHA256: 085b4a64bddea2955d6d074836121ec2e120fb1ca9140f3ccb75e8358e4526b3
  • Pointer size: 130 Bytes
  • Size of remote file: 20.6 kB
activation/results/combined_results.html CHANGED
@@ -4107,7 +4107,7 @@ body[data-tool="eraser"] .main-content {
4107
  <rdf:RDF>
4108
  <ns2:Work>
4109
  <dc:type rdf:resource="http://purl.org/dc/dcmitype/StillImage" />
4110
- <dc:date>2025-10-30T15:53:40.869549</dc:date>
4111
  <dc:format>image/svg+xml</dc:format>
4112
  <dc:creator>
4113
  <ns2:Agent>
@@ -4256,83 +4256,83 @@ body[data-tool="eraser"] .main-content {
4256
  <g id="matplotlib.axis_2">
4257
  <g id="ytick_1">
4258
  <g id="grid-y--2" class="grid grid-y">
4259
- <path d="M 60.23 430.151687 L 847.294169 430.151687 " clip-path="url(#p620c7d392f)" style="fill: none; stroke: #b0b0b0; stroke-opacity: 0.3; stroke-width: 0.8; stroke-linecap: square" />
4260
  </g>
4261
  <g id="line2d_10">
4262
  <defs>
4263
  <path id="m0fca2865ba" d="M 0 0 L -3.5 0 " style="stroke: #000000; stroke-width: 0.8" />
4264
  </defs>
4265
  <g>
4266
- <use ns4:href="#m0fca2865ba" x="60.23" y="430.151687" style="stroke: #000000; stroke-width: 0.8" />
4267
  </g>
4268
  </g>
4269
  <g id="text_10">
4270
- <text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: end" x="53.23" y="433.950906" transform="rotate(-0 53.23 433.950906)">0.025</text>
4271
  </g>
4272
  </g>
4273
  <g id="ytick_2">
4274
  <g id="grid-y--3" class="grid grid-y">
4275
- <path d="M 60.23 360.098012 L 847.294169 360.098012 " clip-path="url(#p620c7d392f)" style="fill: none; stroke: #b0b0b0; stroke-opacity: 0.3; stroke-width: 0.8; stroke-linecap: square" />
4276
  </g>
4277
  <g id="line2d_11">
4278
  <g>
4279
- <use ns4:href="#m0fca2865ba" x="60.23" y="360.098012" style="stroke: #000000; stroke-width: 0.8" />
4280
  </g>
4281
  </g>
4282
  <g id="text_11">
4283
- <text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: end" x="53.23" y="363.897231" transform="rotate(-0 53.23 363.897231)">0.030</text>
4284
  </g>
4285
  </g>
4286
  <g id="ytick_3">
4287
  <g id="grid-y--4" class="grid grid-y">
4288
- <path d="M 60.23 290.044337 L 847.294169 290.044337 " clip-path="url(#p620c7d392f)" style="fill: none; stroke: #b0b0b0; stroke-opacity: 0.3; stroke-width: 0.8; stroke-linecap: square" />
4289
  </g>
4290
  <g id="line2d_12">
4291
  <g>
4292
- <use ns4:href="#m0fca2865ba" x="60.23" y="290.044337" style="stroke: #000000; stroke-width: 0.8" />
4293
  </g>
4294
  </g>
4295
  <g id="text_12">
4296
- <text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: end" x="53.23" y="293.843555" transform="rotate(-0 53.23 293.843555)">0.035</text>
4297
  </g>
4298
  </g>
4299
  <g id="ytick_4">
4300
  <g id="grid-y--5" class="grid grid-y">
4301
- <path d="M 60.23 219.990661 L 847.294169 219.990661 " clip-path="url(#p620c7d392f)" style="fill: none; stroke: #b0b0b0; stroke-opacity: 0.3; stroke-width: 0.8; stroke-linecap: square" />
4302
  </g>
4303
  <g id="line2d_13">
4304
  <g>
4305
- <use ns4:href="#m0fca2865ba" x="60.23" y="219.990661" style="stroke: #000000; stroke-width: 0.8" />
4306
  </g>
4307
  </g>
4308
  <g id="text_13">
4309
- <text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: end" x="53.23" y="223.78988" transform="rotate(-0 53.23 223.78988)">0.040</text>
4310
  </g>
4311
  </g>
4312
  <g id="ytick_5">
4313
  <g id="grid-y--6" class="grid grid-y">
4314
- <path d="M 60.23 149.936986 L 847.294169 149.936986 " clip-path="url(#p620c7d392f)" style="fill: none; stroke: #b0b0b0; stroke-opacity: 0.3; stroke-width: 0.8; stroke-linecap: square" />
4315
  </g>
4316
  <g id="line2d_14">
4317
  <g>
4318
- <use ns4:href="#m0fca2865ba" x="60.23" y="149.936986" style="stroke: #000000; stroke-width: 0.8" />
4319
  </g>
4320
  </g>
4321
  <g id="text_14">
4322
- <text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: end" x="53.23" y="153.736205" transform="rotate(-0 53.23 153.736205)">0.045</text>
4323
  </g>
4324
  </g>
4325
  <g id="ytick_6">
4326
  <g id="grid-y--7" class="grid grid-y">
4327
- <path d="M 60.23 79.883311 L 847.294169 79.883311 " clip-path="url(#p620c7d392f)" style="fill: none; stroke: #b0b0b0; stroke-opacity: 0.3; stroke-width: 0.8; stroke-linecap: square" />
4328
  </g>
4329
  <g id="line2d_15">
4330
  <g>
4331
- <use ns4:href="#m0fca2865ba" x="60.23" y="79.883311" style="stroke: #000000; stroke-width: 0.8" />
4332
  </g>
4333
  </g>
4334
  <g id="text_15">
4335
- <text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: end" x="53.23" y="83.682529" transform="rotate(-0 53.23 83.682529)">0.050</text>
4336
  </g>
4337
  </g>
4338
  <g id="label--y" class="ylabel">
@@ -4340,37 +4340,37 @@ body[data-tool="eraser"] .main-content {
4340
  </g>
4341
  </g>
4342
  <g id="series--hf-kernels-swiglu" class="series">
4343
- <path d="M 96.005644 451.16779 L 185.444754 372.273341 L 274.883864 372.427459 L 364.322974 385.723647 L 453.762084 379.572934 L 543.201194 400.294811 L 632.640304 395.405064 L 722.079415 397.492664 L 811.518525 390.066975 " clip-path="url(#p620c7d392f)" style="fill: none; stroke: #1f77b4; stroke-width: 1.5; stroke-linecap: square" />
4344
  <defs>
4345
  <path id="md7efaf3aec" d="M 0 3 C 0.795609 3 1.55874 2.683901 2.12132 2.12132 C 2.683901 1.55874 3 0.795609 3 0 C 3 -0.795609 2.683901 -1.55874 2.12132 -2.12132 C 1.55874 -2.683901 0.795609 -3 0 -3 C -0.795609 -3 -1.55874 -2.683901 -2.12132 -2.12132 C -2.683901 -1.55874 -3 -0.795609 -3 0 C -3 0.795609 -2.683901 1.55874 -2.12132 2.12132 C -1.55874 2.683901 -0.795609 3 0 3 z " style="stroke: #1f77b4" />
4346
  </defs>
4347
  <g clip-path="url(#p620c7d392f)">
4348
  <use ns4:href="#md7efaf3aec" x="96.005644" y="451.16779" style="fill: #1f77b4; stroke: #1f77b4" />
4349
- <use ns4:href="#md7efaf3aec" x="185.444754" y="372.273341" style="fill: #1f77b4; stroke: #1f77b4" />
4350
- <use ns4:href="#md7efaf3aec" x="274.883864" y="372.427459" style="fill: #1f77b4; stroke: #1f77b4" />
4351
- <use ns4:href="#md7efaf3aec" x="364.322974" y="385.723647" style="fill: #1f77b4; stroke: #1f77b4" />
4352
- <use ns4:href="#md7efaf3aec" x="453.762084" y="379.572934" style="fill: #1f77b4; stroke: #1f77b4" />
4353
- <use ns4:href="#md7efaf3aec" x="543.201194" y="400.294811" style="fill: #1f77b4; stroke: #1f77b4" />
4354
- <use ns4:href="#md7efaf3aec" x="632.640304" y="395.405064" style="fill: #1f77b4; stroke: #1f77b4" />
4355
- <use ns4:href="#md7efaf3aec" x="722.079415" y="397.492664" style="fill: #1f77b4; stroke: #1f77b4" />
4356
- <use ns4:href="#md7efaf3aec" x="811.518525" y="390.066975" style="fill: #1f77b4; stroke: #1f77b4" />
4357
  </g>
4358
  </g>
4359
  <g id="series--torch-eager" class="series">
4360
- <path d="M 96.005644 191.815073 L 185.444754 47.08418 L 274.883864 72.023288 L 364.322974 89.116386 L 453.762084 69.781571 L 543.201194 83.10578 L 632.640304 73.424362 L 722.079415 89.116385 L 811.518525 103.687549 " clip-path="url(#p620c7d392f)" style="fill: none; stroke: #ff7f0e; stroke-width: 1.5; stroke-linecap: square" />
4361
  <defs>
4362
  <path id="m9b8c54d372" d="M 0 3 C 0.795609 3 1.55874 2.683901 2.12132 2.12132 C 2.683901 1.55874 3 0.795609 3 0 C 3 -0.795609 2.683901 -1.55874 2.12132 -2.12132 C 1.55874 -2.683901 0.795609 -3 0 -3 C -0.795609 -3 -1.55874 -2.683901 -2.12132 -2.12132 C -2.683901 -1.55874 -3 -0.795609 -3 0 C -3 0.795609 -2.683901 1.55874 -2.12132 2.12132 C -1.55874 2.683901 -0.795609 3 0 3 z " style="stroke: #ff7f0e" />
4363
  </defs>
4364
  <g clip-path="url(#p620c7d392f)">
4365
- <use ns4:href="#m9b8c54d372" x="96.005644" y="191.815073" style="fill: #ff7f0e; stroke: #ff7f0e" />
4366
  <use ns4:href="#m9b8c54d372" x="185.444754" y="47.08418" style="fill: #ff7f0e; stroke: #ff7f0e" />
4367
- <use ns4:href="#m9b8c54d372" x="274.883864" y="72.023288" style="fill: #ff7f0e; stroke: #ff7f0e" />
4368
- <use ns4:href="#m9b8c54d372" x="364.322974" y="89.116386" style="fill: #ff7f0e; stroke: #ff7f0e" />
4369
- <use ns4:href="#m9b8c54d372" x="453.762084" y="69.781571" style="fill: #ff7f0e; stroke: #ff7f0e" />
4370
- <use ns4:href="#m9b8c54d372" x="543.201194" y="83.10578" style="fill: #ff7f0e; stroke: #ff7f0e" />
4371
- <use ns4:href="#m9b8c54d372" x="632.640304" y="73.424362" style="fill: #ff7f0e; stroke: #ff7f0e" />
4372
- <use ns4:href="#m9b8c54d372" x="722.079415" y="89.116385" style="fill: #ff7f0e; stroke: #ff7f0e" />
4373
- <use ns4:href="#m9b8c54d372" x="811.518525" y="103.687549" style="fill: #ff7f0e; stroke: #ff7f0e" />
4374
  </g>
4375
  </g>
4376
  <g id="patch_3">
@@ -4428,7 +4428,7 @@ body[data-tool="eraser"] .main-content {
4428
  <span onclick="toggleOutput('combine')" style="cursor: pointer;">▼ output</span>
4429
  <span id="uv-indicator-combine" onclick="toggleUvLogsFromHeader('combine')" style="cursor: pointer;">▶ uv-logs</span>
4430
  </span> |
4431
- Cell: combine | 4.28s
4432
  | <button class="run-btn" onclick="runCell('combine')">▶ run</button>
4433
  <button class="copy-btn" onclick="copyCell('combine')">Copy</button>
4434
  <a href="cells/combine.py" target="_blank" class="raw-btn">Raw</a>
@@ -4554,7 +4554,7 @@ Implementations included:
4554
  <div class="uv-install-logs" id="uv-logs-combine">
4555
  <div class="uv-logs-header" onclick="toggleUvLogs(this)">▶ UV Install Logs</div>
4556
  <div class="uv-logs-content" style="display: none;">
4557
- Installed 37 packages in 222ms
4558
  </div>
4559
  </div>
4560
  <div class="cell-artifacts">
@@ -4567,7 +4567,7 @@ Installed 37 packages in 222ms
4567
  <rdf:RDF>
4568
  <ns2:Work>
4569
  <dc:type rdf:resource="http://purl.org/dc/dcmitype/StillImage" />
4570
- <dc:date>2025-10-30T15:53:40.869549</dc:date>
4571
  <dc:format>image/svg+xml</dc:format>
4572
  <dc:creator>
4573
  <ns2:Agent>
@@ -4716,83 +4716,83 @@ Installed 37 packages in 222ms
4716
  <g id="matplotlib.axis_2">
4717
  <g id="ytick_1">
4718
  <g id="grid-y--2" class="grid grid-y">
4719
- <path d="M 60.23 430.151687 L 847.294169 430.151687 " clip-path="url(#p620c7d392f)" style="fill: none; stroke: #b0b0b0; stroke-opacity: 0.3; stroke-width: 0.8; stroke-linecap: square" />
4720
  </g>
4721
  <g id="line2d_10">
4722
  <defs>
4723
  <path id="m0fca2865ba" d="M 0 0 L -3.5 0 " style="stroke: #000000; stroke-width: 0.8" />
4724
  </defs>
4725
  <g>
4726
- <use ns4:href="#m0fca2865ba" x="60.23" y="430.151687" style="stroke: #000000; stroke-width: 0.8" />
4727
  </g>
4728
  </g>
4729
  <g id="text_10">
4730
- <text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: end" x="53.23" y="433.950906" transform="rotate(-0 53.23 433.950906)">0.025</text>
4731
  </g>
4732
  </g>
4733
  <g id="ytick_2">
4734
  <g id="grid-y--3" class="grid grid-y">
4735
- <path d="M 60.23 360.098012 L 847.294169 360.098012 " clip-path="url(#p620c7d392f)" style="fill: none; stroke: #b0b0b0; stroke-opacity: 0.3; stroke-width: 0.8; stroke-linecap: square" />
4736
  </g>
4737
  <g id="line2d_11">
4738
  <g>
4739
- <use ns4:href="#m0fca2865ba" x="60.23" y="360.098012" style="stroke: #000000; stroke-width: 0.8" />
4740
  </g>
4741
  </g>
4742
  <g id="text_11">
4743
- <text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: end" x="53.23" y="363.897231" transform="rotate(-0 53.23 363.897231)">0.030</text>
4744
  </g>
4745
  </g>
4746
  <g id="ytick_3">
4747
  <g id="grid-y--4" class="grid grid-y">
4748
- <path d="M 60.23 290.044337 L 847.294169 290.044337 " clip-path="url(#p620c7d392f)" style="fill: none; stroke: #b0b0b0; stroke-opacity: 0.3; stroke-width: 0.8; stroke-linecap: square" />
4749
  </g>
4750
  <g id="line2d_12">
4751
  <g>
4752
- <use ns4:href="#m0fca2865ba" x="60.23" y="290.044337" style="stroke: #000000; stroke-width: 0.8" />
4753
  </g>
4754
  </g>
4755
  <g id="text_12">
4756
- <text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: end" x="53.23" y="293.843555" transform="rotate(-0 53.23 293.843555)">0.035</text>
4757
  </g>
4758
  </g>
4759
  <g id="ytick_4">
4760
  <g id="grid-y--5" class="grid grid-y">
4761
- <path d="M 60.23 219.990661 L 847.294169 219.990661 " clip-path="url(#p620c7d392f)" style="fill: none; stroke: #b0b0b0; stroke-opacity: 0.3; stroke-width: 0.8; stroke-linecap: square" />
4762
  </g>
4763
  <g id="line2d_13">
4764
  <g>
4765
- <use ns4:href="#m0fca2865ba" x="60.23" y="219.990661" style="stroke: #000000; stroke-width: 0.8" />
4766
  </g>
4767
  </g>
4768
  <g id="text_13">
4769
- <text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: end" x="53.23" y="223.78988" transform="rotate(-0 53.23 223.78988)">0.040</text>
4770
  </g>
4771
  </g>
4772
  <g id="ytick_5">
4773
  <g id="grid-y--6" class="grid grid-y">
4774
- <path d="M 60.23 149.936986 L 847.294169 149.936986 " clip-path="url(#p620c7d392f)" style="fill: none; stroke: #b0b0b0; stroke-opacity: 0.3; stroke-width: 0.8; stroke-linecap: square" />
4775
  </g>
4776
  <g id="line2d_14">
4777
  <g>
4778
- <use ns4:href="#m0fca2865ba" x="60.23" y="149.936986" style="stroke: #000000; stroke-width: 0.8" />
4779
  </g>
4780
  </g>
4781
  <g id="text_14">
4782
- <text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: end" x="53.23" y="153.736205" transform="rotate(-0 53.23 153.736205)">0.045</text>
4783
  </g>
4784
  </g>
4785
  <g id="ytick_6">
4786
  <g id="grid-y--7" class="grid grid-y">
4787
- <path d="M 60.23 79.883311 L 847.294169 79.883311 " clip-path="url(#p620c7d392f)" style="fill: none; stroke: #b0b0b0; stroke-opacity: 0.3; stroke-width: 0.8; stroke-linecap: square" />
4788
  </g>
4789
  <g id="line2d_15">
4790
  <g>
4791
- <use ns4:href="#m0fca2865ba" x="60.23" y="79.883311" style="stroke: #000000; stroke-width: 0.8" />
4792
  </g>
4793
  </g>
4794
  <g id="text_15">
4795
- <text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: end" x="53.23" y="83.682529" transform="rotate(-0 53.23 83.682529)">0.050</text>
4796
  </g>
4797
  </g>
4798
  <g id="label--y" class="ylabel">
@@ -4800,37 +4800,37 @@ Installed 37 packages in 222ms
4800
  </g>
4801
  </g>
4802
  <g id="series--hf-kernels-swiglu" class="series">
4803
- <path d="M 96.005644 451.16779 L 185.444754 372.273341 L 274.883864 372.427459 L 364.322974 385.723647 L 453.762084 379.572934 L 543.201194 400.294811 L 632.640304 395.405064 L 722.079415 397.492664 L 811.518525 390.066975 " clip-path="url(#p620c7d392f)" style="fill: none; stroke: #1f77b4; stroke-width: 1.5; stroke-linecap: square" />
4804
  <defs>
4805
  <path id="md7efaf3aec" d="M 0 3 C 0.795609 3 1.55874 2.683901 2.12132 2.12132 C 2.683901 1.55874 3 0.795609 3 0 C 3 -0.795609 2.683901 -1.55874 2.12132 -2.12132 C 1.55874 -2.683901 0.795609 -3 0 -3 C -0.795609 -3 -1.55874 -2.683901 -2.12132 -2.12132 C -2.683901 -1.55874 -3 -0.795609 -3 0 C -3 0.795609 -2.683901 1.55874 -2.12132 2.12132 C -1.55874 2.683901 -0.795609 3 0 3 z " style="stroke: #1f77b4" />
4806
  </defs>
4807
  <g clip-path="url(#p620c7d392f)">
4808
  <use ns4:href="#md7efaf3aec" x="96.005644" y="451.16779" style="fill: #1f77b4; stroke: #1f77b4" />
4809
- <use ns4:href="#md7efaf3aec" x="185.444754" y="372.273341" style="fill: #1f77b4; stroke: #1f77b4" />
4810
- <use ns4:href="#md7efaf3aec" x="274.883864" y="372.427459" style="fill: #1f77b4; stroke: #1f77b4" />
4811
- <use ns4:href="#md7efaf3aec" x="364.322974" y="385.723647" style="fill: #1f77b4; stroke: #1f77b4" />
4812
- <use ns4:href="#md7efaf3aec" x="453.762084" y="379.572934" style="fill: #1f77b4; stroke: #1f77b4" />
4813
- <use ns4:href="#md7efaf3aec" x="543.201194" y="400.294811" style="fill: #1f77b4; stroke: #1f77b4" />
4814
- <use ns4:href="#md7efaf3aec" x="632.640304" y="395.405064" style="fill: #1f77b4; stroke: #1f77b4" />
4815
- <use ns4:href="#md7efaf3aec" x="722.079415" y="397.492664" style="fill: #1f77b4; stroke: #1f77b4" />
4816
- <use ns4:href="#md7efaf3aec" x="811.518525" y="390.066975" style="fill: #1f77b4; stroke: #1f77b4" />
4817
  </g>
4818
  </g>
4819
  <g id="series--torch-eager" class="series">
4820
- <path d="M 96.005644 191.815073 L 185.444754 47.08418 L 274.883864 72.023288 L 364.322974 89.116386 L 453.762084 69.781571 L 543.201194 83.10578 L 632.640304 73.424362 L 722.079415 89.116385 L 811.518525 103.687549 " clip-path="url(#p620c7d392f)" style="fill: none; stroke: #ff7f0e; stroke-width: 1.5; stroke-linecap: square" />
4821
  <defs>
4822
  <path id="m9b8c54d372" d="M 0 3 C 0.795609 3 1.55874 2.683901 2.12132 2.12132 C 2.683901 1.55874 3 0.795609 3 0 C 3 -0.795609 2.683901 -1.55874 2.12132 -2.12132 C 1.55874 -2.683901 0.795609 -3 0 -3 C -0.795609 -3 -1.55874 -2.683901 -2.12132 -2.12132 C -2.683901 -1.55874 -3 -0.795609 -3 0 C -3 0.795609 -2.683901 1.55874 -2.12132 2.12132 C -1.55874 2.683901 -0.795609 3 0 3 z " style="stroke: #ff7f0e" />
4823
  </defs>
4824
  <g clip-path="url(#p620c7d392f)">
4825
- <use ns4:href="#m9b8c54d372" x="96.005644" y="191.815073" style="fill: #ff7f0e; stroke: #ff7f0e" />
4826
  <use ns4:href="#m9b8c54d372" x="185.444754" y="47.08418" style="fill: #ff7f0e; stroke: #ff7f0e" />
4827
- <use ns4:href="#m9b8c54d372" x="274.883864" y="72.023288" style="fill: #ff7f0e; stroke: #ff7f0e" />
4828
- <use ns4:href="#m9b8c54d372" x="364.322974" y="89.116386" style="fill: #ff7f0e; stroke: #ff7f0e" />
4829
- <use ns4:href="#m9b8c54d372" x="453.762084" y="69.781571" style="fill: #ff7f0e; stroke: #ff7f0e" />
4830
- <use ns4:href="#m9b8c54d372" x="543.201194" y="83.10578" style="fill: #ff7f0e; stroke: #ff7f0e" />
4831
- <use ns4:href="#m9b8c54d372" x="632.640304" y="73.424362" style="fill: #ff7f0e; stroke: #ff7f0e" />
4832
- <use ns4:href="#m9b8c54d372" x="722.079415" y="89.116385" style="fill: #ff7f0e; stroke: #ff7f0e" />
4833
- <use ns4:href="#m9b8c54d372" x="811.518525" y="103.687549" style="fill: #ff7f0e; stroke: #ff7f0e" />
4834
  </g>
4835
  </g>
4836
  <g id="patch_3">
 
4107
  <rdf:RDF>
4108
  <ns2:Work>
4109
  <dc:type rdf:resource="http://purl.org/dc/dcmitype/StillImage" />
4110
+ <dc:date>2025-10-31T20:14:01.265668</dc:date>
4111
  <dc:format>image/svg+xml</dc:format>
4112
  <dc:creator>
4113
  <ns2:Agent>
 
4256
  <g id="matplotlib.axis_2">
4257
  <g id="ytick_1">
4258
  <g id="grid-y--2" class="grid grid-y">
4259
+ <path d="M 60.23 447.291581 L 847.294169 447.291581 " clip-path="url(#p620c7d392f)" style="fill: none; stroke: #b0b0b0; stroke-opacity: 0.3; stroke-width: 0.8; stroke-linecap: square" />
4260
  </g>
4261
  <g id="line2d_10">
4262
  <defs>
4263
  <path id="m0fca2865ba" d="M 0 0 L -3.5 0 " style="stroke: #000000; stroke-width: 0.8" />
4264
  </defs>
4265
  <g>
4266
+ <use ns4:href="#m0fca2865ba" x="60.23" y="447.291581" style="stroke: #000000; stroke-width: 0.8" />
4267
  </g>
4268
  </g>
4269
  <g id="text_10">
4270
+ <text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: end" x="53.23" y="451.090799" transform="rotate(-0 53.23 451.090799)">0.025</text>
4271
  </g>
4272
  </g>
4273
  <g id="ytick_2">
4274
  <g id="grid-y--3" class="grid grid-y">
4275
+ <path d="M 60.23 372.461283 L 847.294169 372.461283 " clip-path="url(#p620c7d392f)" style="fill: none; stroke: #b0b0b0; stroke-opacity: 0.3; stroke-width: 0.8; stroke-linecap: square" />
4276
  </g>
4277
  <g id="line2d_11">
4278
  <g>
4279
+ <use ns4:href="#m0fca2865ba" x="60.23" y="372.461283" style="stroke: #000000; stroke-width: 0.8" />
4280
  </g>
4281
  </g>
4282
  <g id="text_11">
4283
+ <text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: end" x="53.23" y="376.260501" transform="rotate(-0 53.23 376.260501)">0.030</text>
4284
  </g>
4285
  </g>
4286
  <g id="ytick_3">
4287
  <g id="grid-y--4" class="grid grid-y">
4288
+ <path d="M 60.23 297.630984 L 847.294169 297.630984 " clip-path="url(#p620c7d392f)" style="fill: none; stroke: #b0b0b0; stroke-opacity: 0.3; stroke-width: 0.8; stroke-linecap: square" />
4289
  </g>
4290
  <g id="line2d_12">
4291
  <g>
4292
+ <use ns4:href="#m0fca2865ba" x="60.23" y="297.630984" style="stroke: #000000; stroke-width: 0.8" />
4293
  </g>
4294
  </g>
4295
  <g id="text_12">
4296
+ <text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: end" x="53.23" y="301.430203" transform="rotate(-0 53.23 301.430203)">0.035</text>
4297
  </g>
4298
  </g>
4299
  <g id="ytick_4">
4300
  <g id="grid-y--5" class="grid grid-y">
4301
+ <path d="M 60.23 222.800686 L 847.294169 222.800686 " clip-path="url(#p620c7d392f)" style="fill: none; stroke: #b0b0b0; stroke-opacity: 0.3; stroke-width: 0.8; stroke-linecap: square" />
4302
  </g>
4303
  <g id="line2d_13">
4304
  <g>
4305
+ <use ns4:href="#m0fca2865ba" x="60.23" y="222.800686" style="stroke: #000000; stroke-width: 0.8" />
4306
  </g>
4307
  </g>
4308
  <g id="text_13">
4309
+ <text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: end" x="53.23" y="226.599905" transform="rotate(-0 53.23 226.599905)">0.040</text>
4310
  </g>
4311
  </g>
4312
  <g id="ytick_5">
4313
  <g id="grid-y--6" class="grid grid-y">
4314
+ <path d="M 60.23 147.970388 L 847.294169 147.970388 " clip-path="url(#p620c7d392f)" style="fill: none; stroke: #b0b0b0; stroke-opacity: 0.3; stroke-width: 0.8; stroke-linecap: square" />
4315
  </g>
4316
  <g id="line2d_14">
4317
  <g>
4318
+ <use ns4:href="#m0fca2865ba" x="60.23" y="147.970388" style="stroke: #000000; stroke-width: 0.8" />
4319
  </g>
4320
  </g>
4321
  <g id="text_14">
4322
+ <text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: end" x="53.23" y="151.769607" transform="rotate(-0 53.23 151.769607)">0.045</text>
4323
  </g>
4324
  </g>
4325
  <g id="ytick_6">
4326
  <g id="grid-y--7" class="grid grid-y">
4327
+ <path d="M 60.23 73.14009 L 847.294169 73.14009 " clip-path="url(#p620c7d392f)" style="fill: none; stroke: #b0b0b0; stroke-opacity: 0.3; stroke-width: 0.8; stroke-linecap: square" />
4328
  </g>
4329
  <g id="line2d_15">
4330
  <g>
4331
+ <use ns4:href="#m0fca2865ba" x="60.23" y="73.14009" style="stroke: #000000; stroke-width: 0.8" />
4332
  </g>
4333
  </g>
4334
  <g id="text_15">
4335
+ <text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: end" x="53.23" y="76.939309" transform="rotate(-0 53.23 76.939309)">0.050</text>
4336
  </g>
4337
  </g>
4338
  <g id="label--y" class="ylabel">
 
4340
  </g>
4341
  </g>
4342
  <g id="series--hf-kernels-swiglu" class="series">
4343
+ <path d="M 96.005644 451.16779 L 185.444754 376.487152 L 274.883864 390.555248 L 364.322974 389.208303 L 453.762084 410.624734 L 543.201194 412.405695 L 632.640304 383.371541 L 722.079415 400.283188 L 811.518525 398.936242 " clip-path="url(#p620c7d392f)" style="fill: none; stroke: #1f77b4; stroke-width: 1.5; stroke-linecap: square" />
4344
  <defs>
4345
  <path id="md7efaf3aec" d="M 0 3 C 0.795609 3 1.55874 2.683901 2.12132 2.12132 C 2.683901 1.55874 3 0.795609 3 0 C 3 -0.795609 2.683901 -1.55874 2.12132 -2.12132 C 1.55874 -2.683901 0.795609 -3 0 -3 C -0.795609 -3 -1.55874 -2.683901 -2.12132 -2.12132 C -2.683901 -1.55874 -3 -0.795609 -3 0 C -3 0.795609 -2.683901 1.55874 -2.12132 2.12132 C -1.55874 2.683901 -0.795609 3 0 3 z " style="stroke: #1f77b4" />
4346
  </defs>
4347
  <g clip-path="url(#p620c7d392f)">
4348
  <use ns4:href="#md7efaf3aec" x="96.005644" y="451.16779" style="fill: #1f77b4; stroke: #1f77b4" />
4349
+ <use ns4:href="#md7efaf3aec" x="185.444754" y="376.487152" style="fill: #1f77b4; stroke: #1f77b4" />
4350
+ <use ns4:href="#md7efaf3aec" x="274.883864" y="390.555248" style="fill: #1f77b4; stroke: #1f77b4" />
4351
+ <use ns4:href="#md7efaf3aec" x="364.322974" y="389.208303" style="fill: #1f77b4; stroke: #1f77b4" />
4352
+ <use ns4:href="#md7efaf3aec" x="453.762084" y="410.624734" style="fill: #1f77b4; stroke: #1f77b4" />
4353
+ <use ns4:href="#md7efaf3aec" x="543.201194" y="412.405695" style="fill: #1f77b4; stroke: #1f77b4" />
4354
+ <use ns4:href="#md7efaf3aec" x="632.640304" y="383.371541" style="fill: #1f77b4; stroke: #1f77b4" />
4355
+ <use ns4:href="#md7efaf3aec" x="722.079415" y="400.283188" style="fill: #1f77b4; stroke: #1f77b4" />
4356
+ <use ns4:href="#md7efaf3aec" x="811.518525" y="398.936242" style="fill: #1f77b4; stroke: #1f77b4" />
4357
  </g>
4358
  </g>
4359
  <g id="series--torch-eager" class="series">
4360
+ <path d="M 96.005644 155.288791 L 185.444754 47.08418 L 274.883864 47.967177 L 364.322974 65.193113 L 453.762084 62.798543 L 543.201194 92.28168 L 632.640304 73.424445 L 722.079415 89.138808 L 811.518525 87.342881 " clip-path="url(#p620c7d392f)" style="fill: none; stroke: #ff7f0e; stroke-width: 1.5; stroke-linecap: square" />
4361
  <defs>
4362
  <path id="m9b8c54d372" d="M 0 3 C 0.795609 3 1.55874 2.683901 2.12132 2.12132 C 2.683901 1.55874 3 0.795609 3 0 C 3 -0.795609 2.683901 -1.55874 2.12132 -2.12132 C 1.55874 -2.683901 0.795609 -3 0 -3 C -0.795609 -3 -1.55874 -2.683901 -2.12132 -2.12132 C -2.683901 -1.55874 -3 -0.795609 -3 0 C -3 0.795609 -2.683901 1.55874 -2.12132 2.12132 C -1.55874 2.683901 -0.795609 3 0 3 z " style="stroke: #ff7f0e" />
4363
  </defs>
4364
  <g clip-path="url(#p620c7d392f)">
4365
+ <use ns4:href="#m9b8c54d372" x="96.005644" y="155.288791" style="fill: #ff7f0e; stroke: #ff7f0e" />
4366
  <use ns4:href="#m9b8c54d372" x="185.444754" y="47.08418" style="fill: #ff7f0e; stroke: #ff7f0e" />
4367
+ <use ns4:href="#m9b8c54d372" x="274.883864" y="47.967177" style="fill: #ff7f0e; stroke: #ff7f0e" />
4368
+ <use ns4:href="#m9b8c54d372" x="364.322974" y="65.193113" style="fill: #ff7f0e; stroke: #ff7f0e" />
4369
+ <use ns4:href="#m9b8c54d372" x="453.762084" y="62.798543" style="fill: #ff7f0e; stroke: #ff7f0e" />
4370
+ <use ns4:href="#m9b8c54d372" x="543.201194" y="92.28168" style="fill: #ff7f0e; stroke: #ff7f0e" />
4371
+ <use ns4:href="#m9b8c54d372" x="632.640304" y="73.424445" style="fill: #ff7f0e; stroke: #ff7f0e" />
4372
+ <use ns4:href="#m9b8c54d372" x="722.079415" y="89.138808" style="fill: #ff7f0e; stroke: #ff7f0e" />
4373
+ <use ns4:href="#m9b8c54d372" x="811.518525" y="87.342881" style="fill: #ff7f0e; stroke: #ff7f0e" />
4374
  </g>
4375
  </g>
4376
  <g id="patch_3">
 
4428
  <span onclick="toggleOutput('combine')" style="cursor: pointer;">▼ output</span>
4429
  <span id="uv-indicator-combine" onclick="toggleUvLogsFromHeader('combine')" style="cursor: pointer;">▶ uv-logs</span>
4430
  </span> |
4431
+ Cell: combine | 4.32s
4432
  | <button class="run-btn" onclick="runCell('combine')">▶ run</button>
4433
  <button class="copy-btn" onclick="copyCell('combine')">Copy</button>
4434
  <a href="cells/combine.py" target="_blank" class="raw-btn">Raw</a>
 
4554
  <div class="uv-install-logs" id="uv-logs-combine">
4555
  <div class="uv-logs-header" onclick="toggleUvLogs(this)">▶ UV Install Logs</div>
4556
  <div class="uv-logs-content" style="display: none;">
4557
+ Installed 37 packages in 213ms
4558
  </div>
4559
  </div>
4560
  <div class="cell-artifacts">
 
4567
  <rdf:RDF>
4568
  <ns2:Work>
4569
  <dc:type rdf:resource="http://purl.org/dc/dcmitype/StillImage" />
4570
+ <dc:date>2025-10-31T20:14:01.265668</dc:date>
4571
  <dc:format>image/svg+xml</dc:format>
4572
  <dc:creator>
4573
  <ns2:Agent>
 
4716
  <g id="matplotlib.axis_2">
4717
  <g id="ytick_1">
4718
  <g id="grid-y--2" class="grid grid-y">
4719
+ <path d="M 60.23 447.291581 L 847.294169 447.291581 " clip-path="url(#p620c7d392f)" style="fill: none; stroke: #b0b0b0; stroke-opacity: 0.3; stroke-width: 0.8; stroke-linecap: square" />
4720
  </g>
4721
  <g id="line2d_10">
4722
  <defs>
4723
  <path id="m0fca2865ba" d="M 0 0 L -3.5 0 " style="stroke: #000000; stroke-width: 0.8" />
4724
  </defs>
4725
  <g>
4726
+ <use ns4:href="#m0fca2865ba" x="60.23" y="447.291581" style="stroke: #000000; stroke-width: 0.8" />
4727
  </g>
4728
  </g>
4729
  <g id="text_10">
4730
+ <text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: end" x="53.23" y="451.090799" transform="rotate(-0 53.23 451.090799)">0.025</text>
4731
  </g>
4732
  </g>
4733
  <g id="ytick_2">
4734
  <g id="grid-y--3" class="grid grid-y">
4735
+ <path d="M 60.23 372.461283 L 847.294169 372.461283 " clip-path="url(#p620c7d392f)" style="fill: none; stroke: #b0b0b0; stroke-opacity: 0.3; stroke-width: 0.8; stroke-linecap: square" />
4736
  </g>
4737
  <g id="line2d_11">
4738
  <g>
4739
+ <use ns4:href="#m0fca2865ba" x="60.23" y="372.461283" style="stroke: #000000; stroke-width: 0.8" />
4740
  </g>
4741
  </g>
4742
  <g id="text_11">
4743
+ <text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: end" x="53.23" y="376.260501" transform="rotate(-0 53.23 376.260501)">0.030</text>
4744
  </g>
4745
  </g>
4746
  <g id="ytick_3">
4747
  <g id="grid-y--4" class="grid grid-y">
4748
+ <path d="M 60.23 297.630984 L 847.294169 297.630984 " clip-path="url(#p620c7d392f)" style="fill: none; stroke: #b0b0b0; stroke-opacity: 0.3; stroke-width: 0.8; stroke-linecap: square" />
4749
  </g>
4750
  <g id="line2d_12">
4751
  <g>
4752
+ <use ns4:href="#m0fca2865ba" x="60.23" y="297.630984" style="stroke: #000000; stroke-width: 0.8" />
4753
  </g>
4754
  </g>
4755
  <g id="text_12">
4756
+ <text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: end" x="53.23" y="301.430203" transform="rotate(-0 53.23 301.430203)">0.035</text>
4757
  </g>
4758
  </g>
4759
  <g id="ytick_4">
4760
  <g id="grid-y--5" class="grid grid-y">
4761
+ <path d="M 60.23 222.800686 L 847.294169 222.800686 " clip-path="url(#p620c7d392f)" style="fill: none; stroke: #b0b0b0; stroke-opacity: 0.3; stroke-width: 0.8; stroke-linecap: square" />
4762
  </g>
4763
  <g id="line2d_13">
4764
  <g>
4765
+ <use ns4:href="#m0fca2865ba" x="60.23" y="222.800686" style="stroke: #000000; stroke-width: 0.8" />
4766
  </g>
4767
  </g>
4768
  <g id="text_13">
4769
+ <text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: end" x="53.23" y="226.599905" transform="rotate(-0 53.23 226.599905)">0.040</text>
4770
  </g>
4771
  </g>
4772
  <g id="ytick_5">
4773
  <g id="grid-y--6" class="grid grid-y">
4774
+ <path d="M 60.23 147.970388 L 847.294169 147.970388 " clip-path="url(#p620c7d392f)" style="fill: none; stroke: #b0b0b0; stroke-opacity: 0.3; stroke-width: 0.8; stroke-linecap: square" />
4775
  </g>
4776
  <g id="line2d_14">
4777
  <g>
4778
+ <use ns4:href="#m0fca2865ba" x="60.23" y="147.970388" style="stroke: #000000; stroke-width: 0.8" />
4779
  </g>
4780
  </g>
4781
  <g id="text_14">
4782
+ <text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: end" x="53.23" y="151.769607" transform="rotate(-0 53.23 151.769607)">0.045</text>
4783
  </g>
4784
  </g>
4785
  <g id="ytick_6">
4786
  <g id="grid-y--7" class="grid grid-y">
4787
+ <path d="M 60.23 73.14009 L 847.294169 73.14009 " clip-path="url(#p620c7d392f)" style="fill: none; stroke: #b0b0b0; stroke-opacity: 0.3; stroke-width: 0.8; stroke-linecap: square" />
4788
  </g>
4789
  <g id="line2d_15">
4790
  <g>
4791
+ <use ns4:href="#m0fca2865ba" x="60.23" y="73.14009" style="stroke: #000000; stroke-width: 0.8" />
4792
  </g>
4793
  </g>
4794
  <g id="text_15">
4795
+ <text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: end" x="53.23" y="76.939309" transform="rotate(-0 53.23 76.939309)">0.050</text>
4796
  </g>
4797
  </g>
4798
  <g id="label--y" class="ylabel">
 
4800
  </g>
4801
  </g>
4802
  <g id="series--hf-kernels-swiglu" class="series">
4803
+ <path d="M 96.005644 451.16779 L 185.444754 376.487152 L 274.883864 390.555248 L 364.322974 389.208303 L 453.762084 410.624734 L 543.201194 412.405695 L 632.640304 383.371541 L 722.079415 400.283188 L 811.518525 398.936242 " clip-path="url(#p620c7d392f)" style="fill: none; stroke: #1f77b4; stroke-width: 1.5; stroke-linecap: square" />
4804
  <defs>
4805
  <path id="md7efaf3aec" d="M 0 3 C 0.795609 3 1.55874 2.683901 2.12132 2.12132 C 2.683901 1.55874 3 0.795609 3 0 C 3 -0.795609 2.683901 -1.55874 2.12132 -2.12132 C 1.55874 -2.683901 0.795609 -3 0 -3 C -0.795609 -3 -1.55874 -2.683901 -2.12132 -2.12132 C -2.683901 -1.55874 -3 -0.795609 -3 0 C -3 0.795609 -2.683901 1.55874 -2.12132 2.12132 C -1.55874 2.683901 -0.795609 3 0 3 z " style="stroke: #1f77b4" />
4806
  </defs>
4807
  <g clip-path="url(#p620c7d392f)">
4808
  <use ns4:href="#md7efaf3aec" x="96.005644" y="451.16779" style="fill: #1f77b4; stroke: #1f77b4" />
4809
+ <use ns4:href="#md7efaf3aec" x="185.444754" y="376.487152" style="fill: #1f77b4; stroke: #1f77b4" />
4810
+ <use ns4:href="#md7efaf3aec" x="274.883864" y="390.555248" style="fill: #1f77b4; stroke: #1f77b4" />
4811
+ <use ns4:href="#md7efaf3aec" x="364.322974" y="389.208303" style="fill: #1f77b4; stroke: #1f77b4" />
4812
+ <use ns4:href="#md7efaf3aec" x="453.762084" y="410.624734" style="fill: #1f77b4; stroke: #1f77b4" />
4813
+ <use ns4:href="#md7efaf3aec" x="543.201194" y="412.405695" style="fill: #1f77b4; stroke: #1f77b4" />
4814
+ <use ns4:href="#md7efaf3aec" x="632.640304" y="383.371541" style="fill: #1f77b4; stroke: #1f77b4" />
4815
+ <use ns4:href="#md7efaf3aec" x="722.079415" y="400.283188" style="fill: #1f77b4; stroke: #1f77b4" />
4816
+ <use ns4:href="#md7efaf3aec" x="811.518525" y="398.936242" style="fill: #1f77b4; stroke: #1f77b4" />
4817
  </g>
4818
  </g>
4819
  <g id="series--torch-eager" class="series">
4820
+ <path d="M 96.005644 155.288791 L 185.444754 47.08418 L 274.883864 47.967177 L 364.322974 65.193113 L 453.762084 62.798543 L 543.201194 92.28168 L 632.640304 73.424445 L 722.079415 89.138808 L 811.518525 87.342881 " clip-path="url(#p620c7d392f)" style="fill: none; stroke: #ff7f0e; stroke-width: 1.5; stroke-linecap: square" />
4821
  <defs>
4822
  <path id="m9b8c54d372" d="M 0 3 C 0.795609 3 1.55874 2.683901 2.12132 2.12132 C 2.683901 1.55874 3 0.795609 3 0 C 3 -0.795609 2.683901 -1.55874 2.12132 -2.12132 C 1.55874 -2.683901 0.795609 -3 0 -3 C -0.795609 -3 -1.55874 -2.683901 -2.12132 -2.12132 C -2.683901 -1.55874 -3 -0.795609 -3 0 C -3 0.795609 -2.683901 1.55874 -2.12132 2.12132 C -1.55874 2.683901 -0.795609 3 0 3 z " style="stroke: #ff7f0e" />
4823
  </defs>
4824
  <g clip-path="url(#p620c7d392f)">
4825
+ <use ns4:href="#m9b8c54d372" x="96.005644" y="155.288791" style="fill: #ff7f0e; stroke: #ff7f0e" />
4826
  <use ns4:href="#m9b8c54d372" x="185.444754" y="47.08418" style="fill: #ff7f0e; stroke: #ff7f0e" />
4827
+ <use ns4:href="#m9b8c54d372" x="274.883864" y="47.967177" style="fill: #ff7f0e; stroke: #ff7f0e" />
4828
+ <use ns4:href="#m9b8c54d372" x="364.322974" y="65.193113" style="fill: #ff7f0e; stroke: #ff7f0e" />
4829
+ <use ns4:href="#m9b8c54d372" x="453.762084" y="62.798543" style="fill: #ff7f0e; stroke: #ff7f0e" />
4830
+ <use ns4:href="#m9b8c54d372" x="543.201194" y="92.28168" style="fill: #ff7f0e; stroke: #ff7f0e" />
4831
+ <use ns4:href="#m9b8c54d372" x="632.640304" y="73.424445" style="fill: #ff7f0e; stroke: #ff7f0e" />
4832
+ <use ns4:href="#m9b8c54d372" x="722.079415" y="89.138808" style="fill: #ff7f0e; stroke: #ff7f0e" />
4833
+ <use ns4:href="#m9b8c54d372" x="811.518525" y="87.342881" style="fill: #ff7f0e; stroke: #ff7f0e" />
4834
  </g>
4835
  </g>
4836
  <g id="patch_3">
causal_conv1d/impls/artifacts/benchmark/causal_conv1d.jsonl CHANGED
@@ -1,24 +1,24 @@
1
- {"ts": "2025-10-30T15:53:36Z", "run": "a6ca6031a2be44b8852eef7e2bbddc9e", "impl": "hf_kernels_causal_conv1d", "tags": {"family": "hf-kernels", "backend": "cuda"}, "wl": {"name": "cuda_B2_D64_S128_W2", "batch": 2, "dim": 64, "seqlen": 128, "width": 2, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.04394999996293336, "p50": 0.04566100000147344, "p90": 0.046750000024076144, "mean": 0.04579239999884521, "iqr": 0.0020500000346146408, "raw_times": [0.0446999999894615, 0.047901000016281614, 0.046750000024076144, 0.04566100000147344, 0.04394999996293336], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.05609099997627709, "peak_bytes": 295936, "ok": true, "absmax": 0.0, "corr": {"ok": true, "rtol": 0.003, "atol": 0.005, "absmax": 0.0, "mae": 0.0, "mse": 0.0, "ref": "causal_conv1d_fp32"}, "err": null}
2
- {"ts": "2025-10-30T15:53:36Z", "run": "a6ca6031a2be44b8852eef7e2bbddc9e", "impl": "hf_kernels_causal_conv1d", "tags": {"family": "hf-kernels", "backend": "cuda"}, "wl": {"name": "cuda_B2_D64_S128_W4", "batch": 2, "dim": 64, "seqlen": 128, "width": 4, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.05193099997313766, "p50": 0.05449100001442275, "p90": 0.054510999973444996, "mean": 0.05559319998837964, "iqr": 0.0010200000133409048, "raw_times": [0.05349099996010409, 0.05449100001442275, 0.06354200002078869, 0.05193099997313766, 0.054510999973444996], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.060221000012461445, "peak_bytes": 296448, "ok": true, "absmax": 0.0, "corr": {"ok": true, "rtol": 0.003, "atol": 0.005, "absmax": 0.0, "mae": 0.0, "mse": 0.0, "ref": "causal_conv1d_fp32"}, "err": null}
3
- {"ts": "2025-10-30T15:53:36Z", "run": "a6ca6031a2be44b8852eef7e2bbddc9e", "impl": "hf_kernels_causal_conv1d", "tags": {"family": "hf-kernels", "backend": "cuda"}, "wl": {"name": "cuda_B2_D64_S512_W2", "batch": 2, "dim": 64, "seqlen": 512, "width": 2, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.051560999963840004, "p50": 0.05184100001542902, "p90": 0.05310099999178419, "mean": 0.05230499999697713, "iqr": 0.0014099999816608033, "raw_times": [0.05184100001542902, 0.05333100000370905, 0.05310099999178419, 0.05169100001012339, 0.051560999963840004], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.058330999991085264, "peak_bytes": 1180672, "ok": true, "absmax": 0.0, "corr": {"ok": true, "rtol": 0.003, "atol": 0.005, "absmax": 0.0, "mae": 0.0, "mse": 0.0, "ref": "causal_conv1d_fp32"}, "err": null}
4
- {"ts": "2025-10-30T15:53:36Z", "run": "a6ca6031a2be44b8852eef7e2bbddc9e", "impl": "hf_kernels_causal_conv1d", "tags": {"family": "hf-kernels", "backend": "cuda"}, "wl": {"name": "cuda_B2_D64_S512_W4", "batch": 2, "dim": 64, "seqlen": 512, "width": 4, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.05121099997040801, "p50": 0.051831000007496186, "p90": 0.052310999990368146, "mean": 0.05185479999454401, "iqr": 0.0008799999591246888, "raw_times": [0.05121099997040801, 0.051831000007496186, 0.052310999990368146, 0.05248999997320425, 0.05143100003124346], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.05627100000538121, "peak_bytes": 1181184, "ok": true, "absmax": 0.0, "corr": {"ok": true, "rtol": 0.003, "atol": 0.005, "absmax": 0.0, "mae": 0.0, "mse": 0.0, "ref": "causal_conv1d_fp32"}, "err": null}
5
- {"ts": "2025-10-30T15:53:36Z", "run": "a6ca6031a2be44b8852eef7e2bbddc9e", "impl": "hf_kernels_causal_conv1d", "tags": {"family": "hf-kernels", "backend": "cuda"}, "wl": {"name": "cuda_B2_D64_S2048_W2", "batch": 2, "dim": 64, "seqlen": 2048, "width": 2, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.050751000003401714, "p50": 0.051640999970459234, "p90": 0.05217000000357075, "mean": 0.05161080000561924, "iqr": 0.0008689999617672584, "raw_times": [0.05219100000886101, 0.05217000000357075, 0.050751000003401714, 0.05130100004180349, 0.051640999970459234], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.055421000013211597, "peak_bytes": 4719616, "ok": true, "absmax": 0.0, "corr": {"ok": true, "rtol": 0.003, "atol": 0.005, "absmax": 0.0, "mae": 0.0, "mse": 0.0, "ref": "causal_conv1d_fp32"}, "err": null}
6
- {"ts": "2025-10-30T15:53:36Z", "run": "a6ca6031a2be44b8852eef7e2bbddc9e", "impl": "hf_kernels_causal_conv1d", "tags": {"family": "hf-kernels", "backend": "cuda"}, "wl": {"name": "cuda_B2_D64_S2048_W4", "batch": 2, "dim": 64, "seqlen": 2048, "width": 4, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.04883100001507046, "p50": 0.049950999994052836, "p90": 0.05039000001261229, "mean": 0.04992260001017712, "iqr": 0.0006600000119760807, "raw_times": [0.04883100001507046, 0.05071100002851381, 0.04973000000063621, 0.05039000001261229, 0.049950999994052836], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.05684100000280523, "peak_bytes": 4720128, "ok": true, "absmax": 0.0, "corr": {"ok": true, "rtol": 0.003, "atol": 0.005, "absmax": 0.0, "mae": 0.0, "mse": 0.0, "ref": "causal_conv1d_fp32"}, "err": null}
7
- {"ts": "2025-10-30T15:53:36Z", "run": "a6ca6031a2be44b8852eef7e2bbddc9e", "impl": "hf_kernels_causal_conv1d", "tags": {"family": "hf-kernels", "backend": "cuda"}, "wl": {"name": "cuda_B2_D2048_S128_W2", "batch": 2, "dim": 2048, "seqlen": 128, "width": 2, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.04895099999657759, "p50": 0.050181000005977694, "p90": 0.05176100000880979, "mean": 0.05066500000339147, "iqr": 0.0021600000081889448, "raw_times": [0.04960100000062084, 0.05176100000880979, 0.050181000005977694, 0.05283100000497143, 0.04895099999657759], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.05629100002124687, "peak_bytes": 9461760, "ok": true, "absmax": 0.0, "corr": {"ok": true, "rtol": 0.003, "atol": 0.005, "absmax": 0.0, "mae": 0.0, "mse": 0.0, "ref": "causal_conv1d_fp32"}, "err": null}
8
- {"ts": "2025-10-30T15:53:36Z", "run": "a6ca6031a2be44b8852eef7e2bbddc9e", "impl": "hf_kernels_causal_conv1d", "tags": {"family": "hf-kernels", "backend": "cuda"}, "wl": {"name": "cuda_B2_D2048_S128_W4", "batch": 2, "dim": 2048, "seqlen": 128, "width": 4, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.048800999991271965, "p50": 0.051240999994206504, "p90": 0.0513809999915793, "mean": 0.05085500000632237, "iqr": 0.00043999995114063495, "raw_times": [0.051240999994206504, 0.048800999991271965, 0.051911000014115416, 0.050941000040438666, 0.0513809999915793], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.056131000008008414, "peak_bytes": 9478144, "ok": true, "absmax": 0.0, "corr": {"ok": true, "rtol": 0.003, "atol": 0.005, "absmax": 0.0, "mae": 0.0, "mse": 0.0, "ref": "causal_conv1d_fp32"}, "err": null}
9
- {"ts": "2025-10-30T15:53:36Z", "run": "a6ca6031a2be44b8852eef7e2bbddc9e", "impl": "hf_kernels_causal_conv1d", "tags": {"family": "hf-kernels", "backend": "cuda"}, "wl": {"name": "cuda_B2_D2048_S512_W2", "batch": 2, "dim": 2048, "seqlen": 512, "width": 2, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.04940100001249448, "p50": 0.05085099996904319, "p90": 0.05221100002472667, "mean": 0.05112659999895186, "iqr": 0.0015410000742122065, "raw_times": [0.050669999950514466, 0.05221100002472667, 0.04940100001249448, 0.0525000000379805, 0.05085099996904319], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.053861000026245165, "peak_bytes": 37773312, "ok": true, "absmax": 0.0, "corr": {"ok": true, "rtol": 0.003, "atol": 0.005, "absmax": 0.0, "mae": 0.0, "mse": 0.0, "ref": "causal_conv1d_fp32"}, "err": null}
10
- {"ts": "2025-10-30T15:53:36Z", "run": "a6ca6031a2be44b8852eef7e2bbddc9e", "impl": "hf_kernels_causal_conv1d", "tags": {"family": "hf-kernels", "backend": "cuda"}, "wl": {"name": "cuda_B2_D2048_S512_W4", "batch": 2, "dim": 2048, "seqlen": 512, "width": 4, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.04947999997284569, "p50": 0.05073100004437947, "p90": 0.05098100001532657, "mean": 0.05063280001422754, "iqr": 0.0010900000120273035, "raw_times": [0.04947999997284569, 0.05098100001532657, 0.04989100000329927, 0.05073100004437947, 0.052081000035286706], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.054841000007854745, "peak_bytes": 37789696, "ok": true, "absmax": 0.0, "corr": {"ok": true, "rtol": 0.003, "atol": 0.005, "absmax": 0.0, "mae": 0.0, "mse": 0.0, "ref": "causal_conv1d_fp32"}, "err": null}
11
- {"ts": "2025-10-30T15:53:36Z", "run": "a6ca6031a2be44b8852eef7e2bbddc9e", "impl": "hf_kernels_causal_conv1d", "tags": {"family": "hf-kernels", "backend": "cuda"}, "wl": {"name": "cuda_B2_D2048_S2048_W2", "batch": 2, "dim": 2048, "seqlen": 2048, "width": 2, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.05039100000203689, "p50": 0.051160999987587275, "p90": 0.05154000001539316, "mean": 0.051364599994485616, "iqr": 0.00038000001723048626, "raw_times": [0.051160999987587275, 0.05257099996924808, 0.05039100000203689, 0.05154000001539316, 0.051159999998162675], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.05513099995368975, "peak_bytes": 151019520, "ok": true, "absmax": 0.0, "corr": {"ok": true, "rtol": 0.003, "atol": 0.005, "absmax": 0.0, "mae": 0.0, "mse": 0.0, "ref": "causal_conv1d_fp32"}, "err": null}
12
- {"ts": "2025-10-30T15:53:36Z", "run": "a6ca6031a2be44b8852eef7e2bbddc9e", "impl": "hf_kernels_causal_conv1d", "tags": {"family": "hf-kernels", "backend": "cuda"}, "wl": {"name": "cuda_B2_D2048_S2048_W4", "batch": 2, "dim": 2048, "seqlen": 2048, "width": 4, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.048071000037452905, "p50": 0.05178100002467545, "p90": 0.0526809999996658, "mean": 0.05150900001353875, "iqr": 0.0032599999713056604, "raw_times": [0.04942100002836014, 0.0526809999996658, 0.05178100002467545, 0.05559099997753947, 0.048071000037452905], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.05527100000790597, "peak_bytes": 151035904, "ok": true, "absmax": 0.0, "corr": {"ok": true, "rtol": 0.003, "atol": 0.005, "absmax": 0.0, "mae": 0.0, "mse": 0.0, "ref": "causal_conv1d_fp32"}, "err": null}
13
- {"ts": "2025-10-30T15:53:36Z", "run": "a6ca6031a2be44b8852eef7e2bbddc9e", "impl": "hf_kernels_causal_conv1d", "tags": {"family": "hf-kernels", "backend": "cuda"}, "wl": {"name": "cuda_B4_D64_S128_W2", "batch": 4, "dim": 64, "seqlen": 128, "width": 2, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.04963099996757592, "p50": 0.05203099999562255, "p90": 0.052549999963957816, "mean": 0.05276679999042244, "iqr": 0.0005189999683352653, "raw_times": [0.05759100002933337, 0.05203099999562255, 0.052549999963957816, 0.04963099996757592, 0.05203099999562255], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.07661199998665325, "peak_bytes": 33727488, "ok": true, "absmax": 0.0, "corr": {"ok": true, "rtol": 0.003, "atol": 0.005, "absmax": 0.0, "mae": 0.0, "mse": 0.0, "ref": "causal_conv1d_fp32"}, "err": null}
14
- {"ts": "2025-10-30T15:53:36Z", "run": "a6ca6031a2be44b8852eef7e2bbddc9e", "impl": "hf_kernels_causal_conv1d", "tags": {"family": "hf-kernels", "backend": "cuda"}, "wl": {"name": "cuda_B4_D64_S128_W4", "batch": 4, "dim": 64, "seqlen": 128, "width": 4, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.049701000023105735, "p50": 0.051581000036549085, "p90": 0.05290100000365783, "mean": 0.05255880001868718, "iqr": 0.002381000001605571, "raw_times": [0.05290100000365783, 0.058091000028070994, 0.051581000036549085, 0.05052000000205226, 0.049701000023105735], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.054920000025049376, "peak_bytes": 591360, "ok": true, "absmax": 0.0, "corr": {"ok": true, "rtol": 0.003, "atol": 0.005, "absmax": 0.0, "mae": 0.0, "mse": 0.0, "ref": "causal_conv1d_fp32"}, "err": null}
15
- {"ts": "2025-10-30T15:53:36Z", "run": "a6ca6031a2be44b8852eef7e2bbddc9e", "impl": "hf_kernels_causal_conv1d", "tags": {"family": "hf-kernels", "backend": "cuda"}, "wl": {"name": "cuda_B4_D64_S512_W2", "batch": 4, "dim": 64, "seqlen": 512, "width": 2, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.0517009999612128, "p50": 0.05219999997052582, "p90": 0.05233100000623381, "mean": 0.05215079999061345, "iqr": 0.0001500000053056283, "raw_times": [0.05233100000623381, 0.05234100001416664, 0.05219999997052582, 0.0517009999612128, 0.05218100000092818], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.055141000018466, "peak_bytes": 2360320, "ok": true, "absmax": 0.0, "corr": {"ok": true, "rtol": 0.003, "atol": 0.005, "absmax": 0.0, "mae": 0.0, "mse": 0.0, "ref": "causal_conv1d_fp32"}, "err": null}
16
- {"ts": "2025-10-30T15:53:36Z", "run": "a6ca6031a2be44b8852eef7e2bbddc9e", "impl": "hf_kernels_causal_conv1d", "tags": {"family": "hf-kernels", "backend": "cuda"}, "wl": {"name": "cuda_B4_D64_S512_W4", "batch": 4, "dim": 64, "seqlen": 512, "width": 4, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.05047100000865612, "p50": 0.05349100001694751, "p90": 0.05691100000149163, "mean": 0.057148999997025385, "iqr": 0.004350000040176383, "raw_times": [0.05047100000865612, 0.05349100001694751, 0.07231099999671642, 0.05256099996131525, 0.05691100000149163], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.05554099999471873, "peak_bytes": 2360832, "ok": true, "absmax": 0.0, "corr": {"ok": true, "rtol": 0.003, "atol": 0.005, "absmax": 0.0, "mae": 0.0, "mse": 0.0, "ref": "causal_conv1d_fp32"}, "err": null}
17
- {"ts": "2025-10-30T15:53:36Z", "run": "a6ca6031a2be44b8852eef7e2bbddc9e", "impl": "hf_kernels_causal_conv1d", "tags": {"family": "hf-kernels", "backend": "cuda"}, "wl": {"name": "cuda_B4_D64_S2048_W2", "batch": 4, "dim": 64, "seqlen": 2048, "width": 2, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.049690000025748304, "p50": 0.050921000024573004, "p90": 0.051730999985011294, "mean": 0.051232800001344, "iqr": 0.0010800000040944724, "raw_times": [0.05065099998091682, 0.051730999985011294, 0.05317099999047059, 0.049690000025748304, 0.050921000024573004], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.05373099997996178, "peak_bytes": 9438208, "ok": true, "absmax": 0.0, "corr": {"ok": true, "rtol": 0.003, "atol": 0.005, "absmax": 0.0, "mae": 0.0, "mse": 0.0, "ref": "causal_conv1d_fp32"}, "err": null}
18
- {"ts": "2025-10-30T15:53:36Z", "run": "a6ca6031a2be44b8852eef7e2bbddc9e", "impl": "hf_kernels_causal_conv1d", "tags": {"family": "hf-kernels", "backend": "cuda"}, "wl": {"name": "cuda_B4_D64_S2048_W4", "batch": 4, "dim": 64, "seqlen": 2048, "width": 4, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.05013100002315696, "p50": 0.05073099998753605, "p90": 0.052470999946763186, "mean": 0.051448999988679134, "iqr": 0.001829999973779195, "raw_times": [0.05013100002315696, 0.05073099998753605, 0.05327100001295548, 0.052470999946763186, 0.05064099997298399], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.05419999996547631, "peak_bytes": 9438720, "ok": true, "absmax": 0.0, "corr": {"ok": true, "rtol": 0.003, "atol": 0.005, "absmax": 0.0, "mae": 0.0, "mse": 0.0, "ref": "causal_conv1d_fp32"}, "err": null}
19
- {"ts": "2025-10-30T15:53:36Z", "run": "a6ca6031a2be44b8852eef7e2bbddc9e", "impl": "hf_kernels_causal_conv1d", "tags": {"family": "hf-kernels", "backend": "cuda"}, "wl": {"name": "cuda_B4_D2048_S128_W2", "batch": 4, "dim": 2048, "seqlen": 128, "width": 2, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.04918100000850245, "p50": 0.050670999996782484, "p90": 0.05192099996520483, "mean": 0.050938799995492445, "iqr": 0.0013709999393540784, "raw_times": [0.04918100000850245, 0.05192099996520483, 0.05237099998112171, 0.05055000002585075, 0.050670999996782484], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.05559099997753947, "peak_bytes": 18931712, "ok": true, "absmax": 0.0, "corr": {"ok": true, "rtol": 0.003, "atol": 0.005, "absmax": 0.0, "mae": 0.0, "mse": 0.0, "ref": "causal_conv1d_fp32"}, "err": null}
20
- {"ts": "2025-10-30T15:53:36Z", "run": "a6ca6031a2be44b8852eef7e2bbddc9e", "impl": "hf_kernels_causal_conv1d", "tags": {"family": "hf-kernels", "backend": "cuda"}, "wl": {"name": "cuda_B4_D2048_S128_W4", "batch": 4, "dim": 2048, "seqlen": 128, "width": 4, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.049871000044277025, "p50": 0.05047100000865612, "p90": 0.05118100000345294, "mean": 0.050820800015571876, "iqr": 0.0007699999855503847, "raw_times": [0.049871000044277025, 0.05041100001790255, 0.05217000000357075, 0.05047100000865612, 0.05118100000345294], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.05564100001720362, "peak_bytes": 18948096, "ok": true, "absmax": 0.0, "corr": {"ok": true, "rtol": 0.003, "atol": 0.005, "absmax": 0.0, "mae": 0.0, "mse": 0.0, "ref": "causal_conv1d_fp32"}, "err": null}
21
- {"ts": "2025-10-30T15:53:36Z", "run": "a6ca6031a2be44b8852eef7e2bbddc9e", "impl": "hf_kernels_causal_conv1d", "tags": {"family": "hf-kernels", "backend": "cuda"}, "wl": {"name": "cuda_B4_D2048_S512_W2", "batch": 4, "dim": 2048, "seqlen": 512, "width": 2, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.05115100003649786, "p50": 0.052071000027353875, "p90": 0.05212100001017461, "mean": 0.05199700001412566, "iqr": 0.0006100000291553442, "raw_times": [0.05115100003649786, 0.052071000027353875, 0.053131000015582686, 0.05212100001017461, 0.05151099998101927], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.05440099999987069, "peak_bytes": 75522048, "ok": true, "absmax": 0.0, "corr": {"ok": true, "rtol": 0.003, "atol": 0.005, "absmax": 0.0, "mae": 0.0, "mse": 0.0, "ref": "causal_conv1d_fp32"}, "err": null}
22
- {"ts": "2025-10-30T15:53:36Z", "run": "a6ca6031a2be44b8852eef7e2bbddc9e", "impl": "hf_kernels_causal_conv1d", "tags": {"family": "hf-kernels", "backend": "cuda"}, "wl": {"name": "cuda_B4_D2048_S512_W4", "batch": 4, "dim": 2048, "seqlen": 512, "width": 4, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.04927099996621109, "p50": 0.051500999973086437, "p90": 0.05194099998107049, "mean": 0.05114499998626343, "iqr": 0.000919999990856013, "raw_times": [0.051500999973086437, 0.04927099996621109, 0.051991000020734646, 0.05194099998107049, 0.05102099999021448], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.054591000036907644, "peak_bytes": 75538432, "ok": true, "absmax": 0.0, "corr": {"ok": true, "rtol": 0.003, "atol": 0.005, "absmax": 0.0, "mae": 0.0, "mse": 0.0, "ref": "causal_conv1d_fp32"}, "err": null}
23
- {"ts": "2025-10-30T15:53:36Z", "run": "a6ca6031a2be44b8852eef7e2bbddc9e", "impl": "hf_kernels_causal_conv1d", "tags": {"family": "hf-kernels", "backend": "cuda"}, "wl": {"name": "cuda_B4_D2048_S2048_W2", "batch": 4, "dim": 2048, "seqlen": 2048, "width": 2, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.049930999978187174, "p50": 0.050361000035081815, "p90": 0.05102099999021448, "mean": 0.05066480000550655, "iqr": 0.0008009999987734773, "raw_times": [0.050219999991441, 0.050361000035081815, 0.05179100003260828, 0.049930999978187174, 0.05102099999021448], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.05545099998016667, "peak_bytes": 302014464, "ok": true, "absmax": 0.0, "corr": {"ok": true, "rtol": 0.003, "atol": 0.005, "absmax": 0.0, "mae": 0.0, "mse": 0.0, "ref": "causal_conv1d_fp32"}, "err": null}
24
- {"ts": "2025-10-30T15:53:36Z", "run": "a6ca6031a2be44b8852eef7e2bbddc9e", "impl": "hf_kernels_causal_conv1d", "tags": {"family": "hf-kernels", "backend": "cuda"}, "wl": {"name": "cuda_B4_D2048_S2048_W4", "batch": 4, "dim": 2048, "seqlen": 2048, "width": 4, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.0500799999940682, "p50": 0.05195099998900332, "p90": 0.051991000020734646, "mean": 0.05318280000210507, "iqr": 0.0014600000213249587, "raw_times": [0.0500799999940682, 0.05195099998900332, 0.051991000020734646, 0.05053099999940969, 0.061361000007309485], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.05489099999067548, "peak_bytes": 302030848, "ok": true, "absmax": 0.0, "corr": {"ok": true, "rtol": 0.003, "atol": 0.005, "absmax": 0.0, "mae": 0.0, "mse": 0.0, "ref": "causal_conv1d_fp32"}, "err": null}
 
1
+ {"ts": "2025-10-31T20:01:11Z", "run": "a7ca8117e1294b1ba730e0240038ddbc", "impl": "torch_eager", "tags": {"family": "pytorch", "backend": "eager"}, "wl": {"name": "cuda_B2_D64_S128_W2", "batch": 2, "dim": 64, "seqlen": 128, "width": 2, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.06906199996592477, "p50": 0.07093199997143529, "p90": 0.07169200000589626, "mean": 0.07107379998387842, "iqr": 0.0011000000199601345, "raw_times": [0.07093199997143529, 0.07309099999019963, 0.07059199998593613, 0.07169200000589626, 0.06906199996592477], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.07642200000645971, "peak_bytes": 295936, "ok": true, "absmax": 0.0, "corr": {"ok": true, "rtol": 0.003, "atol": 0.005, "absmax": 0.0, "mae": 0.0, "mse": 0.0, "ref": "causal_conv1d_fp32"}, "err": null}
2
+ {"ts": "2025-10-31T20:01:11Z", "run": "a7ca8117e1294b1ba730e0240038ddbc", "impl": "torch_eager", "tags": {"family": "pytorch", "backend": "eager"}, "wl": {"name": "cuda_B2_D64_S128_W4", "batch": 2, "dim": 64, "seqlen": 128, "width": 4, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.08730199999718025, "p50": 0.08879199998546028, "p90": 0.08886199998414668, "mean": 0.0890762000040013, "iqr": 0.00037899997096246807, "raw_times": [0.08730199999718025, 0.08879199998546028, 0.08848300001318421, 0.08886199998414668, 0.09194200004003505], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.091862999965997, "peak_bytes": 296448, "ok": true, "absmax": 0.0, "corr": {"ok": true, "rtol": 0.003, "atol": 0.005, "absmax": 0.0, "mae": 0.0, "mse": 0.0, "ref": "causal_conv1d_fp32"}, "err": null}
3
+ {"ts": "2025-10-31T20:01:11Z", "run": "a7ca8117e1294b1ba730e0240038ddbc", "impl": "torch_eager", "tags": {"family": "pytorch", "backend": "eager"}, "wl": {"name": "cuda_B2_D64_S512_W2", "batch": 2, "dim": 64, "seqlen": 512, "width": 2, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.08465199999818651, "p50": 0.08821300002637145, "p90": 0.08871199997884105, "mean": 0.08770840000806857, "iqr": 0.0007599999776175537, "raw_times": [0.08465199999818651, 0.0879520000012235, 0.08821300002637145, 0.08901300003572032, 0.08871199997884105], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.09156300001222917, "peak_bytes": 1180672, "ok": true, "absmax": 0.0, "corr": {"ok": true, "rtol": 0.003, "atol": 0.005, "absmax": 0.0, "mae": 0.0, "mse": 0.0, "ref": "causal_conv1d_fp32"}, "err": null}
4
+ {"ts": "2025-10-31T20:01:11Z", "run": "a7ca8117e1294b1ba730e0240038ddbc", "impl": "torch_eager", "tags": {"family": "pytorch", "backend": "eager"}, "wl": {"name": "cuda_B2_D64_S512_W4", "batch": 2, "dim": 64, "seqlen": 512, "width": 4, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.08501199999955134, "p50": 0.08710200000905388, "p90": 0.08719199996676252, "mean": 0.08665020000080403, "iqr": 0.001349999934063817, "raw_times": [0.08501199999955134, 0.08710200000905388, 0.08719199996676252, 0.0858420000326987, 0.08810299999595372], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.09103200000026845, "peak_bytes": 1181184, "ok": true, "absmax": 0.0, "corr": {"ok": true, "rtol": 0.003, "atol": 0.005, "absmax": 0.0, "mae": 0.0, "mse": 0.0, "ref": "causal_conv1d_fp32"}, "err": null}
5
+ {"ts": "2025-10-31T20:01:11Z", "run": "a7ca8117e1294b1ba730e0240038ddbc", "impl": "torch_eager", "tags": {"family": "pytorch", "backend": "eager"}, "wl": {"name": "cuda_B2_D64_S2048_W2", "batch": 2, "dim": 64, "seqlen": 2048, "width": 2, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.08575200001814665, "p50": 0.08690200002092752, "p90": 0.08706200003416598, "mean": 0.08684220001669019, "iqr": 0.00029900002118665725, "raw_times": [0.08773199999723147, 0.08676300001297932, 0.08690200002092752, 0.08706200003416598, 0.08575200001814665], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.09036199998035954, "peak_bytes": 4719616, "ok": true, "absmax": 0.0, "corr": {"ok": true, "rtol": 0.003, "atol": 0.005, "absmax": 0.0, "mae": 0.0, "mse": 0.0, "ref": "causal_conv1d_fp32"}, "err": null}
6
+ {"ts": "2025-10-31T20:01:11Z", "run": "a7ca8117e1294b1ba730e0240038ddbc", "impl": "torch_eager", "tags": {"family": "pytorch", "backend": "eager"}, "wl": {"name": "cuda_B2_D64_S2048_W4", "batch": 2, "dim": 64, "seqlen": 2048, "width": 4, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.08490200002597703, "p50": 0.08731200000511308, "p90": 0.0877829999694768, "mean": 0.08806820000017979, "iqr": 0.001451000002816727, "raw_times": [0.09401200003367194, 0.08731200000511308, 0.08633199996666008, 0.08490200002597703, 0.0877829999694768], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.0907329999790818, "peak_bytes": 4720128, "ok": true, "absmax": 0.0, "corr": {"ok": true, "rtol": 0.003, "atol": 0.005, "absmax": 0.0, "mae": 0.0, "mse": 0.0, "ref": "causal_conv1d_fp32"}, "err": null}
7
+ {"ts": "2025-10-31T20:01:11Z", "run": "a7ca8117e1294b1ba730e0240038ddbc", "impl": "torch_eager", "tags": {"family": "pytorch", "backend": "eager"}, "wl": {"name": "cuda_B2_D2048_S128_W2", "batch": 2, "dim": 2048, "seqlen": 128, "width": 2, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.0847820000444699, "p50": 0.08513199998105847, "p90": 0.08660200001031626, "mean": 0.08566600000676772, "iqr": 0.0016600000094513234, "raw_times": [0.08494200000086494, 0.0847820000444699, 0.08687199999712902, 0.08660200001031626, 0.08513199998105847], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.0911219999579771, "peak_bytes": 9461760, "ok": true, "absmax": 0.0, "corr": {"ok": true, "rtol": 0.003, "atol": 0.005, "absmax": 0.0, "mae": 0.0, "mse": 0.0, "ref": "causal_conv1d_fp32"}, "err": null}
8
+ {"ts": "2025-10-31T20:01:11Z", "run": "a7ca8117e1294b1ba730e0240038ddbc", "impl": "torch_eager", "tags": {"family": "pytorch", "backend": "eager"}, "wl": {"name": "cuda_B2_D2048_S128_W4", "batch": 2, "dim": 2048, "seqlen": 128, "width": 4, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.08356199998615921, "p50": 0.0846430000365217, "p90": 0.08576199996923606, "mean": 0.08508039999242101, "iqr": 0.0011189999895577785, "raw_times": [0.08356199998615921, 0.0867919999905098, 0.08464299997967828, 0.08576199996923606, 0.0846430000365217], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.08955300000934585, "peak_bytes": 9478144, "ok": true, "absmax": 0.0, "corr": {"ok": true, "rtol": 0.003, "atol": 0.005, "absmax": 0.0, "mae": 0.0, "mse": 0.0, "ref": "causal_conv1d_fp32"}, "err": null}
9
+ {"ts": "2025-10-31T20:01:11Z", "run": "a7ca8117e1294b1ba730e0240038ddbc", "impl": "torch_eager", "tags": {"family": "pytorch", "backend": "eager"}, "wl": {"name": "cuda_B2_D2048_S512_W2", "batch": 2, "dim": 2048, "seqlen": 512, "width": 2, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.08469199997307442, "p50": 0.08614199998646654, "p90": 0.08723299998791845, "mean": 0.08654439999418173, "iqr": 0.0011309999763398082, "raw_times": [0.08469199997307442, 0.08610200001157864, 0.08614199998646654, 0.08855300001187061, 0.08723299998791845], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.09115300002804361, "peak_bytes": 37773312, "ok": true, "absmax": 0.0, "corr": {"ok": true, "rtol": 0.003, "atol": 0.005, "absmax": 0.0, "mae": 0.0, "mse": 0.0, "ref": "causal_conv1d_fp32"}, "err": null}
10
+ {"ts": "2025-10-31T20:01:11Z", "run": "a7ca8117e1294b1ba730e0240038ddbc", "impl": "torch_eager", "tags": {"family": "pytorch", "backend": "eager"}, "wl": {"name": "cuda_B2_D2048_S512_W4", "batch": 2, "dim": 2048, "seqlen": 512, "width": 4, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.08576300001550408, "p50": 0.08703200001036748, "p90": 0.08823299998539369, "mean": 0.09075460000076419, "iqr": 0.0015310000094359566, "raw_times": [0.10604300001659794, 0.08823299998539369, 0.08703200001036748, 0.08670199997595773, 0.08576300001550408], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.08985199997368909, "peak_bytes": 37789696, "ok": true, "absmax": 0.0, "corr": {"ok": true, "rtol": 0.003, "atol": 0.005, "absmax": 0.0, "mae": 0.0, "mse": 0.0, "ref": "causal_conv1d_fp32"}, "err": null}
11
+ {"ts": "2025-10-31T20:01:11Z", "run": "a7ca8117e1294b1ba730e0240038ddbc", "impl": "torch_eager", "tags": {"family": "pytorch", "backend": "eager"}, "wl": {"name": "cuda_B2_D2048_S2048_W2", "batch": 2, "dim": 2048, "seqlen": 2048, "width": 2, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.14525299997103502, "p50": 0.1457439999512644, "p90": 0.1459139999724357, "mean": 0.1457395999750588, "iqr": 0.00044099999740865314, "raw_times": [0.14525299997103502, 0.14547299997502705, 0.1457439999512644, 0.14631400000553185, 0.1459139999724357], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.1472430000148961, "peak_bytes": 151019520, "ok": true, "absmax": 0.0, "corr": {"ok": true, "rtol": 0.003, "atol": 0.005, "absmax": 0.0, "mae": 0.0, "mse": 0.0, "ref": "causal_conv1d_fp32"}, "err": null}
12
+ {"ts": "2025-10-31T20:01:11Z", "run": "a7ca8117e1294b1ba730e0240038ddbc", "impl": "torch_eager", "tags": {"family": "pytorch", "backend": "eager"}, "wl": {"name": "cuda_B2_D2048_S2048_W4", "batch": 2, "dim": 2048, "seqlen": 2048, "width": 4, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.16037399996093882, "p50": 0.16231400002197915, "p90": 0.16309400001546237, "mean": 0.1622881999992387, "iqr": 0.0012190000120426703, "raw_times": [0.16309400001546237, 0.16231400002197915, 0.16378399999439353, 0.1618750000034197, 0.16037399996093882], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.16341399998509587, "peak_bytes": 151035904, "ok": true, "absmax": 0.0, "corr": {"ok": true, "rtol": 0.003, "atol": 0.005, "absmax": 0.0, "mae": 0.0, "mse": 0.0, "ref": "causal_conv1d_fp32"}, "err": null}
13
+ {"ts": "2025-10-31T20:01:11Z", "run": "a7ca8117e1294b1ba730e0240038ddbc", "impl": "torch_eager", "tags": {"family": "pytorch", "backend": "eager"}, "wl": {"name": "cuda_B4_D64_S128_W2", "batch": 4, "dim": 64, "seqlen": 128, "width": 2, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.08445299999948475, "p50": 0.08518200002072263, "p90": 0.08666200000106983, "mean": 0.08572240001285536, "iqr": 0.0017899999988912896, "raw_times": [0.08445299999948475, 0.08744300004082106, 0.08518200002072263, 0.08666200000106983, 0.08487200000217854], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.0890119999894523, "peak_bytes": 33727488, "ok": true, "absmax": 0.0, "corr": {"ok": true, "rtol": 0.003, "atol": 0.005, "absmax": 0.0, "mae": 0.0, "mse": 0.0, "ref": "causal_conv1d_fp32"}, "err": null}
14
+ {"ts": "2025-10-31T20:01:11Z", "run": "a7ca8117e1294b1ba730e0240038ddbc", "impl": "torch_eager", "tags": {"family": "pytorch", "backend": "eager"}, "wl": {"name": "cuda_B4_D64_S128_W4", "batch": 4, "dim": 64, "seqlen": 128, "width": 4, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.08437200000344092, "p50": 0.08463200003916427, "p90": 0.08609200000364581, "mean": 0.08522400000856578, "iqr": 0.0015900000107649248, "raw_times": [0.08463200003916427, 0.08609200000364581, 0.08652200000369703, 0.08437200000344092, 0.08450199999288088], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.08977199996706986, "peak_bytes": 591360, "ok": true, "absmax": 0.0, "corr": {"ok": true, "rtol": 0.003, "atol": 0.005, "absmax": 0.0, "mae": 0.0, "mse": 0.0, "ref": "causal_conv1d_fp32"}, "err": null}
15
+ {"ts": "2025-10-31T20:01:11Z", "run": "a7ca8117e1294b1ba730e0240038ddbc", "impl": "torch_eager", "tags": {"family": "pytorch", "backend": "eager"}, "wl": {"name": "cuda_B4_D64_S512_W2", "batch": 4, "dim": 64, "seqlen": 512, "width": 2, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.08375199996635274, "p50": 0.08519199997181204, "p90": 0.08627200003274993, "mean": 0.08607399998936671, "iqr": 0.0020100000597267353, "raw_times": [0.08375199996635274, 0.0842619999730232, 0.08627200003274993, 0.08519199997181204, 0.09089200000289566], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.08821199998010343, "peak_bytes": 2360320, "ok": true, "absmax": 0.0, "corr": {"ok": true, "rtol": 0.003, "atol": 0.005, "absmax": 0.0, "mae": 0.0, "mse": 0.0, "ref": "causal_conv1d_fp32"}, "err": null}
16
+ {"ts": "2025-10-31T20:01:11Z", "run": "a7ca8117e1294b1ba730e0240038ddbc", "impl": "torch_eager", "tags": {"family": "pytorch", "backend": "eager"}, "wl": {"name": "cuda_B4_D64_S512_W4", "batch": 4, "dim": 64, "seqlen": 512, "width": 4, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.08470200003785067, "p50": 0.08566200000359458, "p90": 0.08573299999170558, "mean": 0.08566220001284819, "iqr": 0.0006109999617365247, "raw_times": [0.08470200003785067, 0.08709200000112105, 0.08512200002996906, 0.08566200000359458, 0.08573299999170558], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.08864200003699807, "peak_bytes": 2360832, "ok": true, "absmax": 0.0, "corr": {"ok": true, "rtol": 0.003, "atol": 0.005, "absmax": 0.0, "mae": 0.0, "mse": 0.0, "ref": "causal_conv1d_fp32"}, "err": null}
17
+ {"ts": "2025-10-31T20:01:11Z", "run": "a7ca8117e1294b1ba730e0240038ddbc", "impl": "torch_eager", "tags": {"family": "pytorch", "backend": "eager"}, "wl": {"name": "cuda_B4_D64_S2048_W2", "batch": 4, "dim": 64, "seqlen": 2048, "width": 2, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.08451200000081371, "p50": 0.08525300000883362, "p90": 0.08580199994412396, "mean": 0.08525219999455658, "iqr": 0.0009299999419454252, "raw_times": [0.08580199994412396, 0.08525300000883362, 0.08451200000081371, 0.08487200000217854, 0.08582200001683304], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.08942300001990588, "peak_bytes": 9438208, "ok": true, "absmax": 0.0, "corr": {"ok": true, "rtol": 0.003, "atol": 0.005, "absmax": 0.0, "mae": 0.0, "mse": 0.0, "ref": "causal_conv1d_fp32"}, "err": null}
18
+ {"ts": "2025-10-31T20:01:11Z", "run": "a7ca8117e1294b1ba730e0240038ddbc", "impl": "torch_eager", "tags": {"family": "pytorch", "backend": "eager"}, "wl": {"name": "cuda_B4_D64_S2048_W4", "batch": 4, "dim": 64, "seqlen": 2048, "width": 4, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.08533199996918484, "p50": 0.08693199998788259, "p90": 0.09015199998430035, "mean": 0.08883799998784525, "iqr": 0.0043200000163778896, "raw_times": [0.08533199996918484, 0.09015199998430035, 0.08583199996792246, 0.08693199998788259, 0.09594200002993603], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.09176200001093093, "peak_bytes": 9438720, "ok": true, "absmax": 0.0, "corr": {"ok": true, "rtol": 0.003, "atol": 0.005, "absmax": 0.0, "mae": 0.0, "mse": 0.0, "ref": "causal_conv1d_fp32"}, "err": null}
19
+ {"ts": "2025-10-31T20:01:11Z", "run": "a7ca8117e1294b1ba730e0240038ddbc", "impl": "torch_eager", "tags": {"family": "pytorch", "backend": "eager"}, "wl": {"name": "cuda_B4_D2048_S128_W2", "batch": 4, "dim": 2048, "seqlen": 128, "width": 2, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.08384200003774822, "p50": 0.08611200001951147, "p90": 0.08663199997727133, "mean": 0.08570400000280642, "iqr": 0.001730000008137722, "raw_times": [0.08384200003774822, 0.08611200001951147, 0.08703200001036748, 0.08663199997727133, 0.08490199996913361], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.08941200002254845, "peak_bytes": 18931712, "ok": true, "absmax": 0.0, "corr": {"ok": true, "rtol": 0.003, "atol": 0.005, "absmax": 0.0, "mae": 0.0, "mse": 0.0, "ref": "causal_conv1d_fp32"}, "err": null}
20
+ {"ts": "2025-10-31T20:01:11Z", "run": "a7ca8117e1294b1ba730e0240038ddbc", "impl": "torch_eager", "tags": {"family": "pytorch", "backend": "eager"}, "wl": {"name": "cuda_B4_D2048_S128_W4", "batch": 4, "dim": 2048, "seqlen": 128, "width": 4, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.08507300003657292, "p50": 0.0865819999944506, "p90": 0.08741199997075455, "mean": 0.09195439998848087, "iqr": 0.0020300000187489786, "raw_times": [0.11532299998862072, 0.0865819999944506, 0.08741199997075455, 0.08538199995200557, 0.08507300003657292], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.08733200002097874, "peak_bytes": 18948096, "ok": true, "absmax": 0.0, "corr": {"ok": true, "rtol": 0.003, "atol": 0.005, "absmax": 0.0, "mae": 0.0, "mse": 0.0, "ref": "causal_conv1d_fp32"}, "err": null}
21
+ {"ts": "2025-10-31T20:01:11Z", "run": "a7ca8117e1294b1ba730e0240038ddbc", "impl": "torch_eager", "tags": {"family": "pytorch", "backend": "eager"}, "wl": {"name": "cuda_B4_D2048_S512_W2", "batch": 4, "dim": 2048, "seqlen": 512, "width": 2, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.09419299999535724, "p50": 0.09539199999153425, "p90": 0.09730299996135727, "mean": 0.09678459998667677, "iqr": 0.002380999944762152, "raw_times": [0.10211299996853995, 0.09730299996135727, 0.09492200001659512, 0.09539199999153425, 0.09419299999535724], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.09651299995994123, "peak_bytes": 75522048, "ok": true, "absmax": 0.0, "corr": {"ok": true, "rtol": 0.003, "atol": 0.005, "absmax": 0.0, "mae": 0.0, "mse": 0.0, "ref": "causal_conv1d_fp32"}, "err": null}
22
+ {"ts": "2025-10-31T20:01:11Z", "run": "a7ca8117e1294b1ba730e0240038ddbc", "impl": "torch_eager", "tags": {"family": "pytorch", "backend": "eager"}, "wl": {"name": "cuda_B4_D2048_S512_W4", "batch": 4, "dim": 2048, "seqlen": 512, "width": 4, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.10080199996309602, "p50": 0.10192199999892182, "p90": 0.1026219999857858, "mean": 0.10294419998899684, "iqr": 0.0008999999749903509, "raw_times": [0.10765299998638511, 0.10172200001079545, 0.1026219999857858, 0.10192199999892182, 0.10080199996309602], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.10299199999508346, "peak_bytes": 75538432, "ok": true, "absmax": 0.0, "corr": {"ok": true, "rtol": 0.003, "atol": 0.005, "absmax": 0.0, "mae": 0.0, "mse": 0.0, "ref": "causal_conv1d_fp32"}, "err": null}
23
+ {"ts": "2025-10-31T20:01:11Z", "run": "a7ca8117e1294b1ba730e0240038ddbc", "impl": "torch_eager", "tags": {"family": "pytorch", "backend": "eager"}, "wl": {"name": "cuda_B4_D2048_S2048_W2", "batch": 4, "dim": 2048, "seqlen": 2048, "width": 2, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.4861929999719905, "p50": 0.4890019999947981, "p90": 0.48961200002395344, "mean": 0.48862639999924795, "iqr": 0.001079000014669873, "raw_times": [0.48979199999621414, 0.4861929999719905, 0.48961200002395344, 0.4890019999947981, 0.48853300000928357], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.48705300002893637, "peak_bytes": 302014464, "ok": true, "absmax": 0.0, "corr": {"ok": true, "rtol": 0.003, "atol": 0.005, "absmax": 0.0, "mae": 0.0, "mse": 0.0, "ref": "causal_conv1d_fp32"}, "err": null}
24
+ {"ts": "2025-10-31T20:01:11Z", "run": "a7ca8117e1294b1ba730e0240038ddbc", "impl": "torch_eager", "tags": {"family": "pytorch", "backend": "eager"}, "wl": {"name": "cuda_B4_D2048_S2048_W4", "batch": 4, "dim": 2048, "seqlen": 2048, "width": 4, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.49736299996538946, "p50": 0.49848299994437184, "p90": 0.49918199999865465, "mean": 0.4987367999774506, "iqr": 0.0007590000450363732, "raw_times": [0.4984229999536183, 0.49848299994437184, 0.49918199999865465, 0.5002330000252186, 0.49736299996538946], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.4985730000157673, "peak_bytes": 302030848, "ok": true, "absmax": 0.0, "corr": {"ok": true, "rtol": 0.003, "atol": 0.005, "absmax": 0.0, "mae": 0.0, "mse": 0.0, "ref": "causal_conv1d_fp32"}, "err": null}
causal_conv1d/impls/cells/benchmark.py CHANGED
@@ -4,28 +4,37 @@
4
  # "numpy",
5
  # "torch==2.8.0",
6
  # "kernels-benchmark-tools",
7
- # "kernels",
8
  # ]
9
  #
10
  # [tool.uv.sources]
11
  # kernels-benchmark-tools = { path = "../../../../../tools", editable = true }
12
  # ///
13
  import torch
 
14
  import sys
15
  from kernels_benchmark_tools import KernelTypeEnum, run_benchmark
16
- from kernels import get_kernel
17
 
18
- # Load the causal conv1d kernel
19
- causal_conv1d = get_kernel("kernels-community/causal-conv1d")
20
 
 
 
 
 
 
 
21
 
22
- def hf_kernels_causal_conv1d(input_tensor, weight, bias):
23
- return causal_conv1d.causal_conv1d_fn(input_tensor, weight, bias)
 
 
 
 
 
 
24
 
25
 
26
  run_benchmark(
27
  kernel_type=KernelTypeEnum.CAUSAL_CONV1D,
28
- impl_name="hf_kernels_causal_conv1d",
29
- impl_tags={"family": "hf-kernels", "backend": "cuda"},
30
- impl_func=hf_kernels_causal_conv1d,
31
  )
 
4
  # "numpy",
5
  # "torch==2.8.0",
6
  # "kernels-benchmark-tools",
 
7
  # ]
8
  #
9
  # [tool.uv.sources]
10
  # kernels-benchmark-tools = { path = "../../../../../tools", editable = true }
11
  # ///
12
  import torch
13
+ import torch.nn.functional as F
14
  import sys
15
  from kernels_benchmark_tools import KernelTypeEnum, run_benchmark
 
16
 
 
 
17
 
18
+ def torch_causal_conv1d(input_tensor, weight, bias):
19
+ # Convert to weight dtype for computation
20
+ x = input_tensor.to(weight.dtype)
21
+ dim = weight.shape[0]
22
+ width = weight.shape[1]
23
+ seqlen = input_tensor.shape[-1]
24
 
25
+ # Depthwise causal conv1d using PyTorch
26
+ out = F.conv1d(x, weight.unsqueeze(1), bias, padding=width - 1, groups=dim)
27
+
28
+ # Truncate to original sequence length
29
+ out = out[..., :seqlen]
30
+
31
+ # Convert back to original dtype
32
+ return out.to(input_tensor.dtype)
33
 
34
 
35
  run_benchmark(
36
  kernel_type=KernelTypeEnum.CAUSAL_CONV1D,
37
+ impl_name="torch_eager",
38
+ impl_tags={"family": "pytorch", "backend": "eager"},
39
+ impl_func=torch_causal_conv1d,
40
  )
causal_conv1d/impls/hf_kernels_causal_conv1d.html CHANGED
The diff for this file is too large to render. See raw diff
 
causal_conv1d/impls/torch_causal_conv1d.html CHANGED
The diff for this file is too large to render. See raw diff
 
causal_conv1d/results/artifacts/combine/latency.svg CHANGED

Git LFS Details

  • SHA256: cf8858bb054bd7e8f82af77fd05a6475b7ee3a9a335ba4a6506cd1c694804777
  • Pointer size: 130 Bytes
  • Size of remote file: 35.4 kB

Git LFS Details

  • SHA256: 6fdf61512b0add92f3d8e4a284ecb814f7a3b11b2db0fe3af610896a05d7072f
  • Pointer size: 130 Bytes
  • Size of remote file: 35.4 kB
causal_conv1d/results/combined_results.html CHANGED
@@ -4107,7 +4107,7 @@ body[data-tool="eraser"] .main-content {
4107
  <rdf:RDF>
4108
  <ns2:Work>
4109
  <dc:type rdf:resource="http://purl.org/dc/dcmitype/StillImage" />
4110
- <dc:date>2025-10-30T15:53:58.349427</dc:date>
4111
  <dc:format>image/svg+xml</dc:format>
4112
  <dc:creator>
4113
  <ns2:Agent>
@@ -4451,70 +4451,70 @@ body[data-tool="eraser"] .main-content {
4451
  <g id="matplotlib.axis_2">
4452
  <g id="ytick_1">
4453
  <g id="grid-y--2" class="grid grid-y">
4454
- <path d="M 47.72 375.197972 L 831.034248 375.197972 " clip-path="url(#pb49fc4c8d2)" style="fill: none; stroke: #b0b0b0; stroke-opacity: 0.3; stroke-width: 0.8; stroke-linecap: square" />
4455
  </g>
4456
  <g id="line2d_25">
4457
  <defs>
4458
  <path id="m0fca2865ba" d="M 0 0 L -3.5 0 " style="stroke: #000000; stroke-width: 0.8" />
4459
  </defs>
4460
  <g>
4461
- <use ns4:href="#m0fca2865ba" x="47.72" y="375.197972" style="stroke: #000000; stroke-width: 0.8" />
4462
  </g>
4463
  </g>
4464
  <g id="text_25">
4465
- <text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: end" x="40.72" y="378.997191" transform="rotate(-0 40.72 378.997191)">0.1</text>
4466
  </g>
4467
  </g>
4468
  <g id="ytick_2">
4469
  <g id="grid-y--3" class="grid grid-y">
4470
- <path d="M 47.72 292.404953 L 831.034248 292.404953 " clip-path="url(#pb49fc4c8d2)" style="fill: none; stroke: #b0b0b0; stroke-opacity: 0.3; stroke-width: 0.8; stroke-linecap: square" />
4471
  </g>
4472
  <g id="line2d_26">
4473
  <g>
4474
- <use ns4:href="#m0fca2865ba" x="47.72" y="292.404953" style="stroke: #000000; stroke-width: 0.8" />
4475
  </g>
4476
  </g>
4477
  <g id="text_26">
4478
- <text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: end" x="40.72" y="296.204172" transform="rotate(-0 40.72 296.204172)">0.2</text>
4479
  </g>
4480
  </g>
4481
  <g id="ytick_3">
4482
  <g id="grid-y--4" class="grid grid-y">
4483
- <path d="M 47.72 209.611934 L 831.034248 209.611934 " clip-path="url(#pb49fc4c8d2)" style="fill: none; stroke: #b0b0b0; stroke-opacity: 0.3; stroke-width: 0.8; stroke-linecap: square" />
4484
  </g>
4485
  <g id="line2d_27">
4486
  <g>
4487
- <use ns4:href="#m0fca2865ba" x="47.72" y="209.611934" style="stroke: #000000; stroke-width: 0.8" />
4488
  </g>
4489
  </g>
4490
  <g id="text_27">
4491
- <text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: end" x="40.72" y="213.411153" transform="rotate(-0 40.72 213.411153)">0.3</text>
4492
  </g>
4493
  </g>
4494
  <g id="ytick_4">
4495
  <g id="grid-y--5" class="grid grid-y">
4496
- <path d="M 47.72 126.818915 L 831.034248 126.818915 " clip-path="url(#pb49fc4c8d2)" style="fill: none; stroke: #b0b0b0; stroke-opacity: 0.3; stroke-width: 0.8; stroke-linecap: square" />
4497
  </g>
4498
  <g id="line2d_28">
4499
  <g>
4500
- <use ns4:href="#m0fca2865ba" x="47.72" y="126.818915" style="stroke: #000000; stroke-width: 0.8" />
4501
  </g>
4502
  </g>
4503
  <g id="text_28">
4504
- <text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: end" x="40.72" y="130.618134" transform="rotate(-0 40.72 130.618134)">0.4</text>
4505
  </g>
4506
  </g>
4507
  <g id="ytick_5">
4508
  <g id="grid-y--6" class="grid grid-y">
4509
- <path d="M 47.72 44.025896 L 831.034248 44.025896 " clip-path="url(#pb49fc4c8d2)" style="fill: none; stroke: #b0b0b0; stroke-opacity: 0.3; stroke-width: 0.8; stroke-linecap: square" />
4510
  </g>
4511
  <g id="line2d_29">
4512
  <g>
4513
- <use ns4:href="#m0fca2865ba" x="47.72" y="44.025896" style="stroke: #000000; stroke-width: 0.8" />
4514
  </g>
4515
  </g>
4516
  <g id="text_29">
4517
- <text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: end" x="40.72" y="47.825115" transform="rotate(-0 40.72 47.825115)">0.5</text>
4518
  </g>
4519
  </g>
4520
  <g id="label--y" class="ylabel">
@@ -4522,66 +4522,66 @@ body[data-tool="eraser"] .main-content {
4522
  </g>
4523
  </g>
4524
  <g id="series--hf-kernels-causal-conv1d" class="series">
4525
- <path d="M 83.325193 420.186871 L 114.286231 412.876247 L 145.247268 415.070262 L 176.208306 415.078541 L 207.169343 415.235848 L 238.130381 416.63505 L 269.091418 416.444626 L 300.052455 415.56702 L 331.013493 415.889913 L 361.97453 415.989265 L 392.935568 415.633255 L 423.896605 415.119938 L 454.857643 414.912955 L 485.81868 415.285524 L 516.779718 414.773035 L 547.740755 413.704177 L 578.701793 415.831958 L 609.66283 415.989265 L 640.623868 416.03894 L 671.584905 416.204526 L 702.545943 414.879838 L 733.50698 415.351758 L 764.468018 416.295599 L 795.429055 414.97919 " clip-path="url(#pb49fc4c8d2)" style="fill: none; stroke: #1f77b4; stroke-width: 1.5; stroke-linecap: square" />
4526
  <defs>
4527
  <path id="md7efaf3aec" d="M 0 3 C 0.795609 3 1.55874 2.683901 2.12132 2.12132 C 2.683901 1.55874 3 0.795609 3 0 C 3 -0.795609 2.683901 -1.55874 2.12132 -2.12132 C 1.55874 -2.683901 0.795609 -3 0 -3 C -0.795609 -3 -1.55874 -2.683901 -2.12132 -2.12132 C -2.683901 -1.55874 -3 -0.795609 -3 0 C -3 0.795609 -2.683901 1.55874 -2.12132 2.12132 C -1.55874 2.683901 -0.795609 3 0 3 z " style="stroke: #1f77b4" />
4528
  </defs>
4529
  <g clip-path="url(#pb49fc4c8d2)">
4530
  <use ns4:href="#md7efaf3aec" x="83.325193" y="420.186871" style="fill: #1f77b4; stroke: #1f77b4" />
4531
- <use ns4:href="#md7efaf3aec" x="114.286231" y="412.876247" style="fill: #1f77b4; stroke: #1f77b4" />
4532
- <use ns4:href="#md7efaf3aec" x="145.247268" y="415.070262" style="fill: #1f77b4; stroke: #1f77b4" />
4533
- <use ns4:href="#md7efaf3aec" x="176.208306" y="415.078541" style="fill: #1f77b4; stroke: #1f77b4" />
4534
- <use ns4:href="#md7efaf3aec" x="207.169343" y="415.235848" style="fill: #1f77b4; stroke: #1f77b4" />
4535
- <use ns4:href="#md7efaf3aec" x="238.130381" y="416.63505" style="fill: #1f77b4; stroke: #1f77b4" />
4536
- <use ns4:href="#md7efaf3aec" x="269.091418" y="416.444626" style="fill: #1f77b4; stroke: #1f77b4" />
4537
- <use ns4:href="#md7efaf3aec" x="300.052455" y="415.56702" style="fill: #1f77b4; stroke: #1f77b4" />
4538
- <use ns4:href="#md7efaf3aec" x="331.013493" y="415.889913" style="fill: #1f77b4; stroke: #1f77b4" />
4539
- <use ns4:href="#md7efaf3aec" x="361.97453" y="415.989265" style="fill: #1f77b4; stroke: #1f77b4" />
4540
- <use ns4:href="#md7efaf3aec" x="392.935568" y="415.633255" style="fill: #1f77b4; stroke: #1f77b4" />
4541
- <use ns4:href="#md7efaf3aec" x="423.896605" y="415.119938" style="fill: #1f77b4; stroke: #1f77b4" />
4542
- <use ns4:href="#md7efaf3aec" x="454.857643" y="414.912955" style="fill: #1f77b4; stroke: #1f77b4" />
4543
- <use ns4:href="#md7efaf3aec" x="485.81868" y="415.285524" style="fill: #1f77b4; stroke: #1f77b4" />
4544
- <use ns4:href="#md7efaf3aec" x="516.779718" y="414.773035" style="fill: #1f77b4; stroke: #1f77b4" />
4545
- <use ns4:href="#md7efaf3aec" x="547.740755" y="413.704177" style="fill: #1f77b4; stroke: #1f77b4" />
4546
- <use ns4:href="#md7efaf3aec" x="578.701793" y="415.831958" style="fill: #1f77b4; stroke: #1f77b4" />
4547
- <use ns4:href="#md7efaf3aec" x="609.66283" y="415.989265" style="fill: #1f77b4; stroke: #1f77b4" />
4548
- <use ns4:href="#md7efaf3aec" x="640.623868" y="416.03894" style="fill: #1f77b4; stroke: #1f77b4" />
4549
- <use ns4:href="#md7efaf3aec" x="671.584905" y="416.204526" style="fill: #1f77b4; stroke: #1f77b4" />
4550
- <use ns4:href="#md7efaf3aec" x="702.545943" y="414.879838" style="fill: #1f77b4; stroke: #1f77b4" />
4551
- <use ns4:href="#md7efaf3aec" x="733.50698" y="415.351758" style="fill: #1f77b4; stroke: #1f77b4" />
4552
- <use ns4:href="#md7efaf3aec" x="764.468018" y="416.295599" style="fill: #1f77b4; stroke: #1f77b4" />
4553
- <use ns4:href="#md7efaf3aec" x="795.429055" y="414.97919" style="fill: #1f77b4; stroke: #1f77b4" />
4554
  </g>
4555
  </g>
4556
  <g id="series--torch-eager" class="series">
4557
- <path d="M 83.325193 398.743479 L 114.286231 386.307139 L 145.247268 385.761533 L 176.208306 386.431329 L 207.169343 387.118511 L 238.130381 389.975698 L 269.091418 385.612506 L 300.052455 387.674052 L 331.013493 387.276646 L 361.97453 388.402631 L 392.935568 338.137333 L 423.896605 324.393692 L 454.857643 388.923399 L 485.81868 389.229733 L 516.779718 389.570012 L 547.740755 388.501982 L 578.701793 388.517713 L 609.66283 388.55083 L 640.623868 388.518541 L 671.584905 388.609613 L 702.545943 379.783049 L 733.50698 374.2028 L 764.468018 54.691293 L 795.429055 45.608899 " clip-path="url(#pb49fc4c8d2)" style="fill: none; stroke: #ff7f0e; stroke-width: 1.5; stroke-linecap: square" />
4558
  <defs>
4559
  <path id="m9b8c54d372" d="M 0 3 C 0.795609 3 1.55874 2.683901 2.12132 2.12132 C 2.683901 1.55874 3 0.795609 3 0 C 3 -0.795609 2.683901 -1.55874 2.12132 -2.12132 C 1.55874 -2.683901 0.795609 -3 0 -3 C -0.795609 -3 -1.55874 -2.683901 -2.12132 -2.12132 C -2.683901 -1.55874 -3 -0.795609 -3 0 C -3 0.795609 -2.683901 1.55874 -2.12132 2.12132 C -1.55874 2.683901 -0.795609 3 0 3 z " style="stroke: #ff7f0e" />
4560
  </defs>
4561
  <g clip-path="url(#pb49fc4c8d2)">
4562
- <use ns4:href="#m9b8c54d372" x="83.325193" y="398.743479" style="fill: #ff7f0e; stroke: #ff7f0e" />
4563
- <use ns4:href="#m9b8c54d372" x="114.286231" y="386.307139" style="fill: #ff7f0e; stroke: #ff7f0e" />
4564
- <use ns4:href="#m9b8c54d372" x="145.247268" y="385.761533" style="fill: #ff7f0e; stroke: #ff7f0e" />
4565
- <use ns4:href="#m9b8c54d372" x="176.208306" y="386.431329" style="fill: #ff7f0e; stroke: #ff7f0e" />
4566
- <use ns4:href="#m9b8c54d372" x="207.169343" y="387.118511" style="fill: #ff7f0e; stroke: #ff7f0e" />
4567
- <use ns4:href="#m9b8c54d372" x="238.130381" y="389.975698" style="fill: #ff7f0e; stroke: #ff7f0e" />
4568
- <use ns4:href="#m9b8c54d372" x="269.091418" y="385.612506" style="fill: #ff7f0e; stroke: #ff7f0e" />
4569
- <use ns4:href="#m9b8c54d372" x="300.052455" y="387.674052" style="fill: #ff7f0e; stroke: #ff7f0e" />
4570
- <use ns4:href="#m9b8c54d372" x="331.013493" y="387.276646" style="fill: #ff7f0e; stroke: #ff7f0e" />
4571
- <use ns4:href="#m9b8c54d372" x="361.97453" y="388.402631" style="fill: #ff7f0e; stroke: #ff7f0e" />
4572
- <use ns4:href="#m9b8c54d372" x="392.935568" y="338.137333" style="fill: #ff7f0e; stroke: #ff7f0e" />
4573
- <use ns4:href="#m9b8c54d372" x="423.896605" y="324.393692" style="fill: #ff7f0e; stroke: #ff7f0e" />
4574
- <use ns4:href="#m9b8c54d372" x="454.857643" y="388.923399" style="fill: #ff7f0e; stroke: #ff7f0e" />
4575
- <use ns4:href="#m9b8c54d372" x="485.81868" y="389.229733" style="fill: #ff7f0e; stroke: #ff7f0e" />
4576
- <use ns4:href="#m9b8c54d372" x="516.779718" y="389.570012" style="fill: #ff7f0e; stroke: #ff7f0e" />
4577
- <use ns4:href="#m9b8c54d372" x="547.740755" y="388.501982" style="fill: #ff7f0e; stroke: #ff7f0e" />
4578
- <use ns4:href="#m9b8c54d372" x="578.701793" y="388.517713" style="fill: #ff7f0e; stroke: #ff7f0e" />
4579
- <use ns4:href="#m9b8c54d372" x="609.66283" y="388.55083" style="fill: #ff7f0e; stroke: #ff7f0e" />
4580
- <use ns4:href="#m9b8c54d372" x="640.623868" y="388.518541" style="fill: #ff7f0e; stroke: #ff7f0e" />
4581
- <use ns4:href="#m9b8c54d372" x="671.584905" y="388.609613" style="fill: #ff7f0e; stroke: #ff7f0e" />
4582
- <use ns4:href="#m9b8c54d372" x="702.545943" y="379.783049" style="fill: #ff7f0e; stroke: #ff7f0e" />
4583
- <use ns4:href="#m9b8c54d372" x="733.50698" y="374.2028" style="fill: #ff7f0e; stroke: #ff7f0e" />
4584
- <use ns4:href="#m9b8c54d372" x="764.468018" y="54.691293" style="fill: #ff7f0e; stroke: #ff7f0e" />
4585
  <use ns4:href="#m9b8c54d372" x="795.429055" y="45.608899" style="fill: #ff7f0e; stroke: #ff7f0e" />
4586
  </g>
4587
  </g>
@@ -4640,7 +4640,7 @@ body[data-tool="eraser"] .main-content {
4640
  <span onclick="toggleOutput('combine')" style="cursor: pointer;">▼ output</span>
4641
  <span id="uv-indicator-combine" onclick="toggleUvLogsFromHeader('combine')" style="cursor: pointer;">▶ uv-logs</span>
4642
  </span> |
4643
- Cell: combine | 4.38s
4644
  | <button class="run-btn" onclick="runCell('combine')">▶ run</button>
4645
  <button class="copy-btn" onclick="copyCell('combine')">Copy</button>
4646
  <a href="cells/combine.py" target="_blank" class="raw-btn">Raw</a>
@@ -4753,28 +4753,28 @@ hf_kernels_causal_conv1d cuda_B4_D64_S512_W2 0.05 True
4753
  hf_kernels_causal_conv1d cuda_B4_D64_S512_W4 0.05 True
4754
  torch_eager cuda_B2_D2048_S128_W2 0.09 True
4755
  torch_eager cuda_B2_D2048_S128_W4 0.08 True
4756
- torch_eager cuda_B2_D2048_S2048_W2 0.14 True
4757
  torch_eager cuda_B2_D2048_S2048_W4 0.16 True
4758
  torch_eager cuda_B2_D2048_S512_W2 0.09 True
4759
- torch_eager cuda_B2_D2048_S512_W4 0.08 True
4760
  torch_eager cuda_B2_D64_S128_W2 0.07 True
4761
  torch_eager cuda_B2_D64_S128_W4 0.09 True
4762
  torch_eager cuda_B2_D64_S2048_W2 0.09 True
4763
- torch_eager cuda_B2_D64_S2048_W4 0.08 True
4764
  torch_eager cuda_B2_D64_S512_W2 0.09 True
4765
  torch_eager cuda_B2_D64_S512_W4 0.09 True
4766
- torch_eager cuda_B4_D2048_S128_W2 0.08 True
4767
- torch_eager cuda_B4_D2048_S128_W4 0.08 True
4768
  torch_eager cuda_B4_D2048_S2048_W2 0.49 True
4769
  torch_eager cuda_B4_D2048_S2048_W4 0.50 True
4770
- torch_eager cuda_B4_D2048_S512_W2 0.09 True
4771
  torch_eager cuda_B4_D2048_S512_W4 0.10 True
4772
- torch_eager cuda_B4_D64_S128_W2 0.08 True
4773
  torch_eager cuda_B4_D64_S128_W4 0.08 True
4774
- torch_eager cuda_B4_D64_S2048_W2 0.08 True
4775
- torch_eager cuda_B4_D64_S2048_W4 0.08 True
4776
- torch_eager cuda_B4_D64_S512_W2 0.08 True
4777
- torch_eager cuda_B4_D64_S512_W4 0.08 True
4778
 
4779
  GENERATING COMBINED VISUALIZATION
4780
 
@@ -4794,7 +4794,7 @@ Implementations included:
4794
  <div class="uv-install-logs" id="uv-logs-combine">
4795
  <div class="uv-logs-header" onclick="toggleUvLogs(this)">▶ UV Install Logs</div>
4796
  <div class="uv-logs-content" style="display: none;">
4797
- Installed 37 packages in 211ms
4798
  </div>
4799
  </div>
4800
  <div class="cell-artifacts">
@@ -4807,7 +4807,7 @@ Installed 37 packages in 211ms
4807
  <rdf:RDF>
4808
  <ns2:Work>
4809
  <dc:type rdf:resource="http://purl.org/dc/dcmitype/StillImage" />
4810
- <dc:date>2025-10-30T15:53:58.349427</dc:date>
4811
  <dc:format>image/svg+xml</dc:format>
4812
  <dc:creator>
4813
  <ns2:Agent>
@@ -5151,70 +5151,70 @@ Installed 37 packages in 211ms
5151
  <g id="matplotlib.axis_2">
5152
  <g id="ytick_1">
5153
  <g id="grid-y--2" class="grid grid-y">
5154
- <path d="M 47.72 375.197972 L 831.034248 375.197972 " clip-path="url(#pb49fc4c8d2)" style="fill: none; stroke: #b0b0b0; stroke-opacity: 0.3; stroke-width: 0.8; stroke-linecap: square" />
5155
  </g>
5156
  <g id="line2d_25">
5157
  <defs>
5158
  <path id="m0fca2865ba" d="M 0 0 L -3.5 0 " style="stroke: #000000; stroke-width: 0.8" />
5159
  </defs>
5160
  <g>
5161
- <use ns4:href="#m0fca2865ba" x="47.72" y="375.197972" style="stroke: #000000; stroke-width: 0.8" />
5162
  </g>
5163
  </g>
5164
  <g id="text_25">
5165
- <text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: end" x="40.72" y="378.997191" transform="rotate(-0 40.72 378.997191)">0.1</text>
5166
  </g>
5167
  </g>
5168
  <g id="ytick_2">
5169
  <g id="grid-y--3" class="grid grid-y">
5170
- <path d="M 47.72 292.404953 L 831.034248 292.404953 " clip-path="url(#pb49fc4c8d2)" style="fill: none; stroke: #b0b0b0; stroke-opacity: 0.3; stroke-width: 0.8; stroke-linecap: square" />
5171
  </g>
5172
  <g id="line2d_26">
5173
  <g>
5174
- <use ns4:href="#m0fca2865ba" x="47.72" y="292.404953" style="stroke: #000000; stroke-width: 0.8" />
5175
  </g>
5176
  </g>
5177
  <g id="text_26">
5178
- <text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: end" x="40.72" y="296.204172" transform="rotate(-0 40.72 296.204172)">0.2</text>
5179
  </g>
5180
  </g>
5181
  <g id="ytick_3">
5182
  <g id="grid-y--4" class="grid grid-y">
5183
- <path d="M 47.72 209.611934 L 831.034248 209.611934 " clip-path="url(#pb49fc4c8d2)" style="fill: none; stroke: #b0b0b0; stroke-opacity: 0.3; stroke-width: 0.8; stroke-linecap: square" />
5184
  </g>
5185
  <g id="line2d_27">
5186
  <g>
5187
- <use ns4:href="#m0fca2865ba" x="47.72" y="209.611934" style="stroke: #000000; stroke-width: 0.8" />
5188
  </g>
5189
  </g>
5190
  <g id="text_27">
5191
- <text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: end" x="40.72" y="213.411153" transform="rotate(-0 40.72 213.411153)">0.3</text>
5192
  </g>
5193
  </g>
5194
  <g id="ytick_4">
5195
  <g id="grid-y--5" class="grid grid-y">
5196
- <path d="M 47.72 126.818915 L 831.034248 126.818915 " clip-path="url(#pb49fc4c8d2)" style="fill: none; stroke: #b0b0b0; stroke-opacity: 0.3; stroke-width: 0.8; stroke-linecap: square" />
5197
  </g>
5198
  <g id="line2d_28">
5199
  <g>
5200
- <use ns4:href="#m0fca2865ba" x="47.72" y="126.818915" style="stroke: #000000; stroke-width: 0.8" />
5201
  </g>
5202
  </g>
5203
  <g id="text_28">
5204
- <text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: end" x="40.72" y="130.618134" transform="rotate(-0 40.72 130.618134)">0.4</text>
5205
  </g>
5206
  </g>
5207
  <g id="ytick_5">
5208
  <g id="grid-y--6" class="grid grid-y">
5209
- <path d="M 47.72 44.025896 L 831.034248 44.025896 " clip-path="url(#pb49fc4c8d2)" style="fill: none; stroke: #b0b0b0; stroke-opacity: 0.3; stroke-width: 0.8; stroke-linecap: square" />
5210
  </g>
5211
  <g id="line2d_29">
5212
  <g>
5213
- <use ns4:href="#m0fca2865ba" x="47.72" y="44.025896" style="stroke: #000000; stroke-width: 0.8" />
5214
  </g>
5215
  </g>
5216
  <g id="text_29">
5217
- <text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: end" x="40.72" y="47.825115" transform="rotate(-0 40.72 47.825115)">0.5</text>
5218
  </g>
5219
  </g>
5220
  <g id="label--y" class="ylabel">
@@ -5222,66 +5222,66 @@ Installed 37 packages in 211ms
5222
  </g>
5223
  </g>
5224
  <g id="series--hf-kernels-causal-conv1d" class="series">
5225
- <path d="M 83.325193 420.186871 L 114.286231 412.876247 L 145.247268 415.070262 L 176.208306 415.078541 L 207.169343 415.235848 L 238.130381 416.63505 L 269.091418 416.444626 L 300.052455 415.56702 L 331.013493 415.889913 L 361.97453 415.989265 L 392.935568 415.633255 L 423.896605 415.119938 L 454.857643 414.912955 L 485.81868 415.285524 L 516.779718 414.773035 L 547.740755 413.704177 L 578.701793 415.831958 L 609.66283 415.989265 L 640.623868 416.03894 L 671.584905 416.204526 L 702.545943 414.879838 L 733.50698 415.351758 L 764.468018 416.295599 L 795.429055 414.97919 " clip-path="url(#pb49fc4c8d2)" style="fill: none; stroke: #1f77b4; stroke-width: 1.5; stroke-linecap: square" />
5226
  <defs>
5227
  <path id="md7efaf3aec" d="M 0 3 C 0.795609 3 1.55874 2.683901 2.12132 2.12132 C 2.683901 1.55874 3 0.795609 3 0 C 3 -0.795609 2.683901 -1.55874 2.12132 -2.12132 C 1.55874 -2.683901 0.795609 -3 0 -3 C -0.795609 -3 -1.55874 -2.683901 -2.12132 -2.12132 C -2.683901 -1.55874 -3 -0.795609 -3 0 C -3 0.795609 -2.683901 1.55874 -2.12132 2.12132 C -1.55874 2.683901 -0.795609 3 0 3 z " style="stroke: #1f77b4" />
5228
  </defs>
5229
  <g clip-path="url(#pb49fc4c8d2)">
5230
  <use ns4:href="#md7efaf3aec" x="83.325193" y="420.186871" style="fill: #1f77b4; stroke: #1f77b4" />
5231
- <use ns4:href="#md7efaf3aec" x="114.286231" y="412.876247" style="fill: #1f77b4; stroke: #1f77b4" />
5232
- <use ns4:href="#md7efaf3aec" x="145.247268" y="415.070262" style="fill: #1f77b4; stroke: #1f77b4" />
5233
- <use ns4:href="#md7efaf3aec" x="176.208306" y="415.078541" style="fill: #1f77b4; stroke: #1f77b4" />
5234
- <use ns4:href="#md7efaf3aec" x="207.169343" y="415.235848" style="fill: #1f77b4; stroke: #1f77b4" />
5235
- <use ns4:href="#md7efaf3aec" x="238.130381" y="416.63505" style="fill: #1f77b4; stroke: #1f77b4" />
5236
- <use ns4:href="#md7efaf3aec" x="269.091418" y="416.444626" style="fill: #1f77b4; stroke: #1f77b4" />
5237
- <use ns4:href="#md7efaf3aec" x="300.052455" y="415.56702" style="fill: #1f77b4; stroke: #1f77b4" />
5238
- <use ns4:href="#md7efaf3aec" x="331.013493" y="415.889913" style="fill: #1f77b4; stroke: #1f77b4" />
5239
- <use ns4:href="#md7efaf3aec" x="361.97453" y="415.989265" style="fill: #1f77b4; stroke: #1f77b4" />
5240
- <use ns4:href="#md7efaf3aec" x="392.935568" y="415.633255" style="fill: #1f77b4; stroke: #1f77b4" />
5241
- <use ns4:href="#md7efaf3aec" x="423.896605" y="415.119938" style="fill: #1f77b4; stroke: #1f77b4" />
5242
- <use ns4:href="#md7efaf3aec" x="454.857643" y="414.912955" style="fill: #1f77b4; stroke: #1f77b4" />
5243
- <use ns4:href="#md7efaf3aec" x="485.81868" y="415.285524" style="fill: #1f77b4; stroke: #1f77b4" />
5244
- <use ns4:href="#md7efaf3aec" x="516.779718" y="414.773035" style="fill: #1f77b4; stroke: #1f77b4" />
5245
- <use ns4:href="#md7efaf3aec" x="547.740755" y="413.704177" style="fill: #1f77b4; stroke: #1f77b4" />
5246
- <use ns4:href="#md7efaf3aec" x="578.701793" y="415.831958" style="fill: #1f77b4; stroke: #1f77b4" />
5247
- <use ns4:href="#md7efaf3aec" x="609.66283" y="415.989265" style="fill: #1f77b4; stroke: #1f77b4" />
5248
- <use ns4:href="#md7efaf3aec" x="640.623868" y="416.03894" style="fill: #1f77b4; stroke: #1f77b4" />
5249
- <use ns4:href="#md7efaf3aec" x="671.584905" y="416.204526" style="fill: #1f77b4; stroke: #1f77b4" />
5250
- <use ns4:href="#md7efaf3aec" x="702.545943" y="414.879838" style="fill: #1f77b4; stroke: #1f77b4" />
5251
- <use ns4:href="#md7efaf3aec" x="733.50698" y="415.351758" style="fill: #1f77b4; stroke: #1f77b4" />
5252
- <use ns4:href="#md7efaf3aec" x="764.468018" y="416.295599" style="fill: #1f77b4; stroke: #1f77b4" />
5253
- <use ns4:href="#md7efaf3aec" x="795.429055" y="414.97919" style="fill: #1f77b4; stroke: #1f77b4" />
5254
  </g>
5255
  </g>
5256
  <g id="series--torch-eager" class="series">
5257
- <path d="M 83.325193 398.743479 L 114.286231 386.307139 L 145.247268 385.761533 L 176.208306 386.431329 L 207.169343 387.118511 L 238.130381 389.975698 L 269.091418 385.612506 L 300.052455 387.674052 L 331.013493 387.276646 L 361.97453 388.402631 L 392.935568 338.137333 L 423.896605 324.393692 L 454.857643 388.923399 L 485.81868 389.229733 L 516.779718 389.570012 L 547.740755 388.501982 L 578.701793 388.517713 L 609.66283 388.55083 L 640.623868 388.518541 L 671.584905 388.609613 L 702.545943 379.783049 L 733.50698 374.2028 L 764.468018 54.691293 L 795.429055 45.608899 " clip-path="url(#pb49fc4c8d2)" style="fill: none; stroke: #ff7f0e; stroke-width: 1.5; stroke-linecap: square" />
5258
  <defs>
5259
  <path id="m9b8c54d372" d="M 0 3 C 0.795609 3 1.55874 2.683901 2.12132 2.12132 C 2.683901 1.55874 3 0.795609 3 0 C 3 -0.795609 2.683901 -1.55874 2.12132 -2.12132 C 1.55874 -2.683901 0.795609 -3 0 -3 C -0.795609 -3 -1.55874 -2.683901 -2.12132 -2.12132 C -2.683901 -1.55874 -3 -0.795609 -3 0 C -3 0.795609 -2.683901 1.55874 -2.12132 2.12132 C -1.55874 2.683901 -0.795609 3 0 3 z " style="stroke: #ff7f0e" />
5260
  </defs>
5261
  <g clip-path="url(#pb49fc4c8d2)">
5262
- <use ns4:href="#m9b8c54d372" x="83.325193" y="398.743479" style="fill: #ff7f0e; stroke: #ff7f0e" />
5263
- <use ns4:href="#m9b8c54d372" x="114.286231" y="386.307139" style="fill: #ff7f0e; stroke: #ff7f0e" />
5264
- <use ns4:href="#m9b8c54d372" x="145.247268" y="385.761533" style="fill: #ff7f0e; stroke: #ff7f0e" />
5265
- <use ns4:href="#m9b8c54d372" x="176.208306" y="386.431329" style="fill: #ff7f0e; stroke: #ff7f0e" />
5266
- <use ns4:href="#m9b8c54d372" x="207.169343" y="387.118511" style="fill: #ff7f0e; stroke: #ff7f0e" />
5267
- <use ns4:href="#m9b8c54d372" x="238.130381" y="389.975698" style="fill: #ff7f0e; stroke: #ff7f0e" />
5268
- <use ns4:href="#m9b8c54d372" x="269.091418" y="385.612506" style="fill: #ff7f0e; stroke: #ff7f0e" />
5269
- <use ns4:href="#m9b8c54d372" x="300.052455" y="387.674052" style="fill: #ff7f0e; stroke: #ff7f0e" />
5270
- <use ns4:href="#m9b8c54d372" x="331.013493" y="387.276646" style="fill: #ff7f0e; stroke: #ff7f0e" />
5271
- <use ns4:href="#m9b8c54d372" x="361.97453" y="388.402631" style="fill: #ff7f0e; stroke: #ff7f0e" />
5272
- <use ns4:href="#m9b8c54d372" x="392.935568" y="338.137333" style="fill: #ff7f0e; stroke: #ff7f0e" />
5273
- <use ns4:href="#m9b8c54d372" x="423.896605" y="324.393692" style="fill: #ff7f0e; stroke: #ff7f0e" />
5274
- <use ns4:href="#m9b8c54d372" x="454.857643" y="388.923399" style="fill: #ff7f0e; stroke: #ff7f0e" />
5275
- <use ns4:href="#m9b8c54d372" x="485.81868" y="389.229733" style="fill: #ff7f0e; stroke: #ff7f0e" />
5276
- <use ns4:href="#m9b8c54d372" x="516.779718" y="389.570012" style="fill: #ff7f0e; stroke: #ff7f0e" />
5277
- <use ns4:href="#m9b8c54d372" x="547.740755" y="388.501982" style="fill: #ff7f0e; stroke: #ff7f0e" />
5278
- <use ns4:href="#m9b8c54d372" x="578.701793" y="388.517713" style="fill: #ff7f0e; stroke: #ff7f0e" />
5279
- <use ns4:href="#m9b8c54d372" x="609.66283" y="388.55083" style="fill: #ff7f0e; stroke: #ff7f0e" />
5280
- <use ns4:href="#m9b8c54d372" x="640.623868" y="388.518541" style="fill: #ff7f0e; stroke: #ff7f0e" />
5281
- <use ns4:href="#m9b8c54d372" x="671.584905" y="388.609613" style="fill: #ff7f0e; stroke: #ff7f0e" />
5282
- <use ns4:href="#m9b8c54d372" x="702.545943" y="379.783049" style="fill: #ff7f0e; stroke: #ff7f0e" />
5283
- <use ns4:href="#m9b8c54d372" x="733.50698" y="374.2028" style="fill: #ff7f0e; stroke: #ff7f0e" />
5284
- <use ns4:href="#m9b8c54d372" x="764.468018" y="54.691293" style="fill: #ff7f0e; stroke: #ff7f0e" />
5285
  <use ns4:href="#m9b8c54d372" x="795.429055" y="45.608899" style="fill: #ff7f0e; stroke: #ff7f0e" />
5286
  </g>
5287
  </g>
 
4107
  <rdf:RDF>
4108
  <ns2:Work>
4109
  <dc:type rdf:resource="http://purl.org/dc/dcmitype/StillImage" />
4110
+ <dc:date>2025-10-31T20:14:05.716143</dc:date>
4111
  <dc:format>image/svg+xml</dc:format>
4112
  <dc:creator>
4113
  <ns2:Agent>
 
4451
  <g id="matplotlib.axis_2">
4452
  <g id="ytick_1">
4453
  <g id="grid-y--2" class="grid grid-y">
4454
+ <path d="M 47.72 375.317309 L 831.034248 375.317309 " clip-path="url(#pb49fc4c8d2)" style="fill: none; stroke: #b0b0b0; stroke-opacity: 0.3; stroke-width: 0.8; stroke-linecap: square" />
4455
  </g>
4456
  <g id="line2d_25">
4457
  <defs>
4458
  <path id="m0fca2865ba" d="M 0 0 L -3.5 0 " style="stroke: #000000; stroke-width: 0.8" />
4459
  </defs>
4460
  <g>
4461
+ <use ns4:href="#m0fca2865ba" x="47.72" y="375.317309" style="stroke: #000000; stroke-width: 0.8" />
4462
  </g>
4463
  </g>
4464
  <g id="text_25">
4465
+ <text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: end" x="40.72" y="379.116528" transform="rotate(-0 40.72 379.116528)">0.1</text>
4466
  </g>
4467
  </g>
4468
  <g id="ytick_2">
4469
  <g id="grid-y--3" class="grid grid-y">
4470
+ <path d="M 47.72 292.576412 L 831.034248 292.576412 " clip-path="url(#pb49fc4c8d2)" style="fill: none; stroke: #b0b0b0; stroke-opacity: 0.3; stroke-width: 0.8; stroke-linecap: square" />
4471
  </g>
4472
  <g id="line2d_26">
4473
  <g>
4474
+ <use ns4:href="#m0fca2865ba" x="47.72" y="292.576412" style="stroke: #000000; stroke-width: 0.8" />
4475
  </g>
4476
  </g>
4477
  <g id="text_26">
4478
+ <text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: end" x="40.72" y="296.375631" transform="rotate(-0 40.72 296.375631)">0.2</text>
4479
  </g>
4480
  </g>
4481
  <g id="ytick_3">
4482
  <g id="grid-y--4" class="grid grid-y">
4483
+ <path d="M 47.72 209.835514 L 831.034248 209.835514 " clip-path="url(#pb49fc4c8d2)" style="fill: none; stroke: #b0b0b0; stroke-opacity: 0.3; stroke-width: 0.8; stroke-linecap: square" />
4484
  </g>
4485
  <g id="line2d_27">
4486
  <g>
4487
+ <use ns4:href="#m0fca2865ba" x="47.72" y="209.835514" style="stroke: #000000; stroke-width: 0.8" />
4488
  </g>
4489
  </g>
4490
  <g id="text_27">
4491
+ <text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: end" x="40.72" y="213.634733" transform="rotate(-0 40.72 213.634733)">0.3</text>
4492
  </g>
4493
  </g>
4494
  <g id="ytick_4">
4495
  <g id="grid-y--5" class="grid grid-y">
4496
+ <path d="M 47.72 127.094617 L 831.034248 127.094617 " clip-path="url(#pb49fc4c8d2)" style="fill: none; stroke: #b0b0b0; stroke-opacity: 0.3; stroke-width: 0.8; stroke-linecap: square" />
4497
  </g>
4498
  <g id="line2d_28">
4499
  <g>
4500
+ <use ns4:href="#m0fca2865ba" x="47.72" y="127.094617" style="stroke: #000000; stroke-width: 0.8" />
4501
  </g>
4502
  </g>
4503
  <g id="text_28">
4504
+ <text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: end" x="40.72" y="130.893835" transform="rotate(-0 40.72 130.893835)">0.4</text>
4505
  </g>
4506
  </g>
4507
  <g id="ytick_5">
4508
  <g id="grid-y--6" class="grid grid-y">
4509
+ <path d="M 47.72 44.353719 L 831.034248 44.353719 " clip-path="url(#pb49fc4c8d2)" style="fill: none; stroke: #b0b0b0; stroke-opacity: 0.3; stroke-width: 0.8; stroke-linecap: square" />
4510
  </g>
4511
  <g id="line2d_29">
4512
  <g>
4513
+ <use ns4:href="#m0fca2865ba" x="47.72" y="44.353719" style="stroke: #000000; stroke-width: 0.8" />
4514
  </g>
4515
  </g>
4516
  <g id="text_29">
4517
+ <text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: end" x="40.72" y="48.152938" transform="rotate(-0 40.72 48.152938)">0.5</text>
4518
  </g>
4519
  </g>
4520
  <g id="label--y" class="ylabel">
 
4522
  </g>
4523
  </g>
4524
  <g id="series--hf-kernels-causal-conv1d" class="series">
4525
+ <path d="M 83.325193 420.186871 L 114.286231 415.032113 L 145.247268 414.700322 L 176.208306 414.808712 L 207.169343 415.503736 L 238.130381 416.677829 L 269.091418 416.322043 L 300.052455 416.2815 L 331.013493 416.364241 L 361.97453 415.247239 L 392.935568 416.78622 L 423.896605 416.686103 L 454.857643 416.223582 L 485.81868 417.390228 L 516.779718 415.991907 L 547.740755 415.279508 L 578.701793 415.702314 L 609.66283 416.082095 L 640.623868 416.173937 L 671.584905 415.884344 L 702.545943 416.157389 L 733.50698 416.115191 L 764.468018 416.686103 L 795.429055 415.967085 " clip-path="url(#pb49fc4c8d2)" style="fill: none; stroke: #1f77b4; stroke-width: 1.5; stroke-linecap: square" />
4526
  <defs>
4527
  <path id="md7efaf3aec" d="M 0 3 C 0.795609 3 1.55874 2.683901 2.12132 2.12132 C 2.683901 1.55874 3 0.795609 3 0 C 3 -0.795609 2.683901 -1.55874 2.12132 -2.12132 C 1.55874 -2.683901 0.795609 -3 0 -3 C -0.795609 -3 -1.55874 -2.683901 -2.12132 -2.12132 C -2.683901 -1.55874 -3 -0.795609 -3 0 C -3 0.795609 -2.683901 1.55874 -2.12132 2.12132 C -1.55874 2.683901 -0.795609 3 0 3 z " style="stroke: #1f77b4" />
4528
  </defs>
4529
  <g clip-path="url(#pb49fc4c8d2)">
4530
  <use ns4:href="#md7efaf3aec" x="83.325193" y="420.186871" style="fill: #1f77b4; stroke: #1f77b4" />
4531
+ <use ns4:href="#md7efaf3aec" x="114.286231" y="415.032113" style="fill: #1f77b4; stroke: #1f77b4" />
4532
+ <use ns4:href="#md7efaf3aec" x="145.247268" y="414.700322" style="fill: #1f77b4; stroke: #1f77b4" />
4533
+ <use ns4:href="#md7efaf3aec" x="176.208306" y="414.808712" style="fill: #1f77b4; stroke: #1f77b4" />
4534
+ <use ns4:href="#md7efaf3aec" x="207.169343" y="415.503736" style="fill: #1f77b4; stroke: #1f77b4" />
4535
+ <use ns4:href="#md7efaf3aec" x="238.130381" y="416.677829" style="fill: #1f77b4; stroke: #1f77b4" />
4536
+ <use ns4:href="#md7efaf3aec" x="269.091418" y="416.322043" style="fill: #1f77b4; stroke: #1f77b4" />
4537
+ <use ns4:href="#md7efaf3aec" x="300.052455" y="416.2815" style="fill: #1f77b4; stroke: #1f77b4" />
4538
+ <use ns4:href="#md7efaf3aec" x="331.013493" y="416.364241" style="fill: #1f77b4; stroke: #1f77b4" />
4539
+ <use ns4:href="#md7efaf3aec" x="361.97453" y="415.247239" style="fill: #1f77b4; stroke: #1f77b4" />
4540
+ <use ns4:href="#md7efaf3aec" x="392.935568" y="416.78622" style="fill: #1f77b4; stroke: #1f77b4" />
4541
+ <use ns4:href="#md7efaf3aec" x="423.896605" y="416.686103" style="fill: #1f77b4; stroke: #1f77b4" />
4542
+ <use ns4:href="#md7efaf3aec" x="454.857643" y="416.223582" style="fill: #1f77b4; stroke: #1f77b4" />
4543
+ <use ns4:href="#md7efaf3aec" x="485.81868" y="417.390228" style="fill: #1f77b4; stroke: #1f77b4" />
4544
+ <use ns4:href="#md7efaf3aec" x="516.779718" y="415.991907" style="fill: #1f77b4; stroke: #1f77b4" />
4545
+ <use ns4:href="#md7efaf3aec" x="547.740755" y="415.279508" style="fill: #1f77b4; stroke: #1f77b4" />
4546
+ <use ns4:href="#md7efaf3aec" x="578.701793" y="415.702314" style="fill: #1f77b4; stroke: #1f77b4" />
4547
+ <use ns4:href="#md7efaf3aec" x="609.66283" y="416.082095" style="fill: #1f77b4; stroke: #1f77b4" />
4548
+ <use ns4:href="#md7efaf3aec" x="640.623868" y="416.173937" style="fill: #1f77b4; stroke: #1f77b4" />
4549
+ <use ns4:href="#md7efaf3aec" x="671.584905" y="415.884344" style="fill: #1f77b4; stroke: #1f77b4" />
4550
+ <use ns4:href="#md7efaf3aec" x="702.545943" y="416.157389" style="fill: #1f77b4; stroke: #1f77b4" />
4551
+ <use ns4:href="#md7efaf3aec" x="733.50698" y="416.115191" style="fill: #1f77b4; stroke: #1f77b4" />
4552
+ <use ns4:href="#md7efaf3aec" x="764.468018" y="416.686103" style="fill: #1f77b4; stroke: #1f77b4" />
4553
+ <use ns4:href="#md7efaf3aec" x="795.429055" y="415.967085" style="fill: #1f77b4; stroke: #1f77b4" />
4554
  </g>
4555
  </g>
4556
  <g id="series--torch-eager" class="series">
4557
+ <path d="M 83.325193 399.368433 L 114.286231 384.590909 L 145.247268 385.069979 L 176.208306 385.98923 L 207.169343 386.154712 L 238.130381 385.815474 L 269.091418 387.619226 L 300.052455 388.023829 L 331.013493 386.783543 L 361.97453 386.047149 L 392.935568 337.468313 L 423.896605 323.758146 L 454.857643 387.577855 L 485.81868 388.03293 L 516.779718 387.569581 L 547.740755 387.180699 L 578.701793 387.519109 L 609.66283 386.12989 L 640.623868 386.808365 L 671.584905 386.419483 L 702.545943 379.13001 L 733.50698 373.727029 L 764.468018 53.453563 L 795.429055 45.608899 " clip-path="url(#pb49fc4c8d2)" style="fill: none; stroke: #ff7f0e; stroke-width: 1.5; stroke-linecap: square" />
4558
  <defs>
4559
  <path id="m9b8c54d372" d="M 0 3 C 0.795609 3 1.55874 2.683901 2.12132 2.12132 C 2.683901 1.55874 3 0.795609 3 0 C 3 -0.795609 2.683901 -1.55874 2.12132 -2.12132 C 1.55874 -2.683901 0.795609 -3 0 -3 C -0.795609 -3 -1.55874 -2.683901 -2.12132 -2.12132 C -2.683901 -1.55874 -3 -0.795609 -3 0 C -3 0.795609 -2.683901 1.55874 -2.12132 2.12132 C -1.55874 2.683901 -0.795609 3 0 3 z " style="stroke: #ff7f0e" />
4560
  </defs>
4561
  <g clip-path="url(#pb49fc4c8d2)">
4562
+ <use ns4:href="#m9b8c54d372" x="83.325193" y="399.368433" style="fill: #ff7f0e; stroke: #ff7f0e" />
4563
+ <use ns4:href="#m9b8c54d372" x="114.286231" y="384.590909" style="fill: #ff7f0e; stroke: #ff7f0e" />
4564
+ <use ns4:href="#m9b8c54d372" x="145.247268" y="385.069979" style="fill: #ff7f0e; stroke: #ff7f0e" />
4565
+ <use ns4:href="#m9b8c54d372" x="176.208306" y="385.98923" style="fill: #ff7f0e; stroke: #ff7f0e" />
4566
+ <use ns4:href="#m9b8c54d372" x="207.169343" y="386.154712" style="fill: #ff7f0e; stroke: #ff7f0e" />
4567
+ <use ns4:href="#m9b8c54d372" x="238.130381" y="385.815474" style="fill: #ff7f0e; stroke: #ff7f0e" />
4568
+ <use ns4:href="#m9b8c54d372" x="269.091418" y="387.619226" style="fill: #ff7f0e; stroke: #ff7f0e" />
4569
+ <use ns4:href="#m9b8c54d372" x="300.052455" y="388.023829" style="fill: #ff7f0e; stroke: #ff7f0e" />
4570
+ <use ns4:href="#m9b8c54d372" x="331.013493" y="386.783543" style="fill: #ff7f0e; stroke: #ff7f0e" />
4571
+ <use ns4:href="#m9b8c54d372" x="361.97453" y="386.047149" style="fill: #ff7f0e; stroke: #ff7f0e" />
4572
+ <use ns4:href="#m9b8c54d372" x="392.935568" y="337.468313" style="fill: #ff7f0e; stroke: #ff7f0e" />
4573
+ <use ns4:href="#m9b8c54d372" x="423.896605" y="323.758146" style="fill: #ff7f0e; stroke: #ff7f0e" />
4574
+ <use ns4:href="#m9b8c54d372" x="454.857643" y="387.577855" style="fill: #ff7f0e; stroke: #ff7f0e" />
4575
+ <use ns4:href="#m9b8c54d372" x="485.81868" y="388.03293" style="fill: #ff7f0e; stroke: #ff7f0e" />
4576
+ <use ns4:href="#m9b8c54d372" x="516.779718" y="387.569581" style="fill: #ff7f0e; stroke: #ff7f0e" />
4577
+ <use ns4:href="#m9b8c54d372" x="547.740755" y="387.180699" style="fill: #ff7f0e; stroke: #ff7f0e" />
4578
+ <use ns4:href="#m9b8c54d372" x="578.701793" y="387.519109" style="fill: #ff7f0e; stroke: #ff7f0e" />
4579
+ <use ns4:href="#m9b8c54d372" x="609.66283" y="386.12989" style="fill: #ff7f0e; stroke: #ff7f0e" />
4580
+ <use ns4:href="#m9b8c54d372" x="640.623868" y="386.808365" style="fill: #ff7f0e; stroke: #ff7f0e" />
4581
+ <use ns4:href="#m9b8c54d372" x="671.584905" y="386.419483" style="fill: #ff7f0e; stroke: #ff7f0e" />
4582
+ <use ns4:href="#m9b8c54d372" x="702.545943" y="379.13001" style="fill: #ff7f0e; stroke: #ff7f0e" />
4583
+ <use ns4:href="#m9b8c54d372" x="733.50698" y="373.727029" style="fill: #ff7f0e; stroke: #ff7f0e" />
4584
+ <use ns4:href="#m9b8c54d372" x="764.468018" y="53.453563" style="fill: #ff7f0e; stroke: #ff7f0e" />
4585
  <use ns4:href="#m9b8c54d372" x="795.429055" y="45.608899" style="fill: #ff7f0e; stroke: #ff7f0e" />
4586
  </g>
4587
  </g>
 
4640
  <span onclick="toggleOutput('combine')" style="cursor: pointer;">▼ output</span>
4641
  <span id="uv-indicator-combine" onclick="toggleUvLogsFromHeader('combine')" style="cursor: pointer;">▶ uv-logs</span>
4642
  </span> |
4643
+ Cell: combine | 4.43s
4644
  | <button class="run-btn" onclick="runCell('combine')">▶ run</button>
4645
  <button class="copy-btn" onclick="copyCell('combine')">Copy</button>
4646
  <a href="cells/combine.py" target="_blank" class="raw-btn">Raw</a>
 
4753
  hf_kernels_causal_conv1d cuda_B4_D64_S512_W4 0.05 True
4754
  torch_eager cuda_B2_D2048_S128_W2 0.09 True
4755
  torch_eager cuda_B2_D2048_S128_W4 0.08 True
4756
+ torch_eager cuda_B2_D2048_S2048_W2 0.15 True
4757
  torch_eager cuda_B2_D2048_S2048_W4 0.16 True
4758
  torch_eager cuda_B2_D2048_S512_W2 0.09 True
4759
+ torch_eager cuda_B2_D2048_S512_W4 0.09 True
4760
  torch_eager cuda_B2_D64_S128_W2 0.07 True
4761
  torch_eager cuda_B2_D64_S128_W4 0.09 True
4762
  torch_eager cuda_B2_D64_S2048_W2 0.09 True
4763
+ torch_eager cuda_B2_D64_S2048_W4 0.09 True
4764
  torch_eager cuda_B2_D64_S512_W2 0.09 True
4765
  torch_eager cuda_B2_D64_S512_W4 0.09 True
4766
+ torch_eager cuda_B4_D2048_S128_W2 0.09 True
4767
+ torch_eager cuda_B4_D2048_S128_W4 0.09 True
4768
  torch_eager cuda_B4_D2048_S2048_W2 0.49 True
4769
  torch_eager cuda_B4_D2048_S2048_W4 0.50 True
4770
+ torch_eager cuda_B4_D2048_S512_W2 0.10 True
4771
  torch_eager cuda_B4_D2048_S512_W4 0.10 True
4772
+ torch_eager cuda_B4_D64_S128_W2 0.09 True
4773
  torch_eager cuda_B4_D64_S128_W4 0.08 True
4774
+ torch_eager cuda_B4_D64_S2048_W2 0.09 True
4775
+ torch_eager cuda_B4_D64_S2048_W4 0.09 True
4776
+ torch_eager cuda_B4_D64_S512_W2 0.09 True
4777
+ torch_eager cuda_B4_D64_S512_W4 0.09 True
4778
 
4779
  GENERATING COMBINED VISUALIZATION
4780
 
 
4794
  <div class="uv-install-logs" id="uv-logs-combine">
4795
  <div class="uv-logs-header" onclick="toggleUvLogs(this)">▶ UV Install Logs</div>
4796
  <div class="uv-logs-content" style="display: none;">
4797
+ Installed 37 packages in 238ms
4798
  </div>
4799
  </div>
4800
  <div class="cell-artifacts">
 
4807
  <rdf:RDF>
4808
  <ns2:Work>
4809
  <dc:type rdf:resource="http://purl.org/dc/dcmitype/StillImage" />
4810
+ <dc:date>2025-10-31T20:14:05.716143</dc:date>
4811
  <dc:format>image/svg+xml</dc:format>
4812
  <dc:creator>
4813
  <ns2:Agent>
 
5151
  <g id="matplotlib.axis_2">
5152
  <g id="ytick_1">
5153
  <g id="grid-y--2" class="grid grid-y">
5154
+ <path d="M 47.72 375.317309 L 831.034248 375.317309 " clip-path="url(#pb49fc4c8d2)" style="fill: none; stroke: #b0b0b0; stroke-opacity: 0.3; stroke-width: 0.8; stroke-linecap: square" />
5155
  </g>
5156
  <g id="line2d_25">
5157
  <defs>
5158
  <path id="m0fca2865ba" d="M 0 0 L -3.5 0 " style="stroke: #000000; stroke-width: 0.8" />
5159
  </defs>
5160
  <g>
5161
+ <use ns4:href="#m0fca2865ba" x="47.72" y="375.317309" style="stroke: #000000; stroke-width: 0.8" />
5162
  </g>
5163
  </g>
5164
  <g id="text_25">
5165
+ <text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: end" x="40.72" y="379.116528" transform="rotate(-0 40.72 379.116528)">0.1</text>
5166
  </g>
5167
  </g>
5168
  <g id="ytick_2">
5169
  <g id="grid-y--3" class="grid grid-y">
5170
+ <path d="M 47.72 292.576412 L 831.034248 292.576412 " clip-path="url(#pb49fc4c8d2)" style="fill: none; stroke: #b0b0b0; stroke-opacity: 0.3; stroke-width: 0.8; stroke-linecap: square" />
5171
  </g>
5172
  <g id="line2d_26">
5173
  <g>
5174
+ <use ns4:href="#m0fca2865ba" x="47.72" y="292.576412" style="stroke: #000000; stroke-width: 0.8" />
5175
  </g>
5176
  </g>
5177
  <g id="text_26">
5178
+ <text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: end" x="40.72" y="296.375631" transform="rotate(-0 40.72 296.375631)">0.2</text>
5179
  </g>
5180
  </g>
5181
  <g id="ytick_3">
5182
  <g id="grid-y--4" class="grid grid-y">
5183
+ <path d="M 47.72 209.835514 L 831.034248 209.835514 " clip-path="url(#pb49fc4c8d2)" style="fill: none; stroke: #b0b0b0; stroke-opacity: 0.3; stroke-width: 0.8; stroke-linecap: square" />
5184
  </g>
5185
  <g id="line2d_27">
5186
  <g>
5187
+ <use ns4:href="#m0fca2865ba" x="47.72" y="209.835514" style="stroke: #000000; stroke-width: 0.8" />
5188
  </g>
5189
  </g>
5190
  <g id="text_27">
5191
+ <text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: end" x="40.72" y="213.634733" transform="rotate(-0 40.72 213.634733)">0.3</text>
5192
  </g>
5193
  </g>
5194
  <g id="ytick_4">
5195
  <g id="grid-y--5" class="grid grid-y">
5196
+ <path d="M 47.72 127.094617 L 831.034248 127.094617 " clip-path="url(#pb49fc4c8d2)" style="fill: none; stroke: #b0b0b0; stroke-opacity: 0.3; stroke-width: 0.8; stroke-linecap: square" />
5197
  </g>
5198
  <g id="line2d_28">
5199
  <g>
5200
+ <use ns4:href="#m0fca2865ba" x="47.72" y="127.094617" style="stroke: #000000; stroke-width: 0.8" />
5201
  </g>
5202
  </g>
5203
  <g id="text_28">
5204
+ <text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: end" x="40.72" y="130.893835" transform="rotate(-0 40.72 130.893835)">0.4</text>
5205
  </g>
5206
  </g>
5207
  <g id="ytick_5">
5208
  <g id="grid-y--6" class="grid grid-y">
5209
+ <path d="M 47.72 44.353719 L 831.034248 44.353719 " clip-path="url(#pb49fc4c8d2)" style="fill: none; stroke: #b0b0b0; stroke-opacity: 0.3; stroke-width: 0.8; stroke-linecap: square" />
5210
  </g>
5211
  <g id="line2d_29">
5212
  <g>
5213
+ <use ns4:href="#m0fca2865ba" x="47.72" y="44.353719" style="stroke: #000000; stroke-width: 0.8" />
5214
  </g>
5215
  </g>
5216
  <g id="text_29">
5217
+ <text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: end" x="40.72" y="48.152938" transform="rotate(-0 40.72 48.152938)">0.5</text>
5218
  </g>
5219
  </g>
5220
  <g id="label--y" class="ylabel">
 
5222
  </g>
5223
  </g>
5224
  <g id="series--hf-kernels-causal-conv1d" class="series">
5225
+ <path d="M 83.325193 420.186871 L 114.286231 415.032113 L 145.247268 414.700322 L 176.208306 414.808712 L 207.169343 415.503736 L 238.130381 416.677829 L 269.091418 416.322043 L 300.052455 416.2815 L 331.013493 416.364241 L 361.97453 415.247239 L 392.935568 416.78622 L 423.896605 416.686103 L 454.857643 416.223582 L 485.81868 417.390228 L 516.779718 415.991907 L 547.740755 415.279508 L 578.701793 415.702314 L 609.66283 416.082095 L 640.623868 416.173937 L 671.584905 415.884344 L 702.545943 416.157389 L 733.50698 416.115191 L 764.468018 416.686103 L 795.429055 415.967085 " clip-path="url(#pb49fc4c8d2)" style="fill: none; stroke: #1f77b4; stroke-width: 1.5; stroke-linecap: square" />
5226
  <defs>
5227
  <path id="md7efaf3aec" d="M 0 3 C 0.795609 3 1.55874 2.683901 2.12132 2.12132 C 2.683901 1.55874 3 0.795609 3 0 C 3 -0.795609 2.683901 -1.55874 2.12132 -2.12132 C 1.55874 -2.683901 0.795609 -3 0 -3 C -0.795609 -3 -1.55874 -2.683901 -2.12132 -2.12132 C -2.683901 -1.55874 -3 -0.795609 -3 0 C -3 0.795609 -2.683901 1.55874 -2.12132 2.12132 C -1.55874 2.683901 -0.795609 3 0 3 z " style="stroke: #1f77b4" />
5228
  </defs>
5229
  <g clip-path="url(#pb49fc4c8d2)">
5230
  <use ns4:href="#md7efaf3aec" x="83.325193" y="420.186871" style="fill: #1f77b4; stroke: #1f77b4" />
5231
+ <use ns4:href="#md7efaf3aec" x="114.286231" y="415.032113" style="fill: #1f77b4; stroke: #1f77b4" />
5232
+ <use ns4:href="#md7efaf3aec" x="145.247268" y="414.700322" style="fill: #1f77b4; stroke: #1f77b4" />
5233
+ <use ns4:href="#md7efaf3aec" x="176.208306" y="414.808712" style="fill: #1f77b4; stroke: #1f77b4" />
5234
+ <use ns4:href="#md7efaf3aec" x="207.169343" y="415.503736" style="fill: #1f77b4; stroke: #1f77b4" />
5235
+ <use ns4:href="#md7efaf3aec" x="238.130381" y="416.677829" style="fill: #1f77b4; stroke: #1f77b4" />
5236
+ <use ns4:href="#md7efaf3aec" x="269.091418" y="416.322043" style="fill: #1f77b4; stroke: #1f77b4" />
5237
+ <use ns4:href="#md7efaf3aec" x="300.052455" y="416.2815" style="fill: #1f77b4; stroke: #1f77b4" />
5238
+ <use ns4:href="#md7efaf3aec" x="331.013493" y="416.364241" style="fill: #1f77b4; stroke: #1f77b4" />
5239
+ <use ns4:href="#md7efaf3aec" x="361.97453" y="415.247239" style="fill: #1f77b4; stroke: #1f77b4" />
5240
+ <use ns4:href="#md7efaf3aec" x="392.935568" y="416.78622" style="fill: #1f77b4; stroke: #1f77b4" />
5241
+ <use ns4:href="#md7efaf3aec" x="423.896605" y="416.686103" style="fill: #1f77b4; stroke: #1f77b4" />
5242
+ <use ns4:href="#md7efaf3aec" x="454.857643" y="416.223582" style="fill: #1f77b4; stroke: #1f77b4" />
5243
+ <use ns4:href="#md7efaf3aec" x="485.81868" y="417.390228" style="fill: #1f77b4; stroke: #1f77b4" />
5244
+ <use ns4:href="#md7efaf3aec" x="516.779718" y="415.991907" style="fill: #1f77b4; stroke: #1f77b4" />
5245
+ <use ns4:href="#md7efaf3aec" x="547.740755" y="415.279508" style="fill: #1f77b4; stroke: #1f77b4" />
5246
+ <use ns4:href="#md7efaf3aec" x="578.701793" y="415.702314" style="fill: #1f77b4; stroke: #1f77b4" />
5247
+ <use ns4:href="#md7efaf3aec" x="609.66283" y="416.082095" style="fill: #1f77b4; stroke: #1f77b4" />
5248
+ <use ns4:href="#md7efaf3aec" x="640.623868" y="416.173937" style="fill: #1f77b4; stroke: #1f77b4" />
5249
+ <use ns4:href="#md7efaf3aec" x="671.584905" y="415.884344" style="fill: #1f77b4; stroke: #1f77b4" />
5250
+ <use ns4:href="#md7efaf3aec" x="702.545943" y="416.157389" style="fill: #1f77b4; stroke: #1f77b4" />
5251
+ <use ns4:href="#md7efaf3aec" x="733.50698" y="416.115191" style="fill: #1f77b4; stroke: #1f77b4" />
5252
+ <use ns4:href="#md7efaf3aec" x="764.468018" y="416.686103" style="fill: #1f77b4; stroke: #1f77b4" />
5253
+ <use ns4:href="#md7efaf3aec" x="795.429055" y="415.967085" style="fill: #1f77b4; stroke: #1f77b4" />
5254
  </g>
5255
  </g>
5256
  <g id="series--torch-eager" class="series">
5257
+ <path d="M 83.325193 399.368433 L 114.286231 384.590909 L 145.247268 385.069979 L 176.208306 385.98923 L 207.169343 386.154712 L 238.130381 385.815474 L 269.091418 387.619226 L 300.052455 388.023829 L 331.013493 386.783543 L 361.97453 386.047149 L 392.935568 337.468313 L 423.896605 323.758146 L 454.857643 387.577855 L 485.81868 388.03293 L 516.779718 387.569581 L 547.740755 387.180699 L 578.701793 387.519109 L 609.66283 386.12989 L 640.623868 386.808365 L 671.584905 386.419483 L 702.545943 379.13001 L 733.50698 373.727029 L 764.468018 53.453563 L 795.429055 45.608899 " clip-path="url(#pb49fc4c8d2)" style="fill: none; stroke: #ff7f0e; stroke-width: 1.5; stroke-linecap: square" />
5258
  <defs>
5259
  <path id="m9b8c54d372" d="M 0 3 C 0.795609 3 1.55874 2.683901 2.12132 2.12132 C 2.683901 1.55874 3 0.795609 3 0 C 3 -0.795609 2.683901 -1.55874 2.12132 -2.12132 C 1.55874 -2.683901 0.795609 -3 0 -3 C -0.795609 -3 -1.55874 -2.683901 -2.12132 -2.12132 C -2.683901 -1.55874 -3 -0.795609 -3 0 C -3 0.795609 -2.683901 1.55874 -2.12132 2.12132 C -1.55874 2.683901 -0.795609 3 0 3 z " style="stroke: #ff7f0e" />
5260
  </defs>
5261
  <g clip-path="url(#pb49fc4c8d2)">
5262
+ <use ns4:href="#m9b8c54d372" x="83.325193" y="399.368433" style="fill: #ff7f0e; stroke: #ff7f0e" />
5263
+ <use ns4:href="#m9b8c54d372" x="114.286231" y="384.590909" style="fill: #ff7f0e; stroke: #ff7f0e" />
5264
+ <use ns4:href="#m9b8c54d372" x="145.247268" y="385.069979" style="fill: #ff7f0e; stroke: #ff7f0e" />
5265
+ <use ns4:href="#m9b8c54d372" x="176.208306" y="385.98923" style="fill: #ff7f0e; stroke: #ff7f0e" />
5266
+ <use ns4:href="#m9b8c54d372" x="207.169343" y="386.154712" style="fill: #ff7f0e; stroke: #ff7f0e" />
5267
+ <use ns4:href="#m9b8c54d372" x="238.130381" y="385.815474" style="fill: #ff7f0e; stroke: #ff7f0e" />
5268
+ <use ns4:href="#m9b8c54d372" x="269.091418" y="387.619226" style="fill: #ff7f0e; stroke: #ff7f0e" />
5269
+ <use ns4:href="#m9b8c54d372" x="300.052455" y="388.023829" style="fill: #ff7f0e; stroke: #ff7f0e" />
5270
+ <use ns4:href="#m9b8c54d372" x="331.013493" y="386.783543" style="fill: #ff7f0e; stroke: #ff7f0e" />
5271
+ <use ns4:href="#m9b8c54d372" x="361.97453" y="386.047149" style="fill: #ff7f0e; stroke: #ff7f0e" />
5272
+ <use ns4:href="#m9b8c54d372" x="392.935568" y="337.468313" style="fill: #ff7f0e; stroke: #ff7f0e" />
5273
+ <use ns4:href="#m9b8c54d372" x="423.896605" y="323.758146" style="fill: #ff7f0e; stroke: #ff7f0e" />
5274
+ <use ns4:href="#m9b8c54d372" x="454.857643" y="387.577855" style="fill: #ff7f0e; stroke: #ff7f0e" />
5275
+ <use ns4:href="#m9b8c54d372" x="485.81868" y="388.03293" style="fill: #ff7f0e; stroke: #ff7f0e" />
5276
+ <use ns4:href="#m9b8c54d372" x="516.779718" y="387.569581" style="fill: #ff7f0e; stroke: #ff7f0e" />
5277
+ <use ns4:href="#m9b8c54d372" x="547.740755" y="387.180699" style="fill: #ff7f0e; stroke: #ff7f0e" />
5278
+ <use ns4:href="#m9b8c54d372" x="578.701793" y="387.519109" style="fill: #ff7f0e; stroke: #ff7f0e" />
5279
+ <use ns4:href="#m9b8c54d372" x="609.66283" y="386.12989" style="fill: #ff7f0e; stroke: #ff7f0e" />
5280
+ <use ns4:href="#m9b8c54d372" x="640.623868" y="386.808365" style="fill: #ff7f0e; stroke: #ff7f0e" />
5281
+ <use ns4:href="#m9b8c54d372" x="671.584905" y="386.419483" style="fill: #ff7f0e; stroke: #ff7f0e" />
5282
+ <use ns4:href="#m9b8c54d372" x="702.545943" y="379.13001" style="fill: #ff7f0e; stroke: #ff7f0e" />
5283
+ <use ns4:href="#m9b8c54d372" x="733.50698" y="373.727029" style="fill: #ff7f0e; stroke: #ff7f0e" />
5284
+ <use ns4:href="#m9b8c54d372" x="764.468018" y="53.453563" style="fill: #ff7f0e; stroke: #ff7f0e" />
5285
  <use ns4:href="#m9b8c54d372" x="795.429055" y="45.608899" style="fill: #ff7f0e; stroke: #ff7f0e" />
5286
  </g>
5287
  </g>
deformable_detr/impls/artifacts/benchmark/deformable_detr.jsonl ADDED
@@ -0,0 +1,4 @@
 
 
 
 
 
1
+ {"ts": "2025-10-31T20:13:50Z", "run": "c1c013d99d9f4c1199d0a550b8476fb2", "impl": "torch_eager", "tags": {"family": "pytorch", "backend": "eager"}, "wl": {"name": "cuda_B1_Q100_H8_E256_L4_P4", "batch_size": 1, "num_queries": 100, "num_heads": 8, "embed_dim": 256, "num_levels": 4, "num_points": 4, "spatial_shapes": [[32, 32], [16, 16], [8, 8], [4, 4]], "dtype": "float32", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 3.3733269999629556, "p50": 3.3932979999917734, "p90": 3.4002180000243243, "mean": 3.393551400040451, "iqr": 0.010580999969533877, "raw_times": [3.3896370000547904, 3.4002180000243243, 3.3932979999917734, 3.3733269999629556, 3.411277000168411], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 3.4049870000671945, "peak_bytes": 5929472, "ok": true, "absmax": 0.0, "corr": {"ok": true, "rtol": 1e-05, "atol": 1e-05, "absmax": 0.0, "mae": 0.0, "mse": 0.0, "ref": "deformable_detr_torch"}, "err": null}
2
+ {"ts": "2025-10-31T20:13:51Z", "run": "c1c013d99d9f4c1199d0a550b8476fb2", "impl": "torch_eager", "tags": {"family": "pytorch", "backend": "eager"}, "wl": {"name": "cuda_B1_Q300_H8_E256_L4_P4", "batch_size": 1, "num_queries": 300, "num_heads": 8, "embed_dim": 256, "num_levels": 4, "num_points": 4, "spatial_shapes": [[32, 32], [16, 16], [8, 8], [4, 4]], "dtype": "float32", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 3.99112300010529, "p50": 4.007804000139004, "p90": 4.020502999992459, "mean": 4.014501400024528, "iqr": 0.017490000118414173, "raw_times": [4.050064000011844, 4.020502999992459, 4.007804000139004, 4.003012999874045, 3.99112300010529], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 4.017783999870517, "peak_bytes": 15161856, "ok": true, "absmax": 0.0, "corr": {"ok": true, "rtol": 1e-05, "atol": 1e-05, "absmax": 0.0, "mae": 0.0, "mse": 0.0, "ref": "deformable_detr_torch"}, "err": null}
3
+ {"ts": "2025-10-31T20:13:51Z", "run": "c1c013d99d9f4c1199d0a550b8476fb2", "impl": "torch_eager", "tags": {"family": "pytorch", "backend": "eager"}, "wl": {"name": "cuda_B2_Q100_H8_E256_L4_P4", "batch_size": 2, "num_queries": 100, "num_heads": 8, "embed_dim": 256, "num_levels": 4, "num_points": 4, "spatial_shapes": [[32, 32], [16, 16], [8, 8], [4, 4]], "dtype": "float32", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 4.004662999932407, "p50": 4.020202999981848, "p90": 4.030714000009539, "mean": 4.022331200030749, "iqr": 0.011850999953821884, "raw_times": [4.018863000055717, 4.004662999932407, 4.0372130001742335, 4.020202999981848, 4.030714000009539], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 4.032904000041526, "peak_bytes": 11958784, "ok": true, "absmax": 0.0, "corr": {"ok": true, "rtol": 1e-05, "atol": 1e-05, "absmax": 0.0, "mae": 0.0, "mse": 0.0, "ref": "deformable_detr_torch"}, "err": null}
4
+ {"ts": "2025-10-31T20:13:52Z", "run": "c1c013d99d9f4c1199d0a550b8476fb2", "impl": "torch_eager", "tags": {"family": "pytorch", "backend": "eager"}, "wl": {"name": "cuda_B2_Q300_H8_E256_L4_P4", "batch_size": 2, "num_queries": 300, "num_heads": 8, "embed_dim": 256, "num_levels": 4, "num_points": 4, "spatial_shapes": [[32, 32], [16, 16], [8, 8], [4, 4]], "dtype": "float32", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 4.005022999990615, "p50": 4.020072999992408, "p90": 4.0240040000298904, "mean": 4.01746140000796, "iqr": 0.009850999958871398, "raw_times": [4.014153000071019, 4.005022999990615, 4.024053999955868, 4.0240040000298904, 4.020072999992408], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 4.024974000003567, "peak_bytes": 30977024, "ok": true, "absmax": 0.0, "corr": {"ok": true, "rtol": 1e-05, "atol": 1e-05, "absmax": 0.0, "mae": 0.0, "mse": 0.0, "ref": "deformable_detr_torch"}, "err": null}
deformable_detr/impls/cells/benchmark.py ADDED
@@ -0,0 +1,118 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # /// script
2
+ # requires-python = ">=3.10"
3
+ # dependencies = [
4
+ # "numpy",
5
+ # "torch==2.8.0",
6
+ # "kernels-benchmark-tools",
7
+ # ]
8
+ #
9
+ # [tool.uv.sources]
10
+ # kernels-benchmark-tools = { path = "../../../../../tools", editable = true }
11
+ # ///
12
+ import torch
13
+ import sys
14
+ from kernels_benchmark_tools import KernelTypeEnum, run_benchmark
15
+
16
+
17
+ def torch_deformable_detr(
18
+ value, spatial_shapes, level_start_index, sampling_locations, attention_weights, im2col_step=64
19
+ ):
20
+ """
21
+ PyTorch native reference implementation of multi-scale deformable attention.
22
+ Uses vectorized bilinear interpolation for reasonable performance.
23
+ """
24
+ bs, num_queries, num_heads, num_levels, num_points, _ = sampling_locations.shape
25
+ _, _, _, channels = value.shape
26
+
27
+ output = torch.zeros(bs, num_queries, num_heads, channels, device=value.device, dtype=value.dtype)
28
+
29
+ # Split value tensor by levels
30
+ value_list = value.split([int(h * w) for h, w in spatial_shapes.tolist()], dim=1)
31
+
32
+ # Iterate through each level (can't avoid this loop easily)
33
+ for level_idx in range(num_levels):
34
+ h, w = spatial_shapes[level_idx].tolist()
35
+ value_level = value_list[level_idx] # (bs, h*w, num_heads, channels)
36
+
37
+ # Reshape to spatial grid: (bs, num_heads, channels, h, w)
38
+ value_spatial = value_level.reshape(bs, h, w, num_heads, channels).permute(0, 3, 4, 1, 2)
39
+
40
+ # Get sampling locations and weights for this level
41
+ # loc: (bs, num_queries, num_heads, num_points, 2)
42
+ loc = sampling_locations[:, :, :, level_idx, :, :]
43
+ # weight: (bs, num_queries, num_heads, num_points)
44
+ weight = attention_weights[:, :, :, level_idx, :]
45
+
46
+ # Convert normalized coordinates to pixel coordinates
47
+ # loc[..., 0] is x (width), loc[..., 1] is y (height)
48
+ x = loc[..., 0] * w - 0.5 # (bs, num_queries, num_heads, num_points)
49
+ y = loc[..., 1] * h - 0.5
50
+
51
+ # Get integer coordinates for bilinear interpolation
52
+ x0 = torch.floor(x).long()
53
+ y0 = torch.floor(y).long()
54
+ x1 = x0 + 1
55
+ y1 = y0 + 1
56
+
57
+ # Compute interpolation weights BEFORE clamping (important!)
58
+ lw = x - x0.float() # weight for x direction
59
+ lh = y - y0.float() # weight for y direction
60
+ hw = 1 - lw
61
+ hh = 1 - lh
62
+
63
+ # Create mask for valid sample locations
64
+ valid = (y > -1) & (x > -1) & (y < h) & (x < w)
65
+
66
+ # Create masks for each corner being in bounds
67
+ mask_tl = ((y0 >= 0) & (x0 >= 0)).unsqueeze(-1).float()
68
+ mask_tr = ((y0 >= 0) & (x1 <= w - 1)).unsqueeze(-1).float()
69
+ mask_bl = ((y1 <= h - 1) & (x0 >= 0)).unsqueeze(-1).float()
70
+ mask_br = ((y1 <= h - 1) & (x1 <= w - 1)).unsqueeze(-1).float()
71
+
72
+ # Clamp coordinates for safe indexing
73
+ x0_clamped = torch.clamp(x0, 0, w - 1)
74
+ x1_clamped = torch.clamp(x1, 0, w - 1)
75
+ y0_clamped = torch.clamp(y0, 0, h - 1)
76
+ y1_clamped = torch.clamp(y1, 0, h - 1)
77
+
78
+ # Bilinear interpolation weights for all 4 corners
79
+ w_tl = (hh * hw).unsqueeze(-1) # top-left: (bs, num_queries, num_heads, num_points, 1)
80
+ w_tr = (hh * lw).unsqueeze(-1) # top-right
81
+ w_bl = (lh * hw).unsqueeze(-1) # bottom-left
82
+ w_br = (lh * lw).unsqueeze(-1) # bottom-right
83
+
84
+ # Gather values from the 4 corners using advanced indexing
85
+ batch_idx = torch.arange(bs, device=value.device).view(bs, 1, 1, 1).expand(bs, num_queries, num_heads, num_points)
86
+ head_idx = torch.arange(num_heads, device=value.device).view(1, 1, num_heads, 1).expand(bs, num_queries, num_heads, num_points)
87
+
88
+ # Gather corner values with clamped indices, then apply corner masks
89
+ v_tl = value_spatial[batch_idx, head_idx, :, y0_clamped, x0_clamped] * mask_tl
90
+ v_tr = value_spatial[batch_idx, head_idx, :, y0_clamped, x1_clamped] * mask_tr
91
+ v_bl = value_spatial[batch_idx, head_idx, :, y1_clamped, x0_clamped] * mask_bl
92
+ v_br = value_spatial[batch_idx, head_idx, :, y1_clamped, x1_clamped] * mask_br
93
+
94
+ # Bilinear interpolation
95
+ sampled = w_tl * v_tl + w_tr * v_tr + w_bl * v_bl + w_br * v_br
96
+
97
+ # Apply valid mask (only accumulate if entire sample location is valid)
98
+ sampled = sampled * valid.unsqueeze(-1).float()
99
+
100
+ # Apply attention weights and sum over points
101
+ # weight: (bs, num_queries, num_heads, num_points)
102
+ # Expand weight: (bs, num_queries, num_heads, num_points, 1)
103
+ weighted_sampled = sampled * weight.unsqueeze(-1)
104
+
105
+ # Sum over points: (bs, num_queries, num_heads, channels)
106
+ output += weighted_sampled.sum(dim=3)
107
+
108
+ # Flatten last two dimensions to match kernel output
109
+ return output.reshape(bs, num_queries, num_heads * channels)
110
+
111
+
112
+ run_benchmark(
113
+ kernel_type=KernelTypeEnum.DEFORMABLE_DETR,
114
+ impl_name="torch_eager",
115
+ impl_tags={"family": "pytorch", "backend": "eager"},
116
+ impl_func=torch_deformable_detr,
117
+ dtype="float32",
118
+ )
deformable_detr/impls/cells/nv.py ADDED
@@ -0,0 +1,2 @@
 
 
 
1
+ import subprocess
2
+ print(subprocess.run(["nvidia-smi"], capture_output=True, text=True).stdout)
deformable_detr/impls/hf_kernels_deformable_detr.html ADDED
The diff for this file is too large to render. See raw diff
 
deformable_detr/impls/index.html ADDED
@@ -0,0 +1,89 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ <!DOCTYPE html>
2
+ <html>
3
+ <head>
4
+ <meta charset='UTF-8'>
5
+ <meta name='viewport' content='width=device-width, initial-scale=1.0'>
6
+ <title>Index of /deformable_detr/impls</title>
7
+ <style>
8
+ :root {
9
+ --bg-primary: #0a0a0a;
10
+ --bg-secondary: #121212;
11
+ --bg-tertiary: #181818;
12
+ --text-primary: #e0e0e0;
13
+ --text-secondary: #888888;
14
+ --text-link: #64b5f6;
15
+ --border-primary: #2a2a2a;
16
+ }
17
+ body {
18
+ font-family: system-ui, -apple-system, Segoe UI, Roboto, Helvetica, Arial, sans-serif;
19
+ background: var(--bg-primary);
20
+ color: var(--text-primary);
21
+ margin: 0;
22
+ padding: 16px;
23
+ max-width: 900px;
24
+ margin: 0 auto;
25
+ }
26
+ .controls {
27
+ display: flex;
28
+ justify-content: flex-end;
29
+ margin-bottom: 1rem;
30
+ }
31
+ .back-button {
32
+ background: var(--bg-secondary);
33
+ border: 1px solid var(--border-primary);
34
+ padding: 8px 12px;
35
+ border-radius: 4px;
36
+ color: var(--text-secondary);
37
+ cursor: pointer;
38
+ font-size: 0.9rem;
39
+ text-decoration: none;
40
+ display: inline-block;
41
+ }
42
+ .back-button:hover {
43
+ color: var(--text-primary);
44
+ background: var(--bg-tertiary);
45
+ }
46
+ h1 {
47
+ font-size: 1.5em;
48
+ margin: 1rem 0;
49
+ color: var(--text-primary);
50
+ border-bottom: 1px solid var(--border-primary);
51
+ padding-bottom: 0.5rem;
52
+ }
53
+ ul {
54
+ list-style-type: none;
55
+ padding: 0;
56
+ }
57
+ li {
58
+ margin: 0;
59
+ border-bottom: 1px solid var(--border-primary);
60
+ }
61
+ li:last-child {
62
+ border-bottom: none;
63
+ }
64
+ a {
65
+ display: block;
66
+ padding: 0.75rem 0.5rem;
67
+ text-decoration: none;
68
+ color: var(--text-link);
69
+ transition: background 0.2s ease;
70
+ }
71
+ a:hover {
72
+ background: var(--bg-secondary);
73
+ }
74
+ .dir {
75
+ font-weight: 500;
76
+ }
77
+ </style>
78
+ </head>
79
+ <body>
80
+ <div class='controls'>
81
+ <a href='../index.html' class='back-button'>← back</a>
82
+ </div>
83
+ <h1>Index of /deformable_detr/impls</h1>
84
+ <ul>
85
+ <li><a href='hf_kernels_deformable_detr.html' class='file'>hf_kernels_deformable_detr.html</a></li>
86
+ <li><a href='torch_deformable_detr.html' class='file'>torch_deformable_detr.html</a></li>
87
+ </ul>
88
+ </body>
89
+ </html>
deformable_detr/impls/torch_deformable_detr.html ADDED
The diff for this file is too large to render. See raw diff
 
deformable_detr/index.html ADDED
@@ -0,0 +1,89 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ <!DOCTYPE html>
2
+ <html>
3
+ <head>
4
+ <meta charset='UTF-8'>
5
+ <meta name='viewport' content='width=device-width, initial-scale=1.0'>
6
+ <title>Index of /deformable_detr</title>
7
+ <style>
8
+ :root {
9
+ --bg-primary: #0a0a0a;
10
+ --bg-secondary: #121212;
11
+ --bg-tertiary: #181818;
12
+ --text-primary: #e0e0e0;
13
+ --text-secondary: #888888;
14
+ --text-link: #64b5f6;
15
+ --border-primary: #2a2a2a;
16
+ }
17
+ body {
18
+ font-family: system-ui, -apple-system, Segoe UI, Roboto, Helvetica, Arial, sans-serif;
19
+ background: var(--bg-primary);
20
+ color: var(--text-primary);
21
+ margin: 0;
22
+ padding: 16px;
23
+ max-width: 900px;
24
+ margin: 0 auto;
25
+ }
26
+ .controls {
27
+ display: flex;
28
+ justify-content: flex-end;
29
+ margin-bottom: 1rem;
30
+ }
31
+ .back-button {
32
+ background: var(--bg-secondary);
33
+ border: 1px solid var(--border-primary);
34
+ padding: 8px 12px;
35
+ border-radius: 4px;
36
+ color: var(--text-secondary);
37
+ cursor: pointer;
38
+ font-size: 0.9rem;
39
+ text-decoration: none;
40
+ display: inline-block;
41
+ }
42
+ .back-button:hover {
43
+ color: var(--text-primary);
44
+ background: var(--bg-tertiary);
45
+ }
46
+ h1 {
47
+ font-size: 1.5em;
48
+ margin: 1rem 0;
49
+ color: var(--text-primary);
50
+ border-bottom: 1px solid var(--border-primary);
51
+ padding-bottom: 0.5rem;
52
+ }
53
+ ul {
54
+ list-style-type: none;
55
+ padding: 0;
56
+ }
57
+ li {
58
+ margin: 0;
59
+ border-bottom: 1px solid var(--border-primary);
60
+ }
61
+ li:last-child {
62
+ border-bottom: none;
63
+ }
64
+ a {
65
+ display: block;
66
+ padding: 0.75rem 0.5rem;
67
+ text-decoration: none;
68
+ color: var(--text-link);
69
+ transition: background 0.2s ease;
70
+ }
71
+ a:hover {
72
+ background: var(--bg-secondary);
73
+ }
74
+ .dir {
75
+ font-weight: 500;
76
+ }
77
+ </style>
78
+ </head>
79
+ <body>
80
+ <div class='controls'>
81
+ <a href='../index.html' class='back-button'>← back</a>
82
+ </div>
83
+ <h1>Index of /deformable_detr</h1>
84
+ <ul>
85
+ <li><a href='impls/index.html' class='dir'>impls/</a></li>
86
+ <li><a href='results/index.html' class='dir'>results/</a></li>
87
+ </ul>
88
+ </body>
89
+ </html>
deformable_detr/results/artifacts/combine/latency.svg ADDED

Git LFS Details

  • SHA256: b38828b5c85834f31812d3f314ebdc3cc2e8481610a6d31b84a4f9b0ad78c0f2
  • Pointer size: 130 Bytes
  • Size of remote file: 17.8 kB
deformable_detr/results/cells/combine.py ADDED
@@ -0,0 +1,26 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # /// script
2
+ # requires-python = ">=3.10"
3
+ # dependencies = [
4
+ # "numpy",
5
+ # "torch==2.8.0",
6
+ # "kernels-benchmark-tools",
7
+ # "matplotlib",
8
+ # ]
9
+ #
10
+ # [tool.uv.sources]
11
+ # kernels-benchmark-tools = { path = "../../../../../tools", editable = true }
12
+ # ///
13
+ from kernels_benchmark_tools.core.visuals import generate_combined_results
14
+
15
+ # Map display names to uvnote environment variables
16
+ cache_env_map = {
17
+ "HF Kernels Deformable DETR": "UVNOTE_FILE_HF_KERNELS_DEFORMABLE_DETR_BENCHMARK",
18
+ "PyTorch Deformable DETR": "UVNOTE_FILE_TORCH_DEFORMABLE_DETR_BENCHMARK",
19
+ }
20
+
21
+ # Generate combined results with visualization
22
+ generate_combined_results(
23
+ cache_env_map=cache_env_map,
24
+ output_filename="deformable_detr.jsonl",
25
+ svg_filename="latency.svg"
26
+ )
deformable_detr/results/combined_results.html ADDED
The diff for this file is too large to render. See raw diff
 
deformable_detr/results/index.html ADDED
@@ -0,0 +1,88 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ <!DOCTYPE html>
2
+ <html>
3
+ <head>
4
+ <meta charset='UTF-8'>
5
+ <meta name='viewport' content='width=device-width, initial-scale=1.0'>
6
+ <title>Index of /deformable_detr/results</title>
7
+ <style>
8
+ :root {
9
+ --bg-primary: #0a0a0a;
10
+ --bg-secondary: #121212;
11
+ --bg-tertiary: #181818;
12
+ --text-primary: #e0e0e0;
13
+ --text-secondary: #888888;
14
+ --text-link: #64b5f6;
15
+ --border-primary: #2a2a2a;
16
+ }
17
+ body {
18
+ font-family: system-ui, -apple-system, Segoe UI, Roboto, Helvetica, Arial, sans-serif;
19
+ background: var(--bg-primary);
20
+ color: var(--text-primary);
21
+ margin: 0;
22
+ padding: 16px;
23
+ max-width: 900px;
24
+ margin: 0 auto;
25
+ }
26
+ .controls {
27
+ display: flex;
28
+ justify-content: flex-end;
29
+ margin-bottom: 1rem;
30
+ }
31
+ .back-button {
32
+ background: var(--bg-secondary);
33
+ border: 1px solid var(--border-primary);
34
+ padding: 8px 12px;
35
+ border-radius: 4px;
36
+ color: var(--text-secondary);
37
+ cursor: pointer;
38
+ font-size: 0.9rem;
39
+ text-decoration: none;
40
+ display: inline-block;
41
+ }
42
+ .back-button:hover {
43
+ color: var(--text-primary);
44
+ background: var(--bg-tertiary);
45
+ }
46
+ h1 {
47
+ font-size: 1.5em;
48
+ margin: 1rem 0;
49
+ color: var(--text-primary);
50
+ border-bottom: 1px solid var(--border-primary);
51
+ padding-bottom: 0.5rem;
52
+ }
53
+ ul {
54
+ list-style-type: none;
55
+ padding: 0;
56
+ }
57
+ li {
58
+ margin: 0;
59
+ border-bottom: 1px solid var(--border-primary);
60
+ }
61
+ li:last-child {
62
+ border-bottom: none;
63
+ }
64
+ a {
65
+ display: block;
66
+ padding: 0.75rem 0.5rem;
67
+ text-decoration: none;
68
+ color: var(--text-link);
69
+ transition: background 0.2s ease;
70
+ }
71
+ a:hover {
72
+ background: var(--bg-secondary);
73
+ }
74
+ .dir {
75
+ font-weight: 500;
76
+ }
77
+ </style>
78
+ </head>
79
+ <body>
80
+ <div class='controls'>
81
+ <a href='../index.html' class='back-button'>← back</a>
82
+ </div>
83
+ <h1>Index of /deformable_detr/results</h1>
84
+ <ul>
85
+ <li><a href='combined_results.html' class='file'>combined_results.html</a></li>
86
+ </ul>
87
+ </body>
88
+ </html>
flash_attn/impls/artifacts/benchmark/attention.jsonl CHANGED
@@ -1,6 +1,6 @@
1
- {"ts": "2025-10-30T15:53:30Z", "run": "30999c763e9c4aa995d0df5078964128", "impl": "hf_kernels_flash_attn3", "tags": {"family": "hf-kernels", "backend": "flash-attn3", "compile": "none"}, "wl": {"name": "cuda_attn_L128_bfloat16", "batch": 1, "seq_len": 4224, "heads": 24, "head_dim": 128, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.9094910000158052, "p50": 0.9113720000186731, "p90": 0.9181919999718957, "mean": 0.9141214000010223, "iqr": 0.007780999965234514, "raw_times": [0.9104110000066612, 0.9094910000158052, 0.9113720000186731, 0.9181919999718957, 0.9211409999920761], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.9259819999556385, "peak_bytes": 295567360, "ok": true, "absmax": 0.0625, "corr": {"ok": true, "rtol": 0.02, "atol": 0.02, "absmax": 0.0625, "mae": 0.0003604888916015625, "mse": 2.8014183044433594e-06, "ref": "sdpa_math_fp32"}, "err": null}
2
- {"ts": "2025-10-30T15:53:30Z", "run": "30999c763e9c4aa995d0df5078964128", "impl": "hf_kernels_flash_attn3", "tags": {"family": "hf-kernels", "backend": "flash-attn3", "compile": "none"}, "wl": {"name": "cuda_attn_L256_bfloat16", "batch": 1, "seq_len": 4352, "heads": 24, "head_dim": 128, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.9480720000283327, "p50": 0.9496129999888581, "p90": 0.9558429999856344, "mean": 0.952826599996115, "iqr": 0.00735100002202671, "raw_times": [0.9480720000283327, 0.9484919999636077, 0.9496129999888581, 0.9558429999856344, 0.962113000014142], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.9554529999604711, "peak_bytes": 304742400, "ok": true, "absmax": 0.0625, "corr": {"ok": true, "rtol": 0.02, "atol": 0.02, "absmax": 0.0625, "mae": 0.00035858154296875, "mse": 2.7865171432495117e-06, "ref": "sdpa_math_fp32"}, "err": null}
3
- {"ts": "2025-10-30T15:53:30Z", "run": "30999c763e9c4aa995d0df5078964128", "impl": "hf_kernels_flash_attn3", "tags": {"family": "hf-kernels", "backend": "flash-attn3", "compile": "none"}, "wl": {"name": "cuda_attn_L320_bfloat16", "batch": 1, "seq_len": 4416, "heads": 24, "head_dim": 128, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 1.0061439999731192, "p50": 1.0189639999680367, "p90": 1.0215840000000753, "mean": 1.017895999996199, "iqr": 0.0038299999687296804, "raw_times": [1.0189639999680367, 1.025034000008418, 1.0177540000313456, 1.0061439999731192, 1.0215840000000753], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 1.0171540000101231, "peak_bytes": 307494912, "ok": true, "absmax": 0.0625, "corr": {"ok": true, "rtol": 0.02, "atol": 0.02, "absmax": 0.0625, "mae": 0.00035858154296875, "mse": 2.7865171432495117e-06, "ref": "sdpa_math_fp32"}, "err": null}
4
- {"ts": "2025-10-30T15:53:30Z", "run": "30999c763e9c4aa995d0df5078964128", "impl": "hf_kernels_flash_attn3", "tags": {"family": "hf-kernels", "backend": "flash-attn3", "compile": "none"}, "wl": {"name": "cuda_attn_L384_bfloat16", "batch": 1, "seq_len": 4480, "heads": 24, "head_dim": 128, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 1.0146539999595916, "p50": 1.019383999960155, "p90": 1.0202839999919888, "mean": 1.018159799980367, "iqr": 0.004200999967451935, "raw_times": [1.0202839999919888, 1.0146539999595916, 1.0160830000245369, 1.0203939999655631, 1.019383999960155], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 1.0248149999938505, "peak_bytes": 311296000, "ok": true, "absmax": 0.0625, "corr": {"ok": true, "rtol": 0.02, "atol": 0.02, "absmax": 0.0625, "mae": 0.00035858154296875, "mse": 2.7865171432495117e-06, "ref": "sdpa_math_fp32"}, "err": null}
5
- {"ts": "2025-10-30T15:53:30Z", "run": "30999c763e9c4aa995d0df5078964128", "impl": "hf_kernels_flash_attn3", "tags": {"family": "hf-kernels", "backend": "flash-attn3", "compile": "none"}, "wl": {"name": "cuda_attn_L448_bfloat16", "batch": 1, "seq_len": 4544, "heads": 24, "head_dim": 128, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 1.1737179999613545, "p50": 1.184327000032681, "p90": 1.1859380000487363, "mean": 1.186479800003326, "iqr": 0.010300000042207103, "raw_times": [1.1756380000065292, 1.1737179999613545, 1.1859380000487363, 1.184327000032681, 1.2127779999673294], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 1.1959679999904438, "peak_bytes": 315621376, "ok": true, "absmax": 0.0625, "corr": {"ok": true, "rtol": 0.02, "atol": 0.02, "absmax": 0.0625, "mae": 0.00035858154296875, "mse": 2.7865171432495117e-06, "ref": "sdpa_math_fp32"}, "err": null}
6
- {"ts": "2025-10-30T15:53:30Z", "run": "30999c763e9c4aa995d0df5078964128", "impl": "hf_kernels_flash_attn3", "tags": {"family": "hf-kernels", "backend": "flash-attn3", "compile": "none"}, "wl": {"name": "cuda_attn_L512_bfloat16", "batch": 1, "seq_len": 4608, "heads": 24, "head_dim": 128, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 1.1702179999701912, "p50": 1.1838479999823903, "p90": 1.1906280000175684, "mean": 1.1843698000006952, "iqr": 0.016700999992735888, "raw_times": [1.1739270000248325, 1.1702179999701912, 1.1838479999823903, 1.1906280000175684, 1.2032280000084938], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 1.1880579999683505, "peak_bytes": 319946752, "ok": true, "absmax": 0.125, "corr": {"ok": true, "rtol": 0.02, "atol": 0.02, "absmax": 0.125, "mae": 0.000362396240234375, "mse": 2.8014183044433594e-06, "ref": "sdpa_math_fp32"}, "err": null}
 
1
+ {"ts": "2025-10-31T20:13:46Z", "run": "0cb1af490a594cbca21d4dd4012a3c10", "impl": "torch_flash_ma", "tags": {"family": "torch-sdpa", "backend": "FLASH", "compile": "max-autotune"}, "wl": {"name": "cuda_attn_L128_bfloat16", "batch": 1, "seq_len": 4224, "heads": 24, "head_dim": 128, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 1.208432000112225, "p50": 1.215130999980829, "p90": 1.2198710001030122, "mean": 1.215487200033749, "iqr": 0.006680000069536618, "raw_times": [1.2208109999392036, 1.208432000112225, 1.2198710001030122, 1.2131910000334756, 1.215130999980829], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 1.2240119999660237, "peak_bytes": 295567360, "ok": true, "absmax": 0.0625, "corr": {"ok": true, "rtol": 0.02, "atol": 0.02, "absmax": 0.0625, "mae": 0.000354766845703125, "mse": 2.7567148208618164e-06, "ref": "sdpa_math_fp32"}, "err": null}
2
+ {"ts": "2025-10-31T20:13:46Z", "run": "0cb1af490a594cbca21d4dd4012a3c10", "impl": "torch_flash_ma", "tags": {"family": "torch-sdpa", "backend": "FLASH", "compile": "max-autotune"}, "wl": {"name": "cuda_attn_L256_bfloat16", "batch": 1, "seq_len": 4352, "heads": 24, "head_dim": 128, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 1.26713200006634, "p50": 1.2766830000146001, "p90": 1.277253000125711, "mean": 1.2749268000789016, "iqr": 0.004750000016429112, "raw_times": [1.277253000125711, 1.26713200006634, 1.2766830000146001, 1.281063000078575, 1.2725030001092819], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 1.2717629999769997, "peak_bytes": 304742400, "ok": true, "absmax": 0.0625, "corr": {"ok": true, "rtol": 0.02, "atol": 0.02, "absmax": 0.0625, "mae": 0.000354766845703125, "mse": 2.7418136596679688e-06, "ref": "sdpa_math_fp32"}, "err": null}
3
+ {"ts": "2025-10-31T20:13:46Z", "run": "0cb1af490a594cbca21d4dd4012a3c10", "impl": "torch_flash_ma", "tags": {"family": "torch-sdpa", "backend": "FLASH", "compile": "max-autotune"}, "wl": {"name": "cuda_attn_L320_bfloat16", "batch": 1, "seq_len": 4416, "heads": 24, "head_dim": 128, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 1.2928539999847999, "p50": 1.3003640001443273, "p90": 1.3163240000721999, "mean": 1.3067478000721167, "iqr": 0.01689100008661626, "raw_times": [1.3003640001443273, 1.2928539999847999, 1.2994329999855836, 1.3163240000721999, 1.3247640001736727], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 1.3026630001604644, "peak_bytes": 307494912, "ok": true, "absmax": 0.0625, "corr": {"ok": true, "rtol": 0.02, "atol": 0.02, "absmax": 0.0625, "mae": 0.000354766845703125, "mse": 2.7567148208618164e-06, "ref": "sdpa_math_fp32"}, "err": null}
4
+ {"ts": "2025-10-31T20:13:46Z", "run": "0cb1af490a594cbca21d4dd4012a3c10", "impl": "torch_flash_ma", "tags": {"family": "torch-sdpa", "backend": "FLASH", "compile": "max-autotune"}, "wl": {"name": "cuda_attn_L384_bfloat16", "batch": 1, "seq_len": 4480, "heads": 24, "head_dim": 128, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 1.3232850001259067, "p50": 1.3295650001055037, "p90": 1.3361950000216893, "mean": 1.332684600038192, "iqr": 0.007890999995652237, "raw_times": [1.328304000026037, 1.3361950000216893, 1.3295650001055037, 1.3232850001259067, 1.3460739999118232], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 1.3245140000890387, "peak_bytes": 311296000, "ok": true, "absmax": 0.0625, "corr": {"ok": true, "rtol": 0.02, "atol": 0.02, "absmax": 0.0625, "mae": 0.000354766845703125, "mse": 2.7418136596679688e-06, "ref": "sdpa_math_fp32"}, "err": null}
5
+ {"ts": "2025-10-31T20:13:46Z", "run": "0cb1af490a594cbca21d4dd4012a3c10", "impl": "torch_flash_ma", "tags": {"family": "torch-sdpa", "backend": "FLASH", "compile": "max-autotune"}, "wl": {"name": "cuda_attn_L448_bfloat16", "batch": 1, "seq_len": 4544, "heads": 24, "head_dim": 128, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 1.4790479999646777, "p50": 1.4950690001569455, "p90": 1.4989779999723396, "mean": 1.4914904000306706, "iqr": 0.017840000055002747, "raw_times": [1.5032190001420531, 1.4950690001569455, 1.4790479999646777, 1.4811379999173369, 1.4989779999723396], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 1.5107090000583412, "peak_bytes": 315621376, "ok": true, "absmax": 0.0625, "corr": {"ok": true, "rtol": 0.02, "atol": 0.02, "absmax": 0.0625, "mae": 0.000354766845703125, "mse": 2.7418136596679688e-06, "ref": "sdpa_math_fp32"}, "err": null}
6
+ {"ts": "2025-10-31T20:13:46Z", "run": "0cb1af490a594cbca21d4dd4012a3c10", "impl": "torch_flash_ma", "tags": {"family": "torch-sdpa", "backend": "FLASH", "compile": "max-autotune"}, "wl": {"name": "cuda_attn_L512_bfloat16", "batch": 1, "seq_len": 4608, "heads": 24, "head_dim": 128, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 1.511368999899787, "p50": 1.5117090001695033, "p90": 1.512698999931672, "mean": 1.516499199988175, "iqr": 0.00113999999484804, "raw_times": [1.511368999899787, 1.512698999931672, 1.5117090001695033, 1.511558999936824, 1.5351600000030885], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 1.5183190000698232, "peak_bytes": 319946752, "ok": true, "absmax": 0.125, "corr": {"ok": true, "rtol": 0.02, "atol": 0.02, "absmax": 0.125, "mae": 0.0003566741943359375, "mse": 2.7567148208618164e-06, "ref": "sdpa_math_fp32"}, "err": null}
flash_attn/impls/cells/benchmark.py CHANGED
@@ -4,7 +4,6 @@
4
  # "numpy",
5
  # "torch==2.8.0",
6
  # "kernels-benchmark-tools",
7
- # "kernels",
8
  # ]
9
  #
10
  # [tool.uv.sources]
@@ -13,19 +12,18 @@
13
  import torch
14
  import sys
15
  from kernels_benchmark_tools import KernelTypeEnum, run_benchmark
16
- from kernels import get_kernel
17
 
18
- # Load the flash attention 3 kernel
19
- hf_kernels_flash_attn3 = get_kernel("kernels-community/flash-attn3")
20
 
21
-
22
- def hf_flash_attention3(query, key, value):
23
- return hf_kernels_flash_attn3.flash_attn_func(query, key, value, causal=False)[0]
 
 
24
 
25
 
26
  run_benchmark(
27
  kernel_type=KernelTypeEnum.ATTENTION,
28
- impl_name="hf_kernels_flash_attn3",
29
- impl_tags={"family": "hf-kernels", "backend": "flash-attn3", "compile": "none"},
30
- impl_func=hf_flash_attention3,
31
  )
 
4
  # "numpy",
5
  # "torch==2.8.0",
6
  # "kernels-benchmark-tools",
 
7
  # ]
8
  #
9
  # [tool.uv.sources]
 
12
  import torch
13
  import sys
14
  from kernels_benchmark_tools import KernelTypeEnum, run_benchmark
 
15
 
 
 
16
 
17
+ def torch_flash(q, k, v):
18
+ qt, kt, vt = (x.transpose(1, 2).contiguous() for x in (q, k, v))
19
+ with torch.nn.attention.sdpa_kernel(torch.nn.attention.SDPBackend.FLASH_ATTENTION):
20
+ o = torch.nn.functional.scaled_dot_product_attention(qt, kt, vt)
21
+ return o.transpose(1, 2).contiguous()
22
 
23
 
24
  run_benchmark(
25
  kernel_type=KernelTypeEnum.ATTENTION,
26
+ impl_name="torch_flash_ma",
27
+ impl_tags={"family": "torch-sdpa", "backend": "FLASH", "compile": "max-autotune"},
28
+ impl_func=torch_flash,
29
  )
flash_attn/impls/flash_attention.html CHANGED
@@ -4110,7 +4110,7 @@ Cell: nv | 0.21s
4110
  | <button class="run-btn" onclick="runCell('nv')">▶ run</button>
4111
  <button class="copy-btn" onclick="copyCell('nv')">Copy</button>
4112
  <a href="cells/nv.py" target="_blank" class="raw-btn">Raw</a>
4113
- <a href="https://github.com/huggingface/kernels-uvnotes/blob/main/flash_attn/impls/flash_attention.md" target="_blank" class="github-btn">GitHub</a>
4114
  </div>
4115
  <div id="code-nv" class="cell-code" data-lines="3">
4116
  <div class="code-wrap">
@@ -4123,7 +4123,7 @@ Cell: nv | 0.21s
4123
  </div>
4124
  </div>
4125
  <div id="output-nv" class="cell-output">
4126
- <div class="cell-stdout"><pre class="stdout-text">Thu Oct 30 15:52:36 2025
4127
  +-----------------------------------------------------------------------------------------+
4128
  | NVIDIA-SMI 570.195.03 Driver Version: 570.195.03 CUDA Version: 12.8 |
4129
  |-----------------------------------------+------------------------+----------------------+
@@ -4132,7 +4132,7 @@ Cell: nv | 0.21s
4132
  | | | MIG M. |
4133
  |=========================================+========================+======================|
4134
  | 0 NVIDIA L40S On | 00000000:4D:00.0 Off | 0 |
4135
- | N/A 30C P0 75W / 350W | 0MiB / 46068MiB | 11% Default |
4136
  | | | N/A |
4137
  +-----------------------------------------+------------------------+----------------------+
4138
 
@@ -4154,13 +4154,13 @@ Cell: nv | 0.21s
4154
  <span class="collapse-indicators">
4155
  <span onclick="toggleCode('benchmark')" style="cursor: pointer;">▼ code</span>
4156
  <span onclick="toggleOutput('benchmark')" style="cursor: pointer;">▼ output</span>
4157
- <span id="uv-indicator-benchmark" onclick="toggleUvLogsFromHeader('benchmark')" style="cursor: pointer;">▶ uv-logs</span>
4158
  </span> |
4159
- Cell: benchmark | 7.50s
4160
  | <button class="run-btn" onclick="runCell('benchmark')">▶ run</button>
4161
  <button class="copy-btn" onclick="copyCell('benchmark')">Copy</button>
4162
  <a href="cells/benchmark.py" target="_blank" class="raw-btn">Raw</a>
4163
- <a href="https://github.com/huggingface/kernels-uvnotes/blob/main/flash_attn/impls/flash_attention.md" target="_blank" class="github-btn">GitHub</a>
4164
  </div>
4165
  <div id="code-benchmark" class="cell-code" data-lines="29">
4166
  <div class="code-wrap">
@@ -4207,29 +4207,29 @@ PROFILE TRACE: torch_flash_ma | cuda_attn_L128_bfloat16
4207
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4208
  Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
4209
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4210
- torch_flash_ma 0.00% 0.000us 0.00% 0.000us 0.000us 3.587ms 102.23% 3.587ms 3.587ms 1
4211
- torch_flash_ma 7.11% 370.236us 47.42% 2.468ms 2.468ms 0.000us 0.00% 3.549ms 3.549ms 1
4212
- aten::scaled_dot_product_attention 0.85% 44.391us 4.44% 231.334us 77.111us 0.000us 0.00% 2.791ms 930.498us 3
4213
- aten::_scaled_dot_product_flash_attention 0.51% 26.381us 3.59% 186.943us 62.314us 0.000us 0.00% 2.791ms 930.498us 3
4214
- aten::_flash_attention_forward 0.76% 39.658us 2.57% 134.002us 44.667us 2.791ms 79.55% 2.791ms 930.498us 3
4215
- void pytorch_flash::flash_fwd_kernel&lt;Flash_fwd_kerne... 0.00% 0.000us 0.00% 0.000us 0.000us 2.791ms 79.55% 2.791ms 930.498us 3
4216
- aten::contiguous 0.30% 15.641us 34.37% 1.789ms 149.098us 0.000us 0.00% 757.697us 63.141us 12
4217
- aten::clone 0.74% 38.596us 34.07% 1.774ms 147.794us 0.000us 0.00% 757.697us 63.141us 12
4218
- aten::copy_ 1.78% 92.553us 31.63% 1.647ms 137.218us 717.505us 20.45% 757.697us 63.141us 12
4219
- void at::native::elementwise_kernel&lt;128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 717.505us 20.45% 717.505us 59.792us 12
4220
- Activity Buffer Request 27.90% 1.452ms 27.90% 1.452ms 1.452ms 40.192us 1.15% 40.192us 40.192us 1
4221
- aten::transpose 1.49% 77.390us 2.00% 104.302us 4.346us 0.000us 0.00% 0.000us 0.000us 24
4222
- aten::as_strided 0.52% 26.912us 0.52% 26.912us 1.121us 0.000us 0.00% 0.000us 0.000us 24
4223
- aten::empty_like 0.55% 28.453us 2.13% 110.953us 7.397us 0.000us 0.00% 0.000us 0.000us 15
4224
- aten::empty 1.93% 100.211us 1.93% 100.211us 4.175us 0.000us 0.00% 0.000us 0.000us 24
4225
- cudaLaunchKernel 2.45% 127.363us 2.45% 127.363us 8.491us 0.000us 0.00% 0.000us 0.000us 15
4226
- aten::empty_strided 0.32% 16.580us 0.32% 16.580us 5.527us 0.000us 0.00% 0.000us 0.000us 3
4227
- cudaDeviceGetAttribute 0.05% 2.441us 0.05% 2.441us 0.407us 0.000us 0.00% 0.000us 0.000us 6
4228
- cudaFuncSetAttribute 0.18% 9.241us 0.18% 9.241us 3.080us 0.000us 0.00% 0.000us 0.000us 3
4229
- cudaDeviceSynchronize 52.58% 2.737ms 52.58% 2.737ms 2.737ms 0.000us 0.00% 0.000us 0.000us 1
4230
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4231
- Self CPU time total: 5.205ms
4232
- Self CUDA time total: 3.509ms
4233
 
4234
 
4235
 
@@ -4239,29 +4239,29 @@ PROFILE TRACE: torch_flash_ma | cuda_attn_L256_bfloat16
4239
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4240
  Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
4241
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4242
- torch_flash_ma 4.72% 248.136us 41.78% 2.196ms 2.196ms 0.000us 0.00% 3.803ms 3.803ms 1
4243
- torch_flash_ma 0.00% 0.000us 0.00% 0.000us 0.000us 3.759ms 100.28% 3.759ms 3.759ms 1
4244
- aten::scaled_dot_product_attention 0.51% 26.852us 3.40% 178.734us 59.578us 0.000us 0.00% 2.990ms 996.607us 3
4245
- aten::_scaled_dot_product_flash_attention 0.35% 18.418us 2.89% 151.882us 50.627us 0.000us 0.00% 2.990ms 996.607us 3
4246
- aten::_flash_attention_forward 0.65% 34.063us 2.10% 110.562us 36.854us 2.990ms 79.76% 2.990ms 996.607us 3
4247
- void pytorch_flash::flash_fwd_kernel&lt;Flash_fwd_kerne... 0.00% 0.000us 0.00% 0.000us 0.000us 2.990ms 79.76% 2.990ms 996.607us 3
4248
- aten::contiguous 0.19% 10.079us 32.75% 1.721ms 143.446us 0.000us 0.00% 813.629us 67.802us 12
4249
- aten::clone 0.54% 28.151us 32.56% 1.711ms 142.606us 0.000us 0.00% 813.629us 67.802us 12
4250
- aten::copy_ 1.97% 103.281us 30.84% 1.621ms 135.084us 758.782us 20.24% 813.629us 67.802us 12
4251
- void at::native::elementwise_kernel&lt;128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 758.782us 20.24% 758.782us 63.232us 12
4252
- Activity Buffer Request 27.29% 1.434ms 27.29% 1.434ms 1.434ms 54.847us 1.46% 54.847us 54.847us 1
4253
- aten::transpose 0.98% 51.741us 1.34% 70.423us 2.934us 0.000us 0.00% 0.000us 0.000us 24
4254
- aten::as_strided 0.36% 18.682us 0.36% 18.682us 0.778us 0.000us 0.00% 0.000us 0.000us 24
4255
- aten::empty_like 0.38% 19.848us 1.54% 80.939us 5.396us 0.000us 0.00% 0.000us 0.000us 15
4256
- aten::empty 1.45% 76.001us 1.45% 76.001us 3.167us 0.000us 0.00% 0.000us 0.000us 24
4257
- cudaLaunchKernel 2.04% 106.952us 2.04% 106.952us 7.130us 0.000us 0.00% 0.000us 0.000us 15
4258
- aten::empty_strided 0.26% 13.850us 0.26% 13.850us 4.617us 0.000us 0.00% 0.000us 0.000us 3
4259
- cudaDeviceGetAttribute 0.04% 1.860us 0.04% 1.860us 0.310us 0.000us 0.00% 0.000us 0.000us 6
4260
- cudaFuncSetAttribute 0.07% 3.760us 0.07% 3.760us 1.253us 0.000us 0.00% 0.000us 0.000us 3
4261
- cudaDeviceSynchronize 58.22% 3.060ms 58.22% 3.060ms 3.060ms 0.000us 0.00% 0.000us 0.000us 1
4262
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4263
- Self CPU time total: 5.255ms
4264
- Self CUDA time total: 3.749ms
4265
 
4266
 
4267
 
@@ -4271,29 +4271,29 @@ PROFILE TRACE: torch_flash_ma | cuda_attn_L320_bfloat16
4271
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4272
  Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
4273
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4274
- torch_flash_ma 4.59% 242.054us 41.69% 2.201ms 2.201ms 0.000us 0.00% 3.795ms 3.795ms 1
4275
- torch_flash_ma 0.00% 0.000us 0.00% 0.000us 0.000us 3.746ms 100.27% 3.746ms 3.746ms 1
4276
- aten::scaled_dot_product_attention 0.50% 26.150us 3.40% 179.413us 59.804us 0.000us 0.00% 2.957ms 985.581us 3
4277
- aten::_scaled_dot_product_flash_attention 0.35% 18.371us 2.90% 153.263us 51.088us 0.000us 0.00% 2.957ms 985.581us 3
4278
- aten::_flash_attention_forward 0.64% 34.041us 2.11% 111.213us 37.071us 2.957ms 79.14% 2.957ms 985.581us 3
4279
- void pytorch_flash::flash_fwd_kernel&lt;Flash_fwd_kerne... 0.00% 0.000us 0.00% 0.000us 0.000us 2.957ms 79.14% 2.957ms 985.581us 3
4280
- aten::contiguous 0.19% 9.991us 32.85% 1.734ms 144.489us 0.000us 0.00% 838.147us 69.846us 12
4281
- aten::clone 0.52% 27.541us 32.66% 1.724ms 143.657us 0.000us 0.00% 838.147us 69.846us 12
4282
- aten::copy_ 1.47% 77.641us 30.91% 1.632ms 135.987us 779.363us 20.86% 838.147us 69.846us 12
4283
- void at::native::elementwise_kernel&lt;128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 779.363us 20.86% 779.363us 64.947us 12
4284
- Activity Buffer Request 27.89% 1.472ms 27.89% 1.472ms 1.472ms 58.784us 1.57% 58.784us 58.784us 1
4285
- aten::transpose 0.96% 50.819us 1.31% 69.110us 2.880us 0.000us 0.00% 0.000us 0.000us 24
4286
- aten::as_strided 0.35% 18.291us 0.35% 18.291us 0.762us 0.000us 0.00% 0.000us 0.000us 24
4287
- aten::empty_like 0.38% 20.141us 1.58% 83.392us 5.559us 0.000us 0.00% 0.000us 0.000us 15
4288
- aten::empty 1.49% 78.782us 1.49% 78.782us 3.283us 0.000us 0.00% 0.000us 0.000us 24
4289
- cudaLaunchKernel 1.99% 104.800us 1.99% 104.800us 6.987us 0.000us 0.00% 0.000us 0.000us 15
4290
- aten::empty_strided 0.27% 14.320us 0.27% 14.320us 4.773us 0.000us 0.00% 0.000us 0.000us 3
4291
- cudaDeviceGetAttribute 0.04% 1.870us 0.04% 1.870us 0.312us 0.000us 0.00% 0.000us 0.000us 6
4292
- cudaFuncSetAttribute 0.07% 3.720us 0.07% 3.720us 1.240us 0.000us 0.00% 0.000us 0.000us 3
4293
- cudaDeviceSynchronize 58.31% 3.078ms 58.31% 3.078ms 3.078ms 0.000us 0.00% 0.000us 0.000us 1
4294
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4295
- Self CPU time total: 5.279ms
4296
- Self CUDA time total: 3.736ms
4297
 
4298
 
4299
 
@@ -4303,29 +4303,29 @@ PROFILE TRACE: torch_flash_ma | cuda_attn_L384_bfloat16
4303
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4304
  Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
4305
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4306
- torch_flash_ma 4.47% 246.252us 42.66% 2.352ms 2.352ms 0.000us 0.00% 3.878ms 3.878ms 1
4307
- torch_flash_ma 0.00% 0.000us 0.00% 0.000us 0.000us 3.831ms 100.28% 3.831ms 3.831ms 1
4308
- aten::scaled_dot_product_attention 0.47% 26.180us 3.22% 177.714us 59.238us 0.000us 0.00% 3.035ms 1.012ms 3
4309
- aten::_scaled_dot_product_flash_attention 0.34% 18.934us 2.75% 151.534us 50.511us 0.000us 0.00% 3.035ms 1.012ms 3
4310
- aten::_flash_attention_forward 0.60% 33.169us 1.99% 109.931us 36.644us 3.035ms 79.45% 3.035ms 1.012ms 3
4311
- void pytorch_flash::flash_fwd_kernel&lt;Flash_fwd_kerne... 0.00% 0.000us 0.00% 0.000us 0.000us 3.035ms 79.45% 3.035ms 1.012ms 3
4312
- aten::contiguous 0.19% 10.269us 34.14% 1.882ms 156.829us 0.000us 0.00% 843.264us 70.272us 12
4313
- aten::clone 0.51% 27.861us 33.95% 1.872ms 155.974us 0.000us 0.00% 843.264us 70.272us 12
4314
- aten::copy_ 1.39% 76.612us 32.27% 1.779ms 148.225us 785.216us 20.55% 843.264us 70.272us 12
4315
- void at::native::elementwise_kernel&lt;128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 785.216us 20.55% 785.216us 65.435us 12
4316
- Activity Buffer Request 26.00% 1.433ms 26.00% 1.433ms 1.433ms 58.048us 1.52% 58.048us 58.048us 1
4317
- aten::transpose 0.90% 49.620us 1.24% 68.282us 2.845us 0.000us 0.00% 0.000us 0.000us 24
4318
- aten::as_strided 0.34% 18.662us 0.34% 18.662us 0.778us 0.000us 0.00% 0.000us 0.000us 24
4319
- aten::empty_like 0.37% 20.139us 1.52% 83.911us 5.594us 0.000us 0.00% 0.000us 0.000us 15
4320
- aten::empty 1.44% 79.524us 1.44% 79.524us 3.313us 0.000us 0.00% 0.000us 0.000us 24
4321
- cudaLaunchKernel 5.29% 291.664us 5.29% 291.664us 19.444us 0.000us 0.00% 0.000us 0.000us 15
4322
- aten::empty_strided 0.25% 13.850us 0.25% 13.850us 4.617us 0.000us 0.00% 0.000us 0.000us 3
4323
- cudaDeviceGetAttribute 0.03% 1.810us 0.03% 1.810us 0.302us 0.000us 0.00% 0.000us 0.000us 6
4324
- cudaFuncSetAttribute 0.07% 3.620us 0.07% 3.620us 1.207us 0.000us 0.00% 0.000us 0.000us 3
4325
- cudaDeviceSynchronize 57.34% 3.161ms 57.34% 3.161ms 3.161ms 0.000us 0.00% 0.000us 0.000us 1
4326
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4327
- Self CPU time total: 5.512ms
4328
- Self CUDA time total: 3.820ms
4329
 
4330
 
4331
 
@@ -4335,29 +4335,29 @@ PROFILE TRACE: torch_flash_ma | cuda_attn_L448_bfloat16
4335
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4336
  Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
4337
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4338
- torch_flash_ma 4.69% 283.303us 42.14% 2.547ms 2.547ms 0.000us 0.00% 4.304ms 4.304ms 1
4339
- torch_flash_ma 0.00% 0.000us 0.00% 0.000us 0.000us 4.254ms 100.24% 4.254ms 4.254ms 1
4340
- aten::scaled_dot_product_attention 0.82% 49.722us 3.53% 213.285us 71.095us 0.000us 0.00% 3.439ms 1.146ms 3
4341
- aten::_scaled_dot_product_flash_attention 0.34% 20.582us 2.71% 163.563us 54.521us 0.000us 0.00% 3.439ms 1.146ms 3
4342
- aten::_flash_attention_forward 0.62% 37.231us 1.93% 116.771us 38.924us 3.439ms 81.02% 3.439ms 1.146ms 3
4343
- void pytorch_flash::flash_fwd_kernel&lt;Flash_fwd_kerne... 0.00% 0.000us 0.00% 0.000us 0.000us 3.439ms 81.02% 3.439ms 1.146ms 3
4344
- aten::contiguous 0.18% 10.912us 32.97% 1.993ms 166.068us 0.000us 0.00% 865.695us 72.141us 12
4345
- aten::clone 0.50% 30.059us 32.79% 1.982ms 165.158us 0.000us 0.00% 865.695us 72.141us 12
4346
- aten::copy_ 1.39% 83.902us 31.17% 1.884ms 157.000us 805.439us 18.98% 865.695us 72.141us 12
4347
- void at::native::elementwise_kernel&lt;128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 805.439us 18.98% 805.439us 67.120us 12
4348
- Activity Buffer Request 24.08% 1.456ms 24.08% 1.456ms 1.456ms 60.256us 1.42% 60.256us 60.256us 1
4349
- aten::transpose 1.06% 63.793us 1.39% 84.162us 3.507us 0.000us 0.00% 0.000us 0.000us 24
4350
- aten::as_strided 0.34% 20.369us 0.34% 20.369us 0.849us 0.000us 0.00% 0.000us 0.000us 24
4351
- aten::empty_like 0.36% 21.791us 1.46% 88.331us 5.889us 0.000us 0.00% 0.000us 0.000us 15
4352
- aten::empty 1.33% 80.570us 1.33% 80.570us 3.357us 0.000us 0.00% 0.000us 0.000us 24
4353
- cudaLaunchKernel 6.09% 368.355us 6.09% 368.355us 24.557us 0.000us 0.00% 0.000us 0.000us 15
4354
- aten::empty_strided 0.25% 15.000us 0.25% 15.000us 5.000us 0.000us 0.00% 0.000us 0.000us 3
4355
- cudaDeviceGetAttribute 0.03% 1.990us 0.03% 1.990us 0.332us 0.000us 0.00% 0.000us 0.000us 6
4356
- cudaFuncSetAttribute 0.07% 4.160us 0.07% 4.160us 1.387us 0.000us 0.00% 0.000us 0.000us 3
4357
- cudaDeviceSynchronize 57.86% 3.497ms 57.86% 3.497ms 3.497ms 0.000us 0.00% 0.000us 0.000us 1
4358
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4359
- Self CPU time total: 6.045ms
4360
- Self CUDA time total: 4.244ms
4361
 
4362
 
4363
 
@@ -4367,45 +4367,39 @@ PROFILE TRACE: torch_flash_ma | cuda_attn_L512_bfloat16
4367
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4368
  Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
4369
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4370
- torch_flash_ma 4.04% 248.485us 39.71% 2.440ms 2.440ms 0.000us 0.00% 4.431ms 4.431ms 1
4371
- torch_flash_ma 0.00% 0.000us 0.00% 0.000us 0.000us 4.380ms 100.24% 4.380ms 4.380ms 1
4372
- aten::scaled_dot_product_attention 0.42% 25.679us 2.90% 178.082us 59.361us 0.000us 0.00% 3.552ms 1.184ms 3
4373
- aten::_scaled_dot_product_flash_attention 0.29% 17.912us 2.48% 152.403us 50.801us 0.000us 0.00% 3.552ms 1.184ms 3
4374
- aten::_flash_attention_forward 0.56% 34.360us 1.81% 111.452us 37.151us 3.552ms 81.28% 3.552ms 1.184ms 3
4375
- void pytorch_flash::flash_fwd_kernel&lt;Flash_fwd_kerne... 0.00% 0.000us 0.00% 0.000us 0.000us 3.552ms 81.28% 3.552ms 1.184ms 3
4376
- aten::contiguous 0.17% 10.359us 32.01% 1.967ms 163.915us 0.000us 0.00% 879.392us 73.283us 12
4377
- aten::clone 0.45% 27.371us 31.84% 1.957ms 163.052us 0.000us 0.00% 879.392us 73.283us 12
4378
- aten::copy_ 1.33% 81.681us 30.34% 1.864ms 155.367us 818.048us 18.72% 879.392us 73.283us 12
4379
- void at::native::elementwise_kernel&lt;128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 818.048us 18.72% 818.048us 68.171us 12
4380
- Activity Buffer Request 23.48% 1.443ms 23.48% 1.443ms 1.443ms 61.344us 1.40% 61.344us 61.344us 1
4381
- aten::transpose 0.84% 51.433us 1.14% 69.901us 2.913us 0.000us 0.00% 0.000us 0.000us 24
4382
- aten::as_strided 0.30% 18.468us 0.30% 18.468us 0.769us 0.000us 0.00% 0.000us 0.000us 24
4383
- aten::empty_like 0.32% 19.754us 1.37% 83.993us 5.600us 0.000us 0.00% 0.000us 0.000us 15
4384
- aten::empty 1.26% 77.740us 1.26% 77.740us 3.239us 0.000us 0.00% 0.000us 0.000us 24
4385
- cudaLaunchKernel 5.92% 364.005us 5.92% 364.005us 24.267us 0.000us 0.00% 0.000us 0.000us 15
4386
- aten::empty_strided 0.23% 14.381us 0.23% 14.381us 4.794us 0.000us 0.00% 0.000us 0.000us 3
4387
- cudaDeviceGetAttribute 0.03% 1.840us 0.03% 1.840us 0.307us 0.000us 0.00% 0.000us 0.000us 6
4388
- cudaFuncSetAttribute 0.07% 4.180us 0.07% 4.180us 1.393us 0.000us 0.00% 0.000us 0.000us 3
4389
- cudaDeviceSynchronize 60.29% 3.705ms 60.29% 3.705ms 3.705ms 0.000us 0.00% 0.000us 0.000us 1
4390
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4391
- Self CPU time total: 6.146ms
4392
- Self CUDA time total: 4.370ms
4393
 
4394
 
4395
  impl wl p50(ms) ok
4396
  torch_flash_ma cuda_attn_L128_bfloat16 1.22 True
4397
- torch_flash_ma cuda_attn_L256_bfloat16 1.27 True
4398
- torch_flash_ma cuda_attn_L320_bfloat16 1.29 True
4399
- torch_flash_ma cuda_attn_L384_bfloat16 1.30 True
4400
- torch_flash_ma cuda_attn_L448_bfloat16 1.45 True
4401
- torch_flash_ma cuda_attn_L512_bfloat16 1.49 True
4402
  </pre></div>
4403
- <div class="uv-install-logs" id="uv-logs-benchmark">
4404
- <div class="uv-logs-header" onclick="toggleUvLogs(this)">▶ UV Install Logs</div>
4405
- <div class="uv-logs-content" style="display: none;">
4406
- Installed 37 packages in 225ms
4407
- </div>
4408
- </div>
4409
  <div class="cell-artifacts">
4410
  <h4>Artifacts:</h4>
4411
  <a href="artifacts/benchmark/attention.jsonl" class="artifact" target="_blank">attention.jsonl</a>
 
4110
  | <button class="run-btn" onclick="runCell('nv')">▶ run</button>
4111
  <button class="copy-btn" onclick="copyCell('nv')">Copy</button>
4112
  <a href="cells/nv.py" target="_blank" class="raw-btn">Raw</a>
4113
+ <a href="https://github.com/huggingface/kernels-benchmarks/blob/main/flash_attn/impls/flash_attention.md" target="_blank" class="github-btn">GitHub</a>
4114
  </div>
4115
  <div id="code-nv" class="cell-code" data-lines="3">
4116
  <div class="code-wrap">
 
4123
  </div>
4124
  </div>
4125
  <div id="output-nv" class="cell-output">
4126
+ <div class="cell-stdout"><pre class="stdout-text">Fri Oct 31 20:13:43 2025
4127
  +-----------------------------------------------------------------------------------------+
4128
  | NVIDIA-SMI 570.195.03 Driver Version: 570.195.03 CUDA Version: 12.8 |
4129
  |-----------------------------------------+------------------------+----------------------+
 
4132
  | | | MIG M. |
4133
  |=========================================+========================+======================|
4134
  | 0 NVIDIA L40S On | 00000000:4D:00.0 Off | 0 |
4135
+ | N/A 43C P0 83W / 350W | 0MiB / 46068MiB | 11% Default |
4136
  | | | N/A |
4137
  +-----------------------------------------+------------------------+----------------------+
4138
 
 
4154
  <span class="collapse-indicators">
4155
  <span onclick="toggleCode('benchmark')" style="cursor: pointer;">▼ code</span>
4156
  <span onclick="toggleOutput('benchmark')" style="cursor: pointer;">▼ output</span>
4157
+ <span id="uv-indicator-benchmark" style="cursor: default; opacity: 0.3;">▶ uv-logs</span>
4158
  </span> |
4159
+ Cell: benchmark | 3.87s
4160
  | <button class="run-btn" onclick="runCell('benchmark')">▶ run</button>
4161
  <button class="copy-btn" onclick="copyCell('benchmark')">Copy</button>
4162
  <a href="cells/benchmark.py" target="_blank" class="raw-btn">Raw</a>
4163
+ <a href="https://github.com/huggingface/kernels-benchmarks/blob/main/flash_attn/impls/flash_attention.md" target="_blank" class="github-btn">GitHub</a>
4164
  </div>
4165
  <div id="code-benchmark" class="cell-code" data-lines="29">
4166
  <div class="code-wrap">
 
4207
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4208
  Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
4209
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4210
+ torch_flash_ma 0.00% 0.000us 0.00% 0.000us 0.000us 3.600ms 101.99% 3.600ms 3.600ms 1
4211
+ torch_flash_ma 6.70% 350.157us 46.68% 2.439ms 2.439ms 0.000us 0.00% 3.570ms 3.570ms 1
4212
+ aten::scaled_dot_product_attention 0.81% 42.281us 4.26% 222.626us 74.209us 0.000us 0.00% 2.816ms 938.781us 3
4213
+ aten::_scaled_dot_product_flash_attention 0.52% 27.002us 3.45% 180.345us 60.115us 0.000us 0.00% 2.816ms 938.781us 3
4214
+ aten::_flash_attention_forward 0.79% 41.210us 2.54% 132.453us 44.151us 2.816ms 79.78% 2.816ms 938.781us 3
4215
+ void pytorch_flash::flash_fwd_kernel&lt;Flash_fwd_kerne... 0.00% 0.000us 0.00% 0.000us 0.000us 2.816ms 79.78% 2.816ms 938.781us 3
4216
+ aten::contiguous 0.29% 15.041us 34.44% 1.800ms 149.962us 0.000us 0.00% 753.884us 62.824us 12
4217
+ aten::clone 0.75% 38.969us 34.15% 1.785ms 148.709us 0.000us 0.00% 753.884us 62.824us 12
4218
+ aten::copy_ 1.73% 90.324us 31.78% 1.661ms 138.388us 713.788us 20.22% 753.884us 62.824us 12
4219
+ void at::native::elementwise_kernel&lt;128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 713.788us 20.22% 713.788us 59.482us 12
4220
+ Activity Buffer Request 28.08% 1.467ms 28.08% 1.467ms 1.467ms 40.096us 1.14% 40.096us 40.096us 1
4221
+ aten::transpose 1.25% 65.371us 1.68% 87.543us 3.648us 0.000us 0.00% 0.000us 0.000us 24
4222
+ aten::as_strided 0.42% 22.172us 0.42% 22.172us 0.924us 0.000us 0.00% 0.000us 0.000us 24
4223
+ aten::empty_like 0.53% 27.463us 2.06% 107.524us 7.168us 0.000us 0.00% 0.000us 0.000us 15
4224
+ aten::empty 1.78% 93.220us 1.78% 93.220us 3.884us 0.000us 0.00% 0.000us 0.000us 24
4225
+ cudaLaunchKernel 2.49% 130.035us 2.49% 130.035us 8.669us 0.000us 0.00% 0.000us 0.000us 15
4226
+ aten::empty_strided 0.32% 16.730us 0.32% 16.730us 5.577us 0.000us 0.00% 0.000us 0.000us 3
4227
+ cudaDeviceGetAttribute 0.05% 2.690us 0.05% 2.690us 0.448us 0.000us 0.00% 0.000us 0.000us 6
4228
+ cudaFuncSetAttribute 0.17% 9.000us 0.17% 9.000us 3.000us 0.000us 0.00% 0.000us 0.000us 3
4229
+ cudaDeviceSynchronize 53.32% 2.786ms 53.32% 2.786ms 2.786ms 0.000us 0.00% 0.000us 0.000us 1
4230
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4231
+ Self CPU time total: 5.225ms
4232
+ Self CUDA time total: 3.530ms
4233
 
4234
 
4235
 
 
4239
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4240
  Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
4241
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4242
+ torch_flash_ma 4.88% 260.255us 42.26% 2.252ms 2.252ms 0.000us 0.00% 3.798ms 3.798ms 1
4243
+ torch_flash_ma 0.00% 0.000us 0.00% 0.000us 0.000us 3.753ms 100.28% 3.753ms 3.753ms 1
4244
+ aten::scaled_dot_product_attention 0.49% 25.890us 3.50% 186.735us 62.245us 0.000us 0.00% 2.976ms 991.858us 3
4245
+ aten::_scaled_dot_product_flash_attention 0.33% 17.842us 3.02% 160.845us 53.615us 0.000us 0.00% 2.976ms 991.858us 3
4246
+ aten::_flash_attention_forward 0.74% 39.289us 2.26% 120.363us 40.121us 2.976ms 79.51% 2.976ms 991.858us 3
4247
+ void pytorch_flash::flash_fwd_kernel&lt;Flash_fwd_kerne... 0.00% 0.000us 0.00% 0.000us 0.000us 2.976ms 79.51% 2.976ms 991.858us 3
4248
+ aten::contiguous 0.20% 10.403us 33.03% 1.760ms 146.680us 0.000us 0.00% 822.042us 68.504us 12
4249
+ aten::clone 0.53% 28.238us 32.84% 1.750ms 145.813us 0.000us 0.00% 822.042us 68.504us 12
4250
+ aten::copy_ 1.51% 80.312us 31.12% 1.659ms 138.210us 766.874us 20.49% 822.042us 68.504us 12
4251
+ void at::native::elementwise_kernel&lt;128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 766.874us 20.49% 766.874us 63.906us 12
4252
+ Activity Buffer Request 28.02% 1.493ms 28.02% 1.493ms 1.493ms 55.168us 1.47% 55.168us 55.168us 1
4253
+ aten::transpose 0.94% 50.313us 1.27% 67.673us 2.820us 0.000us 0.00% 0.000us 0.000us 24
4254
+ aten::as_strided 0.33% 17.360us 0.33% 17.360us 0.723us 0.000us 0.00% 0.000us 0.000us 24
4255
+ aten::empty_like 0.40% 21.528us 1.56% 83.370us 5.558us 0.000us 0.00% 0.000us 0.000us 15
4256
+ aten::empty 1.43% 76.263us 1.43% 76.263us 3.178us 0.000us 0.00% 0.000us 0.000us 24
4257
+ cudaLaunchKernel 2.08% 110.943us 2.08% 110.943us 7.396us 0.000us 0.00% 0.000us 0.000us 15
4258
+ aten::empty_strided 0.27% 14.621us 0.27% 14.621us 4.874us 0.000us 0.00% 0.000us 0.000us 3
4259
+ cudaDeviceGetAttribute 0.03% 1.781us 0.03% 1.781us 0.297us 0.000us 0.00% 0.000us 0.000us 6
4260
+ cudaFuncSetAttribute 0.08% 4.011us 0.08% 4.011us 1.337us 0.000us 0.00% 0.000us 0.000us 3
4261
+ cudaDeviceSynchronize 57.74% 3.077ms 57.74% 3.077ms 3.077ms 0.000us 0.00% 0.000us 0.000us 1
4262
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4263
+ Self CPU time total: 5.329ms
4264
+ Self CUDA time total: 3.742ms
4265
 
4266
 
4267
 
 
4271
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4272
  Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
4273
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4274
+ torch_flash_ma 4.87% 262.676us 41.62% 2.245ms 2.245ms 0.000us 0.00% 3.882ms 3.882ms 1
4275
+ torch_flash_ma 0.00% 0.000us 0.00% 0.000us 0.000us 3.834ms 100.29% 3.834ms 3.834ms 1
4276
+ aten::scaled_dot_product_attention 0.50% 26.770us 3.49% 188.015us 62.672us 0.000us 0.00% 3.044ms 1.015ms 3
4277
+ aten::_scaled_dot_product_flash_attention 0.35% 18.803us 2.99% 161.245us 53.748us 0.000us 0.00% 3.044ms 1.015ms 3
4278
+ aten::_flash_attention_forward 0.74% 39.829us 2.21% 119.102us 39.701us 3.044ms 79.61% 3.044ms 1.015ms 3
4279
+ void pytorch_flash::flash_fwd_kernel&lt;Flash_fwd_kerne... 0.00% 0.000us 0.00% 0.000us 0.000us 3.044ms 79.61% 3.044ms 1.015ms 3
4280
+ aten::contiguous 0.18% 9.451us 32.36% 1.746ms 145.465us 0.000us 0.00% 838.367us 69.864us 12
4281
+ aten::clone 0.54% 28.881us 32.18% 1.736ms 144.678us 0.000us 0.00% 838.367us 69.864us 12
4282
+ aten::copy_ 1.51% 81.201us 30.48% 1.644ms 137.016us 779.615us 20.39% 838.367us 69.864us 12
4283
+ void at::native::elementwise_kernel&lt;128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 779.615us 20.39% 779.615us 64.968us 12
4284
+ Activity Buffer Request 27.31% 1.473ms 27.31% 1.473ms 1.473ms 58.752us 1.54% 58.752us 58.752us 1
4285
+ aten::transpose 1.01% 54.592us 1.34% 72.471us 3.020us 0.000us 0.00% 0.000us 0.000us 24
4286
+ aten::as_strided 0.33% 17.879us 0.33% 17.879us 0.745us 0.000us 0.00% 0.000us 0.000us 24
4287
+ aten::empty_like 0.37% 20.117us 1.53% 82.751us 5.517us 0.000us 0.00% 0.000us 0.000us 15
4288
+ aten::empty 1.41% 76.295us 1.41% 76.295us 3.179us 0.000us 0.00% 0.000us 0.000us 24
4289
+ cudaLaunchKernel 2.13% 114.795us 2.13% 114.795us 7.653us 0.000us 0.00% 0.000us 0.000us 15
4290
+ aten::empty_strided 0.27% 14.801us 0.27% 14.801us 4.934us 0.000us 0.00% 0.000us 0.000us 3
4291
+ cudaDeviceGetAttribute 0.04% 2.110us 0.04% 2.110us 0.352us 0.000us 0.00% 0.000us 0.000us 6
4292
+ cudaFuncSetAttribute 0.07% 3.990us 0.07% 3.990us 1.330us 0.000us 0.00% 0.000us 0.000us 3
4293
+ cudaDeviceSynchronize 58.38% 3.149ms 58.38% 3.149ms 3.149ms 0.000us 0.00% 0.000us 0.000us 1
4294
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4295
+ Self CPU time total: 5.395ms
4296
+ Self CUDA time total: 3.823ms
4297
 
4298
 
4299
 
 
4303
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4304
  Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
4305
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4306
+ torch_flash_ma 4.61% 261.106us 43.54% 2.469ms 2.469ms 0.000us 0.00% 3.945ms 3.945ms 1
4307
+ torch_flash_ma 0.00% 0.000us 0.00% 0.000us 0.000us 3.898ms 100.28% 3.898ms 3.898ms 1
4308
+ aten::scaled_dot_product_attention 0.46% 26.241us 3.40% 192.654us 64.218us 0.000us 0.00% 3.100ms 1.033ms 3
4309
+ aten::_scaled_dot_product_flash_attention 0.34% 19.509us 2.94% 166.413us 55.471us 0.000us 0.00% 3.100ms 1.033ms 3
4310
+ aten::_flash_attention_forward 0.74% 42.081us 2.16% 122.633us 40.878us 3.100ms 79.76% 3.100ms 1.033ms 3
4311
+ void pytorch_flash::flash_fwd_kernel&lt;Flash_fwd_kerne... 0.00% 0.000us 0.00% 0.000us 0.000us 3.100ms 79.76% 3.100ms 1.033ms 3
4312
+ aten::contiguous 0.20% 11.161us 34.71% 1.968ms 163.994us 0.000us 0.00% 844.704us 70.392us 12
4313
+ aten::clone 0.52% 29.682us 34.51% 1.957ms 163.064us 0.000us 0.00% 844.704us 70.392us 12
4314
+ aten::copy_ 1.45% 82.261us 32.81% 1.860ms 155.026us 786.784us 20.24% 844.704us 70.392us 12
4315
+ void at::native::elementwise_kernel&lt;128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 786.784us 20.24% 786.784us 65.565us 12
4316
+ Activity Buffer Request 26.26% 1.489ms 26.26% 1.489ms 1.489ms 57.920us 1.49% 57.920us 57.920us 1
4317
+ aten::transpose 0.95% 53.820us 1.26% 71.322us 2.972us 0.000us 0.00% 0.000us 0.000us 24
4318
+ aten::as_strided 0.31% 17.502us 0.31% 17.502us 0.729us 0.000us 0.00% 0.000us 0.000us 24
4319
+ aten::empty_like 0.39% 21.943us 1.53% 86.983us 5.799us 0.000us 0.00% 0.000us 0.000us 15
4320
+ aten::empty 1.40% 79.202us 1.40% 79.202us 3.300us 0.000us 0.00% 0.000us 0.000us 24
4321
+ cudaLaunchKernel 5.55% 314.487us 5.55% 314.487us 20.966us 0.000us 0.00% 0.000us 0.000us 15
4322
+ aten::empty_strided 0.26% 14.830us 0.26% 14.830us 4.943us 0.000us 0.00% 0.000us 0.000us 3
4323
+ cudaDeviceGetAttribute 0.04% 2.010us 0.04% 2.010us 0.335us 0.000us 0.00% 0.000us 0.000us 6
4324
+ cudaFuncSetAttribute 0.07% 4.040us 0.07% 4.040us 1.347us 0.000us 0.00% 0.000us 0.000us 3
4325
+ cudaDeviceSynchronize 56.46% 3.201ms 56.46% 3.201ms 3.201ms 0.000us 0.00% 0.000us 0.000us 1
4326
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4327
+ Self CPU time total: 5.670ms
4328
+ Self CUDA time total: 3.887ms
4329
 
4330
 
4331
 
 
4335
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4336
  Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
4337
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4338
+ torch_flash_ma 5.12% 312.519us 40.82% 2.493ms 2.493ms 0.000us 0.00% 4.416ms 4.416ms 1
4339
+ torch_flash_ma 0.00% 0.000us 0.00% 0.000us 0.000us 4.365ms 100.24% 4.365ms 4.365ms 1
4340
+ aten::scaled_dot_product_attention 0.42% 25.922us 3.20% 195.246us 65.082us 0.000us 0.00% 3.547ms 1.182ms 3
4341
+ aten::_scaled_dot_product_flash_attention 0.34% 20.847us 2.77% 169.324us 56.441us 0.000us 0.00% 3.547ms 1.182ms 3
4342
+ aten::_flash_attention_forward 0.72% 44.243us 2.07% 126.303us 42.101us 3.547ms 81.45% 3.547ms 1.182ms 3
4343
+ void pytorch_flash::flash_fwd_kernel&lt;Flash_fwd_kerne... 0.00% 0.000us 0.00% 0.000us 0.000us 3.547ms 81.45% 3.547ms 1.182ms 3
4344
+ aten::contiguous 0.17% 10.559us 31.73% 1.938ms 161.473us 0.000us 0.00% 869.122us 72.427us 12
4345
+ aten::clone 0.47% 28.763us 31.56% 1.927ms 160.593us 0.000us 0.00% 869.122us 72.427us 12
4346
+ aten::copy_ 1.36% 83.033us 30.01% 1.832ms 152.707us 807.906us 18.55% 869.122us 72.427us 12
4347
+ void at::native::elementwise_kernel&lt;128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 807.906us 18.55% 807.906us 67.326us 12
4348
+ Activity Buffer Request 24.51% 1.497ms 24.51% 1.497ms 1.497ms 61.216us 1.41% 61.216us 61.216us 1
4349
+ aten::transpose 0.85% 52.195us 1.14% 69.864us 2.911us 0.000us 0.00% 0.000us 0.000us 24
4350
+ aten::as_strided 0.29% 17.669us 0.29% 17.669us 0.736us 0.000us 0.00% 0.000us 0.000us 24
4351
+ aten::empty_like 0.34% 20.921us 1.44% 87.791us 5.853us 0.000us 0.00% 0.000us 0.000us 15
4352
+ aten::empty 1.30% 79.270us 1.30% 79.270us 3.303us 0.000us 0.00% 0.000us 0.000us 24
4353
+ cudaLaunchKernel 4.55% 277.575us 4.55% 277.575us 18.505us 0.000us 0.00% 0.000us 0.000us 15
4354
+ aten::empty_strided 0.27% 16.520us 0.27% 16.520us 5.507us 0.000us 0.00% 0.000us 0.000us 3
4355
+ cudaDeviceGetAttribute 0.03% 1.960us 0.03% 1.960us 0.327us 0.000us 0.00% 0.000us 0.000us 6
4356
+ cudaFuncSetAttribute 0.07% 4.040us 0.07% 4.040us 1.347us 0.000us 0.00% 0.000us 0.000us 3
4357
+ cudaDeviceSynchronize 59.18% 3.614ms 59.18% 3.614ms 3.614ms 0.000us 0.00% 0.000us 0.000us 1
4358
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4359
+ Self CPU time total: 6.107ms
4360
+ Self CUDA time total: 4.355ms
4361
 
4362
 
4363
 
 
4367
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4368
  Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
4369
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4370
+ torch_flash_ma 3.85% 236.256us 38.02% 2.335ms 2.335ms 0.000us 0.00% 4.535ms 4.535ms 1
4371
+ torch_flash_ma 0.00% 0.000us 0.00% 0.000us 0.000us 4.485ms 100.25% 4.485ms 4.485ms 1
4372
+ aten::scaled_dot_product_attention 0.43% 26.452us 2.98% 183.275us 61.092us 0.000us 0.00% 3.655ms 1.218ms 3
4373
+ aten::_scaled_dot_product_flash_attention 0.30% 18.620us 2.55% 156.823us 52.274us 0.000us 0.00% 3.655ms 1.218ms 3
4374
+ aten::_flash_attention_forward 0.59% 36.060us 1.88% 115.323us 38.441us 3.655ms 81.69% 3.655ms 1.218ms 3
4375
+ void pytorch_flash::flash_fwd_kernel&lt;Flash_fwd_kerne... 0.00% 0.000us 0.00% 0.000us 0.000us 3.655ms 81.69% 3.655ms 1.218ms 3
4376
+ aten::contiguous 0.16% 9.770us 30.40% 1.867ms 155.567us 0.000us 0.00% 880.065us 73.339us 12
4377
+ aten::clone 0.46% 28.179us 30.24% 1.857ms 154.753us 0.000us 0.00% 880.065us 73.339us 12
4378
+ aten::copy_ 1.36% 83.563us 28.74% 1.765ms 147.054us 819.137us 18.31% 880.065us 73.339us 12
4379
+ void at::native::elementwise_kernel&lt;128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 819.137us 18.31% 819.137us 68.261us 12
4380
+ Activity Buffer Request 23.24% 1.427ms 23.24% 1.427ms 1.427ms 60.928us 1.36% 60.928us 60.928us 1
4381
+ aten::transpose 0.86% 52.980us 1.16% 71.060us 2.961us 0.000us 0.00% 0.000us 0.000us 24
4382
+ aten::as_strided 0.29% 18.080us 0.29% 18.080us 0.753us 0.000us 0.00% 0.000us 0.000us 24
4383
+ aten::empty_like 0.34% 20.930us 1.37% 83.913us 5.594us 0.000us 0.00% 0.000us 0.000us 15
4384
+ aten::empty 1.25% 77.043us 1.25% 77.043us 3.210us 0.000us 0.00% 0.000us 0.000us 24
4385
+ cudaLaunchKernel 4.54% 278.990us 4.54% 278.990us 18.599us 0.000us 0.00% 0.000us 0.000us 15
4386
+ aten::empty_strided 0.24% 14.661us 0.24% 14.661us 4.887us 0.000us 0.00% 0.000us 0.000us 3
4387
+ cudaDeviceGetAttribute 0.03% 1.978us 0.03% 1.978us 0.330us 0.000us 0.00% 0.000us 0.000us 6
4388
+ cudaFuncSetAttribute 0.06% 3.901us 0.06% 3.901us 1.300us 0.000us 0.00% 0.000us 0.000us 3
4389
+ cudaDeviceSynchronize 61.98% 3.806ms 61.98% 3.806ms 3.806ms 0.000us 0.00% 0.000us 0.000us 1
4390
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4391
+ Self CPU time total: 6.141ms
4392
+ Self CUDA time total: 4.474ms
4393
 
4394
 
4395
  impl wl p50(ms) ok
4396
  torch_flash_ma cuda_attn_L128_bfloat16 1.22 True
4397
+ torch_flash_ma cuda_attn_L256_bfloat16 1.28 True
4398
+ torch_flash_ma cuda_attn_L320_bfloat16 1.30 True
4399
+ torch_flash_ma cuda_attn_L384_bfloat16 1.33 True
4400
+ torch_flash_ma cuda_attn_L448_bfloat16 1.50 True
4401
+ torch_flash_ma cuda_attn_L512_bfloat16 1.51 True
4402
  </pre></div>
 
 
 
 
 
 
4403
  <div class="cell-artifacts">
4404
  <h4>Artifacts:</h4>
4405
  <a href="artifacts/benchmark/attention.jsonl" class="artifact" target="_blank">attention.jsonl</a>
flash_attn/impls/hf_kernels_flash_attn.html CHANGED
@@ -4104,14 +4104,14 @@ body[data-tool="eraser"] .main-content {
4104
  <span class="collapse-indicators">
4105
  <span onclick="toggleCode('benchmark')" style="cursor: pointer;">▼ code</span>
4106
  <span onclick="toggleOutput('benchmark')" style="cursor: pointer;">▼ output</span>
4107
- <span id="uv-indicator-benchmark" onclick="toggleUvLogsFromHeader('benchmark')" style="cursor: pointer;">▶ uv-logs</span>
4108
  </span> |
4109
- Cell: benchmark | 10.91s
4110
  | <button class="run-btn" onclick="runCell('benchmark')">▶ run</button>
4111
  <button class="copy-btn" onclick="copyCell('benchmark')">Copy</button>
4112
  <a href="cells/benchmark.py" target="_blank" class="raw-btn">Raw</a>
4113
- <a href="https://github.com/huggingface/kernels-uvnotes/blob/main/flash_attn/impls/hf_kernels_flash_attn.md" target="_blank" class="github-btn">GitHub</a>
4114
- <a href="https://huggingface.co/kernels-community/flash-attn2" target="_blank" class="hf-btn">🤗 HF</a>
4115
  </div>
4116
  <div id="code-benchmark" class="cell-code" data-lines="32">
4117
  <div class="code-wrap">
@@ -4161,21 +4161,21 @@ PROFILE TRACE: hf_kernels_flash_attn | cuda_attn_L128_bfloat16
4161
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4162
  Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
4163
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4164
- hf_kernels_flash_attn 3.74% 162.312us 41.68% 1.808ms 1.808ms 0.000us 0.00% 3.686ms 3.686ms 1
4165
- _flash_attn_9e27194::fwd 1.67% 72.360us 37.94% 1.646ms 548.560us 2.753ms 100.00% 3.686ms 1.229ms 3
4166
- hf_kernels_flash_attn 0.00% 0.000us 0.00% 0.000us 0.000us 2.754ms 100.05% 2.754ms 2.754ms 1
4167
- void flash::flash_fwd_kernel&lt;Flash_fwd_kernel_traits... 0.00% 0.000us 0.00% 0.000us 0.000us 2.753ms 100.00% 2.753ms 917.639us 3
4168
- Activity Buffer Request 33.08% 1.435ms 33.08% 1.435ms 1.435ms 933.501us 33.91% 933.501us 933.501us 1
4169
- cudaDeviceGetAttribute 0.12% 5.209us 0.12% 5.209us 0.347us 0.000us 0.00% 0.000us 0.000us 15
4170
- aten::empty_like 0.42% 18.210us 1.24% 53.790us 17.930us 0.000us 0.00% 0.000us 0.000us 3
4171
- aten::empty_strided 0.82% 35.580us 0.82% 35.580us 11.860us 0.000us 0.00% 0.000us 0.000us 3
4172
- aten::empty 0.58% 25.153us 0.58% 25.153us 2.795us 0.000us 0.00% 0.000us 0.000us 9
4173
- cudaFuncSetAttribute 0.26% 11.441us 0.26% 11.441us 3.814us 0.000us 0.00% 0.000us 0.000us 3
4174
- cudaLaunchKernel 0.99% 42.781us 0.99% 42.781us 14.260us 0.000us 0.00% 0.000us 0.000us 3
4175
- cudaDeviceSynchronize 58.32% 2.530ms 58.32% 2.530ms 2.530ms 0.000us 0.00% 0.000us 0.000us 1
4176
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4177
- Self CPU time total: 4.338ms
4178
- Self CUDA time total: 2.753ms
4179
 
4180
 
4181
 
@@ -4185,21 +4185,21 @@ PROFILE TRACE: hf_kernels_flash_attn | cuda_attn_L256_bfloat16
4185
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4186
  Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
4187
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4188
- hf_kernels_flash_attn 2.52% 113.464us 37.14% 1.670ms 1.670ms 0.000us 0.00% 3.984ms 3.984ms 1
4189
- _flash_attn_9e27194::fwd 1.10% 49.632us 34.61% 1.557ms 518.855us 2.977ms 100.00% 3.984ms 1.328ms 3
4190
- hf_kernels_flash_attn 0.00% 0.000us 0.00% 0.000us 0.000us 2.979ms 100.05% 2.979ms 2.979ms 1
4191
- void flash::flash_fwd_kernel&lt;Flash_fwd_kernel_traits... 0.00% 0.000us 0.00% 0.000us 0.000us 2.977ms 100.00% 2.977ms 992.348us 3
4192
- Activity Buffer Request 31.69% 1.425ms 31.69% 1.425ms 1.425ms 1.007ms 33.82% 1.007ms 1.007ms 1
4193
- cudaDeviceGetAttribute 0.08% 3.769us 0.08% 3.769us 0.251us 0.000us 0.00% 0.000us 0.000us 15
4194
- aten::empty_like 0.17% 7.560us 0.54% 24.080us 8.027us 0.000us 0.00% 0.000us 0.000us 3
4195
- aten::empty_strided 0.37% 16.520us 0.37% 16.520us 5.507us 0.000us 0.00% 0.000us 0.000us 3
4196
- aten::empty 0.47% 21.170us 0.47% 21.170us 2.352us 0.000us 0.00% 0.000us 0.000us 9
4197
- cudaFuncSetAttribute 0.08% 3.820us 0.08% 3.820us 1.273us 0.000us 0.00% 0.000us 0.000us 3
4198
- cudaLaunchKernel 0.64% 28.910us 0.64% 28.910us 9.637us 0.000us 0.00% 0.000us 0.000us 3
4199
- cudaDeviceSynchronize 62.86% 2.827ms 62.86% 2.827ms 2.827ms 0.000us 0.00% 0.000us 0.000us 1
4200
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4201
- Self CPU time total: 4.497ms
4202
- Self CUDA time total: 2.977ms
4203
 
4204
 
4205
 
@@ -4209,21 +4209,21 @@ PROFILE TRACE: hf_kernels_flash_attn | cuda_attn_L320_bfloat16
4209
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4210
  Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
4211
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4212
- hf_kernels_flash_attn 2.39% 108.133us 36.58% 1.655ms 1.655ms 0.000us 0.00% 4.040ms 4.040ms 1
4213
- _flash_attn_9e27194::fwd 1.06% 48.029us 34.19% 1.547ms 515.608us 3.016ms 100.00% 4.040ms 1.347ms 3
4214
- hf_kernels_flash_attn 0.00% 0.000us 0.00% 0.000us 0.000us 3.017ms 100.05% 3.017ms 3.017ms 1
4215
- void flash::flash_fwd_kernel&lt;Flash_fwd_kernel_traits... 0.00% 0.000us 0.00% 0.000us 0.000us 3.016ms 100.00% 3.016ms 1.005ms 3
4216
- Activity Buffer Request 31.28% 1.415ms 31.28% 1.415ms 1.415ms 1.024ms 33.96% 1.024ms 1.024ms 1
4217
- cudaDeviceGetAttribute 0.09% 4.281us 0.09% 4.281us 0.285us 0.000us 0.00% 0.000us 0.000us 15
4218
- aten::empty_like 0.16% 7.121us 0.52% 23.411us 7.804us 0.000us 0.00% 0.000us 0.000us 3
4219
- aten::empty_strided 0.36% 16.290us 0.36% 16.290us 5.430us 0.000us 0.00% 0.000us 0.000us 3
4220
- aten::empty 0.49% 22.080us 0.49% 22.080us 2.453us 0.000us 0.00% 0.000us 0.000us 9
4221
- cudaFuncSetAttribute 0.08% 3.840us 0.08% 3.840us 1.280us 0.000us 0.00% 0.000us 0.000us 3
4222
- cudaLaunchKernel 0.66% 29.710us 0.66% 29.710us 9.903us 0.000us 0.00% 0.000us 0.000us 3
4223
- cudaDeviceSynchronize 63.42% 2.870ms 63.42% 2.870ms 2.870ms 0.000us 0.00% 0.000us 0.000us 1
4224
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4225
- Self CPU time total: 4.525ms
4226
- Self CUDA time total: 3.016ms
4227
 
4228
 
4229
 
@@ -4233,21 +4233,21 @@ PROFILE TRACE: hf_kernels_flash_attn | cuda_attn_L384_bfloat16
4233
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4234
  Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
4235
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4236
- hf_kernels_flash_attn 2.32% 109.992us 39.04% 1.848ms 1.848ms 0.000us 0.00% 4.060ms 4.060ms 1
4237
- _flash_attn_9e27194::fwd 1.05% 49.564us 36.71% 1.738ms 579.317us 3.035ms 100.00% 4.060ms 1.353ms 3
4238
- hf_kernels_flash_attn 0.00% 0.000us 0.00% 0.000us 0.000us 3.037ms 100.05% 3.037ms 3.037ms 1
4239
- void flash::flash_fwd_kernel&lt;Flash_fwd_kernel_traits... 0.00% 0.000us 0.00% 0.000us 0.000us 3.035ms 100.00% 3.035ms 1.012ms 3
4240
- Activity Buffer Request 29.72% 1.407ms 29.72% 1.407ms 1.407ms 1.025ms 33.76% 1.025ms 1.025ms 1
4241
- cudaDeviceGetAttribute 0.08% 3.690us 0.08% 3.690us 0.246us 0.000us 0.00% 0.000us 0.000us 15
4242
- aten::empty_like 0.16% 7.770us 0.54% 25.380us 8.460us 0.000us 0.00% 0.000us 0.000us 3
4243
- aten::empty_strided 0.37% 17.610us 0.37% 17.610us 5.870us 0.000us 0.00% 0.000us 0.000us 3
4244
- aten::empty 0.47% 22.139us 0.47% 22.139us 2.460us 0.000us 0.00% 0.000us 0.000us 9
4245
- cudaFuncSetAttribute 0.08% 3.790us 0.08% 3.790us 1.263us 0.000us 0.00% 0.000us 0.000us 3
4246
- cudaLaunchKernel 4.78% 226.343us 4.78% 226.343us 75.448us 0.000us 0.00% 0.000us 0.000us 3
4247
- cudaDeviceSynchronize 60.96% 2.886ms 60.96% 2.886ms 2.886ms 0.000us 0.00% 0.000us 0.000us 1
4248
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4249
- Self CPU time total: 4.734ms
4250
- Self CUDA time total: 3.035ms
4251
 
4252
 
4253
 
@@ -4257,21 +4257,21 @@ PROFILE TRACE: hf_kernels_flash_attn | cuda_attn_L448_bfloat16
4257
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4258
  Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
4259
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4260
- hf_kernels_flash_attn 2.11% 110.542us 35.45% 1.860ms 1.860ms 0.000us 0.00% 4.719ms 4.719ms 1
4261
- _flash_attn_9e27194::fwd 0.97% 51.080us 33.34% 1.750ms 583.220us 3.535ms 100.00% 4.719ms 1.573ms 3
4262
- hf_kernels_flash_attn 0.00% 0.000us 0.00% 0.000us 0.000us 3.537ms 100.04% 3.537ms 3.537ms 1
4263
- void flash::flash_fwd_kernel&lt;Flash_fwd_kernel_traits... 0.00% 0.000us 0.00% 0.000us 0.000us 3.535ms 100.00% 3.535ms 1.178ms 3
4264
- Activity Buffer Request 27.95% 1.467ms 27.95% 1.467ms 1.467ms 1.184ms 33.49% 1.184ms 1.184ms 1
4265
- cudaDeviceGetAttribute 0.07% 3.640us 0.07% 3.640us 0.243us 0.000us 0.00% 0.000us 0.000us 15
4266
- aten::empty_like 0.14% 7.520us 0.47% 24.731us 8.244us 0.000us 0.00% 0.000us 0.000us 3
4267
- aten::empty_strided 0.33% 17.211us 0.33% 17.211us 5.737us 0.000us 0.00% 0.000us 0.000us 3
4268
- aten::empty 0.43% 22.670us 0.43% 22.670us 2.519us 0.000us 0.00% 0.000us 0.000us 9
4269
- cudaFuncSetAttribute 0.07% 3.800us 0.07% 3.800us 1.267us 0.000us 0.00% 0.000us 0.000us 3
4270
- cudaLaunchKernel 3.37% 176.824us 3.37% 176.824us 58.941us 0.000us 0.00% 0.000us 0.000us 3
4271
- cudaDeviceSynchronize 64.55% 3.388ms 64.55% 3.388ms 3.388ms 0.000us 0.00% 0.000us 0.000us 1
4272
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4273
- Self CPU time total: 5.248ms
4274
- Self CUDA time total: 3.535ms
4275
 
4276
 
4277
 
@@ -4281,41 +4281,36 @@ PROFILE TRACE: hf_kernels_flash_attn | cuda_attn_L512_bfloat16
4281
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4282
  Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
4283
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4284
- hf_kernels_flash_attn 2.24% 118.861us 34.58% 1.832ms 1.832ms 0.000us 0.00% 4.834ms 4.834ms 1
4285
- _flash_attn_9e27194::fwd 0.90% 47.900us 32.34% 1.713ms 571.163us 3.618ms 100.00% 4.834ms 1.611ms 3
4286
- hf_kernels_flash_attn 0.00% 0.000us 0.00% 0.000us 0.000us 3.619ms 100.04% 3.619ms 3.619ms 1
4287
- void flash::flash_fwd_kernel&lt;Flash_fwd_kernel_traits... 0.00% 0.000us 0.00% 0.000us 0.000us 3.618ms 100.00% 3.618ms 1.206ms 3
4288
- Activity Buffer Request 27.32% 1.448ms 27.32% 1.448ms 1.448ms 1.217ms 33.63% 1.217ms 1.217ms 1
4289
- cudaDeviceGetAttribute 0.07% 3.661us 0.07% 3.661us 0.244us 0.000us 0.00% 0.000us 0.000us 15
4290
- aten::empty_like 0.14% 7.320us 0.50% 26.231us 8.744us 0.000us 0.00% 0.000us 0.000us 3
4291
- aten::empty_strided 0.36% 18.911us 0.36% 18.911us 6.304us 0.000us 0.00% 0.000us 0.000us 3
4292
- aten::empty 0.40% 21.351us 0.40% 21.351us 2.372us 0.000us 0.00% 0.000us 0.000us 9
4293
- cudaFuncSetAttribute 0.08% 4.160us 0.08% 4.160us 1.387us 0.000us 0.00% 0.000us 0.000us 3
4294
- cudaLaunchKernel 3.07% 162.463us 3.07% 162.463us 54.154us 0.000us 0.00% 0.000us 0.000us 3
4295
- cudaDeviceSynchronize 65.42% 3.466ms 65.42% 3.466ms 3.466ms 0.000us 0.00% 0.000us 0.000us 1
4296
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4297
- Self CPU time total: 5.299ms
4298
- Self CUDA time total: 3.618ms
4299
 
4300
 
4301
  impl wl p50(ms) ok
4302
- hf_kernels_flash_attn cuda_attn_L128_bfloat16 0.94 True
4303
- hf_kernels_flash_attn cuda_attn_L256_bfloat16 0.99 True
4304
- hf_kernels_flash_attn cuda_attn_L320_bfloat16 1.03 True
4305
- hf_kernels_flash_attn cuda_attn_L384_bfloat16 1.05 True
4306
- hf_kernels_flash_attn cuda_attn_L448_bfloat16 1.21 True
4307
  hf_kernels_flash_attn cuda_attn_L512_bfloat16 1.23 True
4308
  </pre></div>
4309
- <div class="uv-install-logs" id="uv-logs-benchmark">
4310
- <div class="uv-logs-header" onclick="toggleUvLogs(this)">▶ UV Install Logs</div>
4311
- <div class="uv-logs-content" style="display: none;">
4312
- Installed 15 packages in 15ms
4313
  </div>
4314
- </div>
4315
- <div class="cell-stderr">Fetching 20 files: 0%| | 0/20 [00:00&lt;?, ?it/s]
4316
- Fetching 20 files: 5%|▌ | 1/20 [00:00&lt;00:02, 8.29it/s]
4317
- Fetching 20 files: 10%|█ | 2/20 [00:06&lt;01:08, 3.82s/it]
4318
- Fetching 20 files: 100%|██████████| 20/20 [00:06&lt;00:00, 3.06it/s]</div>
4319
  <div class="cell-artifacts">
4320
  <h4>Artifacts:</h4>
4321
  <a href="artifacts/benchmark/attention.jsonl" class="artifact" target="_blank">attention.jsonl</a>
 
4104
  <span class="collapse-indicators">
4105
  <span onclick="toggleCode('benchmark')" style="cursor: pointer;">▼ code</span>
4106
  <span onclick="toggleOutput('benchmark')" style="cursor: pointer;">▼ output</span>
4107
+ <span id="uv-indicator-benchmark" style="cursor: default; opacity: 0.3;">▶ uv-logs</span>
4108
  </span> |
4109
+ Cell: benchmark | 5.83s
4110
  | <button class="run-btn" onclick="runCell('benchmark')">▶ run</button>
4111
  <button class="copy-btn" onclick="copyCell('benchmark')">Copy</button>
4112
  <a href="cells/benchmark.py" target="_blank" class="raw-btn">Raw</a>
4113
+ <a href="https://github.com/huggingface/kernels-benchmarks/blob/main/flash_attn/impls/hf_kernels_flash_attn.md" target="_blank" class="github-btn">GitHub</a>
4114
+ <a href="https://huggingface.co/kernels-community/flash-attn" target="_blank" class="hf-btn">🤗 HF</a>
4115
  </div>
4116
  <div id="code-benchmark" class="cell-code" data-lines="32">
4117
  <div class="code-wrap">
 
4161
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4162
  Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
4163
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4164
+ hf_kernels_flash_attn 3.51% 153.413us 41.11% 1.797ms 1.797ms 0.000us 0.00% 3.733ms 3.733ms 1
4165
+ _flash_attn_9e27194::fwd 1.62% 70.702us 37.60% 1.644ms 547.894us 2.785ms 100.00% 3.733ms 1.244ms 3
4166
+ hf_kernels_flash_attn 0.00% 0.000us 0.00% 0.000us 0.000us 2.786ms 100.05% 2.786ms 2.786ms 1
4167
+ void flash::flash_fwd_kernel&lt;Flash_fwd_kernel_traits... 0.00% 0.000us 0.00% 0.000us 0.000us 2.785ms 100.00% 2.785ms 928.303us 3
4168
+ Activity Buffer Request 32.92% 1.439ms 32.92% 1.439ms 1.439ms 947.706us 34.03% 947.706us 947.706us 1
4169
+ cudaDeviceGetAttribute 0.11% 4.891us 0.11% 4.891us 0.326us 0.000us 0.00% 0.000us 0.000us 15
4170
+ aten::empty_like 0.37% 16.181us 1.17% 51.061us 17.020us 0.000us 0.00% 0.000us 0.000us 3
4171
+ aten::empty_strided 0.80% 34.880us 0.80% 34.880us 11.627us 0.000us 0.00% 0.000us 0.000us 3
4172
+ aten::empty 0.59% 25.681us 0.59% 25.681us 2.853us 0.000us 0.00% 0.000us 0.000us 9
4173
+ cudaFuncSetAttribute 0.26% 11.340us 0.26% 11.340us 3.780us 0.000us 0.00% 0.000us 0.000us 3
4174
+ cudaLaunchKernel 0.93% 40.731us 0.93% 40.731us 13.577us 0.000us 0.00% 0.000us 0.000us 3
4175
+ cudaDeviceSynchronize 58.89% 2.575ms 58.89% 2.575ms 2.575ms 0.000us 0.00% 0.000us 0.000us 1
4176
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4177
+ Self CPU time total: 4.372ms
4178
+ Self CUDA time total: 2.785ms
4179
 
4180
 
4181
 
 
4185
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4186
  Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
4187
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4188
+ hf_kernels_flash_attn 1.94% 86.682us 37.50% 1.676ms 1.676ms 0.000us 0.00% 3.929ms 3.929ms 1
4189
+ _flash_attn_9e27194::fwd 1.06% 47.570us 35.56% 1.589ms 529.734us 2.938ms 100.00% 3.929ms 1.310ms 3
4190
+ hf_kernels_flash_attn 0.00% 0.000us 0.00% 0.000us 0.000us 2.939ms 100.05% 2.939ms 2.939ms 1
4191
+ void flash::flash_fwd_kernel&lt;Flash_fwd_kernel_traits... 0.00% 0.000us 0.00% 0.000us 0.000us 2.938ms 100.00% 2.938ms 979.209us 3
4192
+ Activity Buffer Request 32.66% 1.460ms 32.66% 1.460ms 1.460ms 991.166us 33.74% 991.166us 991.166us 1
4193
+ cudaDeviceGetAttribute 0.10% 4.450us 0.10% 4.450us 0.297us 0.000us 0.00% 0.000us 0.000us 15
4194
+ aten::empty_like 0.19% 8.440us 0.55% 24.690us 8.230us 0.000us 0.00% 0.000us 0.000us 3
4195
+ aten::empty_strided 0.36% 16.250us 0.36% 16.250us 5.417us 0.000us 0.00% 0.000us 0.000us 3
4196
+ aten::empty 0.51% 22.872us 0.51% 22.872us 2.541us 0.000us 0.00% 0.000us 0.000us 9
4197
+ cudaFuncSetAttribute 0.07% 3.350us 0.07% 3.350us 1.117us 0.000us 0.00% 0.000us 0.000us 3
4198
+ cudaLaunchKernel 0.60% 26.611us 0.60% 26.611us 8.870us 0.000us 0.00% 0.000us 0.000us 3
4199
+ cudaDeviceSynchronize 62.50% 2.794ms 62.50% 2.794ms 2.794ms 0.000us 0.00% 0.000us 0.000us 1
4200
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4201
+ Self CPU time total: 4.469ms
4202
+ Self CUDA time total: 2.938ms
4203
 
4204
 
4205
 
 
4209
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4210
  Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
4211
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4212
+ hf_kernels_flash_attn 2.38% 109.313us 36.70% 1.683ms 1.683ms 0.000us 0.00% 4.081ms 4.081ms 1
4213
+ _flash_attn_9e27194::fwd 1.05% 48.167us 34.31% 1.574ms 524.567us 3.048ms 100.00% 4.081ms 1.360ms 3
4214
+ hf_kernels_flash_attn 0.00% 0.000us 0.00% 0.000us 0.000us 3.049ms 100.05% 3.049ms 3.049ms 1
4215
+ void flash::flash_fwd_kernel&lt;Flash_fwd_kernel_traits... 0.00% 0.000us 0.00% 0.000us 0.000us 3.048ms 100.00% 3.048ms 1.016ms 3
4216
+ Activity Buffer Request 31.46% 1.443ms 31.46% 1.443ms 1.443ms 1.033ms 33.90% 1.033ms 1.033ms 1
4217
+ cudaDeviceGetAttribute 0.09% 4.231us 0.09% 4.231us 0.282us 0.000us 0.00% 0.000us 0.000us 15
4218
+ aten::empty_like 0.16% 7.250us 0.52% 23.960us 7.987us 0.000us 0.00% 0.000us 0.000us 3
4219
+ aten::empty_strided 0.36% 16.710us 0.36% 16.710us 5.570us 0.000us 0.00% 0.000us 0.000us 3
4220
+ aten::empty 0.46% 21.300us 0.46% 21.300us 2.367us 0.000us 0.00% 0.000us 0.000us 9
4221
+ cudaFuncSetAttribute 0.08% 3.561us 0.08% 3.561us 1.187us 0.000us 0.00% 0.000us 0.000us 3
4222
+ cudaLaunchKernel 0.64% 29.473us 0.64% 29.473us 9.824us 0.000us 0.00% 0.000us 0.000us 3
4223
+ cudaDeviceSynchronize 63.30% 2.903ms 63.30% 2.903ms 2.903ms 0.000us 0.00% 0.000us 0.000us 1
4224
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4225
+ Self CPU time total: 4.586ms
4226
+ Self CUDA time total: 3.048ms
4227
 
4228
 
4229
 
 
4233
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4234
  Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
4235
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4236
+ hf_kernels_flash_attn 2.13% 103.094us 38.83% 1.884ms 1.884ms 0.000us 0.00% 4.165ms 4.165ms 1
4237
+ _flash_attn_9e27194::fwd 0.99% 47.838us 36.71% 1.781ms 593.521us 3.114ms 100.00% 4.165ms 1.388ms 3
4238
+ hf_kernels_flash_attn 0.00% 0.000us 0.00% 0.000us 0.000us 3.116ms 100.05% 3.116ms 3.116ms 1
4239
+ void flash::flash_fwd_kernel&lt;Flash_fwd_kernel_traits... 0.00% 0.000us 0.00% 0.000us 0.000us 3.114ms 100.00% 3.114ms 1.038ms 3
4240
+ Activity Buffer Request 29.59% 1.435ms 29.59% 1.435ms 1.435ms 1.051ms 33.75% 1.051ms 1.051ms 1
4241
+ cudaDeviceGetAttribute 0.08% 3.800us 0.08% 3.800us 0.253us 0.000us 0.00% 0.000us 0.000us 15
4242
+ aten::empty_like 0.16% 7.891us 0.53% 25.811us 8.604us 0.000us 0.00% 0.000us 0.000us 3
4243
+ aten::empty_strided 0.37% 17.920us 0.37% 17.920us 5.973us 0.000us 0.00% 0.000us 0.000us 3
4244
+ aten::empty 0.45% 21.731us 0.45% 21.731us 2.415us 0.000us 0.00% 0.000us 0.000us 9
4245
+ cudaFuncSetAttribute 0.08% 3.740us 0.08% 3.740us 1.247us 0.000us 0.00% 0.000us 0.000us 3
4246
+ cudaLaunchKernel 4.99% 242.187us 4.99% 242.187us 80.729us 0.000us 0.00% 0.000us 0.000us 3
4247
+ cudaDeviceSynchronize 61.17% 2.967ms 61.17% 2.967ms 2.967ms 0.000us 0.00% 0.000us 0.000us 1
4248
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4249
+ Self CPU time total: 4.851ms
4250
+ Self CUDA time total: 3.114ms
4251
 
4252
 
4253
 
 
4257
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4258
  Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
4259
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4260
+ hf_kernels_flash_attn 2.00% 105.522us 34.61% 1.828ms 1.828ms 0.000us 0.00% 4.806ms 4.806ms 1
4261
+ _flash_attn_9e27194::fwd 0.94% 49.622us 32.62% 1.723ms 574.192us 3.597ms 100.00% 4.806ms 1.602ms 3
4262
+ hf_kernels_flash_attn 0.00% 0.000us 0.00% 0.000us 0.000us 3.599ms 100.05% 3.599ms 3.599ms 1
4263
+ void flash::flash_fwd_kernel&lt;Flash_fwd_kernel_traits... 0.00% 0.000us 0.00% 0.000us 0.000us 3.597ms 100.00% 3.597ms 1.199ms 3
4264
+ Activity Buffer Request 27.37% 1.446ms 27.37% 1.446ms 1.446ms 1.209ms 33.59% 1.209ms 1.209ms 1
4265
+ cudaDeviceGetAttribute 0.08% 3.991us 0.08% 3.991us 0.266us 0.000us 0.00% 0.000us 0.000us 15
4266
+ aten::empty_like 0.14% 7.250us 0.47% 24.620us 8.207us 0.000us 0.00% 0.000us 0.000us 3
4267
+ aten::empty_strided 0.33% 17.370us 0.33% 17.370us 5.790us 0.000us 0.00% 0.000us 0.000us 3
4268
+ aten::empty 0.41% 21.681us 0.41% 21.681us 2.409us 0.000us 0.00% 0.000us 0.000us 9
4269
+ cudaFuncSetAttribute 0.07% 3.770us 0.07% 3.770us 1.257us 0.000us 0.00% 0.000us 0.000us 3
4270
+ cudaLaunchKernel 3.28% 173.384us 3.28% 173.384us 57.795us 0.000us 0.00% 0.000us 0.000us 3
4271
+ cudaDeviceSynchronize 65.39% 3.453ms 65.39% 3.453ms 3.453ms 0.000us 0.00% 0.000us 0.000us 1
4272
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4273
+ Self CPU time total: 5.281ms
4274
+ Self CUDA time total: 3.597ms
4275
 
4276
 
4277
 
 
4281
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4282
  Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
4283
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4284
+ hf_kernels_flash_attn 2.02% 107.892us 33.82% 1.810ms 1.810ms 0.000us 0.00% 4.930ms 4.930ms 1
4285
+ _flash_attn_9e27194::fwd 0.91% 48.918us 31.80% 1.702ms 567.268us 3.687ms 100.00% 4.930ms 1.643ms 3
4286
+ hf_kernels_flash_attn 0.00% 0.000us 0.00% 0.000us 0.000us 3.689ms 100.04% 3.689ms 3.689ms 1
4287
+ void flash::flash_fwd_kernel&lt;Flash_fwd_kernel_traits... 0.00% 0.000us 0.00% 0.000us 0.000us 3.687ms 100.00% 3.687ms 1.229ms 3
4288
+ Activity Buffer Request 26.86% 1.437ms 26.86% 1.437ms 1.437ms 1.242ms 33.69% 1.242ms 1.242ms 1
4289
+ cudaDeviceGetAttribute 0.07% 3.881us 0.07% 3.881us 0.259us 0.000us 0.00% 0.000us 0.000us 15
4290
+ aten::empty_like 0.14% 7.591us 0.49% 26.111us 8.704us 0.000us 0.00% 0.000us 0.000us 3
4291
+ aten::empty_strided 0.35% 18.520us 0.35% 18.520us 6.173us 0.000us 0.00% 0.000us 0.000us 3
4292
+ aten::empty 0.39% 20.640us 0.39% 20.640us 2.293us 0.000us 0.00% 0.000us 0.000us 9
4293
+ cudaFuncSetAttribute 0.07% 3.561us 0.07% 3.561us 1.187us 0.000us 0.00% 0.000us 0.000us 3
4294
+ cudaLaunchKernel 3.01% 161.306us 3.01% 161.306us 53.769us 0.000us 0.00% 0.000us 0.000us 3
4295
+ cudaDeviceSynchronize 66.18% 3.542ms 66.18% 3.542ms 3.542ms 0.000us 0.00% 0.000us 0.000us 1
4296
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4297
+ Self CPU time total: 5.351ms
4298
+ Self CUDA time total: 3.687ms
4299
 
4300
 
4301
  impl wl p50(ms) ok
4302
+ hf_kernels_flash_attn cuda_attn_L128_bfloat16 0.95 True
4303
+ hf_kernels_flash_attn cuda_attn_L256_bfloat16 1.00 True
4304
+ hf_kernels_flash_attn cuda_attn_L320_bfloat16 1.05 True
4305
+ hf_kernels_flash_attn cuda_attn_L384_bfloat16 1.06 True
4306
+ hf_kernels_flash_attn cuda_attn_L448_bfloat16 1.23 True
4307
  hf_kernels_flash_attn cuda_attn_L512_bfloat16 1.23 True
4308
  </pre></div>
4309
+ <div class="cell-stderr">
4310
+ Fetching 20 files: 0%| | 0/20 [00:00&lt;?, ?it/s]
4311
+ Fetching 20 files: 10%|█ | 2/20 [00:01&lt;00:15, 1.19it/s]
4312
+ Fetching 20 files: 100%|██████████| 20/20 [00:01&lt;00:00, 11.87it/s]
4313
  </div>
 
 
 
 
 
4314
  <div class="cell-artifacts">
4315
  <h4>Artifacts:</h4>
4316
  <a href="artifacts/benchmark/attention.jsonl" class="artifact" target="_blank">attention.jsonl</a>
flash_attn/impls/hf_kernels_flash_attn3.html CHANGED
@@ -4106,11 +4106,11 @@ body[data-tool="eraser"] .main-content {
4106
  <span onclick="toggleOutput('benchmark')" style="cursor: pointer;">▼ output</span>
4107
  <span id="uv-indicator-benchmark" style="cursor: default; opacity: 0.3;">▶ uv-logs</span>
4108
  </span> |
4109
- Cell: benchmark | 5.55s
4110
  | <button class="run-btn" onclick="runCell('benchmark')">▶ run</button>
4111
  <button class="copy-btn" onclick="copyCell('benchmark')">Copy</button>
4112
  <a href="cells/benchmark.py" target="_blank" class="raw-btn">Raw</a>
4113
- <a href="https://github.com/huggingface/kernels-uvnotes/blob/main/flash_attn/impls/hf_kernels_flash_attn3.md" target="_blank" class="github-btn">GitHub</a>
4114
  <a href="https://huggingface.co/kernels-community/flash-attn3" target="_blank" class="hf-btn">🤗 HF</a>
4115
  </div>
4116
  <div id="code-benchmark" class="cell-code" data-lines="31">
@@ -4160,19 +4160,19 @@ PROFILE TRACE: hf_kernels_flash_attn3 | cuda_attn_L128_bfloat16
4160
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4161
  Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
4162
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4163
- hf_kernels_flash_attn3 4.02% 170.054us 45.66% 1.931ms 1.931ms 0.000us 0.00% 3.489ms 3.489ms 1
4164
- FlashAttnFunc 2.98% 126.112us 41.64% 1.761ms 586.890us 0.000us 0.00% 3.489ms 1.163ms 3
4165
- _flash_attn3_48fe103_dirty::fwd 1.85% 78.440us 38.65% 1.635ms 544.853us 2.605ms 100.00% 3.489ms 1.163ms 3
4166
- hf_kernels_flash_attn3 0.00% 0.000us 0.00% 0.000us 0.000us 2.606ms 100.06% 2.606ms 2.606ms 1
4167
- void cutlass::device_kernel&lt;flash::enable_sm80_to_sm... 0.00% 0.000us 0.00% 0.000us 0.000us 2.605ms 100.00% 2.605ms 868.221us 3
4168
- Activity Buffer Request 34.45% 1.457ms 34.45% 1.457ms 1.457ms 884.680us 33.97% 884.680us 884.680us 1
4169
- aten::empty 1.07% 45.402us 1.07% 45.402us 7.567us 0.000us 0.00% 0.000us 0.000us 6
4170
- cudaFuncSetAttribute 0.29% 12.202us 0.29% 12.202us 4.067us 0.000us 0.00% 0.000us 0.000us 3
4171
- cudaLaunchKernel 0.99% 41.761us 0.99% 41.761us 13.920us 0.000us 0.00% 0.000us 0.000us 3
4172
- cudaDeviceSynchronize 54.34% 2.298ms 54.34% 2.298ms 2.298ms 0.000us 0.00% 0.000us 0.000us 1
4173
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4174
- Self CPU time total: 4.229ms
4175
- Self CUDA time total: 2.605ms
4176
 
4177
 
4178
 
@@ -4182,19 +4182,19 @@ PROFILE TRACE: hf_kernels_flash_attn3 | cuda_attn_L256_bfloat16
4182
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4183
  Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
4184
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4185
- hf_kernels_flash_attn3 2.90% 125.133us 41.34% 1.782ms 1.782ms 0.000us 0.00% 3.684ms 3.684ms 1
4186
- FlashAttnFunc 2.10% 90.312us 38.43% 1.657ms 552.206us 0.000us 0.00% 3.684ms 1.228ms 3
4187
- _flash_attn3_48fe103_dirty::fwd 1.24% 53.461us 36.34% 1.566ms 522.102us 2.755ms 100.00% 3.684ms 1.228ms 3
4188
- hf_kernels_flash_attn3 0.00% 0.000us 0.00% 0.000us 0.000us 2.756ms 100.06% 2.756ms 2.756ms 1
4189
- void cutlass::device_kernel&lt;flash::enable_sm80_to_sm... 0.00% 0.000us 0.00% 0.000us 0.000us 2.755ms 100.00% 2.755ms 918.309us 3
4190
- Activity Buffer Request 33.60% 1.448ms 33.60% 1.448ms 1.448ms 929.157us 33.73% 929.157us 929.157us 1
4191
- aten::empty 0.64% 27.380us 0.64% 27.380us 4.563us 0.000us 0.00% 0.000us 0.000us 6
4192
- cudaFuncSetAttribute 0.13% 5.449us 0.13% 5.449us 1.816us 0.000us 0.00% 0.000us 0.000us 3
4193
- cudaLaunchKernel 0.74% 31.802us 0.74% 31.802us 10.601us 0.000us 0.00% 0.000us 0.000us 3
4194
- cudaDeviceSynchronize 58.66% 2.529ms 58.66% 2.529ms 2.529ms 0.000us 0.00% 0.000us 0.000us 1
4195
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4196
- Self CPU time total: 4.310ms
4197
- Self CUDA time total: 2.755ms
4198
 
4199
 
4200
 
@@ -4204,19 +4204,19 @@ PROFILE TRACE: hf_kernels_flash_attn3 | cuda_attn_L320_bfloat16
4204
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4205
  Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
4206
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4207
- hf_kernels_flash_attn3 2.81% 125.615us 39.44% 1.762ms 1.762ms 0.000us 0.00% 3.917ms 3.917ms 1
4208
- FlashAttnFunc 2.03% 90.880us 36.63% 1.637ms 545.546us 0.000us 0.00% 3.917ms 1.306ms 3
4209
- _flash_attn3_48fe103_dirty::fwd 1.20% 53.572us 34.59% 1.546ms 515.252us 2.927ms 100.00% 3.917ms 1.306ms 3
4210
- hf_kernels_flash_attn3 0.00% 0.000us 0.00% 0.000us 0.000us 2.928ms 100.05% 2.928ms 2.928ms 1
4211
- void cutlass::device_kernel&lt;flash::enable_sm80_to_sm... 0.00% 0.000us 0.00% 0.000us 0.000us 2.927ms 100.00% 2.927ms 975.593us 3
4212
- Activity Buffer Request 31.96% 1.428ms 31.96% 1.428ms 1.428ms 990.441us 33.84% 990.441us 990.441us 1
4213
- aten::empty 0.63% 27.950us 0.63% 27.950us 4.658us 0.000us 0.00% 0.000us 0.000us 6
4214
- cudaFuncSetAttribute 0.12% 5.340us 0.12% 5.340us 1.780us 0.000us 0.00% 0.000us 0.000us 3
4215
- cudaLaunchKernel 0.68% 30.562us 0.68% 30.562us 10.187us 0.000us 0.00% 0.000us 0.000us 3
4216
- cudaDeviceSynchronize 60.56% 2.706ms 60.56% 2.706ms 2.706ms 0.000us 0.00% 0.000us 0.000us 1
4217
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4218
- Self CPU time total: 4.469ms
4219
- Self CUDA time total: 2.927ms
4220
 
4221
 
4222
 
@@ -4226,19 +4226,19 @@ PROFILE TRACE: hf_kernels_flash_attn3 | cuda_attn_L384_bfloat16
4226
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4227
  Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
4228
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4229
- hf_kernels_flash_attn3 2.73% 126.513us 42.04% 1.948ms 1.948ms 0.000us 0.00% 3.892ms 3.892ms 1
4230
- FlashAttnFunc 2.03% 94.184us 39.31% 1.821ms 607.134us 0.000us 0.00% 3.892ms 1.297ms 3
4231
- _flash_attn3_48fe103_dirty::fwd 1.14% 52.959us 37.28% 1.727ms 575.740us 2.906ms 100.00% 3.892ms 1.297ms 3
4232
- hf_kernels_flash_attn3 0.00% 0.000us 0.00% 0.000us 0.000us 2.908ms 100.05% 2.908ms 2.908ms 1
4233
- void cutlass::device_kernel&lt;flash::enable_sm80_to_sm... 0.00% 0.000us 0.00% 0.000us 0.000us 2.906ms 100.00% 2.906ms 968.728us 3
4234
- Activity Buffer Request 30.69% 1.422ms 30.69% 1.422ms 1.422ms 985.540us 33.91% 985.540us 985.540us 1
4235
- aten::empty 0.63% 29.361us 0.63% 29.361us 4.893us 0.000us 0.00% 0.000us 0.000us 6
4236
- cudaFuncSetAttribute 0.11% 5.241us 0.11% 5.241us 1.747us 0.000us 0.00% 0.000us 0.000us 3
4237
- cudaLaunchKernel 4.70% 217.965us 4.70% 217.965us 72.655us 0.000us 0.00% 0.000us 0.000us 3
4238
- cudaDeviceSynchronize 57.96% 2.685ms 57.96% 2.685ms 2.685ms 0.000us 0.00% 0.000us 0.000us 1
4239
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4240
- Self CPU time total: 4.633ms
4241
- Self CUDA time total: 2.906ms
4242
 
4243
 
4244
 
@@ -4248,19 +4248,19 @@ PROFILE TRACE: hf_kernels_flash_attn3 | cuda_attn_L448_bfloat16
4248
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4249
  Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
4250
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4251
- hf_kernels_flash_attn3 2.33% 120.764us 37.09% 1.922ms 1.922ms 0.000us 0.00% 4.645ms 4.645ms 1
4252
- FlashAttnFunc 1.78% 92.240us 34.76% 1.801ms 600.384us 0.000us 0.00% 4.645ms 1.548ms 3
4253
- _flash_attn3_48fe103_dirty::fwd 1.04% 53.829us 32.98% 1.709ms 569.637us 3.482ms 100.00% 4.645ms 1.548ms 3
4254
- hf_kernels_flash_attn3 0.00% 0.000us 0.00% 0.000us 0.000us 3.483ms 100.04% 3.483ms 3.483ms 1
4255
- void cutlass::device_kernel&lt;flash::enable_sm80_to_sm... 0.00% 0.000us 0.00% 0.000us 0.000us 3.482ms 100.00% 3.482ms 1.161ms 3
4256
- Activity Buffer Request 27.80% 1.441ms 27.80% 1.441ms 1.441ms 1.163ms 33.40% 1.163ms 1.163ms 1
4257
- aten::empty 0.54% 28.012us 0.54% 28.012us 4.669us 0.000us 0.00% 0.000us 0.000us 6
4258
- cudaFuncSetAttribute 0.10% 5.211us 0.10% 5.211us 1.737us 0.000us 0.00% 0.000us 0.000us 3
4259
- cudaLaunchKernel 3.50% 181.305us 3.50% 181.305us 60.435us 0.000us 0.00% 0.000us 0.000us 3
4260
- cudaDeviceSynchronize 62.91% 3.260ms 62.91% 3.260ms 3.260ms 0.000us 0.00% 0.000us 0.000us 1
4261
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4262
- Self CPU time total: 5.182ms
4263
- Self CUDA time total: 3.482ms
4264
 
4265
 
4266
 
@@ -4270,33 +4270,33 @@ PROFILE TRACE: hf_kernels_flash_attn3 | cuda_attn_L512_bfloat16
4270
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4271
  Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
4272
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4273
- hf_kernels_flash_attn3 2.54% 130.883us 37.28% 1.924ms 1.924ms 0.000us 0.00% 4.633ms 4.633ms 1
4274
- FlashAttnFunc 1.80% 93.033us 34.74% 1.793ms 597.564us 0.000us 0.00% 4.633ms 1.544ms 3
4275
- _flash_attn3_48fe103_dirty::fwd 1.02% 52.583us 32.94% 1.700ms 566.553us 3.468ms 100.00% 4.633ms 1.544ms 3
4276
- hf_kernels_flash_attn3 0.00% 0.000us 0.00% 0.000us 0.000us 3.469ms 100.04% 3.469ms 3.469ms 1
4277
  void cutlass::device_kernel&lt;flash::enable_sm80_to_sm... 0.00% 0.000us 0.00% 0.000us 0.000us 3.468ms 100.00% 3.468ms 1.156ms 3
4278
- Activity Buffer Request 27.99% 1.444ms 27.99% 1.444ms 1.444ms 1.165ms 33.61% 1.165ms 1.165ms 1
4279
- aten::empty 0.56% 29.150us 0.56% 29.150us 4.858us 0.000us 0.00% 0.000us 0.000us 6
4280
- cudaFuncSetAttribute 0.10% 5.050us 0.10% 5.050us 1.683us 0.000us 0.00% 0.000us 0.000us 3
4281
- cudaLaunchKernel 3.27% 168.763us 3.27% 168.763us 56.254us 0.000us 0.00% 0.000us 0.000us 3
4282
- cudaDeviceSynchronize 62.72% 3.236ms 62.72% 3.236ms 3.236ms 0.000us 0.00% 0.000us 0.000us 1
4283
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4284
- Self CPU time total: 5.160ms
4285
  Self CUDA time total: 3.468ms
4286
 
4287
 
4288
  impl wl p50(ms) ok
4289
- hf_kernels_flash_attn3 cuda_attn_L128_bfloat16 0.91 True
4290
- hf_kernels_flash_attn3 cuda_attn_L256_bfloat16 0.95 True
4291
- hf_kernels_flash_attn3 cuda_attn_L320_bfloat16 1.02 True
4292
- hf_kernels_flash_attn3 cuda_attn_L384_bfloat16 1.02 True
4293
- hf_kernels_flash_attn3 cuda_attn_L448_bfloat16 1.18 True
4294
  hf_kernels_flash_attn3 cuda_attn_L512_bfloat16 1.18 True
4295
  </pre></div>
4296
  <div class="cell-stderr">
4297
  Fetching 4 files: 0%| | 0/4 [00:00&lt;?, ?it/s]
4298
- Fetching 4 files: 50%|█████ | 2/4 [00:01&lt;00:01, 1.35it/s]
4299
- Fetching 4 files: 100%|██████████| 4/4 [00:01&lt;00:00, 2.71it/s]
4300
  </div>
4301
  <div class="cell-artifacts">
4302
  <h4>Artifacts:</h4>
 
4106
  <span onclick="toggleOutput('benchmark')" style="cursor: pointer;">▼ output</span>
4107
  <span id="uv-indicator-benchmark" style="cursor: default; opacity: 0.3;">▶ uv-logs</span>
4108
  </span> |
4109
+ Cell: benchmark | 5.53s
4110
  | <button class="run-btn" onclick="runCell('benchmark')">▶ run</button>
4111
  <button class="copy-btn" onclick="copyCell('benchmark')">Copy</button>
4112
  <a href="cells/benchmark.py" target="_blank" class="raw-btn">Raw</a>
4113
+ <a href="https://github.com/huggingface/kernels-benchmarks/blob/main/flash_attn/impls/hf_kernels_flash_attn3.md" target="_blank" class="github-btn">GitHub</a>
4114
  <a href="https://huggingface.co/kernels-community/flash-attn3" target="_blank" class="hf-btn">🤗 HF</a>
4115
  </div>
4116
  <div id="code-benchmark" class="cell-code" data-lines="31">
 
4160
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4161
  Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
4162
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4163
+ hf_kernels_flash_attn3 3.85% 171.193us 46.01% 2.045ms 2.045ms 0.000us 0.00% 3.614ms 3.614ms 1
4164
+ FlashAttnFunc 3.07% 136.295us 42.15% 1.874ms 624.570us 0.000us 0.00% 3.614ms 1.205ms 3
4165
+ _flash_attn3_48fe103_dirty::fwd 1.94% 86.341us 39.09% 1.737ms 579.138us 2.720ms 100.00% 3.614ms 1.205ms 3
4166
+ hf_kernels_flash_attn3 0.00% 0.000us 0.00% 0.000us 0.000us 2.722ms 100.05% 2.722ms 2.722ms 1
4167
+ void cutlass::device_kernel&lt;flash::enable_sm80_to_sm... 0.00% 0.000us 0.00% 0.000us 0.000us 2.720ms 100.00% 2.720ms 906.698us 3
4168
+ Activity Buffer Request 34.72% 1.543ms 34.72% 1.543ms 1.543ms 893.600us 32.85% 893.600us 893.600us 1
4169
+ aten::empty 1.07% 47.441us 1.07% 47.441us 7.907us 0.000us 0.00% 0.000us 0.000us 6
4170
+ cudaFuncSetAttribute 0.31% 13.761us 0.31% 13.761us 4.587us 0.000us 0.00% 0.000us 0.000us 3
4171
+ cudaLaunchKernel 1.05% 46.772us 1.05% 46.772us 15.591us 0.000us 0.00% 0.000us 0.000us 3
4172
+ cudaDeviceSynchronize 53.99% 2.400ms 53.99% 2.400ms 2.400ms 0.000us 0.00% 0.000us 0.000us 1
4173
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4174
+ Self CPU time total: 4.445ms
4175
+ Self CUDA time total: 2.720ms
4176
 
4177
 
4178
 
 
4182
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4183
  Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
4184
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4185
+ hf_kernels_flash_attn3 2.41% 104.370us 41.13% 1.784ms 1.784ms 0.000us 0.00% 3.700ms 3.700ms 1
4186
+ FlashAttnFunc 2.00% 86.685us 38.73% 1.679ms 559.738us 0.000us 0.00% 3.700ms 1.233ms 3
4187
+ _flash_attn3_48fe103_dirty::fwd 1.21% 52.631us 36.73% 1.593ms 530.843us 2.768ms 100.00% 3.700ms 1.233ms 3
4188
+ hf_kernels_flash_attn3 0.00% 0.000us 0.00% 0.000us 0.000us 2.769ms 100.06% 2.769ms 2.769ms 1
4189
+ void cutlass::device_kernel&lt;flash::enable_sm80_to_sm... 0.00% 0.000us 0.00% 0.000us 0.000us 2.768ms 100.00% 2.768ms 922.559us 3
4190
+ Activity Buffer Request 34.10% 1.479ms 34.10% 1.479ms 1.479ms 932.127us 33.68% 932.127us 932.127us 1
4191
+ aten::empty 0.60% 25.981us 0.60% 25.981us 4.330us 0.000us 0.00% 0.000us 0.000us 6
4192
+ cudaFuncSetAttribute 0.12% 5.050us 0.12% 5.050us 1.683us 0.000us 0.00% 0.000us 0.000us 3
4193
+ cudaLaunchKernel 0.70% 30.140us 0.70% 30.140us 10.047us 0.000us 0.00% 0.000us 0.000us 3
4194
+ cudaDeviceSynchronize 58.87% 2.553ms 58.87% 2.553ms 2.553ms 0.000us 0.00% 0.000us 0.000us 1
4195
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4196
+ Self CPU time total: 4.336ms
4197
+ Self CUDA time total: 2.768ms
4198
 
4199
 
4200
 
 
4204
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4205
  Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
4206
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4207
+ hf_kernels_flash_attn3 2.29% 102.411us 40.10% 1.791ms 1.791ms 0.000us 0.00% 3.875ms 3.875ms 1
4208
+ FlashAttnFunc 2.01% 89.903us 37.81% 1.688ms 562.801us 0.000us 0.00% 3.875ms 1.292ms 3
4209
+ _flash_attn3_48fe103_dirty::fwd 1.18% 52.613us 35.79% 1.599ms 532.834us 2.892ms 100.00% 3.875ms 1.292ms 3
4210
+ hf_kernels_flash_attn3 0.00% 0.000us 0.00% 0.000us 0.000us 2.893ms 100.05% 2.893ms 2.893ms 1
4211
+ void cutlass::device_kernel&lt;flash::enable_sm80_to_sm... 0.00% 0.000us 0.00% 0.000us 0.000us 2.892ms 100.00% 2.892ms 963.972us 3
4212
+ Activity Buffer Request 33.24% 1.485ms 33.24% 1.485ms 1.485ms 983.097us 33.99% 983.097us 983.097us 1
4213
+ aten::empty 0.58% 25.770us 0.58% 25.770us 4.295us 0.000us 0.00% 0.000us 0.000us 6
4214
+ cudaFuncSetAttribute 0.11% 4.820us 0.11% 4.820us 1.607us 0.000us 0.00% 0.000us 0.000us 3
4215
+ cudaLaunchKernel 0.69% 30.740us 0.69% 30.740us 10.247us 0.000us 0.00% 0.000us 0.000us 3
4216
+ cudaDeviceSynchronize 59.90% 2.675ms 59.90% 2.675ms 2.675ms 0.000us 0.00% 0.000us 0.000us 1
4217
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4218
+ Self CPU time total: 4.466ms
4219
+ Self CUDA time total: 2.892ms
4220
 
4221
 
4222
 
 
4226
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4227
  Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
4228
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4229
+ hf_kernels_flash_attn3 2.68% 125.944us 42.11% 1.982ms 1.982ms 0.000us 0.00% 3.932ms 3.932ms 1
4230
+ FlashAttnFunc 1.98% 92.983us 39.44% 1.856ms 618.639us 0.000us 0.00% 3.932ms 1.311ms 3
4231
+ _flash_attn3_48fe103_dirty::fwd 1.14% 53.661us 37.46% 1.763ms 587.645us 2.953ms 100.00% 3.932ms 1.311ms 3
4232
+ hf_kernels_flash_attn3 0.00% 0.000us 0.00% 0.000us 0.000us 2.954ms 100.06% 2.954ms 2.954ms 1
4233
+ void cutlass::device_kernel&lt;flash::enable_sm80_to_sm... 0.00% 0.000us 0.00% 0.000us 0.000us 2.953ms 100.00% 2.953ms 984.176us 3
4234
+ Activity Buffer Request 30.48% 1.434ms 30.48% 1.434ms 1.434ms 979.803us 33.19% 979.803us 979.803us 1
4235
+ aten::empty 0.58% 27.450us 0.58% 27.450us 4.575us 0.000us 0.00% 0.000us 0.000us 6
4236
+ cudaFuncSetAttribute 0.11% 5.150us 0.11% 5.150us 1.717us 0.000us 0.00% 0.000us 0.000us 3
4237
+ cudaLaunchKernel 5.15% 242.396us 5.15% 242.396us 80.799us 0.000us 0.00% 0.000us 0.000us 3
4238
+ cudaDeviceSynchronize 57.89% 2.724ms 57.89% 2.724ms 2.724ms 0.000us 0.00% 0.000us 0.000us 1
4239
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4240
+ Self CPU time total: 4.706ms
4241
+ Self CUDA time total: 2.953ms
4242
 
4243
 
4244
 
 
4248
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4249
  Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
4250
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4251
+ hf_kernels_flash_attn3 2.36% 122.892us 37.59% 1.960ms 1.960ms 0.000us 0.00% 4.622ms 4.622ms 1
4252
+ FlashAttnFunc 1.74% 90.533us 35.23% 1.837ms 612.429us 0.000us 0.00% 4.622ms 1.541ms 3
4253
+ _flash_attn3_48fe103_dirty::fwd 0.97% 50.750us 33.49% 1.747ms 582.252us 3.470ms 100.00% 4.622ms 1.541ms 3
4254
+ hf_kernels_flash_attn3 0.00% 0.000us 0.00% 0.000us 0.000us 3.472ms 100.05% 3.472ms 3.472ms 1
4255
+ void cutlass::device_kernel&lt;flash::enable_sm80_to_sm... 0.00% 0.000us 0.00% 0.000us 0.000us 3.470ms 100.00% 3.470ms 1.157ms 3
4256
+ Activity Buffer Request 27.49% 1.433ms 27.49% 1.433ms 1.433ms 1.152ms 33.20% 1.152ms 1.152ms 1
4257
+ aten::empty 0.51% 26.592us 0.51% 26.592us 4.432us 0.000us 0.00% 0.000us 0.000us 6
4258
+ cudaFuncSetAttribute 0.10% 5.060us 0.10% 5.060us 1.687us 0.000us 0.00% 0.000us 0.000us 3
4259
+ cudaLaunchKernel 4.43% 230.856us 4.43% 230.856us 76.952us 0.000us 0.00% 0.000us 0.000us 3
4260
+ cudaDeviceSynchronize 62.41% 3.255ms 62.41% 3.255ms 3.255ms 0.000us 0.00% 0.000us 0.000us 1
4261
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4262
+ Self CPU time total: 5.215ms
4263
+ Self CUDA time total: 3.470ms
4264
 
4265
 
4266
 
 
4270
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4271
  Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
4272
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4273
+ hf_kernels_flash_attn3 2.32% 120.892us 37.51% 1.951ms 1.951ms 0.000us 0.00% 4.639ms 4.639ms 1
4274
+ FlashAttnFunc 1.74% 90.773us 35.18% 1.830ms 610.133us 0.000us 0.00% 4.639ms 1.546ms 3
4275
+ _flash_attn3_48fe103_dirty::fwd 0.99% 51.351us 33.44% 1.740ms 579.875us 3.468ms 100.00% 4.639ms 1.546ms 3
4276
+ hf_kernels_flash_attn3 0.00% 0.000us 0.00% 0.000us 0.000us 3.469ms 100.05% 3.469ms 3.469ms 1
4277
  void cutlass::device_kernel&lt;flash::enable_sm80_to_sm... 0.00% 0.000us 0.00% 0.000us 0.000us 3.468ms 100.00% 3.468ms 1.156ms 3
4278
+ Activity Buffer Request 27.26% 1.418ms 27.26% 1.418ms 1.418ms 1.172ms 33.79% 1.172ms 1.172ms 1
4279
+ aten::empty 0.51% 26.560us 0.51% 26.560us 4.427us 0.000us 0.00% 0.000us 0.000us 6
4280
+ cudaFuncSetAttribute 0.10% 5.101us 0.10% 5.101us 1.700us 0.000us 0.00% 0.000us 0.000us 3
4281
+ cudaLaunchKernel 4.58% 238.367us 4.58% 238.367us 79.456us 0.000us 0.00% 0.000us 0.000us 3
4282
+ cudaDeviceSynchronize 62.49% 3.251ms 62.49% 3.251ms 3.251ms 0.000us 0.00% 0.000us 0.000us 1
4283
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4284
+ Self CPU time total: 5.202ms
4285
  Self CUDA time total: 3.468ms
4286
 
4287
 
4288
  impl wl p50(ms) ok
4289
+ hf_kernels_flash_attn3 cuda_attn_L128_bfloat16 0.92 True
4290
+ hf_kernels_flash_attn3 cuda_attn_L256_bfloat16 0.96 True
4291
+ hf_kernels_flash_attn3 cuda_attn_L320_bfloat16 1.01 True
4292
+ hf_kernels_flash_attn3 cuda_attn_L384_bfloat16 1.03 True
4293
+ hf_kernels_flash_attn3 cuda_attn_L448_bfloat16 1.20 True
4294
  hf_kernels_flash_attn3 cuda_attn_L512_bfloat16 1.18 True
4295
  </pre></div>
4296
  <div class="cell-stderr">
4297
  Fetching 4 files: 0%| | 0/4 [00:00&lt;?, ?it/s]
4298
+ Fetching 4 files: 50%|█████ | 2/4 [00:01&lt;00:01, 1.42it/s]
4299
+ Fetching 4 files: 100%|██████████| 4/4 [00:01&lt;00:00, 2.84it/s]
4300
  </div>
4301
  <div class="cell-artifacts">
4302
  <h4>Artifacts:</h4>
flash_attn/impls/mem_efficient_attention.html CHANGED
@@ -4110,7 +4110,7 @@ Cell: benchmark | 3.94s
4110
  | <button class="run-btn" onclick="runCell('benchmark')">▶ run</button>
4111
  <button class="copy-btn" onclick="copyCell('benchmark')">Copy</button>
4112
  <a href="cells/benchmark.py" target="_blank" class="raw-btn">Raw</a>
4113
- <a href="https://github.com/huggingface/kernels-uvnotes/blob/main/flash_attn/impls/mem_efficient_attention.md" target="_blank" class="github-btn">GitHub</a>
4114
  </div>
4115
  <div id="code-benchmark" class="cell-code" data-lines="31">
4116
  <div class="code-wrap">
@@ -4159,28 +4159,28 @@ PROFILE TRACE: torch_mem_eff | cuda_attn_L128_bfloat16
4159
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4160
  Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
4161
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4162
- torch_mem_eff 5.20% 361.468us 33.36% 2.319ms 2.319ms 0.000us 0.00% 5.387ms 5.387ms 1
4163
- torch_mem_eff 0.00% 0.000us 0.00% 0.000us 0.000us 5.370ms 100.63% 5.370ms 5.370ms 1
4164
- aten::scaled_dot_product_attention 0.48% 33.240us 2.68% 186.333us 62.111us 0.000us 0.00% 4.719ms 1.573ms 3
4165
- aten::_scaled_dot_product_efficient_attention 0.35% 24.389us 2.20% 153.093us 51.031us 0.000us 0.00% 4.719ms 1.573ms 3
4166
- aten::_efficient_attention_forward 0.53% 37.120us 1.50% 104.111us 34.704us 4.719ms 88.44% 4.719ms 1.573ms 3
4167
- fmha_cutlassF_bf16_aligned_64x128_rf_sm80(PyTorchMem... 0.00% 0.000us 0.00% 0.000us 0.000us 4.719ms 88.44% 4.719ms 1.573ms 3
4168
- aten::contiguous 0.18% 12.841us 24.53% 1.706ms 189.522us 0.000us 0.00% 667.809us 74.201us 9
4169
- aten::clone 0.46% 31.899us 24.35% 1.693ms 188.095us 0.000us 0.00% 667.809us 74.201us 9
4170
- aten::copy_ 1.13% 78.352us 22.86% 1.589ms 176.604us 617.121us 11.56% 667.809us 74.201us 9
4171
- void at::native::elementwise_kernel&lt;128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 617.121us 11.56% 617.121us 68.569us 9
4172
- Activity Buffer Request 20.52% 1.427ms 20.52% 1.427ms 1.427ms 50.688us 0.95% 50.688us 50.688us 1
4173
- aten::transpose 0.98% 68.237us 1.30% 90.074us 3.753us 0.000us 0.00% 0.000us 0.000us 24
4174
- aten::as_strided 0.31% 21.837us 0.31% 21.837us 0.910us 0.000us 0.00% 0.000us 0.000us 24
4175
- aten::empty_like 0.25% 17.541us 1.03% 71.521us 7.947us 0.000us 0.00% 0.000us 0.000us 9
4176
- aten::empty 1.19% 82.429us 1.19% 82.429us 3.925us 0.000us 0.00% 0.000us 0.000us 21
4177
- cudaLaunchKernel 1.61% 111.770us 1.61% 111.770us 9.314us 0.000us 0.00% 0.000us 0.000us 12
4178
- cudaStreamIsCapturing 0.05% 3.512us 0.05% 3.512us 1.171us 0.000us 0.00% 0.000us 0.000us 3
4179
- cudaFuncSetAttribute 0.11% 7.660us 0.11% 7.660us 2.553us 0.000us 0.00% 0.000us 0.000us 3
4180
- cudaDeviceSynchronize 66.64% 4.633ms 66.64% 4.633ms 4.633ms 0.000us 0.00% 0.000us 0.000us 1
4181
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4182
- Self CPU time total: 6.952ms
4183
- Self CUDA time total: 5.336ms
4184
 
4185
 
4186
 
@@ -4190,28 +4190,28 @@ PROFILE TRACE: torch_mem_eff | cuda_attn_L256_bfloat16
4190
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4191
  Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
4192
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4193
- torch_mem_eff 3.61% 259.378us 29.44% 2.116ms 2.116ms 0.000us 0.00% 5.734ms 5.734ms 1
4194
- torch_mem_eff 0.00% 0.000us 0.00% 0.000us 0.000us 5.688ms 100.15% 5.688ms 5.688ms 1
4195
- aten::scaled_dot_product_attention 0.27% 19.560us 2.06% 147.832us 49.277us 0.000us 0.00% 5.042ms 1.681ms 3
4196
- aten::_scaled_dot_product_efficient_attention 0.27% 19.340us 1.78% 128.272us 42.757us 0.000us 0.00% 5.042ms 1.681ms 3
4197
- aten::_efficient_attention_forward 0.39% 28.380us 1.18% 84.990us 28.330us 5.042ms 88.79% 5.042ms 1.681ms 3
4198
- fmha_cutlassF_bf16_aligned_64x128_rf_sm80(PyTorchMem... 0.00% 0.000us 0.00% 0.000us 0.000us 5.042ms 88.79% 5.042ms 1.681ms 3
4199
- aten::contiguous 0.11% 8.118us 23.11% 1.661ms 184.525us 0.000us 0.00% 691.453us 76.828us 9
4200
- aten::clone 0.32% 22.761us 23.00% 1.653ms 183.623us 0.000us 0.00% 691.453us 76.828us 9
4201
- aten::copy_ 0.95% 68.519us 21.65% 1.556ms 172.887us 636.925us 11.21% 691.453us 76.828us 9
4202
- void at::native::elementwise_kernel&lt;128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 636.925us 11.21% 636.925us 70.769us 9
4203
- Activity Buffer Request 19.69% 1.415ms 19.69% 1.415ms 1.415ms 54.528us 0.96% 54.528us 54.528us 1
4204
- aten::transpose 0.75% 54.034us 1.00% 71.792us 2.991us 0.000us 0.00% 0.000us 0.000us 24
4205
- aten::as_strided 0.25% 17.758us 0.25% 17.758us 0.740us 0.000us 0.00% 0.000us 0.000us 24
4206
- aten::empty_like 0.18% 12.992us 1.03% 73.863us 8.207us 0.000us 0.00% 0.000us 0.000us 9
4207
- aten::empty 1.22% 87.512us 1.22% 87.512us 4.167us 0.000us 0.00% 0.000us 0.000us 21
4208
- cudaLaunchKernel 1.35% 96.951us 1.35% 96.951us 8.079us 0.000us 0.00% 0.000us 0.000us 12
4209
- cudaStreamIsCapturing 0.03% 2.489us 0.03% 2.489us 0.830us 0.000us 0.00% 0.000us 0.000us 3
4210
- cudaFuncSetAttribute 0.04% 3.130us 0.04% 3.130us 1.043us 0.000us 0.00% 0.000us 0.000us 3
4211
- cudaDeviceSynchronize 70.56% 5.071ms 70.56% 5.071ms 5.071ms 0.000us 0.00% 0.000us 0.000us 1
4212
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4213
- Self CPU time total: 7.187ms
4214
- Self CUDA time total: 5.679ms
4215
 
4216
 
4217
 
@@ -4221,28 +4221,28 @@ PROFILE TRACE: torch_mem_eff | cuda_attn_L320_bfloat16
4221
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4222
  Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
4223
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4224
- torch_mem_eff 3.31% 247.873us 28.16% 2.111ms 2.111ms 0.000us 0.00% 6.014ms 6.014ms 1
4225
- torch_mem_eff 0.00% 0.000us 0.00% 0.000us 0.000us 5.964ms 100.13% 5.964ms 5.964ms 1
4226
- aten::scaled_dot_product_attention 0.26% 19.681us 1.94% 145.404us 48.468us 0.000us 0.00% 5.300ms 1.767ms 3
4227
- aten::_scaled_dot_product_efficient_attention 0.25% 18.780us 1.68% 125.723us 41.908us 0.000us 0.00% 5.300ms 1.767ms 3
4228
- aten::_efficient_attention_forward 0.40% 29.910us 1.12% 83.752us 27.917us 5.300ms 89.00% 5.300ms 1.767ms 3
4229
- fmha_cutlassF_bf16_aligned_64x128_rf_sm80(PyTorchMem... 0.00% 0.000us 0.00% 0.000us 0.000us 5.300ms 89.00% 5.300ms 1.767ms 3
4230
- aten::contiguous 0.10% 7.548us 22.32% 1.673ms 185.921us 0.000us 0.00% 713.444us 79.272us 9
4231
- aten::clone 0.29% 21.851us 22.22% 1.666ms 185.082us 0.000us 0.00% 713.444us 79.272us 9
4232
- aten::copy_ 0.89% 66.441us 21.22% 1.591ms 176.813us 655.331us 11.00% 713.444us 79.272us 9
4233
- void at::native::elementwise_kernel&lt;128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 655.331us 11.00% 655.331us 72.815us 9
4234
- Activity Buffer Request 19.37% 1.452ms 19.37% 1.452ms 1.452ms 58.113us 0.98% 58.113us 58.113us 1
4235
- aten::transpose 0.68% 50.773us 0.90% 67.843us 2.827us 0.000us 0.00% 0.000us 0.000us 24
4236
- aten::as_strided 0.23% 17.070us 0.23% 17.070us 0.711us 0.000us 0.00% 0.000us 0.000us 24
4237
- aten::empty_like 0.16% 12.290us 0.70% 52.570us 5.841us 0.000us 0.00% 0.000us 0.000us 9
4238
- aten::empty 0.87% 64.980us 0.87% 64.980us 3.094us 0.000us 0.00% 0.000us 0.000us 21
4239
- cudaLaunchKernel 1.28% 96.085us 1.28% 96.085us 8.007us 0.000us 0.00% 0.000us 0.000us 12
4240
- cudaStreamIsCapturing 0.03% 2.520us 0.03% 2.520us 0.840us 0.000us 0.00% 0.000us 0.000us 3
4241
- cudaFuncSetAttribute 0.04% 3.050us 0.04% 3.050us 1.017us 0.000us 0.00% 0.000us 0.000us 3
4242
- cudaDeviceSynchronize 71.84% 5.386ms 71.84% 5.386ms 5.386ms 0.000us 0.00% 0.000us 0.000us 1
4243
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4244
- Self CPU time total: 7.498ms
4245
- Self CUDA time total: 5.956ms
4246
 
4247
 
4248
 
@@ -4252,28 +4252,28 @@ PROFILE TRACE: torch_mem_eff | cuda_attn_L384_bfloat16
4252
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4253
  Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
4254
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4255
- torch_mem_eff 3.20% 247.803us 30.17% 2.338ms 2.338ms 0.000us 0.00% 6.050ms 6.050ms 1
4256
- torch_mem_eff 0.00% 0.000us 0.00% 0.000us 0.000us 6.000ms 100.13% 6.000ms 6.000ms 1
4257
- aten::scaled_dot_product_attention 0.37% 28.670us 2.04% 158.093us 52.698us 0.000us 0.00% 5.339ms 1.780ms 3
4258
- aten::_scaled_dot_product_efficient_attention 0.26% 20.220us 1.67% 129.423us 43.141us 0.000us 0.00% 5.339ms 1.780ms 3
4259
- aten::_efficient_attention_forward 0.38% 29.560us 1.08% 83.863us 27.954us 5.339ms 89.10% 5.339ms 1.780ms 3
4260
- fmha_cutlassF_bf16_aligned_64x128_rf_sm80(PyTorchMem... 0.00% 0.000us 0.00% 0.000us 0.000us 5.339ms 89.10% 5.339ms 1.780ms 3
4261
- aten::contiguous 0.10% 7.610us 24.36% 1.887ms 209.722us 0.000us 0.00% 711.328us 79.036us 9
4262
- aten::clone 0.28% 21.914us 24.26% 1.880ms 208.876us 0.000us 0.00% 711.328us 79.036us 9
4263
- aten::copy_ 0.87% 67.261us 23.30% 1.806ms 200.640us 653.248us 10.90% 711.328us 79.036us 9
4264
- void at::native::elementwise_kernel&lt;128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 653.248us 10.90% 653.248us 72.583us 9
4265
- Activity Buffer Request 18.39% 1.425ms 18.39% 1.425ms 1.425ms 58.080us 0.97% 58.080us 58.080us 1
4266
- aten::transpose 0.68% 52.310us 0.90% 69.650us 2.902us 0.000us 0.00% 0.000us 0.000us 24
4267
- aten::as_strided 0.22% 17.340us 0.22% 17.340us 0.723us 0.000us 0.00% 0.000us 0.000us 24
4268
- aten::empty_like 0.16% 12.088us 0.67% 52.209us 5.801us 0.000us 0.00% 0.000us 0.000us 9
4269
- aten::empty 0.84% 64.993us 0.84% 64.993us 3.095us 0.000us 0.00% 0.000us 0.000us 21
4270
- cudaLaunchKernel 4.36% 337.546us 4.36% 337.546us 28.129us 0.000us 0.00% 0.000us 0.000us 12
4271
- cudaStreamIsCapturing 0.03% 2.491us 0.03% 2.491us 0.830us 0.000us 0.00% 0.000us 0.000us 3
4272
- cudaFuncSetAttribute 0.04% 3.020us 0.04% 3.020us 1.007us 0.000us 0.00% 0.000us 0.000us 3
4273
- cudaDeviceSynchronize 69.83% 5.411ms 69.83% 5.411ms 5.411ms 0.000us 0.00% 0.000us 0.000us 1
4274
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4275
- Self CPU time total: 7.749ms
4276
- Self CUDA time total: 5.992ms
4277
 
4278
 
4279
 
@@ -4283,28 +4283,28 @@ PROFILE TRACE: torch_mem_eff | cuda_attn_L448_bfloat16
4283
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4284
  Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
4285
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4286
- torch_mem_eff 3.22% 253.272us 29.03% 2.283ms 2.283ms 0.000us 0.00% 6.248ms 6.248ms 1
4287
- torch_mem_eff 0.00% 0.000us 0.00% 0.000us 0.000us 6.196ms 100.13% 6.196ms 6.196ms 1
4288
- aten::scaled_dot_product_attention 0.25% 19.441us 2.25% 176.884us 58.961us 0.000us 0.00% 5.524ms 1.841ms 3
4289
- aten::_scaled_dot_product_efficient_attention 0.26% 20.811us 2.00% 157.443us 52.481us 0.000us 0.00% 5.524ms 1.841ms 3
4290
- aten::_efficient_attention_forward 0.41% 31.883us 1.42% 111.902us 37.301us 5.524ms 89.27% 5.524ms 1.841ms 3
4291
- fmha_cutlassF_bf16_aligned_64x128_rf_sm80(PyTorchMem... 0.00% 0.000us 0.00% 0.000us 0.000us 5.524ms 89.27% 5.524ms 1.841ms 3
4292
- aten::contiguous 0.10% 7.580us 22.97% 1.807ms 200.732us 0.000us 0.00% 724.035us 80.448us 9
4293
- aten::clone 0.28% 22.150us 22.88% 1.799ms 199.890us 0.000us 0.00% 724.035us 80.448us 9
4294
- aten::copy_ 0.85% 67.019us 21.94% 1.725ms 191.709us 664.226us 10.73% 724.035us 80.448us 9
4295
- void at::native::elementwise_kernel&lt;128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 664.226us 10.73% 664.226us 73.803us 9
4296
- Activity Buffer Request 18.12% 1.425ms 18.12% 1.425ms 1.425ms 59.809us 0.97% 59.809us 59.809us 1
4297
- aten::transpose 0.68% 53.201us 0.91% 71.182us 2.966us 0.000us 0.00% 0.000us 0.000us 24
4298
- aten::as_strided 0.23% 17.981us 0.23% 17.981us 0.749us 0.000us 0.00% 0.000us 0.000us 24
4299
- aten::empty_like 0.15% 12.001us 0.65% 51.482us 5.720us 0.000us 0.00% 0.000us 0.000us 9
4300
- aten::empty 0.81% 63.729us 0.81% 63.729us 3.035us 0.000us 0.00% 0.000us 0.000us 21
4301
- cudaLaunchKernel 3.60% 283.426us 3.60% 283.426us 23.619us 0.000us 0.00% 0.000us 0.000us 12
4302
- cudaStreamIsCapturing 0.03% 2.490us 0.03% 2.490us 0.830us 0.000us 0.00% 0.000us 0.000us 3
4303
- cudaFuncSetAttribute 0.04% 2.980us 0.04% 2.980us 0.993us 0.000us 0.00% 0.000us 0.000us 3
4304
- cudaDeviceSynchronize 70.97% 5.581ms 70.97% 5.581ms 5.581ms 0.000us 0.00% 0.000us 0.000us 1
4305
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4306
- Self CPU time total: 7.864ms
4307
- Self CUDA time total: 6.188ms
4308
 
4309
 
4310
 
@@ -4314,37 +4314,37 @@ PROFILE TRACE: torch_mem_eff | cuda_attn_L512_bfloat16
4314
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4315
  Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
4316
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4317
- torch_mem_eff 3.10% 256.636us 27.41% 2.272ms 2.272ms 0.000us 0.00% 6.685ms 6.685ms 1
4318
- torch_mem_eff 0.00% 0.000us 0.00% 0.000us 0.000us 6.632ms 100.12% 6.632ms 6.632ms 1
4319
- aten::scaled_dot_product_attention 0.23% 18.791us 1.80% 149.483us 49.828us 0.000us 0.00% 5.954ms 1.985ms 3
4320
- aten::_scaled_dot_product_efficient_attention 0.24% 19.642us 1.58% 130.692us 43.564us 0.000us 0.00% 5.954ms 1.985ms 3
4321
- aten::_efficient_attention_forward 0.40% 33.027us 1.05% 86.901us 28.967us 5.954ms 89.88% 5.954ms 1.985ms 3
4322
- fmha_cutlassF_bf16_aligned_64x128_rf_sm80(PyTorchMem... 0.00% 0.000us 0.00% 0.000us 0.000us 5.954ms 89.88% 5.954ms 1.985ms 3
4323
- aten::contiguous 0.09% 7.531us 21.68% 1.797ms 199.660us 0.000us 0.00% 731.136us 81.237us 9
4324
- aten::clone 0.27% 22.649us 21.59% 1.789ms 198.823us 0.000us 0.00% 731.136us 81.237us 9
4325
- aten::copy_ 0.82% 67.700us 20.66% 1.712ms 190.261us 670.176us 10.12% 731.136us 81.237us 9
4326
- void at::native::elementwise_kernel&lt;128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 670.176us 10.12% 670.176us 74.464us 9
4327
- Activity Buffer Request 17.30% 1.434ms 17.30% 1.434ms 1.434ms 60.960us 0.92% 60.960us 60.960us 1
4328
- aten::transpose 0.90% 75.001us 1.12% 92.890us 3.870us 0.000us 0.00% 0.000us 0.000us 24
4329
- aten::as_strided 0.22% 17.889us 0.22% 17.889us 0.745us 0.000us 0.00% 0.000us 0.000us 24
4330
- aten::empty_like 0.15% 12.259us 0.66% 54.410us 6.046us 0.000us 0.00% 0.000us 0.000us 9
4331
- aten::empty 0.81% 67.133us 0.81% 67.133us 3.197us 0.000us 0.00% 0.000us 0.000us 21
4332
- cudaLaunchKernel 2.82% 234.057us 2.82% 234.057us 19.505us 0.000us 0.00% 0.000us 0.000us 12
4333
- cudaStreamIsCapturing 0.03% 2.420us 0.03% 2.420us 0.807us 0.000us 0.00% 0.000us 0.000us 3
4334
- cudaFuncSetAttribute 0.04% 3.430us 0.04% 3.430us 1.143us 0.000us 0.00% 0.000us 0.000us 3
4335
- cudaDeviceSynchronize 72.59% 6.017ms 72.59% 6.017ms 6.017ms 0.000us 0.00% 0.000us 0.000us 1
4336
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4337
- Self CPU time total: 8.289ms
4338
- Self CUDA time total: 6.624ms
4339
 
4340
 
4341
  impl wl p50(ms) ok
4342
- torch_mem_eff cuda_attn_L128_bfloat16 1.81 True
4343
- torch_mem_eff cuda_attn_L256_bfloat16 1.88 True
4344
- torch_mem_eff cuda_attn_L320_bfloat16 1.97 True
4345
- torch_mem_eff cuda_attn_L384_bfloat16 1.97 True
4346
- torch_mem_eff cuda_attn_L448_bfloat16 2.09 True
4347
- torch_mem_eff cuda_attn_L512_bfloat16 2.22 True
4348
  </pre></div>
4349
  <div class="cell-artifacts">
4350
  <h4>Artifacts:</h4>
 
4110
  | <button class="run-btn" onclick="runCell('benchmark')">▶ run</button>
4111
  <button class="copy-btn" onclick="copyCell('benchmark')">Copy</button>
4112
  <a href="cells/benchmark.py" target="_blank" class="raw-btn">Raw</a>
4113
+ <a href="https://github.com/huggingface/kernels-benchmarks/blob/main/flash_attn/impls/mem_efficient_attention.md" target="_blank" class="github-btn">GitHub</a>
4114
  </div>
4115
  <div id="code-benchmark" class="cell-code" data-lines="31">
4116
  <div class="code-wrap">
 
4159
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4160
  Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
4161
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4162
+ torch_mem_eff 5.14% 365.276us 32.53% 2.313ms 2.313ms 0.000us 0.00% 5.511ms 5.511ms 1
4163
+ torch_mem_eff 0.00% 0.000us 0.00% 0.000us 0.000us 5.492ms 100.58% 5.492ms 5.492ms 1
4164
+ aten::scaled_dot_product_attention 0.43% 30.401us 2.47% 175.534us 58.511us 0.000us 0.00% 4.841ms 1.614ms 3
4165
+ aten::_scaled_dot_product_efficient_attention 0.33% 23.489us 2.04% 145.133us 48.378us 0.000us 0.00% 4.841ms 1.614ms 3
4166
+ aten::_efficient_attention_forward 0.51% 36.572us 1.40% 99.733us 33.244us 4.841ms 88.65% 4.841ms 1.614ms 3
4167
+ fmha_cutlassF_bf16_aligned_64x128_rf_sm80(PyTorchMem... 0.00% 0.000us 0.00% 0.000us 0.000us 4.841ms 88.65% 4.841ms 1.614ms 3
4168
+ aten::contiguous 0.18% 12.851us 23.99% 1.706ms 189.523us 0.000us 0.00% 670.241us 74.471us 9
4169
+ aten::clone 0.46% 32.742us 23.80% 1.693ms 188.095us 0.000us 0.00% 670.241us 74.471us 9
4170
+ aten::copy_ 1.05% 74.801us 22.33% 1.588ms 176.415us 619.776us 11.35% 670.241us 74.471us 9
4171
+ void at::native::elementwise_kernel&lt;128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 619.776us 11.35% 619.776us 68.864us 9
4172
+ Activity Buffer Request 20.17% 1.434ms 20.17% 1.434ms 1.434ms 50.465us 0.92% 50.465us 50.465us 1
4173
+ aten::transpose 0.93% 66.224us 1.25% 88.644us 3.693us 0.000us 0.00% 0.000us 0.000us 24
4174
+ aten::as_strided 0.32% 22.420us 0.32% 22.420us 0.934us 0.000us 0.00% 0.000us 0.000us 24
4175
+ aten::empty_like 0.25% 17.919us 1.02% 72.382us 8.042us 0.000us 0.00% 0.000us 0.000us 9
4176
+ aten::empty 1.14% 81.114us 1.14% 81.114us 3.863us 0.000us 0.00% 0.000us 0.000us 21
4177
+ cudaLaunchKernel 1.46% 103.973us 1.46% 103.973us 8.664us 0.000us 0.00% 0.000us 0.000us 12
4178
+ cudaStreamIsCapturing 0.04% 2.960us 0.04% 2.960us 0.987us 0.000us 0.00% 0.000us 0.000us 3
4179
+ cudaFuncSetAttribute 0.12% 8.310us 0.12% 8.310us 2.770us 0.000us 0.00% 0.000us 0.000us 3
4180
+ cudaDeviceSynchronize 67.47% 4.798ms 67.47% 4.798ms 4.798ms 0.000us 0.00% 0.000us 0.000us 1
4181
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4182
+ Self CPU time total: 7.111ms
4183
+ Self CUDA time total: 5.460ms
4184
 
4185
 
4186
 
 
4190
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4191
  Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
4192
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4193
+ torch_mem_eff 3.28% 242.746us 28.00% 2.075ms 2.075ms 0.000us 0.00% 5.933ms 5.933ms 1
4194
+ torch_mem_eff 0.00% 0.000us 0.00% 0.000us 0.000us 5.886ms 100.14% 5.886ms 5.886ms 1
4195
+ aten::scaled_dot_product_attention 0.25% 18.240us 1.89% 140.073us 46.691us 0.000us 0.00% 5.241ms 1.747ms 3
4196
+ aten::_scaled_dot_product_efficient_attention 0.25% 18.689us 1.64% 121.833us 40.611us 0.000us 0.00% 5.241ms 1.747ms 3
4197
+ aten::_efficient_attention_forward 0.38% 28.462us 1.09% 81.063us 27.021us 5.241ms 89.17% 5.241ms 1.747ms 3
4198
+ fmha_cutlassF_bf16_aligned_64x128_rf_sm80(PyTorchMem... 0.00% 0.000us 0.00% 0.000us 0.000us 5.241ms 89.17% 5.241ms 1.747ms 3
4199
+ aten::contiguous 0.10% 7.041us 22.26% 1.650ms 183.285us 0.000us 0.00% 691.103us 76.789us 9
4200
+ aten::clone 0.29% 21.342us 22.17% 1.643ms 182.503us 0.000us 0.00% 691.103us 76.789us 9
4201
+ aten::copy_ 0.86% 63.451us 21.24% 1.574ms 174.872us 636.671us 10.83% 691.103us 76.789us 9
4202
+ void at::native::elementwise_kernel&lt;128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 636.671us 10.83% 636.671us 70.741us 9
4203
+ Activity Buffer Request 19.50% 1.445ms 19.50% 1.445ms 1.445ms 54.432us 0.93% 54.432us 54.432us 1
4204
+ aten::transpose 0.64% 47.650us 0.87% 64.701us 2.696us 0.000us 0.00% 0.000us 0.000us 24
4205
+ aten::as_strided 0.23% 17.051us 0.23% 17.051us 0.710us 0.000us 0.00% 0.000us 0.000us 24
4206
+ aten::empty_like 0.16% 11.589us 0.64% 47.330us 5.259us 0.000us 0.00% 0.000us 0.000us 9
4207
+ aten::empty 0.82% 60.521us 0.82% 60.521us 2.882us 0.000us 0.00% 0.000us 0.000us 21
4208
+ cudaLaunchKernel 1.19% 88.044us 1.19% 88.044us 7.337us 0.000us 0.00% 0.000us 0.000us 12
4209
+ cudaStreamIsCapturing 0.03% 2.420us 0.03% 2.420us 0.807us 0.000us 0.00% 0.000us 0.000us 3
4210
+ cudaFuncSetAttribute 0.04% 3.030us 0.04% 3.030us 1.010us 0.000us 0.00% 0.000us 0.000us 3
4211
+ cudaDeviceSynchronize 72.00% 5.335ms 72.00% 5.335ms 5.335ms 0.000us 0.00% 0.000us 0.000us 1
4212
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4213
+ Self CPU time total: 7.410ms
4214
+ Self CUDA time total: 5.878ms
4215
 
4216
 
4217
 
 
4221
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4222
  Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
4223
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4224
+ torch_mem_eff 3.21% 244.055us 27.47% 2.092ms 2.092ms 0.000us 0.00% 6.130ms 6.130ms 1
4225
+ torch_mem_eff 0.00% 0.000us 0.00% 0.000us 0.000us 6.080ms 100.14% 6.080ms 6.080ms 1
4226
+ aten::scaled_dot_product_attention 0.23% 17.641us 1.86% 141.944us 47.315us 0.000us 0.00% 5.414ms 1.805ms 3
4227
+ aten::_scaled_dot_product_efficient_attention 0.25% 19.359us 1.63% 124.303us 41.434us 0.000us 0.00% 5.414ms 1.805ms 3
4228
+ aten::_efficient_attention_forward 0.37% 28.219us 1.06% 80.592us 26.864us 5.414ms 89.17% 5.414ms 1.805ms 3
4229
+ fmha_cutlassF_bf16_aligned_64x128_rf_sm80(PyTorchMem... 0.00% 0.000us 0.00% 0.000us 0.000us 5.414ms 89.17% 5.414ms 1.805ms 3
4230
+ aten::contiguous 0.11% 8.060us 21.81% 1.661ms 184.510us 0.000us 0.00% 716.192us 79.577us 9
4231
+ aten::clone 0.29% 22.431us 21.70% 1.653ms 183.615us 0.000us 0.00% 716.192us 79.577us 9
4232
+ aten::copy_ 0.81% 61.641us 20.75% 1.580ms 175.564us 657.728us 10.83% 716.192us 79.577us 9
4233
+ void at::native::elementwise_kernel&lt;128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 657.728us 10.83% 657.728us 73.081us 9
4234
+ Activity Buffer Request 19.08% 1.453ms 19.08% 1.453ms 1.453ms 58.464us 0.96% 58.464us 58.464us 1
4235
+ aten::transpose 0.69% 52.203us 0.92% 69.763us 2.907us 0.000us 0.00% 0.000us 0.000us 24
4236
+ aten::as_strided 0.23% 17.560us 0.23% 17.560us 0.732us 0.000us 0.00% 0.000us 0.000us 24
4237
+ aten::empty_like 0.15% 11.581us 0.66% 50.023us 5.558us 0.000us 0.00% 0.000us 0.000us 9
4238
+ aten::empty 0.84% 63.785us 0.84% 63.785us 3.037us 0.000us 0.00% 0.000us 0.000us 21
4239
+ cudaLaunchKernel 1.14% 86.832us 1.14% 86.832us 7.236us 0.000us 0.00% 0.000us 0.000us 12
4240
+ cudaStreamIsCapturing 0.03% 2.250us 0.03% 2.250us 0.750us 0.000us 0.00% 0.000us 0.000us 3
4241
+ cudaFuncSetAttribute 0.04% 3.260us 0.04% 3.260us 1.087us 0.000us 0.00% 0.000us 0.000us 3
4242
+ cudaDeviceSynchronize 72.53% 5.522ms 72.53% 5.522ms 5.522ms 0.000us 0.00% 0.000us 0.000us 1
4243
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4244
+ Self CPU time total: 7.614ms
4245
+ Self CUDA time total: 6.072ms
4246
 
4247
 
4248
 
 
4252
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4253
  Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
4254
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4255
+ torch_mem_eff 3.16% 248.365us 29.29% 2.300ms 2.300ms 0.000us 0.00% 6.163ms 6.163ms 1
4256
+ torch_mem_eff 0.00% 0.000us 0.00% 0.000us 0.000us 6.114ms 100.14% 6.114ms 6.114ms 1
4257
+ aten::scaled_dot_product_attention 0.24% 19.232us 1.82% 142.774us 47.591us 0.000us 0.00% 5.452ms 1.817ms 3
4258
+ aten::_scaled_dot_product_efficient_attention 0.25% 19.461us 1.57% 123.542us 41.181us 0.000us 0.00% 5.452ms 1.817ms 3
4259
+ aten::_efficient_attention_forward 0.37% 29.029us 1.03% 80.672us 26.891us 5.452ms 89.29% 5.452ms 1.817ms 3
4260
+ fmha_cutlassF_bf16_aligned_64x128_rf_sm80(PyTorchMem... 0.00% 0.000us 0.00% 0.000us 0.000us 5.452ms 89.29% 5.452ms 1.817ms 3
4261
+ aten::contiguous 0.10% 7.931us 23.78% 1.867ms 207.435us 0.000us 0.00% 711.072us 79.008us 9
4262
+ aten::clone 0.30% 23.532us 23.68% 1.859ms 206.554us 0.000us 0.00% 711.072us 79.008us 9
4263
+ aten::copy_ 0.81% 63.779us 22.73% 1.785ms 198.306us 653.792us 10.71% 711.072us 79.008us 9
4264
+ void at::native::elementwise_kernel&lt;128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 653.792us 10.71% 653.792us 72.644us 9
4265
+ Activity Buffer Request 18.59% 1.459ms 18.59% 1.459ms 1.459ms 57.280us 0.94% 57.280us 57.280us 1
4266
+ aten::transpose 0.62% 48.610us 0.83% 65.130us 2.714us 0.000us 0.00% 0.000us 0.000us 24
4267
+ aten::as_strided 0.21% 16.520us 0.21% 16.520us 0.688us 0.000us 0.00% 0.000us 0.000us 24
4268
+ aten::empty_like 0.16% 12.281us 0.65% 50.702us 5.634us 0.000us 0.00% 0.000us 0.000us 9
4269
+ aten::empty 0.80% 62.502us 0.80% 62.502us 2.976us 0.000us 0.00% 0.000us 0.000us 21
4270
+ cudaLaunchKernel 3.60% 282.729us 3.60% 282.729us 23.561us 0.000us 0.00% 0.000us 0.000us 12
4271
+ cudaStreamIsCapturing 0.03% 2.471us 0.03% 2.471us 0.824us 0.000us 0.00% 0.000us 0.000us 3
4272
+ cudaFuncSetAttribute 0.05% 4.120us 0.05% 4.120us 1.373us 0.000us 0.00% 0.000us 0.000us 3
4273
+ cudaDeviceSynchronize 70.71% 5.551ms 70.71% 5.551ms 5.551ms 0.000us 0.00% 0.000us 0.000us 1
4274
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4275
+ Self CPU time total: 7.851ms
4276
+ Self CUDA time total: 6.106ms
4277
 
4278
 
4279
 
 
4283
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4284
  Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
4285
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4286
+ torch_mem_eff 3.01% 243.675us 28.03% 2.272ms 2.272ms 0.000us 0.00% 6.451ms 6.451ms 1
4287
+ torch_mem_eff 0.00% 0.000us 0.00% 0.000us 0.000us 6.399ms 100.13% 6.399ms 6.399ms 1
4288
+ aten::scaled_dot_product_attention 0.23% 18.671us 1.77% 143.224us 47.741us 0.000us 0.00% 5.726ms 1.909ms 3
4289
+ aten::_scaled_dot_product_efficient_attention 0.24% 19.652us 1.54% 124.553us 41.518us 0.000us 0.00% 5.726ms 1.909ms 3
4290
+ aten::_efficient_attention_forward 0.35% 28.317us 0.99% 80.642us 26.881us 5.726ms 89.60% 5.726ms 1.909ms 3
4291
+ fmha_cutlassF_bf16_aligned_64x128_rf_sm80(PyTorchMem... 0.00% 0.000us 0.00% 0.000us 0.000us 5.726ms 89.60% 5.726ms 1.909ms 3
4292
+ aten::contiguous 0.10% 7.791us 22.70% 1.840ms 204.460us 0.000us 0.00% 725.025us 80.558us 9
4293
+ aten::clone 0.29% 23.489us 22.61% 1.832ms 203.594us 0.000us 0.00% 725.025us 80.558us 9
4294
+ aten::copy_ 0.81% 65.293us 21.68% 1.757ms 195.223us 664.641us 10.40% 725.025us 80.558us 9
4295
+ void at::native::elementwise_kernel&lt;128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 664.641us 10.40% 664.641us 73.849us 9
4296
+ Activity Buffer Request 17.77% 1.440ms 17.77% 1.440ms 1.440ms 60.384us 0.94% 60.384us 60.384us 1
4297
+ aten::transpose 0.63% 51.151us 0.85% 69.251us 2.885us 0.000us 0.00% 0.000us 0.000us 24
4298
+ aten::as_strided 0.22% 18.100us 0.22% 18.100us 0.754us 0.000us 0.00% 0.000us 0.000us 24
4299
+ aten::empty_like 0.15% 11.960us 0.64% 51.852us 5.761us 0.000us 0.00% 0.000us 0.000us 9
4300
+ aten::empty 0.79% 64.314us 0.79% 64.314us 3.063us 0.000us 0.00% 0.000us 0.000us 21
4301
+ cudaLaunchKernel 3.36% 272.117us 3.36% 272.117us 22.676us 0.000us 0.00% 0.000us 0.000us 12
4302
+ cudaStreamIsCapturing 0.03% 2.500us 0.03% 2.500us 0.833us 0.000us 0.00% 0.000us 0.000us 3
4303
+ cudaFuncSetAttribute 0.06% 4.532us 0.06% 4.532us 1.511us 0.000us 0.00% 0.000us 0.000us 3
4304
+ cudaDeviceSynchronize 71.97% 5.833ms 71.97% 5.833ms 5.833ms 0.000us 0.00% 0.000us 0.000us 1
4305
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4306
+ Self CPU time total: 8.105ms
4307
+ Self CUDA time total: 6.391ms
4308
 
4309
 
4310
 
 
4314
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4315
  Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
4316
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4317
+ torch_mem_eff 2.88% 242.135us 27.00% 2.269ms 2.269ms 0.000us 0.00% 6.759ms 6.759ms 1
4318
+ torch_mem_eff 0.00% 0.000us 0.00% 0.000us 0.000us 6.705ms 100.12% 6.705ms 6.705ms 1
4319
+ aten::scaled_dot_product_attention 0.21% 17.851us 1.72% 144.884us 48.295us 0.000us 0.00% 6.024ms 2.008ms 3
4320
+ aten::_scaled_dot_product_efficient_attention 0.23% 19.591us 1.51% 127.033us 42.344us 0.000us 0.00% 6.024ms 2.008ms 3
4321
+ aten::_efficient_attention_forward 0.34% 28.520us 0.97% 81.532us 27.177us 6.024ms 89.96% 6.024ms 2.008ms 3
4322
+ fmha_cutlassF_bf16_aligned_64x128_rf_sm80(PyTorchMem... 0.00% 0.000us 0.00% 0.000us 0.000us 6.024ms 89.96% 6.024ms 2.008ms 3
4323
+ aten::contiguous 0.10% 8.099us 21.87% 1.838ms 204.242us 0.000us 0.00% 734.178us 81.575us 9
4324
+ aten::clone 0.28% 23.122us 21.78% 1.830ms 203.342us 0.000us 0.00% 734.178us 81.575us 9
4325
+ aten::copy_ 0.74% 62.180us 20.86% 1.753ms 194.799us 672.322us 10.04% 734.178us 81.575us 9
4326
+ void at::native::elementwise_kernel&lt;128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 672.322us 10.04% 672.322us 74.702us 9
4327
+ Activity Buffer Request 17.19% 1.445ms 17.19% 1.445ms 1.445ms 61.856us 0.92% 61.856us 61.856us 1
4328
+ aten::transpose 0.62% 52.351us 0.83% 70.022us 2.918us 0.000us 0.00% 0.000us 0.000us 24
4329
+ aten::as_strided 0.21% 17.671us 0.21% 17.671us 0.736us 0.000us 0.00% 0.000us 0.000us 24
4330
+ aten::empty_like 0.15% 12.653us 0.64% 53.763us 5.974us 0.000us 0.00% 0.000us 0.000us 9
4331
+ aten::empty 0.79% 66.761us 0.79% 66.761us 3.179us 0.000us 0.00% 0.000us 0.000us 21
4332
+ cudaLaunchKernel 3.19% 267.907us 3.19% 267.907us 22.326us 0.000us 0.00% 0.000us 0.000us 12
4333
+ cudaStreamIsCapturing 0.03% 2.430us 0.03% 2.430us 0.810us 0.000us 0.00% 0.000us 0.000us 3
4334
+ cudaFuncSetAttribute 0.04% 3.350us 0.04% 3.350us 1.117us 0.000us 0.00% 0.000us 0.000us 3
4335
+ cudaDeviceSynchronize 73.00% 6.134ms 73.00% 6.134ms 6.134ms 0.000us 0.00% 0.000us 0.000us 1
4336
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4337
+ Self CPU time total: 8.404ms
4338
+ Self CUDA time total: 6.697ms
4339
 
4340
 
4341
  impl wl p50(ms) ok
4342
+ torch_mem_eff cuda_attn_L128_bfloat16 1.85 True
4343
+ torch_mem_eff cuda_attn_L256_bfloat16 1.95 True
4344
+ torch_mem_eff cuda_attn_L320_bfloat16 1.99 True
4345
+ torch_mem_eff cuda_attn_L384_bfloat16 2.07 True
4346
+ torch_mem_eff cuda_attn_L448_bfloat16 2.06 True
4347
+ torch_mem_eff cuda_attn_L512_bfloat16 2.25 True
4348
  </pre></div>
4349
  <div class="cell-artifacts">
4350
  <h4>Artifacts:</h4>
flash_attn/impls/sage_attention.html CHANGED
@@ -4104,13 +4104,14 @@ body[data-tool="eraser"] .main-content {
4104
  <span class="collapse-indicators">
4105
  <span onclick="toggleCode('benchmark')" style="cursor: pointer;">▼ code</span>
4106
  <span onclick="toggleOutput('benchmark')" style="cursor: pointer;">▼ output</span>
4107
- <span id="uv-indicator-benchmark" style="cursor: default; opacity: 0.3;">▶ uv-logs</span>
4108
  </span> |
4109
- Cell: benchmark | 4.12s
4110
  | <button class="run-btn" onclick="runCell('benchmark')">▶ run</button>
4111
  <button class="copy-btn" onclick="copyCell('benchmark')">Copy</button>
4112
  <a href="cells/benchmark.py" target="_blank" class="raw-btn">Raw</a>
4113
- <a href="https://github.com/huggingface/kernels-uvnotes/blob/main/flash_attn/impls/sage_attention.md" target="_blank" class="github-btn">GitHub</a>
 
4114
  </div>
4115
  <div id="code-benchmark" class="cell-code" data-lines="32">
4116
  <div class="code-wrap">
@@ -4155,24 +4156,27 @@ Cell: benchmark | 4.12s
4155
  <div class="cell-stdout"><pre class="stdout-text">Running attention benchmark on cuda with 6 workloads.
4156
  impl wl p50(ms) ok
4157
  sage_int8_fp16 cuda_attn_L128_bfloat16 FAIL False
4158
- Error: module &#x27;sage_attention_cb34d81dafacbad9&#x27; has no attribute &#x27;fwd&#x27;
4159
  sage_int8_fp16 cuda_attn_L256_bfloat16 FAIL False
4160
- Error: module &#x27;sage_attention_cb34d81dafacbad9&#x27; has no attribute &#x27;fwd&#x27;
4161
  sage_int8_fp16 cuda_attn_L320_bfloat16 FAIL False
4162
- Error: module &#x27;sage_attention_cb34d81dafacbad9&#x27; has no attribute &#x27;fwd&#x27;
4163
  sage_int8_fp16 cuda_attn_L384_bfloat16 FAIL False
4164
- Error: module &#x27;sage_attention_cb34d81dafacbad9&#x27; has no attribute &#x27;fwd&#x27;
4165
  sage_int8_fp16 cuda_attn_L448_bfloat16 FAIL False
4166
- Error: module &#x27;sage_attention_cb34d81dafacbad9&#x27; has no attribute &#x27;fwd&#x27;
4167
  sage_int8_fp16 cuda_attn_L512_bfloat16 FAIL False
4168
- Error: module &#x27;sage_attention_cb34d81dafacbad9&#x27; has no attribute &#x27;fwd&#x27;
4169
  </pre></div>
4170
- <div class="cell-stderr">
4171
- Fetching 11 files: 0%| | 0/11 [00:00&lt;?, ?it/s]
4172
- Fetching 11 files: 18%|█▊ | 2/11 [00:00&lt;00:00, 17.35it/s]
4173
- Fetching 11 files: 73%|███████▎ | 8/11 [00:00&lt;00:00, 15.18it/s]
4174
- Fetching 11 files: 100%|██████████| 11/11 [00:00&lt;00:00, 21.06it/s]
4175
  </div>
 
 
 
 
4176
  <div class="cell-artifacts">
4177
  <h4>Artifacts:</h4>
4178
  <a href="artifacts/benchmark/attention.jsonl" class="artifact" target="_blank">attention.jsonl</a>
 
4104
  <span class="collapse-indicators">
4105
  <span onclick="toggleCode('benchmark')" style="cursor: pointer;">▼ code</span>
4106
  <span onclick="toggleOutput('benchmark')" style="cursor: pointer;">▼ output</span>
4107
+ <span id="uv-indicator-benchmark" onclick="toggleUvLogsFromHeader('benchmark')" style="cursor: pointer;">▶ uv-logs</span>
4108
  </span> |
4109
+ Cell: benchmark | 4.69s
4110
  | <button class="run-btn" onclick="runCell('benchmark')">▶ run</button>
4111
  <button class="copy-btn" onclick="copyCell('benchmark')">Copy</button>
4112
  <a href="cells/benchmark.py" target="_blank" class="raw-btn">Raw</a>
4113
+ <a href="https://github.com/huggingface/kernels-benchmarks/blob/main/flash_attn/impls/sage_attention.md" target="_blank" class="github-btn">GitHub</a>
4114
+ <a href="https://huggingface.co/kernels-community/sage_attention" target="_blank" class="hf-btn">🤗 HF</a>
4115
  </div>
4116
  <div id="code-benchmark" class="cell-code" data-lines="32">
4117
  <div class="code-wrap">
 
4156
  <div class="cell-stdout"><pre class="stdout-text">Running attention benchmark on cuda with 6 workloads.
4157
  impl wl p50(ms) ok
4158
  sage_int8_fp16 cuda_attn_L128_bfloat16 FAIL False
4159
+ Error: module &#x27;sage_attention_ef0573391bb63704&#x27; has no attribute &#x27;fwd&#x27;
4160
  sage_int8_fp16 cuda_attn_L256_bfloat16 FAIL False
4161
+ Error: module &#x27;sage_attention_ef0573391bb63704&#x27; has no attribute &#x27;fwd&#x27;
4162
  sage_int8_fp16 cuda_attn_L320_bfloat16 FAIL False
4163
+ Error: module &#x27;sage_attention_ef0573391bb63704&#x27; has no attribute &#x27;fwd&#x27;
4164
  sage_int8_fp16 cuda_attn_L384_bfloat16 FAIL False
4165
+ Error: module &#x27;sage_attention_ef0573391bb63704&#x27; has no attribute &#x27;fwd&#x27;
4166
  sage_int8_fp16 cuda_attn_L448_bfloat16 FAIL False
4167
+ Error: module &#x27;sage_attention_ef0573391bb63704&#x27; has no attribute &#x27;fwd&#x27;
4168
  sage_int8_fp16 cuda_attn_L512_bfloat16 FAIL False
4169
+ Error: module &#x27;sage_attention_ef0573391bb63704&#x27; has no attribute &#x27;fwd&#x27;
4170
  </pre></div>
4171
+ <div class="uv-install-logs" id="uv-logs-benchmark">
4172
+ <div class="uv-logs-header" onclick="toggleUvLogs(this)">▶ UV Install Logs</div>
4173
+ <div class="uv-logs-content" style="display: none;">
4174
+ Installed 15 packages in 14ms
 
4175
  </div>
4176
+ </div>
4177
+ <div class="cell-stderr">Fetching 11 files: 0%| | 0/11 [00:00&lt;?, ?it/s]
4178
+ Fetching 11 files: 73%|███████▎ | 8/11 [00:00&lt;00:00, 11.73it/s]
4179
+ Fetching 11 files: 100%|██████████| 11/11 [00:00&lt;00:00, 16.12it/s]</div>
4180
  <div class="cell-artifacts">
4181
  <h4>Artifacts:</h4>
4182
  <a href="artifacts/benchmark/attention.jsonl" class="artifact" target="_blank">attention.jsonl</a>
flash_attn/impls/xformers.html CHANGED
@@ -4106,11 +4106,11 @@ body[data-tool="eraser"] .main-content {
4106
  <span onclick="toggleOutput('benchmark')" style="cursor: pointer;">▼ output</span>
4107
  <span id="uv-indicator-benchmark" onclick="toggleUvLogsFromHeader('benchmark')" style="cursor: pointer;">▶ uv-logs</span>
4108
  </span> |
4109
- Cell: benchmark | 5.04s
4110
  | <button class="run-btn" onclick="runCell('benchmark')">▶ run</button>
4111
  <button class="copy-btn" onclick="copyCell('benchmark')">Copy</button>
4112
  <a href="cells/benchmark.py" target="_blank" class="raw-btn">Raw</a>
4113
- <a href="https://github.com/huggingface/kernels-uvnotes/blob/main/flash_attn/impls/xformers.md" target="_blank" class="github-btn">GitHub</a>
4114
  </div>
4115
  <div id="code-benchmark" class="cell-code" data-lines="30">
4116
  <div class="code-wrap">
@@ -4158,21 +4158,21 @@ PROFILE TRACE: xformers_meff | cuda_attn_L128_bfloat16
4158
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4159
  Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
4160
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4161
- xformers_meff 11.46% 506.438us 53.66% 2.372ms 2.372ms 0.000us 0.00% 3.500ms 3.500ms 1
4162
- xformers_flash3::flash_fwd 4.48% 198.083us 41.44% 1.831ms 610.487us 0.000us 0.00% 3.500ms 1.167ms 3
4163
- flash_attn_3::fwd 1.73% 76.649us 36.96% 1.633ms 544.459us 2.610ms 100.00% 3.500ms 1.167ms 3
4164
- xformers_meff 0.00% 0.000us 0.00% 0.000us 0.000us 2.612ms 100.06% 2.612ms 2.612ms 1
4165
- void cutlass::device_kernel&lt;flash::enable_sm80_to_sm... 0.00% 0.000us 0.00% 0.000us 0.000us 2.610ms 100.00% 2.610ms 870.154us 3
4166
- Activity Buffer Request 33.26% 1.470ms 33.26% 1.470ms 1.470ms 889.248us 34.06% 889.248us 889.248us 1
4167
- aten::empty 0.80% 35.182us 0.80% 35.182us 5.864us 0.000us 0.00% 0.000us 0.000us 6
4168
- cudaFuncSetAttribute 0.25% 10.920us 0.25% 10.920us 3.640us 0.000us 0.00% 0.000us 0.000us 3
4169
- cudaLaunchKernel 0.92% 40.501us 0.92% 40.501us 13.500us 0.000us 0.00% 0.000us 0.000us 3
4170
- aten::reshape 0.27% 12.132us 0.77% 33.872us 5.645us 0.000us 0.00% 0.000us 0.000us 6
4171
- aten::view 0.49% 21.740us 0.49% 21.740us 3.623us 0.000us 0.00% 0.000us 0.000us 6
4172
- cudaDeviceSynchronize 46.34% 2.048ms 46.34% 2.048ms 2.048ms 0.000us 0.00% 0.000us 0.000us 1
4173
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4174
- Self CPU time total: 4.420ms
4175
- Self CUDA time total: 2.610ms
4176
 
4177
 
4178
 
@@ -4182,21 +4182,21 @@ PROFILE TRACE: xformers_meff | cuda_attn_L256_bfloat16
4182
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4183
  Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
4184
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4185
- xformers_meff 7.25% 318.297us 46.47% 2.042ms 2.042ms 0.000us 0.00% 3.722ms 3.722ms 1
4186
- xformers_flash3::flash_fwd 3.37% 148.131us 38.68% 1.699ms 566.453us 0.000us 0.00% 3.722ms 1.241ms 3
4187
- flash_attn_3::fwd 1.17% 51.450us 35.31% 1.551ms 517.076us 2.780ms 100.00% 3.722ms 1.241ms 3
4188
- xformers_meff 0.00% 0.000us 0.00% 0.000us 0.000us 2.782ms 100.05% 2.782ms 2.782ms 1
4189
- void cutlass::device_kernel&lt;flash::enable_sm80_to_sm... 0.00% 0.000us 0.00% 0.000us 0.000us 2.780ms 100.00% 2.780ms 926.692us 3
4190
- Activity Buffer Request 32.58% 1.431ms 32.58% 1.431ms 1.431ms 942.244us 33.89% 942.244us 942.244us 1
4191
- aten::empty 0.66% 29.210us 0.66% 29.210us 4.868us 0.000us 0.00% 0.000us 0.000us 6
4192
- cudaFuncSetAttribute 0.13% 5.512us 0.13% 5.512us 1.837us 0.000us 0.00% 0.000us 0.000us 3
4193
- cudaLaunchKernel 0.77% 34.031us 0.77% 34.031us 11.344us 0.000us 0.00% 0.000us 0.000us 3
4194
- aten::reshape 0.21% 9.369us 0.54% 23.900us 3.983us 0.000us 0.00% 0.000us 0.000us 6
4195
- aten::view 0.33% 14.531us 0.33% 14.531us 2.422us 0.000us 0.00% 0.000us 0.000us 6
4196
- cudaDeviceSynchronize 53.53% 2.351ms 53.53% 2.351ms 2.351ms 0.000us 0.00% 0.000us 0.000us 1
4197
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4198
- Self CPU time total: 4.393ms
4199
- Self CUDA time total: 2.780ms
4200
 
4201
 
4202
 
@@ -4206,21 +4206,21 @@ PROFILE TRACE: xformers_meff | cuda_attn_L320_bfloat16
4206
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4207
  Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
4208
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4209
- xformers_meff 6.91% 309.504us 45.24% 2.025ms 2.025ms 0.000us 0.00% 3.854ms 3.854ms 1
4210
- xformers_flash3::flash_fwd 3.30% 147.756us 37.80% 1.692ms 563.990us 0.000us 0.00% 3.854ms 1.285ms 3
4211
- flash_attn_3::fwd 1.19% 53.048us 34.50% 1.544ms 514.738us 2.875ms 100.00% 3.854ms 1.285ms 3
4212
- xformers_meff 0.00% 0.000us 0.00% 0.000us 0.000us 2.877ms 100.05% 2.877ms 2.877ms 1
4213
- void cutlass::device_kernel&lt;flash::enable_sm80_to_sm... 0.00% 0.000us 0.00% 0.000us 0.000us 2.875ms 100.00% 2.875ms 958.381us 3
4214
- Activity Buffer Request 31.77% 1.422ms 31.77% 1.422ms 1.422ms 979.266us 34.06% 979.266us 979.266us 1
4215
- aten::empty 0.67% 29.790us 0.67% 29.790us 4.965us 0.000us 0.00% 0.000us 0.000us 6
4216
- cudaFuncSetAttribute 0.12% 5.570us 0.12% 5.570us 1.857us 0.000us 0.00% 0.000us 0.000us 3
4217
- cudaLaunchKernel 0.76% 33.852us 0.76% 33.852us 11.284us 0.000us 0.00% 0.000us 0.000us 3
4218
- aten::reshape 0.22% 9.920us 0.53% 23.660us 3.943us 0.000us 0.00% 0.000us 0.000us 6
4219
- aten::view 0.31% 13.740us 0.31% 13.740us 2.290us 0.000us 0.00% 0.000us 0.000us 6
4220
- cudaDeviceSynchronize 54.76% 2.451ms 54.76% 2.451ms 2.451ms 0.000us 0.00% 0.000us 0.000us 1
4221
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4222
- Self CPU time total: 4.476ms
4223
- Self CUDA time total: 2.875ms
4224
 
4225
 
4226
 
@@ -4230,21 +4230,21 @@ PROFILE TRACE: xformers_meff | cuda_attn_L384_bfloat16
4230
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4231
  Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
4232
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4233
- xformers_meff 6.53% 306.895us 47.96% 2.255ms 2.255ms 0.000us 0.00% 3.838ms 3.838ms 1
4234
- xformers_flash3::flash_fwd 3.09% 145.243us 40.94% 1.925ms 641.651us 0.000us 0.00% 3.838ms 1.279ms 3
4235
- flash_attn_3::fwd 1.17% 55.062us 37.85% 1.780ms 593.237us 2.865ms 100.00% 3.838ms 1.279ms 3
4236
- xformers_meff 0.00% 0.000us 0.00% 0.000us 0.000us 2.866ms 100.05% 2.866ms 2.866ms 1
4237
- void cutlass::device_kernel&lt;flash::enable_sm80_to_sm... 0.00% 0.000us 0.00% 0.000us 0.000us 2.865ms 100.00% 2.865ms 954.931us 3
4238
- Activity Buffer Request 30.23% 1.421ms 30.23% 1.421ms 1.421ms 973.182us 33.97% 973.182us 973.182us 1
4239
- aten::empty 0.63% 29.790us 0.63% 29.790us 4.965us 0.000us 0.00% 0.000us 0.000us 6
4240
- cudaFuncSetAttribute 0.11% 5.390us 0.11% 5.390us 1.797us 0.000us 0.00% 0.000us 0.000us 3
4241
- cudaLaunchKernel 5.70% 268.094us 5.70% 268.094us 89.365us 0.000us 0.00% 0.000us 0.000us 3
4242
- aten::reshape 0.19% 8.710us 0.49% 22.930us 3.822us 0.000us 0.00% 0.000us 0.000us 6
4243
- aten::view 0.30% 14.220us 0.30% 14.220us 2.370us 0.000us 0.00% 0.000us 0.000us 6
4244
- cudaDeviceSynchronize 52.04% 2.447ms 52.04% 2.447ms 2.447ms 0.000us 0.00% 0.000us 0.000us 1
4245
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4246
- Self CPU time total: 4.702ms
4247
- Self CUDA time total: 2.865ms
4248
 
4249
 
4250
 
@@ -4254,21 +4254,21 @@ PROFILE TRACE: xformers_meff | cuda_attn_L448_bfloat16
4254
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4255
  Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
4256
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4257
- xformers_meff 6.46% 328.735us 43.31% 2.206ms 2.206ms 0.000us 0.00% 4.477ms 4.477ms 1
4258
- xformers_flash3::flash_fwd 3.06% 155.642us 36.36% 1.852ms 617.231us 0.000us 0.00% 4.477ms 1.492ms 3
4259
- flash_attn_3::fwd 1.12% 56.881us 33.30% 1.696ms 565.350us 3.348ms 100.00% 4.477ms 1.492ms 3
4260
- xformers_meff 0.00% 0.000us 0.00% 0.000us 0.000us 3.349ms 100.04% 3.349ms 3.349ms 1
4261
- void cutlass::device_kernel&lt;flash::enable_sm80_to_sm... 0.00% 0.000us 0.00% 0.000us 0.000us 3.348ms 100.00% 3.348ms 1.116ms 3
4262
- Activity Buffer Request 27.91% 1.421ms 27.91% 1.421ms 1.421ms 1.129ms 33.72% 1.129ms 1.129ms 1
4263
- aten::empty 0.63% 32.251us 0.63% 32.251us 5.375us 0.000us 0.00% 0.000us 0.000us 6
4264
- cudaFuncSetAttribute 0.11% 5.740us 0.11% 5.740us 1.913us 0.000us 0.00% 0.000us 0.000us 3
4265
- cudaLaunchKernel 3.53% 179.913us 3.53% 179.913us 59.971us 0.000us 0.00% 0.000us 0.000us 3
4266
- aten::reshape 0.21% 10.692us 0.50% 25.231us 4.205us 0.000us 0.00% 0.000us 0.000us 6
4267
- aten::view 0.29% 14.539us 0.29% 14.539us 2.423us 0.000us 0.00% 0.000us 0.000us 6
4268
- cudaDeviceSynchronize 56.69% 2.887ms 56.69% 2.887ms 2.887ms 0.000us 0.00% 0.000us 0.000us 1
4269
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4270
- Self CPU time total: 5.092ms
4271
- Self CUDA time total: 3.348ms
4272
 
4273
 
4274
 
@@ -4278,37 +4278,83 @@ PROFILE TRACE: xformers_meff | cuda_attn_L512_bfloat16
4278
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4279
  Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
4280
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4281
- xformers_meff 6.24% 320.533us 43.45% 2.233ms 2.233ms 0.000us 0.00% 4.496ms 4.496ms 1
4282
- xformers_flash3::flash_fwd 2.90% 149.124us 36.73% 1.887ms 629.094us 0.000us 0.00% 4.496ms 1.499ms 3
4283
- flash_attn_3::fwd 1.48% 76.290us 33.83% 1.738ms 579.386us 3.368ms 100.00% 4.496ms 1.499ms 3
4284
- xformers_meff 0.00% 0.000us 0.00% 0.000us 0.000us 3.369ms 100.05% 3.369ms 3.369ms 1
4285
- void cutlass::device_kernel&lt;flash::enable_sm80_to_sm... 0.00% 0.000us 0.00% 0.000us 0.000us 3.368ms 100.00% 3.368ms 1.123ms 3
4286
- Activity Buffer Request 28.33% 1.456ms 28.33% 1.456ms 1.456ms 1.129ms 33.51% 1.129ms 1.129ms 1
4287
- aten::empty 0.58% 29.962us 0.58% 29.962us 4.994us 0.000us 0.00% 0.000us 0.000us 6
4288
- cudaFuncSetAttribute 0.12% 6.240us 0.12% 6.240us 2.080us 0.000us 0.00% 0.000us 0.000us 3
4289
- cudaLaunchKernel 3.31% 169.832us 3.31% 169.832us 56.611us 0.000us 0.00% 0.000us 0.000us 3
4290
- aten::reshape 0.21% 10.672us 0.48% 24.873us 4.146us 0.000us 0.00% 0.000us 0.000us 6
4291
- aten::view 0.28% 14.201us 0.28% 14.201us 2.367us 0.000us 0.00% 0.000us 0.000us 6
4292
- cudaDeviceSynchronize 56.55% 2.906ms 56.55% 2.906ms 2.906ms 0.000us 0.00% 0.000us 0.000us 1
4293
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4294
- Self CPU time total: 5.138ms
4295
- Self CUDA time total: 3.368ms
4296
 
4297
 
4298
  impl wl p50(ms) ok
4299
- xformers_meff cuda_attn_L128_bfloat16 0.98 True
4300
- xformers_meff cuda_attn_L256_bfloat16 1.02 True
4301
- xformers_meff cuda_attn_L320_bfloat16 1.07 True
4302
  xformers_meff cuda_attn_L384_bfloat16 1.08 True
4303
- xformers_meff cuda_attn_L448_bfloat16 1.24 True
4304
  xformers_meff cuda_attn_L512_bfloat16 1.23 True
4305
  </pre></div>
4306
  <div class="uv-install-logs" id="uv-logs-benchmark">
4307
  <div class="uv-logs-header" onclick="toggleUvLogs(this)">▶ UV Install Logs</div>
4308
  <div class="uv-logs-content" style="display: none;">
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
4309
  Downloading xformers (111.8MiB)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
4310
  Downloading xformers
4311
- Installed 1 package in 13ms
 
 
 
 
 
 
 
 
 
4312
  </div>
4313
  </div>
4314
  <div class="cell-artifacts">
 
4106
  <span onclick="toggleOutput('benchmark')" style="cursor: pointer;">▼ output</span>
4107
  <span id="uv-indicator-benchmark" onclick="toggleUvLogsFromHeader('benchmark')" style="cursor: pointer;">▶ uv-logs</span>
4108
  </span> |
4109
+ Cell: benchmark | 33.71s
4110
  | <button class="run-btn" onclick="runCell('benchmark')">▶ run</button>
4111
  <button class="copy-btn" onclick="copyCell('benchmark')">Copy</button>
4112
  <a href="cells/benchmark.py" target="_blank" class="raw-btn">Raw</a>
4113
+ <a href="https://github.com/huggingface/kernels-benchmarks/blob/main/flash_attn/impls/xformers.md" target="_blank" class="github-btn">GitHub</a>
4114
  </div>
4115
  <div id="code-benchmark" class="cell-code" data-lines="30">
4116
  <div class="code-wrap">
 
4158
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4159
  Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
4160
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4161
+ xformers_meff 10.98% 488.134us 52.82% 2.349ms 2.349ms 0.000us 0.00% 3.539ms 3.539ms 1
4162
+ xformers_flash3::flash_fwd 4.45% 198.034us 41.02% 1.824ms 608.009us 0.000us 0.00% 3.539ms 1.180ms 3
4163
+ flash_attn_3::fwd 1.81% 80.354us 36.57% 1.626ms 541.997us 2.647ms 100.00% 3.539ms 1.180ms 3
4164
+ xformers_meff 0.00% 0.000us 0.00% 0.000us 0.000us 2.648ms 100.06% 2.648ms 2.648ms 1
4165
+ void cutlass::device_kernel&lt;flash::enable_sm80_to_sm... 0.00% 0.000us 0.00% 0.000us 0.000us 2.647ms 100.00% 2.647ms 882.203us 3
4166
+ Activity Buffer Request 32.65% 1.452ms 32.65% 1.452ms 1.452ms 892.891us 33.74% 892.891us 892.891us 1
4167
+ aten::empty 0.78% 34.470us 0.78% 34.470us 5.745us 0.000us 0.00% 0.000us 0.000us 6
4168
+ cudaFuncSetAttribute 0.26% 11.370us 0.26% 11.370us 3.790us 0.000us 0.00% 0.000us 0.000us 3
4169
+ cudaLaunchKernel 1.08% 47.851us 1.08% 47.851us 15.950us 0.000us 0.00% 0.000us 0.000us 3
4170
+ aten::reshape 0.28% 12.261us 0.82% 36.420us 6.070us 0.000us 0.00% 0.000us 0.000us 6
4171
+ aten::view 0.54% 24.159us 0.54% 24.159us 4.026us 0.000us 0.00% 0.000us 0.000us 6
4172
+ cudaDeviceSynchronize 47.18% 2.098ms 47.18% 2.098ms 2.098ms 0.000us 0.00% 0.000us 0.000us 1
4173
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4174
+ Self CPU time total: 4.447ms
4175
+ Self CUDA time total: 2.647ms
4176
 
4177
 
4178
 
 
4182
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4183
  Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
4184
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4185
+ xformers_meff 7.22% 318.208us 46.97% 2.070ms 2.070ms 0.000us 0.00% 3.700ms 3.700ms 1
4186
+ xformers_flash3::flash_fwd 3.33% 146.973us 39.20% 1.728ms 575.898us 0.000us 0.00% 3.700ms 1.233ms 3
4187
+ flash_attn_3::fwd 1.20% 53.004us 35.87% 1.581ms 526.907us 2.767ms 100.00% 3.700ms 1.233ms 3
4188
+ xformers_meff 0.00% 0.000us 0.00% 0.000us 0.000us 2.769ms 100.05% 2.769ms 2.769ms 1
4189
+ void cutlass::device_kernel&lt;flash::enable_sm80_to_sm... 0.00% 0.000us 0.00% 0.000us 0.000us 2.767ms 100.00% 2.767ms 922.499us 3
4190
+ Activity Buffer Request 33.12% 1.459ms 33.12% 1.459ms 1.459ms 932.857us 33.71% 932.857us 932.857us 1
4191
+ aten::empty 0.65% 28.790us 0.65% 28.790us 4.798us 0.000us 0.00% 0.000us 0.000us 6
4192
+ cudaFuncSetAttribute 0.13% 5.860us 0.13% 5.860us 1.953us 0.000us 0.00% 0.000us 0.000us 3
4193
+ cudaLaunchKernel 0.76% 33.580us 0.76% 33.580us 11.193us 0.000us 0.00% 0.000us 0.000us 3
4194
+ aten::reshape 0.21% 9.291us 0.54% 23.901us 3.983us 0.000us 0.00% 0.000us 0.000us 6
4195
+ aten::view 0.33% 14.610us 0.33% 14.610us 2.435us 0.000us 0.00% 0.000us 0.000us 6
4196
+ cudaDeviceSynchronize 53.03% 2.337ms 53.03% 2.337ms 2.337ms 0.000us 0.00% 0.000us 0.000us 1
4197
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4198
+ Self CPU time total: 4.407ms
4199
+ Self CUDA time total: 2.767ms
4200
 
4201
 
4202
 
 
4206
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4207
  Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
4208
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4209
+ xformers_meff 6.87% 306.279us 45.67% 2.036ms 2.036ms 0.000us 0.00% 3.803ms 3.803ms 1
4210
+ xformers_flash3::flash_fwd 3.28% 146.193us 38.29% 1.707ms 568.871us 0.000us 0.00% 3.803ms 1.268ms 3
4211
+ flash_attn_3::fwd 1.22% 54.360us 35.01% 1.560ms 520.140us 2.841ms 100.00% 3.803ms 1.268ms 3
4212
+ xformers_meff 0.00% 0.000us 0.00% 0.000us 0.000us 2.843ms 100.05% 2.843ms 2.843ms 1
4213
+ void cutlass::device_kernel&lt;flash::enable_sm80_to_sm... 0.00% 0.000us 0.00% 0.000us 0.000us 2.841ms 100.00% 2.841ms 947.064us 3
4214
+ Activity Buffer Request 32.21% 1.435ms 32.21% 1.435ms 1.435ms 961.848us 33.85% 961.848us 961.848us 1
4215
+ aten::empty 0.68% 30.200us 0.68% 30.200us 5.033us 0.000us 0.00% 0.000us 0.000us 6
4216
+ cudaFuncSetAttribute 0.12% 5.560us 0.12% 5.560us 1.853us 0.000us 0.00% 0.000us 0.000us 3
4217
+ cudaLaunchKernel 0.78% 34.863us 0.78% 34.863us 11.621us 0.000us 0.00% 0.000us 0.000us 3
4218
+ aten::reshape 0.20% 8.808us 0.51% 22.610us 3.768us 0.000us 0.00% 0.000us 0.000us 6
4219
+ aten::view 0.31% 13.802us 0.31% 13.802us 2.300us 0.000us 0.00% 0.000us 0.000us 6
4220
+ cudaDeviceSynchronize 54.33% 2.422ms 54.33% 2.422ms 2.422ms 0.000us 0.00% 0.000us 0.000us 1
4221
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4222
+ Self CPU time total: 4.457ms
4223
+ Self CUDA time total: 2.841ms
4224
 
4225
 
4226
 
 
4230
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4231
  Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
4232
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4233
+ xformers_meff 6.67% 311.798us 48.16% 2.253ms 2.253ms 0.000us 0.00% 3.854ms 3.854ms 1
4234
+ xformers_flash3::flash_fwd 3.68% 172.144us 40.98% 1.917ms 638.949us 0.000us 0.00% 3.854ms 1.285ms 3
4235
+ flash_attn_3::fwd 1.19% 55.670us 37.30% 1.745ms 581.568us 2.881ms 100.00% 3.854ms 1.285ms 3
4236
+ xformers_meff 0.00% 0.000us 0.00% 0.000us 0.000us 2.883ms 100.05% 2.883ms 2.883ms 1
4237
+ void cutlass::device_kernel&lt;flash::enable_sm80_to_sm... 0.00% 0.000us 0.00% 0.000us 0.000us 2.881ms 100.00% 2.881ms 960.465us 3
4238
+ Activity Buffer Request 30.77% 1.440ms 30.77% 1.440ms 1.440ms 972.603us 33.75% 972.603us 972.603us 1
4239
+ aten::empty 0.63% 29.580us 0.63% 29.580us 4.930us 0.000us 0.00% 0.000us 0.000us 6
4240
+ cudaFuncSetAttribute 0.12% 5.801us 0.12% 5.801us 1.934us 0.000us 0.00% 0.000us 0.000us 3
4241
+ cudaLaunchKernel 4.58% 214.036us 4.58% 214.036us 71.345us 0.000us 0.00% 0.000us 0.000us 3
4242
+ aten::reshape 0.19% 9.019us 0.51% 24.051us 4.009us 0.000us 0.00% 0.000us 0.000us 6
4243
+ aten::view 0.32% 15.032us 0.32% 15.032us 2.505us 0.000us 0.00% 0.000us 0.000us 6
4244
+ cudaDeviceSynchronize 51.84% 2.425ms 51.84% 2.425ms 2.425ms 0.000us 0.00% 0.000us 0.000us 1
4245
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4246
+ Self CPU time total: 4.678ms
4247
+ Self CUDA time total: 2.881ms
4248
 
4249
 
4250
 
 
4254
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4255
  Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
4256
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4257
+ xformers_meff 5.88% 304.576us 42.22% 2.188ms 2.188ms 0.000us 0.00% 4.552ms 4.552ms 1
4258
+ xformers_flash3::flash_fwd 2.84% 147.154us 35.91% 1.861ms 620.213us 0.000us 0.00% 4.552ms 1.517ms 3
4259
+ flash_attn_3::fwd 1.02% 52.961us 33.07% 1.713ms 571.161us 3.412ms 100.00% 4.552ms 1.517ms 3
4260
+ xformers_meff 0.00% 0.000us 0.00% 0.000us 0.000us 3.414ms 100.04% 3.414ms 3.414ms 1
4261
+ void cutlass::device_kernel&lt;flash::enable_sm80_to_sm... 0.00% 0.000us 0.00% 0.000us 0.000us 3.412ms 100.00% 3.412ms 1.137ms 3
4262
+ Activity Buffer Request 27.95% 1.448ms 27.95% 1.448ms 1.448ms 1.140ms 33.41% 1.140ms 1.140ms 1
4263
+ aten::empty 0.56% 29.272us 0.56% 29.272us 4.879us 0.000us 0.00% 0.000us 0.000us 6
4264
+ cudaFuncSetAttribute 0.12% 6.180us 0.12% 6.180us 2.060us 0.000us 0.00% 0.000us 0.000us 3
4265
+ cudaLaunchKernel 3.41% 176.624us 3.41% 176.624us 58.875us 0.000us 0.00% 0.000us 0.000us 3
4266
+ aten::reshape 0.17% 9.052us 0.44% 22.882us 3.814us 0.000us 0.00% 0.000us 0.000us 6
4267
+ aten::view 0.27% 13.830us 0.27% 13.830us 2.305us 0.000us 0.00% 0.000us 0.000us 6
4268
+ cudaDeviceSynchronize 57.78% 2.994ms 57.78% 2.994ms 2.994ms 0.000us 0.00% 0.000us 0.000us 1
4269
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4270
+ Self CPU time total: 5.182ms
4271
+ Self CUDA time total: 3.412ms
4272
 
4273
 
4274
 
 
4278
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4279
  Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
4280
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4281
+ xformers_meff 5.58% 285.697us 41.87% 2.143ms 2.143ms 0.000us 0.00% 4.544ms 4.544ms 1
4282
+ xformers_flash3::flash_fwd 2.91% 148.714us 35.83% 1.834ms 611.255us 0.000us 0.00% 4.544ms 1.515ms 3
4283
+ flash_attn_3::fwd 1.04% 53.311us 32.92% 1.685ms 561.684us 3.402ms 100.00% 4.544ms 1.515ms 3
4284
+ xformers_meff 0.00% 0.000us 0.00% 0.000us 0.000us 3.403ms 100.05% 3.403ms 3.403ms 1
4285
+ void cutlass::device_kernel&lt;flash::enable_sm80_to_sm... 0.00% 0.000us 0.00% 0.000us 0.000us 3.402ms 100.00% 3.402ms 1.134ms 3
4286
+ Activity Buffer Request 27.78% 1.422ms 27.78% 1.422ms 1.422ms 1.142ms 33.57% 1.142ms 1.142ms 1
4287
+ aten::empty 0.58% 29.640us 0.58% 29.640us 4.940us 0.000us 0.00% 0.000us 0.000us 6
4288
+ cudaFuncSetAttribute 0.12% 5.990us 0.12% 5.990us 1.997us 0.000us 0.00% 0.000us 0.000us 3
4289
+ cudaLaunchKernel 3.40% 174.134us 3.40% 174.134us 58.045us 0.000us 0.00% 0.000us 0.000us 3
4290
+ aten::reshape 0.17% 8.543us 0.45% 23.191us 3.865us 0.000us 0.00% 0.000us 0.000us 6
4291
+ aten::view 0.29% 14.648us 0.29% 14.648us 2.441us 0.000us 0.00% 0.000us 0.000us 6
4292
+ cudaDeviceSynchronize 58.13% 2.975ms 58.13% 2.975ms 2.975ms 0.000us 0.00% 0.000us 0.000us 1
4293
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4294
+ Self CPU time total: 5.118ms
4295
+ Self CUDA time total: 3.402ms
4296
 
4297
 
4298
  impl wl p50(ms) ok
4299
+ xformers_meff cuda_attn_L128_bfloat16 1.00 True
4300
+ xformers_meff cuda_attn_L256_bfloat16 1.03 True
4301
+ xformers_meff cuda_attn_L320_bfloat16 1.08 True
4302
  xformers_meff cuda_attn_L384_bfloat16 1.08 True
4303
+ xformers_meff cuda_attn_L448_bfloat16 1.25 True
4304
  xformers_meff cuda_attn_L512_bfloat16 1.23 True
4305
  </pre></div>
4306
  <div class="uv-install-logs" id="uv-logs-benchmark">
4307
  <div class="uv-logs-header" onclick="toggleUvLogs(this)">▶ UV Install Logs</div>
4308
  <div class="uv-logs-content" style="display: none;">
4309
+ Building kernels-benchmark-tools @ file:///__w/kernels-benchmarks/kernels-benchmarks/tools
4310
+ Downloading networkx (1.9MiB)
4311
+ Downloading nvidia-cudnn-cu12 (674.0MiB)
4312
+ Downloading fonttools (4.7MiB)
4313
+ Downloading nvidia-cufft-cu12 (184.2MiB)
4314
+ Downloading numpy (16.2MiB)
4315
+ Downloading torch (846.9MiB)
4316
+ Downloading setuptools (1.1MiB)
4317
+ Downloading nvidia-cublas-cu12 (566.8MiB)
4318
+ Downloading sympy (6.0MiB)
4319
+ Downloading nvidia-cufile-cu12 (1.1MiB)
4320
+ Downloading nvidia-nccl-cu12 (307.4MiB)
4321
+ Downloading kiwisolver (1.4MiB)
4322
+ Downloading matplotlib (8.3MiB)
4323
+ Downloading triton (148.3MiB)
4324
+ Downloading nvidia-nvjitlink-cu12 (37.4MiB)
4325
+ Downloading nvidia-cuda-nvrtc-cu12 (84.0MiB)
4326
+ Downloading nvidia-curand-cu12 (60.7MiB)
4327
+ Downloading nvidia-cuda-cupti-cu12 (9.8MiB)
4328
+ Downloading nvidia-cusolver-cu12 (255.1MiB)
4329
+ Downloading nvidia-cusparselt-cu12 (273.9MiB)
4330
  Downloading xformers (111.8MiB)
4331
+ Downloading pillow (6.7MiB)
4332
+ Downloading nvidia-cusparse-cu12 (274.9MiB)
4333
+ Downloading nvidia-cufile-cu12
4334
+ Downloading kiwisolver
4335
+ Downloading setuptools
4336
+ Downloading networkx
4337
+ Downloading fonttools
4338
+ Downloading pillow
4339
+ Built kernels-benchmark-tools @ file:///__w/kernels-benchmarks/kernels-benchmarks/tools
4340
+ Downloading nvidia-cuda-cupti-cu12
4341
+ Downloading matplotlib
4342
+ Downloading sympy
4343
+ Downloading numpy
4344
+ Downloading nvidia-nvjitlink-cu12
4345
+ Downloading nvidia-curand-cu12
4346
+ Downloading nvidia-cuda-nvrtc-cu12
4347
  Downloading xformers
4348
+ Downloading triton
4349
+ Downloading nvidia-cufft-cu12
4350
+ Downloading nvidia-cusolver-cu12
4351
+ Downloading nvidia-cusparse-cu12
4352
+ Downloading nvidia-cusparselt-cu12
4353
+ Downloading nvidia-nccl-cu12
4354
+ Downloading nvidia-cublas-cu12
4355
+ Downloading nvidia-cudnn-cu12
4356
+ Downloading torch
4357
+ Installed 38 packages in 236ms
4358
  </div>
4359
  </div>
4360
  <div class="cell-artifacts">
flash_attn/results/artifacts/combine/latency.svg CHANGED

Git LFS Details

  • SHA256: c6390d15c17c1cced5612c62eb1fb07f7304765d3d9c2c842f634fd3107bbeaf
  • Pointer size: 130 Bytes
  • Size of remote file: 24.8 kB

Git LFS Details

  • SHA256: 520b28a43c879f6952cf0ddeade1438dbb5bd7caf01b6509254a4c68e9446ee6
  • Pointer size: 130 Bytes
  • Size of remote file: 24.8 kB
flash_attn/results/combined_results.html CHANGED
@@ -4107,7 +4107,7 @@ body[data-tool="eraser"] .main-content {
4107
  <rdf:RDF>
4108
  <ns2:Work>
4109
  <dc:type rdf:resource="http://purl.org/dc/dcmitype/StillImage" />
4110
- <dc:date>2025-10-30T15:53:53.940454</dc:date>
4111
  <dc:format>image/svg+xml</dc:format>
4112
  <dc:creator>
4113
  <ns2:Agent>
@@ -4217,96 +4217,96 @@ body[data-tool="eraser"] .main-content {
4217
  <g id="matplotlib.axis_2">
4218
  <g id="ytick_1">
4219
  <g id="grid-y--2" class="grid grid-y">
4220
- <path d="M 47.81 402.410473 L 835.361742 402.410473 " clip-path="url(#p09feef2583)" style="fill: none; stroke: #b0b0b0; stroke-opacity: 0.3; stroke-width: 0.8; stroke-linecap: square" />
4221
  </g>
4222
  <g id="line2d_7">
4223
  <defs>
4224
  <path id="m0fca2865ba" d="M 0 0 L -3.5 0 " style="stroke: #000000; stroke-width: 0.8" />
4225
  </defs>
4226
  <g>
4227
- <use ns4:href="#m0fca2865ba" x="47.81" y="402.410473" style="stroke: #000000; stroke-width: 0.8" />
4228
  </g>
4229
  </g>
4230
  <g id="text_7">
4231
- <text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: end" x="40.81" y="406.209692" transform="rotate(-0 40.81 406.209692)">1.0</text>
4232
  </g>
4233
  </g>
4234
  <g id="ytick_2">
4235
  <g id="grid-y--3" class="grid grid-y">
4236
- <path d="M 47.81 343.789654 L 835.361742 343.789654 " clip-path="url(#p09feef2583)" style="fill: none; stroke: #b0b0b0; stroke-opacity: 0.3; stroke-width: 0.8; stroke-linecap: square" />
4237
  </g>
4238
  <g id="line2d_8">
4239
  <g>
4240
- <use ns4:href="#m0fca2865ba" x="47.81" y="343.789654" style="stroke: #000000; stroke-width: 0.8" />
4241
  </g>
4242
  </g>
4243
  <g id="text_8">
4244
- <text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: end" x="40.81" y="347.588873" transform="rotate(-0 40.81 347.588873)">1.2</text>
4245
  </g>
4246
  </g>
4247
  <g id="ytick_3">
4248
  <g id="grid-y--4" class="grid grid-y">
4249
- <path d="M 47.81 285.168836 L 835.361742 285.168836 " clip-path="url(#p09feef2583)" style="fill: none; stroke: #b0b0b0; stroke-opacity: 0.3; stroke-width: 0.8; stroke-linecap: square" />
4250
  </g>
4251
  <g id="line2d_9">
4252
  <g>
4253
- <use ns4:href="#m0fca2865ba" x="47.81" y="285.168836" style="stroke: #000000; stroke-width: 0.8" />
4254
  </g>
4255
  </g>
4256
  <g id="text_9">
4257
- <text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: end" x="40.81" y="288.968055" transform="rotate(-0 40.81 288.968055)">1.4</text>
4258
  </g>
4259
  </g>
4260
  <g id="ytick_4">
4261
  <g id="grid-y--5" class="grid grid-y">
4262
- <path d="M 47.81 226.548018 L 835.361742 226.548018 " clip-path="url(#p09feef2583)" style="fill: none; stroke: #b0b0b0; stroke-opacity: 0.3; stroke-width: 0.8; stroke-linecap: square" />
4263
  </g>
4264
  <g id="line2d_10">
4265
  <g>
4266
- <use ns4:href="#m0fca2865ba" x="47.81" y="226.548018" style="stroke: #000000; stroke-width: 0.8" />
4267
  </g>
4268
  </g>
4269
  <g id="text_10">
4270
- <text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: end" x="40.81" y="230.347236" transform="rotate(-0 40.81 230.347236)">1.6</text>
4271
  </g>
4272
  </g>
4273
  <g id="ytick_5">
4274
  <g id="grid-y--6" class="grid grid-y">
4275
- <path d="M 47.81 167.927199 L 835.361742 167.927199 " clip-path="url(#p09feef2583)" style="fill: none; stroke: #b0b0b0; stroke-opacity: 0.3; stroke-width: 0.8; stroke-linecap: square" />
4276
  </g>
4277
  <g id="line2d_11">
4278
  <g>
4279
- <use ns4:href="#m0fca2865ba" x="47.81" y="167.927199" style="stroke: #000000; stroke-width: 0.8" />
4280
  </g>
4281
  </g>
4282
  <g id="text_11">
4283
- <text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: end" x="40.81" y="171.726418" transform="rotate(-0 40.81 171.726418)">1.8</text>
4284
  </g>
4285
  </g>
4286
  <g id="ytick_6">
4287
  <g id="grid-y--7" class="grid grid-y">
4288
- <path d="M 47.81 109.306381 L 835.361742 109.306381 " clip-path="url(#p09feef2583)" style="fill: none; stroke: #b0b0b0; stroke-opacity: 0.3; stroke-width: 0.8; stroke-linecap: square" />
4289
  </g>
4290
  <g id="line2d_12">
4291
  <g>
4292
- <use ns4:href="#m0fca2865ba" x="47.81" y="109.306381" style="stroke: #000000; stroke-width: 0.8" />
4293
  </g>
4294
  </g>
4295
  <g id="text_12">
4296
- <text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: end" x="40.81" y="113.1056" transform="rotate(-0 40.81 113.1056)">2.0</text>
4297
  </g>
4298
  </g>
4299
  <g id="ytick_7">
4300
  <g id="grid-y--8" class="grid grid-y">
4301
- <path d="M 47.81 50.685563 L 835.361742 50.685563 " clip-path="url(#p09feef2583)" style="fill: none; stroke: #b0b0b0; stroke-opacity: 0.3; stroke-width: 0.8; stroke-linecap: square" />
4302
  </g>
4303
  <g id="line2d_13">
4304
  <g>
4305
- <use ns4:href="#m0fca2865ba" x="47.81" y="50.685563" style="stroke: #000000; stroke-width: 0.8" />
4306
  </g>
4307
  </g>
4308
  <g id="text_13">
4309
- <text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: end" x="40.81" y="54.484781" transform="rotate(-0 40.81 54.484781)">2.2</text>
4310
  </g>
4311
  </g>
4312
  <g id="label--y" class="ylabel">
@@ -4314,73 +4314,73 @@ body[data-tool="eraser"] .main-content {
4314
  </g>
4315
  </g>
4316
  <g id="series--torch-flash-ma" class="series">
4317
- <path d="M 83.607806 338.320039 L 226.799032 324.329888 L 369.990258 318.590616 L 513.181484 313.901244 L 656.37271 271.916135 L 799.563935 259.376848 " clip-path="url(#p09feef2583)" style="fill: none; stroke: #1f77b4; stroke-width: 1.5; stroke-linecap: square" />
4318
  <defs>
4319
  <path id="md7efaf3aec" d="M 0 3 C 0.795609 3 1.55874 2.683901 2.12132 2.12132 C 2.683901 1.55874 3 0.795609 3 0 C 3 -0.795609 2.683901 -1.55874 2.12132 -2.12132 C 1.55874 -2.683901 0.795609 -3 0 -3 C -0.795609 -3 -1.55874 -2.683901 -2.12132 -2.12132 C -2.683901 -1.55874 -3 -0.795609 -3 0 C -3 0.795609 -2.683901 1.55874 -2.12132 2.12132 C -1.55874 2.683901 -0.795609 3 0 3 z " style="stroke: #1f77b4" />
4320
  </defs>
4321
  <g clip-path="url(#p09feef2583)">
4322
- <use ns4:href="#md7efaf3aec" x="83.607806" y="338.320039" style="fill: #1f77b4; stroke: #1f77b4" />
4323
- <use ns4:href="#md7efaf3aec" x="226.799032" y="324.329888" style="fill: #1f77b4; stroke: #1f77b4" />
4324
- <use ns4:href="#md7efaf3aec" x="369.990258" y="318.590616" style="fill: #1f77b4; stroke: #1f77b4" />
4325
- <use ns4:href="#md7efaf3aec" x="513.181484" y="313.901244" style="fill: #1f77b4; stroke: #1f77b4" />
4326
- <use ns4:href="#md7efaf3aec" x="656.37271" y="271.916135" style="fill: #1f77b4; stroke: #1f77b4" />
4327
- <use ns4:href="#md7efaf3aec" x="799.563935" y="259.376848" style="fill: #1f77b4; stroke: #1f77b4" />
4328
  </g>
4329
  </g>
4330
  <g id="series--torch-mem-eff" class="series">
4331
- <path d="M 83.607806 163.963846 L 226.799032 145.342943 L 369.990258 117.045795 L 513.181484 117.544365 L 656.37271 83.816291 L 799.563935 45.999414 " clip-path="url(#p09feef2583)" style="fill: none; stroke: #ff7f0e; stroke-width: 1.5; stroke-linecap: square" />
4332
  <defs>
4333
  <path id="m9b8c54d372" d="M 0 3 C 0.795609 3 1.55874 2.683901 2.12132 2.12132 C 2.683901 1.55874 3 0.795609 3 0 C 3 -0.795609 2.683901 -1.55874 2.12132 -2.12132 C 1.55874 -2.683901 0.795609 -3 0 -3 C -0.795609 -3 -1.55874 -2.683901 -2.12132 -2.12132 C -2.683901 -1.55874 -3 -0.795609 -3 0 C -3 0.795609 -2.683901 1.55874 -2.12132 2.12132 C -1.55874 2.683901 -0.795609 3 0 3 z " style="stroke: #ff7f0e" />
4334
  </defs>
4335
  <g clip-path="url(#p09feef2583)">
4336
- <use ns4:href="#m9b8c54d372" x="83.607806" y="163.963846" style="fill: #ff7f0e; stroke: #ff7f0e" />
4337
- <use ns4:href="#m9b8c54d372" x="226.799032" y="145.342943" style="fill: #ff7f0e; stroke: #ff7f0e" />
4338
- <use ns4:href="#m9b8c54d372" x="369.990258" y="117.045795" style="fill: #ff7f0e; stroke: #ff7f0e" />
4339
- <use ns4:href="#m9b8c54d372" x="513.181484" y="117.544365" style="fill: #ff7f0e; stroke: #ff7f0e" />
4340
- <use ns4:href="#m9b8c54d372" x="656.37271" y="83.816291" style="fill: #ff7f0e; stroke: #ff7f0e" />
4341
  <use ns4:href="#m9b8c54d372" x="799.563935" y="45.999414" style="fill: #ff7f0e; stroke: #ff7f0e" />
4342
  </g>
4343
  </g>
4344
  <g id="series--xformers-meff" class="series">
4345
- <path d="M 83.607806 407.071707 L 226.799032 396.194321 L 369.990258 382.362446 L 513.181484 378.056747 L 656.37271 332.261284 L 799.563935 334.228013 " clip-path="url(#p09feef2583)" style="fill: none; stroke: #2ca02c; stroke-width: 1.5; stroke-linecap: square" />
4346
  <defs>
4347
  <path id="mc655281e0b" d="M 0 3 C 0.795609 3 1.55874 2.683901 2.12132 2.12132 C 2.683901 1.55874 3 0.795609 3 0 C 3 -0.795609 2.683901 -1.55874 2.12132 -2.12132 C 1.55874 -2.683901 0.795609 -3 0 -3 C -0.795609 -3 -1.55874 -2.683901 -2.12132 -2.12132 C -2.683901 -1.55874 -3 -0.795609 -3 0 C -3 0.795609 -2.683901 1.55874 -2.12132 2.12132 C -1.55874 2.683901 -0.795609 3 0 3 z " style="stroke: #2ca02c" />
4348
  </defs>
4349
  <g clip-path="url(#p09feef2583)">
4350
- <use ns4:href="#mc655281e0b" x="83.607806" y="407.071707" style="fill: #2ca02c; stroke: #2ca02c" />
4351
- <use ns4:href="#mc655281e0b" x="226.799032" y="396.194321" style="fill: #2ca02c; stroke: #2ca02c" />
4352
- <use ns4:href="#mc655281e0b" x="369.990258" y="382.362446" style="fill: #2ca02c; stroke: #2ca02c" />
4353
- <use ns4:href="#mc655281e0b" x="513.181484" y="378.056747" style="fill: #2ca02c; stroke: #2ca02c" />
4354
- <use ns4:href="#mc655281e0b" x="656.37271" y="332.261284" style="fill: #2ca02c; stroke: #2ca02c" />
4355
- <use ns4:href="#mc655281e0b" x="799.563935" y="334.228013" style="fill: #2ca02c; stroke: #2ca02c" />
4356
  </g>
4357
  </g>
4358
  <g id="series--hf-kernels-flash-attn" class="series">
4359
- <path d="M 83.607806 418.848923 L 226.799032 406.104464 L 369.990258 393.547884 L 513.181484 387.046249 L 656.37271 340.26625 L 799.563935 333.615718 " clip-path="url(#p09feef2583)" style="fill: none; stroke: #d62728; stroke-width: 1.5; stroke-linecap: square" />
4360
  <defs>
4361
  <path id="m61c8040d7e" d="M 0 3 C 0.795609 3 1.55874 2.683901 2.12132 2.12132 C 2.683901 1.55874 3 0.795609 3 0 C 3 -0.795609 2.683901 -1.55874 2.12132 -2.12132 C 1.55874 -2.683901 0.795609 -3 0 -3 C -0.795609 -3 -1.55874 -2.683901 -2.12132 -2.12132 C -2.683901 -1.55874 -3 -0.795609 -3 0 C -3 0.795609 -2.683901 1.55874 -2.12132 2.12132 C -1.55874 2.683901 -0.795609 3 0 3 z " style="stroke: #d62728" />
4362
  </defs>
4363
  <g clip-path="url(#p09feef2583)">
4364
- <use ns4:href="#m61c8040d7e" x="83.607806" y="418.848923" style="fill: #d62728; stroke: #d62728" />
4365
- <use ns4:href="#m61c8040d7e" x="226.799032" y="406.104464" style="fill: #d62728; stroke: #d62728" />
4366
- <use ns4:href="#m61c8040d7e" x="369.990258" y="393.547884" style="fill: #d62728; stroke: #d62728" />
4367
- <use ns4:href="#m61c8040d7e" x="513.181484" y="387.046249" style="fill: #d62728; stroke: #d62728" />
4368
- <use ns4:href="#m61c8040d7e" x="656.37271" y="340.26625" style="fill: #d62728; stroke: #d62728" />
4369
- <use ns4:href="#m61c8040d7e" x="799.563935" y="333.615718" style="fill: #d62728; stroke: #d62728" />
4370
  </g>
4371
  </g>
4372
  <g id="series--hf-kernels-flash-attn3" class="series">
4373
- <path d="M 83.607806 428.387702 L 226.799032 417.179109 L 369.990258 396.852047 L 513.181484 396.728943 L 656.37271 348.383475 L 799.563935 348.523872 " clip-path="url(#p09feef2583)" style="fill: none; stroke: #9467bd; stroke-width: 1.5; stroke-linecap: square" />
4374
  <defs>
4375
  <path id="m7cd35be9cc" d="M 0 3 C 0.795609 3 1.55874 2.683901 2.12132 2.12132 C 2.683901 1.55874 3 0.795609 3 0 C 3 -0.795609 2.683901 -1.55874 2.12132 -2.12132 C 1.55874 -2.683901 0.795609 -3 0 -3 C -0.795609 -3 -1.55874 -2.683901 -2.12132 -2.12132 C -2.683901 -1.55874 -3 -0.795609 -3 0 C -3 0.795609 -2.683901 1.55874 -2.12132 2.12132 C -1.55874 2.683901 -0.795609 3 0 3 z " style="stroke: #9467bd" />
4376
  </defs>
4377
  <g clip-path="url(#p09feef2583)">
4378
  <use ns4:href="#m7cd35be9cc" x="83.607806" y="428.387702" style="fill: #9467bd; stroke: #9467bd" />
4379
- <use ns4:href="#m7cd35be9cc" x="226.799032" y="417.179109" style="fill: #9467bd; stroke: #9467bd" />
4380
- <use ns4:href="#m7cd35be9cc" x="369.990258" y="396.852047" style="fill: #9467bd; stroke: #9467bd" />
4381
- <use ns4:href="#m7cd35be9cc" x="513.181484" y="396.728943" style="fill: #9467bd; stroke: #9467bd" />
4382
- <use ns4:href="#m7cd35be9cc" x="656.37271" y="348.383475" style="fill: #9467bd; stroke: #9467bd" />
4383
- <use ns4:href="#m7cd35be9cc" x="799.563935" y="348.523872" style="fill: #9467bd; stroke: #9467bd" />
4384
  </g>
4385
  </g>
4386
  <g id="patch_3">
@@ -4465,7 +4465,7 @@ body[data-tool="eraser"] .main-content {
4465
  <span onclick="toggleOutput('combine')" style="cursor: pointer;">▼ output</span>
4466
  <span id="uv-indicator-combine" onclick="toggleUvLogsFromHeader('combine')" style="cursor: pointer;">▶ uv-logs</span>
4467
  </span> |
4468
- Cell: combine | 4.26s
4469
  | <button class="run-btn" onclick="runCell('combine')">▶ run</button>
4470
  <button class="copy-btn" onclick="copyCell('combine')">Copy</button>
4471
  <a href="cells/combine.py" target="_blank" class="raw-btn">Raw</a>
@@ -4572,47 +4572,47 @@ Summary: 6 found, 0 skipped, 0 missing
4572
  COMBINED BENCHMARK SUMMARY
4573
 
4574
  impl wl p50(ms) ok
4575
- hf_kernels_flash_attn cuda_attn_L128_bfloat16 0.94 True
4576
- hf_kernels_flash_attn cuda_attn_L256_bfloat16 0.99 True
4577
- hf_kernels_flash_attn cuda_attn_L320_bfloat16 1.03 True
4578
- hf_kernels_flash_attn cuda_attn_L384_bfloat16 1.05 True
4579
- hf_kernels_flash_attn cuda_attn_L448_bfloat16 1.21 True
4580
  hf_kernels_flash_attn cuda_attn_L512_bfloat16 1.23 True
4581
- hf_kernels_flash_attn3 cuda_attn_L128_bfloat16 0.91 True
4582
- hf_kernels_flash_attn3 cuda_attn_L256_bfloat16 0.95 True
4583
- hf_kernels_flash_attn3 cuda_attn_L320_bfloat16 1.02 True
4584
- hf_kernels_flash_attn3 cuda_attn_L384_bfloat16 1.02 True
4585
- hf_kernels_flash_attn3 cuda_attn_L448_bfloat16 1.18 True
4586
  hf_kernels_flash_attn3 cuda_attn_L512_bfloat16 1.18 True
4587
  sage_int8_fp16 cuda_attn_L128_bfloat16 FAIL False
4588
- Error: module &#x27;sage_attention_cb34d81dafacbad9&#x27; has no attribute &#x27;fwd&#x27;
4589
  sage_int8_fp16 cuda_attn_L256_bfloat16 FAIL False
4590
- Error: module &#x27;sage_attention_cb34d81dafacbad9&#x27; has no attribute &#x27;fwd&#x27;
4591
  sage_int8_fp16 cuda_attn_L320_bfloat16 FAIL False
4592
- Error: module &#x27;sage_attention_cb34d81dafacbad9&#x27; has no attribute &#x27;fwd&#x27;
4593
  sage_int8_fp16 cuda_attn_L384_bfloat16 FAIL False
4594
- Error: module &#x27;sage_attention_cb34d81dafacbad9&#x27; has no attribute &#x27;fwd&#x27;
4595
  sage_int8_fp16 cuda_attn_L448_bfloat16 FAIL False
4596
- Error: module &#x27;sage_attention_cb34d81dafacbad9&#x27; has no attribute &#x27;fwd&#x27;
4597
  sage_int8_fp16 cuda_attn_L512_bfloat16 FAIL False
4598
- Error: module &#x27;sage_attention_cb34d81dafacbad9&#x27; has no attribute &#x27;fwd&#x27;
4599
  torch_flash_ma cuda_attn_L128_bfloat16 1.22 True
4600
- torch_flash_ma cuda_attn_L256_bfloat16 1.27 True
4601
- torch_flash_ma cuda_attn_L320_bfloat16 1.29 True
4602
- torch_flash_ma cuda_attn_L384_bfloat16 1.30 True
4603
- torch_flash_ma cuda_attn_L448_bfloat16 1.45 True
4604
- torch_flash_ma cuda_attn_L512_bfloat16 1.49 True
4605
- torch_mem_eff cuda_attn_L128_bfloat16 1.81 True
4606
- torch_mem_eff cuda_attn_L256_bfloat16 1.88 True
4607
- torch_mem_eff cuda_attn_L320_bfloat16 1.97 True
4608
- torch_mem_eff cuda_attn_L384_bfloat16 1.97 True
4609
- torch_mem_eff cuda_attn_L448_bfloat16 2.09 True
4610
- torch_mem_eff cuda_attn_L512_bfloat16 2.22 True
4611
- xformers_meff cuda_attn_L128_bfloat16 0.98 True
4612
- xformers_meff cuda_attn_L256_bfloat16 1.02 True
4613
- xformers_meff cuda_attn_L320_bfloat16 1.07 True
4614
  xformers_meff cuda_attn_L384_bfloat16 1.08 True
4615
- xformers_meff cuda_attn_L448_bfloat16 1.24 True
4616
  xformers_meff cuda_attn_L512_bfloat16 1.23 True
4617
 
4618
  GENERATING COMBINED VISUALIZATION
@@ -4637,7 +4637,7 @@ Implementations included:
4637
  <div class="uv-install-logs" id="uv-logs-combine">
4638
  <div class="uv-logs-header" onclick="toggleUvLogs(this)">▶ UV Install Logs</div>
4639
  <div class="uv-logs-content" style="display: none;">
4640
- Installed 37 packages in 190ms
4641
  </div>
4642
  </div>
4643
  <div class="cell-artifacts">
@@ -4650,7 +4650,7 @@ Installed 37 packages in 190ms
4650
  <rdf:RDF>
4651
  <ns2:Work>
4652
  <dc:type rdf:resource="http://purl.org/dc/dcmitype/StillImage" />
4653
- <dc:date>2025-10-30T15:53:53.940454</dc:date>
4654
  <dc:format>image/svg+xml</dc:format>
4655
  <dc:creator>
4656
  <ns2:Agent>
@@ -4760,96 +4760,96 @@ Installed 37 packages in 190ms
4760
  <g id="matplotlib.axis_2">
4761
  <g id="ytick_1">
4762
  <g id="grid-y--2" class="grid grid-y">
4763
- <path d="M 47.81 402.410473 L 835.361742 402.410473 " clip-path="url(#p09feef2583)" style="fill: none; stroke: #b0b0b0; stroke-opacity: 0.3; stroke-width: 0.8; stroke-linecap: square" />
4764
  </g>
4765
  <g id="line2d_7">
4766
  <defs>
4767
  <path id="m0fca2865ba" d="M 0 0 L -3.5 0 " style="stroke: #000000; stroke-width: 0.8" />
4768
  </defs>
4769
  <g>
4770
- <use ns4:href="#m0fca2865ba" x="47.81" y="402.410473" style="stroke: #000000; stroke-width: 0.8" />
4771
  </g>
4772
  </g>
4773
  <g id="text_7">
4774
- <text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: end" x="40.81" y="406.209692" transform="rotate(-0 40.81 406.209692)">1.0</text>
4775
  </g>
4776
  </g>
4777
  <g id="ytick_2">
4778
  <g id="grid-y--3" class="grid grid-y">
4779
- <path d="M 47.81 343.789654 L 835.361742 343.789654 " clip-path="url(#p09feef2583)" style="fill: none; stroke: #b0b0b0; stroke-opacity: 0.3; stroke-width: 0.8; stroke-linecap: square" />
4780
  </g>
4781
  <g id="line2d_8">
4782
  <g>
4783
- <use ns4:href="#m0fca2865ba" x="47.81" y="343.789654" style="stroke: #000000; stroke-width: 0.8" />
4784
  </g>
4785
  </g>
4786
  <g id="text_8">
4787
- <text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: end" x="40.81" y="347.588873" transform="rotate(-0 40.81 347.588873)">1.2</text>
4788
  </g>
4789
  </g>
4790
  <g id="ytick_3">
4791
  <g id="grid-y--4" class="grid grid-y">
4792
- <path d="M 47.81 285.168836 L 835.361742 285.168836 " clip-path="url(#p09feef2583)" style="fill: none; stroke: #b0b0b0; stroke-opacity: 0.3; stroke-width: 0.8; stroke-linecap: square" />
4793
  </g>
4794
  <g id="line2d_9">
4795
  <g>
4796
- <use ns4:href="#m0fca2865ba" x="47.81" y="285.168836" style="stroke: #000000; stroke-width: 0.8" />
4797
  </g>
4798
  </g>
4799
  <g id="text_9">
4800
- <text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: end" x="40.81" y="288.968055" transform="rotate(-0 40.81 288.968055)">1.4</text>
4801
  </g>
4802
  </g>
4803
  <g id="ytick_4">
4804
  <g id="grid-y--5" class="grid grid-y">
4805
- <path d="M 47.81 226.548018 L 835.361742 226.548018 " clip-path="url(#p09feef2583)" style="fill: none; stroke: #b0b0b0; stroke-opacity: 0.3; stroke-width: 0.8; stroke-linecap: square" />
4806
  </g>
4807
  <g id="line2d_10">
4808
  <g>
4809
- <use ns4:href="#m0fca2865ba" x="47.81" y="226.548018" style="stroke: #000000; stroke-width: 0.8" />
4810
  </g>
4811
  </g>
4812
  <g id="text_10">
4813
- <text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: end" x="40.81" y="230.347236" transform="rotate(-0 40.81 230.347236)">1.6</text>
4814
  </g>
4815
  </g>
4816
  <g id="ytick_5">
4817
  <g id="grid-y--6" class="grid grid-y">
4818
- <path d="M 47.81 167.927199 L 835.361742 167.927199 " clip-path="url(#p09feef2583)" style="fill: none; stroke: #b0b0b0; stroke-opacity: 0.3; stroke-width: 0.8; stroke-linecap: square" />
4819
  </g>
4820
  <g id="line2d_11">
4821
  <g>
4822
- <use ns4:href="#m0fca2865ba" x="47.81" y="167.927199" style="stroke: #000000; stroke-width: 0.8" />
4823
  </g>
4824
  </g>
4825
  <g id="text_11">
4826
- <text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: end" x="40.81" y="171.726418" transform="rotate(-0 40.81 171.726418)">1.8</text>
4827
  </g>
4828
  </g>
4829
  <g id="ytick_6">
4830
  <g id="grid-y--7" class="grid grid-y">
4831
- <path d="M 47.81 109.306381 L 835.361742 109.306381 " clip-path="url(#p09feef2583)" style="fill: none; stroke: #b0b0b0; stroke-opacity: 0.3; stroke-width: 0.8; stroke-linecap: square" />
4832
  </g>
4833
  <g id="line2d_12">
4834
  <g>
4835
- <use ns4:href="#m0fca2865ba" x="47.81" y="109.306381" style="stroke: #000000; stroke-width: 0.8" />
4836
  </g>
4837
  </g>
4838
  <g id="text_12">
4839
- <text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: end" x="40.81" y="113.1056" transform="rotate(-0 40.81 113.1056)">2.0</text>
4840
  </g>
4841
  </g>
4842
  <g id="ytick_7">
4843
  <g id="grid-y--8" class="grid grid-y">
4844
- <path d="M 47.81 50.685563 L 835.361742 50.685563 " clip-path="url(#p09feef2583)" style="fill: none; stroke: #b0b0b0; stroke-opacity: 0.3; stroke-width: 0.8; stroke-linecap: square" />
4845
  </g>
4846
  <g id="line2d_13">
4847
  <g>
4848
- <use ns4:href="#m0fca2865ba" x="47.81" y="50.685563" style="stroke: #000000; stroke-width: 0.8" />
4849
  </g>
4850
  </g>
4851
  <g id="text_13">
4852
- <text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: end" x="40.81" y="54.484781" transform="rotate(-0 40.81 54.484781)">2.2</text>
4853
  </g>
4854
  </g>
4855
  <g id="label--y" class="ylabel">
@@ -4857,73 +4857,73 @@ Installed 37 packages in 190ms
4857
  </g>
4858
  </g>
4859
  <g id="series--torch-flash-ma" class="series">
4860
- <path d="M 83.607806 338.320039 L 226.799032 324.329888 L 369.990258 318.590616 L 513.181484 313.901244 L 656.37271 271.916135 L 799.563935 259.376848 " clip-path="url(#p09feef2583)" style="fill: none; stroke: #1f77b4; stroke-width: 1.5; stroke-linecap: square" />
4861
  <defs>
4862
  <path id="md7efaf3aec" d="M 0 3 C 0.795609 3 1.55874 2.683901 2.12132 2.12132 C 2.683901 1.55874 3 0.795609 3 0 C 3 -0.795609 2.683901 -1.55874 2.12132 -2.12132 C 1.55874 -2.683901 0.795609 -3 0 -3 C -0.795609 -3 -1.55874 -2.683901 -2.12132 -2.12132 C -2.683901 -1.55874 -3 -0.795609 -3 0 C -3 0.795609 -2.683901 1.55874 -2.12132 2.12132 C -1.55874 2.683901 -0.795609 3 0 3 z " style="stroke: #1f77b4" />
4863
  </defs>
4864
  <g clip-path="url(#p09feef2583)">
4865
- <use ns4:href="#md7efaf3aec" x="83.607806" y="338.320039" style="fill: #1f77b4; stroke: #1f77b4" />
4866
- <use ns4:href="#md7efaf3aec" x="226.799032" y="324.329888" style="fill: #1f77b4; stroke: #1f77b4" />
4867
- <use ns4:href="#md7efaf3aec" x="369.990258" y="318.590616" style="fill: #1f77b4; stroke: #1f77b4" />
4868
- <use ns4:href="#md7efaf3aec" x="513.181484" y="313.901244" style="fill: #1f77b4; stroke: #1f77b4" />
4869
- <use ns4:href="#md7efaf3aec" x="656.37271" y="271.916135" style="fill: #1f77b4; stroke: #1f77b4" />
4870
- <use ns4:href="#md7efaf3aec" x="799.563935" y="259.376848" style="fill: #1f77b4; stroke: #1f77b4" />
4871
  </g>
4872
  </g>
4873
  <g id="series--torch-mem-eff" class="series">
4874
- <path d="M 83.607806 163.963846 L 226.799032 145.342943 L 369.990258 117.045795 L 513.181484 117.544365 L 656.37271 83.816291 L 799.563935 45.999414 " clip-path="url(#p09feef2583)" style="fill: none; stroke: #ff7f0e; stroke-width: 1.5; stroke-linecap: square" />
4875
  <defs>
4876
  <path id="m9b8c54d372" d="M 0 3 C 0.795609 3 1.55874 2.683901 2.12132 2.12132 C 2.683901 1.55874 3 0.795609 3 0 C 3 -0.795609 2.683901 -1.55874 2.12132 -2.12132 C 1.55874 -2.683901 0.795609 -3 0 -3 C -0.795609 -3 -1.55874 -2.683901 -2.12132 -2.12132 C -2.683901 -1.55874 -3 -0.795609 -3 0 C -3 0.795609 -2.683901 1.55874 -2.12132 2.12132 C -1.55874 2.683901 -0.795609 3 0 3 z " style="stroke: #ff7f0e" />
4877
  </defs>
4878
  <g clip-path="url(#p09feef2583)">
4879
- <use ns4:href="#m9b8c54d372" x="83.607806" y="163.963846" style="fill: #ff7f0e; stroke: #ff7f0e" />
4880
- <use ns4:href="#m9b8c54d372" x="226.799032" y="145.342943" style="fill: #ff7f0e; stroke: #ff7f0e" />
4881
- <use ns4:href="#m9b8c54d372" x="369.990258" y="117.045795" style="fill: #ff7f0e; stroke: #ff7f0e" />
4882
- <use ns4:href="#m9b8c54d372" x="513.181484" y="117.544365" style="fill: #ff7f0e; stroke: #ff7f0e" />
4883
- <use ns4:href="#m9b8c54d372" x="656.37271" y="83.816291" style="fill: #ff7f0e; stroke: #ff7f0e" />
4884
  <use ns4:href="#m9b8c54d372" x="799.563935" y="45.999414" style="fill: #ff7f0e; stroke: #ff7f0e" />
4885
  </g>
4886
  </g>
4887
  <g id="series--xformers-meff" class="series">
4888
- <path d="M 83.607806 407.071707 L 226.799032 396.194321 L 369.990258 382.362446 L 513.181484 378.056747 L 656.37271 332.261284 L 799.563935 334.228013 " clip-path="url(#p09feef2583)" style="fill: none; stroke: #2ca02c; stroke-width: 1.5; stroke-linecap: square" />
4889
  <defs>
4890
  <path id="mc655281e0b" d="M 0 3 C 0.795609 3 1.55874 2.683901 2.12132 2.12132 C 2.683901 1.55874 3 0.795609 3 0 C 3 -0.795609 2.683901 -1.55874 2.12132 -2.12132 C 1.55874 -2.683901 0.795609 -3 0 -3 C -0.795609 -3 -1.55874 -2.683901 -2.12132 -2.12132 C -2.683901 -1.55874 -3 -0.795609 -3 0 C -3 0.795609 -2.683901 1.55874 -2.12132 2.12132 C -1.55874 2.683901 -0.795609 3 0 3 z " style="stroke: #2ca02c" />
4891
  </defs>
4892
  <g clip-path="url(#p09feef2583)">
4893
- <use ns4:href="#mc655281e0b" x="83.607806" y="407.071707" style="fill: #2ca02c; stroke: #2ca02c" />
4894
- <use ns4:href="#mc655281e0b" x="226.799032" y="396.194321" style="fill: #2ca02c; stroke: #2ca02c" />
4895
- <use ns4:href="#mc655281e0b" x="369.990258" y="382.362446" style="fill: #2ca02c; stroke: #2ca02c" />
4896
- <use ns4:href="#mc655281e0b" x="513.181484" y="378.056747" style="fill: #2ca02c; stroke: #2ca02c" />
4897
- <use ns4:href="#mc655281e0b" x="656.37271" y="332.261284" style="fill: #2ca02c; stroke: #2ca02c" />
4898
- <use ns4:href="#mc655281e0b" x="799.563935" y="334.228013" style="fill: #2ca02c; stroke: #2ca02c" />
4899
  </g>
4900
  </g>
4901
  <g id="series--hf-kernels-flash-attn" class="series">
4902
- <path d="M 83.607806 418.848923 L 226.799032 406.104464 L 369.990258 393.547884 L 513.181484 387.046249 L 656.37271 340.26625 L 799.563935 333.615718 " clip-path="url(#p09feef2583)" style="fill: none; stroke: #d62728; stroke-width: 1.5; stroke-linecap: square" />
4903
  <defs>
4904
  <path id="m61c8040d7e" d="M 0 3 C 0.795609 3 1.55874 2.683901 2.12132 2.12132 C 2.683901 1.55874 3 0.795609 3 0 C 3 -0.795609 2.683901 -1.55874 2.12132 -2.12132 C 1.55874 -2.683901 0.795609 -3 0 -3 C -0.795609 -3 -1.55874 -2.683901 -2.12132 -2.12132 C -2.683901 -1.55874 -3 -0.795609 -3 0 C -3 0.795609 -2.683901 1.55874 -2.12132 2.12132 C -1.55874 2.683901 -0.795609 3 0 3 z " style="stroke: #d62728" />
4905
  </defs>
4906
  <g clip-path="url(#p09feef2583)">
4907
- <use ns4:href="#m61c8040d7e" x="83.607806" y="418.848923" style="fill: #d62728; stroke: #d62728" />
4908
- <use ns4:href="#m61c8040d7e" x="226.799032" y="406.104464" style="fill: #d62728; stroke: #d62728" />
4909
- <use ns4:href="#m61c8040d7e" x="369.990258" y="393.547884" style="fill: #d62728; stroke: #d62728" />
4910
- <use ns4:href="#m61c8040d7e" x="513.181484" y="387.046249" style="fill: #d62728; stroke: #d62728" />
4911
- <use ns4:href="#m61c8040d7e" x="656.37271" y="340.26625" style="fill: #d62728; stroke: #d62728" />
4912
- <use ns4:href="#m61c8040d7e" x="799.563935" y="333.615718" style="fill: #d62728; stroke: #d62728" />
4913
  </g>
4914
  </g>
4915
  <g id="series--hf-kernels-flash-attn3" class="series">
4916
- <path d="M 83.607806 428.387702 L 226.799032 417.179109 L 369.990258 396.852047 L 513.181484 396.728943 L 656.37271 348.383475 L 799.563935 348.523872 " clip-path="url(#p09feef2583)" style="fill: none; stroke: #9467bd; stroke-width: 1.5; stroke-linecap: square" />
4917
  <defs>
4918
  <path id="m7cd35be9cc" d="M 0 3 C 0.795609 3 1.55874 2.683901 2.12132 2.12132 C 2.683901 1.55874 3 0.795609 3 0 C 3 -0.795609 2.683901 -1.55874 2.12132 -2.12132 C 1.55874 -2.683901 0.795609 -3 0 -3 C -0.795609 -3 -1.55874 -2.683901 -2.12132 -2.12132 C -2.683901 -1.55874 -3 -0.795609 -3 0 C -3 0.795609 -2.683901 1.55874 -2.12132 2.12132 C -1.55874 2.683901 -0.795609 3 0 3 z " style="stroke: #9467bd" />
4919
  </defs>
4920
  <g clip-path="url(#p09feef2583)">
4921
  <use ns4:href="#m7cd35be9cc" x="83.607806" y="428.387702" style="fill: #9467bd; stroke: #9467bd" />
4922
- <use ns4:href="#m7cd35be9cc" x="226.799032" y="417.179109" style="fill: #9467bd; stroke: #9467bd" />
4923
- <use ns4:href="#m7cd35be9cc" x="369.990258" y="396.852047" style="fill: #9467bd; stroke: #9467bd" />
4924
- <use ns4:href="#m7cd35be9cc" x="513.181484" y="396.728943" style="fill: #9467bd; stroke: #9467bd" />
4925
- <use ns4:href="#m7cd35be9cc" x="656.37271" y="348.383475" style="fill: #9467bd; stroke: #9467bd" />
4926
- <use ns4:href="#m7cd35be9cc" x="799.563935" y="348.523872" style="fill: #9467bd; stroke: #9467bd" />
4927
  </g>
4928
  </g>
4929
  <g id="patch_3">
 
4107
  <rdf:RDF>
4108
  <ns2:Work>
4109
  <dc:type rdf:resource="http://purl.org/dc/dcmitype/StillImage" />
4110
+ <dc:date>2025-10-31T20:14:18.946177</dc:date>
4111
  <dc:format>image/svg+xml</dc:format>
4112
  <dc:creator>
4113
  <ns2:Agent>
 
4217
  <g id="matplotlib.axis_2">
4218
  <g id="ytick_1">
4219
  <g id="grid-y--2" class="grid grid-y">
4220
+ <path d="M 47.81 406.365305 L 835.361742 406.365305 " clip-path="url(#p09feef2583)" style="fill: none; stroke: #b0b0b0; stroke-opacity: 0.3; stroke-width: 0.8; stroke-linecap: square" />
4221
  </g>
4222
  <g id="line2d_7">
4223
  <defs>
4224
  <path id="m0fca2865ba" d="M 0 0 L -3.5 0 " style="stroke: #000000; stroke-width: 0.8" />
4225
  </defs>
4226
  <g>
4227
+ <use ns4:href="#m0fca2865ba" x="47.81" y="406.365305" style="stroke: #000000; stroke-width: 0.8" />
4228
  </g>
4229
  </g>
4230
  <g id="text_7">
4231
+ <text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: end" x="40.81" y="410.164524" transform="rotate(-0 40.81 410.164524)">1.0</text>
4232
  </g>
4233
  </g>
4234
  <g id="ytick_2">
4235
  <g id="grid-y--3" class="grid grid-y">
4236
+ <path d="M 47.81 348.61376 L 835.361742 348.61376 " clip-path="url(#p09feef2583)" style="fill: none; stroke: #b0b0b0; stroke-opacity: 0.3; stroke-width: 0.8; stroke-linecap: square" />
4237
  </g>
4238
  <g id="line2d_8">
4239
  <g>
4240
+ <use ns4:href="#m0fca2865ba" x="47.81" y="348.61376" style="stroke: #000000; stroke-width: 0.8" />
4241
  </g>
4242
  </g>
4243
  <g id="text_8">
4244
+ <text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: end" x="40.81" y="352.412978" transform="rotate(-0 40.81 352.412978)">1.2</text>
4245
  </g>
4246
  </g>
4247
  <g id="ytick_3">
4248
  <g id="grid-y--4" class="grid grid-y">
4249
+ <path d="M 47.81 290.862214 L 835.361742 290.862214 " clip-path="url(#p09feef2583)" style="fill: none; stroke: #b0b0b0; stroke-opacity: 0.3; stroke-width: 0.8; stroke-linecap: square" />
4250
  </g>
4251
  <g id="line2d_9">
4252
  <g>
4253
+ <use ns4:href="#m0fca2865ba" x="47.81" y="290.862214" style="stroke: #000000; stroke-width: 0.8" />
4254
  </g>
4255
  </g>
4256
  <g id="text_9">
4257
+ <text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: end" x="40.81" y="294.661433" transform="rotate(-0 40.81 294.661433)">1.4</text>
4258
  </g>
4259
  </g>
4260
  <g id="ytick_4">
4261
  <g id="grid-y--5" class="grid grid-y">
4262
+ <path d="M 47.81 233.110668 L 835.361742 233.110668 " clip-path="url(#p09feef2583)" style="fill: none; stroke: #b0b0b0; stroke-opacity: 0.3; stroke-width: 0.8; stroke-linecap: square" />
4263
  </g>
4264
  <g id="line2d_10">
4265
  <g>
4266
+ <use ns4:href="#m0fca2865ba" x="47.81" y="233.110668" style="stroke: #000000; stroke-width: 0.8" />
4267
  </g>
4268
  </g>
4269
  <g id="text_10">
4270
+ <text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: end" x="40.81" y="236.909887" transform="rotate(-0 40.81 236.909887)">1.6</text>
4271
  </g>
4272
  </g>
4273
  <g id="ytick_5">
4274
  <g id="grid-y--6" class="grid grid-y">
4275
+ <path d="M 47.81 175.359123 L 835.361742 175.359123 " clip-path="url(#p09feef2583)" style="fill: none; stroke: #b0b0b0; stroke-opacity: 0.3; stroke-width: 0.8; stroke-linecap: square" />
4276
  </g>
4277
  <g id="line2d_11">
4278
  <g>
4279
+ <use ns4:href="#m0fca2865ba" x="47.81" y="175.359123" style="stroke: #000000; stroke-width: 0.8" />
4280
  </g>
4281
  </g>
4282
  <g id="text_11">
4283
+ <text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: end" x="40.81" y="179.158342" transform="rotate(-0 40.81 179.158342)">1.8</text>
4284
  </g>
4285
  </g>
4286
  <g id="ytick_6">
4287
  <g id="grid-y--7" class="grid grid-y">
4288
+ <path d="M 47.81 117.607577 L 835.361742 117.607577 " clip-path="url(#p09feef2583)" style="fill: none; stroke: #b0b0b0; stroke-opacity: 0.3; stroke-width: 0.8; stroke-linecap: square" />
4289
  </g>
4290
  <g id="line2d_12">
4291
  <g>
4292
+ <use ns4:href="#m0fca2865ba" x="47.81" y="117.607577" style="stroke: #000000; stroke-width: 0.8" />
4293
  </g>
4294
  </g>
4295
  <g id="text_12">
4296
+ <text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: end" x="40.81" y="121.406796" transform="rotate(-0 40.81 121.406796)">2.0</text>
4297
  </g>
4298
  </g>
4299
  <g id="ytick_7">
4300
  <g id="grid-y--8" class="grid grid-y">
4301
+ <path d="M 47.81 59.856031 L 835.361742 59.856031 " clip-path="url(#p09feef2583)" style="fill: none; stroke: #b0b0b0; stroke-opacity: 0.3; stroke-width: 0.8; stroke-linecap: square" />
4302
  </g>
4303
  <g id="line2d_13">
4304
  <g>
4305
+ <use ns4:href="#m0fca2865ba" x="47.81" y="59.856031" style="stroke: #000000; stroke-width: 0.8" />
4306
  </g>
4307
  </g>
4308
  <g id="text_13">
4309
+ <text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: end" x="40.81" y="63.65525" transform="rotate(-0 40.81 63.65525)">2.2</text>
4310
  </g>
4311
  </g>
4312
  <g id="label--y" class="ylabel">
 
4314
  </g>
4315
  </g>
4316
  <g id="series--torch-flash-ma" class="series">
4317
+ <path d="M 83.607806 344.244567 L 226.799032 326.470951 L 369.990258 319.632879 L 513.181484 311.200865 L 656.37271 263.410306 L 799.563935 258.605377 " clip-path="url(#p09feef2583)" style="fill: none; stroke: #1f77b4; stroke-width: 1.5; stroke-linecap: square" />
4318
  <defs>
4319
  <path id="md7efaf3aec" d="M 0 3 C 0.795609 3 1.55874 2.683901 2.12132 2.12132 C 2.683901 1.55874 3 0.795609 3 0 C 3 -0.795609 2.683901 -1.55874 2.12132 -2.12132 C 1.55874 -2.683901 0.795609 -3 0 -3 C -0.795609 -3 -1.55874 -2.683901 -2.12132 -2.12132 C -2.683901 -1.55874 -3 -0.795609 -3 0 C -3 0.795609 -2.683901 1.55874 -2.12132 2.12132 C -1.55874 2.683901 -0.795609 3 0 3 z " style="stroke: #1f77b4" />
4320
  </defs>
4321
  <g clip-path="url(#p09feef2583)">
4322
+ <use ns4:href="#md7efaf3aec" x="83.607806" y="344.244567" style="fill: #1f77b4; stroke: #1f77b4" />
4323
+ <use ns4:href="#md7efaf3aec" x="226.799032" y="326.470951" style="fill: #1f77b4; stroke: #1f77b4" />
4324
+ <use ns4:href="#md7efaf3aec" x="369.990258" y="319.632879" style="fill: #1f77b4; stroke: #1f77b4" />
4325
+ <use ns4:href="#md7efaf3aec" x="513.181484" y="311.200865" style="fill: #1f77b4; stroke: #1f77b4" />
4326
+ <use ns4:href="#md7efaf3aec" x="656.37271" y="263.410306" style="fill: #1f77b4; stroke: #1f77b4" />
4327
+ <use ns4:href="#md7efaf3aec" x="799.563935" y="258.605377" style="fill: #1f77b4; stroke: #1f77b4" />
4328
  </g>
4329
  </g>
4330
  <g id="series--torch-mem-eff" class="series">
4331
+ <path d="M 83.607806 160.220133 L 226.799032 131.522812 L 369.990258 119.284971 L 513.181484 97.052936 L 656.37271 99.854174 L 799.563935 45.999414 " clip-path="url(#p09feef2583)" style="fill: none; stroke: #ff7f0e; stroke-width: 1.5; stroke-linecap: square" />
4332
  <defs>
4333
  <path id="m9b8c54d372" d="M 0 3 C 0.795609 3 1.55874 2.683901 2.12132 2.12132 C 2.683901 1.55874 3 0.795609 3 0 C 3 -0.795609 2.683901 -1.55874 2.12132 -2.12132 C 1.55874 -2.683901 0.795609 -3 0 -3 C -0.795609 -3 -1.55874 -2.683901 -2.12132 -2.12132 C -2.683901 -1.55874 -3 -0.795609 -3 0 C -3 0.795609 -2.683901 1.55874 -2.12132 2.12132 C -1.55874 2.683901 -0.795609 3 0 3 z " style="stroke: #ff7f0e" />
4334
  </defs>
4335
  <g clip-path="url(#p09feef2583)">
4336
+ <use ns4:href="#m9b8c54d372" x="83.607806" y="160.220133" style="fill: #ff7f0e; stroke: #ff7f0e" />
4337
+ <use ns4:href="#m9b8c54d372" x="226.799032" y="131.522812" style="fill: #ff7f0e; stroke: #ff7f0e" />
4338
+ <use ns4:href="#m9b8c54d372" x="369.990258" y="119.284971" style="fill: #ff7f0e; stroke: #ff7f0e" />
4339
+ <use ns4:href="#m9b8c54d372" x="513.181484" y="97.052936" style="fill: #ff7f0e; stroke: #ff7f0e" />
4340
+ <use ns4:href="#m9b8c54d372" x="656.37271" y="99.854174" style="fill: #ff7f0e; stroke: #ff7f0e" />
4341
  <use ns4:href="#m9b8c54d372" x="799.563935" y="45.999414" style="fill: #ff7f0e; stroke: #ff7f0e" />
4342
  </g>
4343
  </g>
4344
  <g id="series--xformers-meff" class="series">
4345
+ <path d="M 83.607806 406.681206 L 226.799032 399.095541 L 369.990258 382.16221 L 513.181484 383.640938 L 656.37271 334.388976 L 799.563935 340.779474 " clip-path="url(#p09feef2583)" style="fill: none; stroke: #2ca02c; stroke-width: 1.5; stroke-linecap: square" />
4346
  <defs>
4347
  <path id="mc655281e0b" d="M 0 3 C 0.795609 3 1.55874 2.683901 2.12132 2.12132 C 2.683901 1.55874 3 0.795609 3 0 C 3 -0.795609 2.683901 -1.55874 2.12132 -2.12132 C 1.55874 -2.683901 0.795609 -3 0 -3 C -0.795609 -3 -1.55874 -2.683901 -2.12132 -2.12132 C -2.683901 -1.55874 -3 -0.795609 -3 0 C -3 0.795609 -2.683901 1.55874 -2.12132 2.12132 C -1.55874 2.683901 -0.795609 3 0 3 z " style="stroke: #2ca02c" />
4348
  </defs>
4349
  <g clip-path="url(#p09feef2583)">
4350
+ <use ns4:href="#mc655281e0b" x="83.607806" y="406.681206" style="fill: #2ca02c; stroke: #2ca02c" />
4351
+ <use ns4:href="#mc655281e0b" x="226.799032" y="399.095541" style="fill: #2ca02c; stroke: #2ca02c" />
4352
+ <use ns4:href="#mc655281e0b" x="369.990258" y="382.16221" style="fill: #2ca02c; stroke: #2ca02c" />
4353
+ <use ns4:href="#mc655281e0b" x="513.181484" y="383.640938" style="fill: #2ca02c; stroke: #2ca02c" />
4354
+ <use ns4:href="#mc655281e0b" x="656.37271" y="334.388976" style="fill: #2ca02c; stroke: #2ca02c" />
4355
+ <use ns4:href="#mc655281e0b" x="799.563935" y="340.779474" style="fill: #2ca02c; stroke: #2ca02c" />
4356
  </g>
4357
  </g>
4358
  <g id="series--hf-kernels-flash-attn" class="series">
4359
+ <path d="M 83.607806 420.013439 L 226.799032 405.003813 L 369.990258 391.079337 L 513.181484 388.024281 L 656.37271 340.106668 L 799.563935 341.194996 " clip-path="url(#p09feef2583)" style="fill: none; stroke: #d62728; stroke-width: 1.5; stroke-linecap: square" />
4360
  <defs>
4361
  <path id="m61c8040d7e" d="M 0 3 C 0.795609 3 1.55874 2.683901 2.12132 2.12132 C 2.683901 1.55874 3 0.795609 3 0 C 3 -0.795609 2.683901 -1.55874 2.12132 -2.12132 C 1.55874 -2.683901 0.795609 -3 0 -3 C -0.795609 -3 -1.55874 -2.683901 -2.12132 -2.12132 C -2.683901 -1.55874 -3 -0.795609 -3 0 C -3 0.795609 -2.683901 1.55874 -2.12132 2.12132 C -1.55874 2.683901 -0.795609 3 0 3 z " style="stroke: #d62728" />
4362
  </defs>
4363
  <g clip-path="url(#p09feef2583)">
4364
+ <use ns4:href="#m61c8040d7e" x="83.607806" y="420.013439" style="fill: #d62728; stroke: #d62728" />
4365
+ <use ns4:href="#m61c8040d7e" x="226.799032" y="405.003813" style="fill: #d62728; stroke: #d62728" />
4366
+ <use ns4:href="#m61c8040d7e" x="369.990258" y="391.079337" style="fill: #d62728; stroke: #d62728" />
4367
+ <use ns4:href="#m61c8040d7e" x="513.181484" y="388.024281" style="fill: #d62728; stroke: #d62728" />
4368
+ <use ns4:href="#m61c8040d7e" x="656.37271" y="340.106668" style="fill: #d62728; stroke: #d62728" />
4369
+ <use ns4:href="#m61c8040d7e" x="799.563935" y="341.194996" style="fill: #d62728; stroke: #d62728" />
4370
  </g>
4371
  </g>
4372
  <g id="series--hf-kernels-flash-attn3" class="series">
4373
+ <path d="M 83.607806 428.387702 L 226.799032 418.228917 L 369.990258 402.378716 L 513.181484 397.605262 L 656.37271 348.593258 L 799.563935 355.437105 " clip-path="url(#p09feef2583)" style="fill: none; stroke: #9467bd; stroke-width: 1.5; stroke-linecap: square" />
4374
  <defs>
4375
  <path id="m7cd35be9cc" d="M 0 3 C 0.795609 3 1.55874 2.683901 2.12132 2.12132 C 2.683901 1.55874 3 0.795609 3 0 C 3 -0.795609 2.683901 -1.55874 2.12132 -2.12132 C 1.55874 -2.683901 0.795609 -3 0 -3 C -0.795609 -3 -1.55874 -2.683901 -2.12132 -2.12132 C -2.683901 -1.55874 -3 -0.795609 -3 0 C -3 0.795609 -2.683901 1.55874 -2.12132 2.12132 C -1.55874 2.683901 -0.795609 3 0 3 z " style="stroke: #9467bd" />
4376
  </defs>
4377
  <g clip-path="url(#p09feef2583)">
4378
  <use ns4:href="#m7cd35be9cc" x="83.607806" y="428.387702" style="fill: #9467bd; stroke: #9467bd" />
4379
+ <use ns4:href="#m7cd35be9cc" x="226.799032" y="418.228917" style="fill: #9467bd; stroke: #9467bd" />
4380
+ <use ns4:href="#m7cd35be9cc" x="369.990258" y="402.378716" style="fill: #9467bd; stroke: #9467bd" />
4381
+ <use ns4:href="#m7cd35be9cc" x="513.181484" y="397.605262" style="fill: #9467bd; stroke: #9467bd" />
4382
+ <use ns4:href="#m7cd35be9cc" x="656.37271" y="348.593258" style="fill: #9467bd; stroke: #9467bd" />
4383
+ <use ns4:href="#m7cd35be9cc" x="799.563935" y="355.437105" style="fill: #9467bd; stroke: #9467bd" />
4384
  </g>
4385
  </g>
4386
  <g id="patch_3">
 
4465
  <span onclick="toggleOutput('combine')" style="cursor: pointer;">▼ output</span>
4466
  <span id="uv-indicator-combine" onclick="toggleUvLogsFromHeader('combine')" style="cursor: pointer;">▶ uv-logs</span>
4467
  </span> |
4468
+ Cell: combine | 4.31s
4469
  | <button class="run-btn" onclick="runCell('combine')">▶ run</button>
4470
  <button class="copy-btn" onclick="copyCell('combine')">Copy</button>
4471
  <a href="cells/combine.py" target="_blank" class="raw-btn">Raw</a>
 
4572
  COMBINED BENCHMARK SUMMARY
4573
 
4574
  impl wl p50(ms) ok
4575
+ hf_kernels_flash_attn cuda_attn_L128_bfloat16 0.95 True
4576
+ hf_kernels_flash_attn cuda_attn_L256_bfloat16 1.00 True
4577
+ hf_kernels_flash_attn cuda_attn_L320_bfloat16 1.05 True
4578
+ hf_kernels_flash_attn cuda_attn_L384_bfloat16 1.06 True
4579
+ hf_kernels_flash_attn cuda_attn_L448_bfloat16 1.23 True
4580
  hf_kernels_flash_attn cuda_attn_L512_bfloat16 1.23 True
4581
+ hf_kernels_flash_attn3 cuda_attn_L128_bfloat16 0.92 True
4582
+ hf_kernels_flash_attn3 cuda_attn_L256_bfloat16 0.96 True
4583
+ hf_kernels_flash_attn3 cuda_attn_L320_bfloat16 1.01 True
4584
+ hf_kernels_flash_attn3 cuda_attn_L384_bfloat16 1.03 True
4585
+ hf_kernels_flash_attn3 cuda_attn_L448_bfloat16 1.20 True
4586
  hf_kernels_flash_attn3 cuda_attn_L512_bfloat16 1.18 True
4587
  sage_int8_fp16 cuda_attn_L128_bfloat16 FAIL False
4588
+ Error: module &#x27;sage_attention_ef0573391bb63704&#x27; has no attribute &#x27;fwd&#x27;
4589
  sage_int8_fp16 cuda_attn_L256_bfloat16 FAIL False
4590
+ Error: module &#x27;sage_attention_ef0573391bb63704&#x27; has no attribute &#x27;fwd&#x27;
4591
  sage_int8_fp16 cuda_attn_L320_bfloat16 FAIL False
4592
+ Error: module &#x27;sage_attention_ef0573391bb63704&#x27; has no attribute &#x27;fwd&#x27;
4593
  sage_int8_fp16 cuda_attn_L384_bfloat16 FAIL False
4594
+ Error: module &#x27;sage_attention_ef0573391bb63704&#x27; has no attribute &#x27;fwd&#x27;
4595
  sage_int8_fp16 cuda_attn_L448_bfloat16 FAIL False
4596
+ Error: module &#x27;sage_attention_ef0573391bb63704&#x27; has no attribute &#x27;fwd&#x27;
4597
  sage_int8_fp16 cuda_attn_L512_bfloat16 FAIL False
4598
+ Error: module &#x27;sage_attention_ef0573391bb63704&#x27; has no attribute &#x27;fwd&#x27;
4599
  torch_flash_ma cuda_attn_L128_bfloat16 1.22 True
4600
+ torch_flash_ma cuda_attn_L256_bfloat16 1.28 True
4601
+ torch_flash_ma cuda_attn_L320_bfloat16 1.30 True
4602
+ torch_flash_ma cuda_attn_L384_bfloat16 1.33 True
4603
+ torch_flash_ma cuda_attn_L448_bfloat16 1.50 True
4604
+ torch_flash_ma cuda_attn_L512_bfloat16 1.51 True
4605
+ torch_mem_eff cuda_attn_L128_bfloat16 1.85 True
4606
+ torch_mem_eff cuda_attn_L256_bfloat16 1.95 True
4607
+ torch_mem_eff cuda_attn_L320_bfloat16 1.99 True
4608
+ torch_mem_eff cuda_attn_L384_bfloat16 2.07 True
4609
+ torch_mem_eff cuda_attn_L448_bfloat16 2.06 True
4610
+ torch_mem_eff cuda_attn_L512_bfloat16 2.25 True
4611
+ xformers_meff cuda_attn_L128_bfloat16 1.00 True
4612
+ xformers_meff cuda_attn_L256_bfloat16 1.03 True
4613
+ xformers_meff cuda_attn_L320_bfloat16 1.08 True
4614
  xformers_meff cuda_attn_L384_bfloat16 1.08 True
4615
+ xformers_meff cuda_attn_L448_bfloat16 1.25 True
4616
  xformers_meff cuda_attn_L512_bfloat16 1.23 True
4617
 
4618
  GENERATING COMBINED VISUALIZATION
 
4637
  <div class="uv-install-logs" id="uv-logs-combine">
4638
  <div class="uv-logs-header" onclick="toggleUvLogs(this)">▶ UV Install Logs</div>
4639
  <div class="uv-logs-content" style="display: none;">
4640
+ Installed 37 packages in 225ms
4641
  </div>
4642
  </div>
4643
  <div class="cell-artifacts">
 
4650
  <rdf:RDF>
4651
  <ns2:Work>
4652
  <dc:type rdf:resource="http://purl.org/dc/dcmitype/StillImage" />
4653
+ <dc:date>2025-10-31T20:14:18.946177</dc:date>
4654
  <dc:format>image/svg+xml</dc:format>
4655
  <dc:creator>
4656
  <ns2:Agent>
 
4760
  <g id="matplotlib.axis_2">
4761
  <g id="ytick_1">
4762
  <g id="grid-y--2" class="grid grid-y">
4763
+ <path d="M 47.81 406.365305 L 835.361742 406.365305 " clip-path="url(#p09feef2583)" style="fill: none; stroke: #b0b0b0; stroke-opacity: 0.3; stroke-width: 0.8; stroke-linecap: square" />
4764
  </g>
4765
  <g id="line2d_7">
4766
  <defs>
4767
  <path id="m0fca2865ba" d="M 0 0 L -3.5 0 " style="stroke: #000000; stroke-width: 0.8" />
4768
  </defs>
4769
  <g>
4770
+ <use ns4:href="#m0fca2865ba" x="47.81" y="406.365305" style="stroke: #000000; stroke-width: 0.8" />
4771
  </g>
4772
  </g>
4773
  <g id="text_7">
4774
+ <text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: end" x="40.81" y="410.164524" transform="rotate(-0 40.81 410.164524)">1.0</text>
4775
  </g>
4776
  </g>
4777
  <g id="ytick_2">
4778
  <g id="grid-y--3" class="grid grid-y">
4779
+ <path d="M 47.81 348.61376 L 835.361742 348.61376 " clip-path="url(#p09feef2583)" style="fill: none; stroke: #b0b0b0; stroke-opacity: 0.3; stroke-width: 0.8; stroke-linecap: square" />
4780
  </g>
4781
  <g id="line2d_8">
4782
  <g>
4783
+ <use ns4:href="#m0fca2865ba" x="47.81" y="348.61376" style="stroke: #000000; stroke-width: 0.8" />
4784
  </g>
4785
  </g>
4786
  <g id="text_8">
4787
+ <text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: end" x="40.81" y="352.412978" transform="rotate(-0 40.81 352.412978)">1.2</text>
4788
  </g>
4789
  </g>
4790
  <g id="ytick_3">
4791
  <g id="grid-y--4" class="grid grid-y">
4792
+ <path d="M 47.81 290.862214 L 835.361742 290.862214 " clip-path="url(#p09feef2583)" style="fill: none; stroke: #b0b0b0; stroke-opacity: 0.3; stroke-width: 0.8; stroke-linecap: square" />
4793
  </g>
4794
  <g id="line2d_9">
4795
  <g>
4796
+ <use ns4:href="#m0fca2865ba" x="47.81" y="290.862214" style="stroke: #000000; stroke-width: 0.8" />
4797
  </g>
4798
  </g>
4799
  <g id="text_9">
4800
+ <text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: end" x="40.81" y="294.661433" transform="rotate(-0 40.81 294.661433)">1.4</text>
4801
  </g>
4802
  </g>
4803
  <g id="ytick_4">
4804
  <g id="grid-y--5" class="grid grid-y">
4805
+ <path d="M 47.81 233.110668 L 835.361742 233.110668 " clip-path="url(#p09feef2583)" style="fill: none; stroke: #b0b0b0; stroke-opacity: 0.3; stroke-width: 0.8; stroke-linecap: square" />
4806
  </g>
4807
  <g id="line2d_10">
4808
  <g>
4809
+ <use ns4:href="#m0fca2865ba" x="47.81" y="233.110668" style="stroke: #000000; stroke-width: 0.8" />
4810
  </g>
4811
  </g>
4812
  <g id="text_10">
4813
+ <text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: end" x="40.81" y="236.909887" transform="rotate(-0 40.81 236.909887)">1.6</text>
4814
  </g>
4815
  </g>
4816
  <g id="ytick_5">
4817
  <g id="grid-y--6" class="grid grid-y">
4818
+ <path d="M 47.81 175.359123 L 835.361742 175.359123 " clip-path="url(#p09feef2583)" style="fill: none; stroke: #b0b0b0; stroke-opacity: 0.3; stroke-width: 0.8; stroke-linecap: square" />
4819
  </g>
4820
  <g id="line2d_11">
4821
  <g>
4822
+ <use ns4:href="#m0fca2865ba" x="47.81" y="175.359123" style="stroke: #000000; stroke-width: 0.8" />
4823
  </g>
4824
  </g>
4825
  <g id="text_11">
4826
+ <text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: end" x="40.81" y="179.158342" transform="rotate(-0 40.81 179.158342)">1.8</text>
4827
  </g>
4828
  </g>
4829
  <g id="ytick_6">
4830
  <g id="grid-y--7" class="grid grid-y">
4831
+ <path d="M 47.81 117.607577 L 835.361742 117.607577 " clip-path="url(#p09feef2583)" style="fill: none; stroke: #b0b0b0; stroke-opacity: 0.3; stroke-width: 0.8; stroke-linecap: square" />
4832
  </g>
4833
  <g id="line2d_12">
4834
  <g>
4835
+ <use ns4:href="#m0fca2865ba" x="47.81" y="117.607577" style="stroke: #000000; stroke-width: 0.8" />
4836
  </g>
4837
  </g>
4838
  <g id="text_12">
4839
+ <text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: end" x="40.81" y="121.406796" transform="rotate(-0 40.81 121.406796)">2.0</text>
4840
  </g>
4841
  </g>
4842
  <g id="ytick_7">
4843
  <g id="grid-y--8" class="grid grid-y">
4844
+ <path d="M 47.81 59.856031 L 835.361742 59.856031 " clip-path="url(#p09feef2583)" style="fill: none; stroke: #b0b0b0; stroke-opacity: 0.3; stroke-width: 0.8; stroke-linecap: square" />
4845
  </g>
4846
  <g id="line2d_13">
4847
  <g>
4848
+ <use ns4:href="#m0fca2865ba" x="47.81" y="59.856031" style="stroke: #000000; stroke-width: 0.8" />
4849
  </g>
4850
  </g>
4851
  <g id="text_13">
4852
+ <text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: end" x="40.81" y="63.65525" transform="rotate(-0 40.81 63.65525)">2.2</text>
4853
  </g>
4854
  </g>
4855
  <g id="label--y" class="ylabel">
 
4857
  </g>
4858
  </g>
4859
  <g id="series--torch-flash-ma" class="series">
4860
+ <path d="M 83.607806 344.244567 L 226.799032 326.470951 L 369.990258 319.632879 L 513.181484 311.200865 L 656.37271 263.410306 L 799.563935 258.605377 " clip-path="url(#p09feef2583)" style="fill: none; stroke: #1f77b4; stroke-width: 1.5; stroke-linecap: square" />
4861
  <defs>
4862
  <path id="md7efaf3aec" d="M 0 3 C 0.795609 3 1.55874 2.683901 2.12132 2.12132 C 2.683901 1.55874 3 0.795609 3 0 C 3 -0.795609 2.683901 -1.55874 2.12132 -2.12132 C 1.55874 -2.683901 0.795609 -3 0 -3 C -0.795609 -3 -1.55874 -2.683901 -2.12132 -2.12132 C -2.683901 -1.55874 -3 -0.795609 -3 0 C -3 0.795609 -2.683901 1.55874 -2.12132 2.12132 C -1.55874 2.683901 -0.795609 3 0 3 z " style="stroke: #1f77b4" />
4863
  </defs>
4864
  <g clip-path="url(#p09feef2583)">
4865
+ <use ns4:href="#md7efaf3aec" x="83.607806" y="344.244567" style="fill: #1f77b4; stroke: #1f77b4" />
4866
+ <use ns4:href="#md7efaf3aec" x="226.799032" y="326.470951" style="fill: #1f77b4; stroke: #1f77b4" />
4867
+ <use ns4:href="#md7efaf3aec" x="369.990258" y="319.632879" style="fill: #1f77b4; stroke: #1f77b4" />
4868
+ <use ns4:href="#md7efaf3aec" x="513.181484" y="311.200865" style="fill: #1f77b4; stroke: #1f77b4" />
4869
+ <use ns4:href="#md7efaf3aec" x="656.37271" y="263.410306" style="fill: #1f77b4; stroke: #1f77b4" />
4870
+ <use ns4:href="#md7efaf3aec" x="799.563935" y="258.605377" style="fill: #1f77b4; stroke: #1f77b4" />
4871
  </g>
4872
  </g>
4873
  <g id="series--torch-mem-eff" class="series">
4874
+ <path d="M 83.607806 160.220133 L 226.799032 131.522812 L 369.990258 119.284971 L 513.181484 97.052936 L 656.37271 99.854174 L 799.563935 45.999414 " clip-path="url(#p09feef2583)" style="fill: none; stroke: #ff7f0e; stroke-width: 1.5; stroke-linecap: square" />
4875
  <defs>
4876
  <path id="m9b8c54d372" d="M 0 3 C 0.795609 3 1.55874 2.683901 2.12132 2.12132 C 2.683901 1.55874 3 0.795609 3 0 C 3 -0.795609 2.683901 -1.55874 2.12132 -2.12132 C 1.55874 -2.683901 0.795609 -3 0 -3 C -0.795609 -3 -1.55874 -2.683901 -2.12132 -2.12132 C -2.683901 -1.55874 -3 -0.795609 -3 0 C -3 0.795609 -2.683901 1.55874 -2.12132 2.12132 C -1.55874 2.683901 -0.795609 3 0 3 z " style="stroke: #ff7f0e" />
4877
  </defs>
4878
  <g clip-path="url(#p09feef2583)">
4879
+ <use ns4:href="#m9b8c54d372" x="83.607806" y="160.220133" style="fill: #ff7f0e; stroke: #ff7f0e" />
4880
+ <use ns4:href="#m9b8c54d372" x="226.799032" y="131.522812" style="fill: #ff7f0e; stroke: #ff7f0e" />
4881
+ <use ns4:href="#m9b8c54d372" x="369.990258" y="119.284971" style="fill: #ff7f0e; stroke: #ff7f0e" />
4882
+ <use ns4:href="#m9b8c54d372" x="513.181484" y="97.052936" style="fill: #ff7f0e; stroke: #ff7f0e" />
4883
+ <use ns4:href="#m9b8c54d372" x="656.37271" y="99.854174" style="fill: #ff7f0e; stroke: #ff7f0e" />
4884
  <use ns4:href="#m9b8c54d372" x="799.563935" y="45.999414" style="fill: #ff7f0e; stroke: #ff7f0e" />
4885
  </g>
4886
  </g>
4887
  <g id="series--xformers-meff" class="series">
4888
+ <path d="M 83.607806 406.681206 L 226.799032 399.095541 L 369.990258 382.16221 L 513.181484 383.640938 L 656.37271 334.388976 L 799.563935 340.779474 " clip-path="url(#p09feef2583)" style="fill: none; stroke: #2ca02c; stroke-width: 1.5; stroke-linecap: square" />
4889
  <defs>
4890
  <path id="mc655281e0b" d="M 0 3 C 0.795609 3 1.55874 2.683901 2.12132 2.12132 C 2.683901 1.55874 3 0.795609 3 0 C 3 -0.795609 2.683901 -1.55874 2.12132 -2.12132 C 1.55874 -2.683901 0.795609 -3 0 -3 C -0.795609 -3 -1.55874 -2.683901 -2.12132 -2.12132 C -2.683901 -1.55874 -3 -0.795609 -3 0 C -3 0.795609 -2.683901 1.55874 -2.12132 2.12132 C -1.55874 2.683901 -0.795609 3 0 3 z " style="stroke: #2ca02c" />
4891
  </defs>
4892
  <g clip-path="url(#p09feef2583)">
4893
+ <use ns4:href="#mc655281e0b" x="83.607806" y="406.681206" style="fill: #2ca02c; stroke: #2ca02c" />
4894
+ <use ns4:href="#mc655281e0b" x="226.799032" y="399.095541" style="fill: #2ca02c; stroke: #2ca02c" />
4895
+ <use ns4:href="#mc655281e0b" x="369.990258" y="382.16221" style="fill: #2ca02c; stroke: #2ca02c" />
4896
+ <use ns4:href="#mc655281e0b" x="513.181484" y="383.640938" style="fill: #2ca02c; stroke: #2ca02c" />
4897
+ <use ns4:href="#mc655281e0b" x="656.37271" y="334.388976" style="fill: #2ca02c; stroke: #2ca02c" />
4898
+ <use ns4:href="#mc655281e0b" x="799.563935" y="340.779474" style="fill: #2ca02c; stroke: #2ca02c" />
4899
  </g>
4900
  </g>
4901
  <g id="series--hf-kernels-flash-attn" class="series">
4902
+ <path d="M 83.607806 420.013439 L 226.799032 405.003813 L 369.990258 391.079337 L 513.181484 388.024281 L 656.37271 340.106668 L 799.563935 341.194996 " clip-path="url(#p09feef2583)" style="fill: none; stroke: #d62728; stroke-width: 1.5; stroke-linecap: square" />
4903
  <defs>
4904
  <path id="m61c8040d7e" d="M 0 3 C 0.795609 3 1.55874 2.683901 2.12132 2.12132 C 2.683901 1.55874 3 0.795609 3 0 C 3 -0.795609 2.683901 -1.55874 2.12132 -2.12132 C 1.55874 -2.683901 0.795609 -3 0 -3 C -0.795609 -3 -1.55874 -2.683901 -2.12132 -2.12132 C -2.683901 -1.55874 -3 -0.795609 -3 0 C -3 0.795609 -2.683901 1.55874 -2.12132 2.12132 C -1.55874 2.683901 -0.795609 3 0 3 z " style="stroke: #d62728" />
4905
  </defs>
4906
  <g clip-path="url(#p09feef2583)">
4907
+ <use ns4:href="#m61c8040d7e" x="83.607806" y="420.013439" style="fill: #d62728; stroke: #d62728" />
4908
+ <use ns4:href="#m61c8040d7e" x="226.799032" y="405.003813" style="fill: #d62728; stroke: #d62728" />
4909
+ <use ns4:href="#m61c8040d7e" x="369.990258" y="391.079337" style="fill: #d62728; stroke: #d62728" />
4910
+ <use ns4:href="#m61c8040d7e" x="513.181484" y="388.024281" style="fill: #d62728; stroke: #d62728" />
4911
+ <use ns4:href="#m61c8040d7e" x="656.37271" y="340.106668" style="fill: #d62728; stroke: #d62728" />
4912
+ <use ns4:href="#m61c8040d7e" x="799.563935" y="341.194996" style="fill: #d62728; stroke: #d62728" />
4913
  </g>
4914
  </g>
4915
  <g id="series--hf-kernels-flash-attn3" class="series">
4916
+ <path d="M 83.607806 428.387702 L 226.799032 418.228917 L 369.990258 402.378716 L 513.181484 397.605262 L 656.37271 348.593258 L 799.563935 355.437105 " clip-path="url(#p09feef2583)" style="fill: none; stroke: #9467bd; stroke-width: 1.5; stroke-linecap: square" />
4917
  <defs>
4918
  <path id="m7cd35be9cc" d="M 0 3 C 0.795609 3 1.55874 2.683901 2.12132 2.12132 C 2.683901 1.55874 3 0.795609 3 0 C 3 -0.795609 2.683901 -1.55874 2.12132 -2.12132 C 1.55874 -2.683901 0.795609 -3 0 -3 C -0.795609 -3 -1.55874 -2.683901 -2.12132 -2.12132 C -2.683901 -1.55874 -3 -0.795609 -3 0 C -3 0.795609 -2.683901 1.55874 -2.12132 2.12132 C -1.55874 2.683901 -0.795609 3 0 3 z " style="stroke: #9467bd" />
4919
  </defs>
4920
  <g clip-path="url(#p09feef2583)">
4921
  <use ns4:href="#m7cd35be9cc" x="83.607806" y="428.387702" style="fill: #9467bd; stroke: #9467bd" />
4922
+ <use ns4:href="#m7cd35be9cc" x="226.799032" y="418.228917" style="fill: #9467bd; stroke: #9467bd" />
4923
+ <use ns4:href="#m7cd35be9cc" x="369.990258" y="402.378716" style="fill: #9467bd; stroke: #9467bd" />
4924
+ <use ns4:href="#m7cd35be9cc" x="513.181484" y="397.605262" style="fill: #9467bd; stroke: #9467bd" />
4925
+ <use ns4:href="#m7cd35be9cc" x="656.37271" y="348.593258" style="fill: #9467bd; stroke: #9467bd" />
4926
+ <use ns4:href="#m7cd35be9cc" x="799.563935" y="355.437105" style="fill: #9467bd; stroke: #9467bd" />
4927
  </g>
4928
  </g>
4929
  <g id="patch_3">
index.html CHANGED
@@ -4097,35 +4097,54 @@ body[data-tool="eraser"] .main-content {
4097
  </div>
4098
 
4099
  <div class="main-content">
4100
- <h1>KERNELS COMMUNITY BENCHMARKS</h1>
 
 
 
 
 
 
 
 
 
4101
  <p>This report aggregates latency and performance benchmarks across core model components.<br />
4102
  Each section includes:<br />
4103
  - A latency visualization<br />
4104
  - Links to detailed implementation benchmarks </p>
4105
  <h2>TABLE OF CONTENTS</h2>
4106
  <ul>
4107
- <li><a href="#methodology">METHODOLOGY</a></li>
4108
- <li><a href="#layer-normalization">LAYER NORMALIZATION</a></li>
4109
- <li><a href="#rotary-position-embeddings">ROTARY POSITION EMBEDDINGS</a></li>
4110
  <li><a href="#flash-attention">FLASH ATTENTION</a></li>
 
 
 
4111
  <li><a href="#causal-conv1d">CAUSAL CONV1D</a></li>
4112
- <li><a href="#activation-functions">ACTIVATION FUNCTIONS</a></li>
4113
- <li><a href="#notes">NOTES</a></li>
4114
  </ul>
 
 
 
 
 
 
 
 
4115
  <h2>METHODOLOGY</h2>
4116
- <p>Each benchmark is run with the <a href="https://github.com/huggingface/kernels-benchmarks">Kernels Benchmarking Framework</a> and follows these principles:<br />
 
4117
  - a reference implementation (usually PyTorch native) is included for baseline comparison<br />
4118
  - multiple input sizes and batch sizes are tested to reflect real-world usage<br />
4119
  - runs are repeatable via python virtual environments and documented dependencies<br />
4120
  - results are collected and visualized using standardized scripts </p>
4121
- <hr />
 
4122
  <div class="alert">
4123
  <strong>Note:</strong> Latency values are measured in milliseconds (ms). Lower values indicate better performance.
4124
  </div>
4125
 
4126
- <h2>LAYER NORMALIZATION</h2>
4127
  <div class="artifact-preview">
4128
- <img src="layer_norm/results/artifacts/combine/latency.svg" alt="Layer Norm Latency" width="800">
4129
  </div>
4130
 
4131
  <table>
@@ -4133,32 +4152,40 @@ Each section includes:<br />
4133
  <tr>
4134
  <th>Implementation</th>
4135
  <th>Description</th>
 
 
 
4136
  </tr>
4137
  </thead>
4138
  <tbody>
4139
  <tr>
4140
- <td>HF Kernels Layer Norm</td>
4141
- <td>HuggingFace kernels implementation</td>
 
 
 
4142
  </tr>
4143
  <tr>
4144
- <td>PyTorch Layer Norm</td>
4145
- <td>PyTorch native implementation</td>
 
 
 
4146
  </tr>
4147
  </tbody>
4148
  </table>
4149
  <p align="center">
4150
- <!-- <button onclick="window.location.href='layer_norm/'" style="margin-left: 20px; padding: 10px 20px; background-color: #007bff; color: white; border: none; border-radius: 5px; cursor: pointer;"> -->
4151
- <button
4152
- onclick="window.location.href='layer_norm/'"
4153
  class="btn">
4154
  Explore Full Bench
4155
  </button>
4156
  </p>
4157
 
4158
  <hr />
4159
- <h2>ROTARY POSITION EMBEDDINGS</h2>
4160
  <div class="artifact-preview">
4161
- <img src="rotary/results/artifacts/combine/latency.svg" alt="Rotary Position Embeddings Latency" width="800">
4162
  </div>
4163
 
4164
  <table>
@@ -4166,31 +4193,68 @@ Each section includes:<br />
4166
  <tr>
4167
  <th>Implementation</th>
4168
  <th>Description</th>
 
 
 
4169
  </tr>
4170
  </thead>
4171
  <tbody>
4172
  <tr>
4173
- <td>HF Kernels Rotary</td>
4174
- <td>HuggingFace kernels implementation</td>
 
 
 
4175
  </tr>
4176
  <tr>
4177
- <td>PyTorch Rotary</td>
4178
- <td>PyTorch native implementation</td>
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
4179
  </tr>
4180
  </tbody>
4181
  </table>
4182
  <p align="center">
4183
  <button
4184
- onclick="window.location.href='rotary/'"
4185
  class="btn">
4186
  Explore Full Bench
4187
  </button>
4188
  </p>
4189
 
4190
  <hr />
4191
- <h2>FLASH ATTENTION</h2>
4192
  <div class="artifact-preview">
4193
- <img src="flash_attn/results/artifacts/combine/latency.svg" alt="Flash Attention Latency" width="800">
4194
  </div>
4195
 
4196
  <table>
@@ -4198,38 +4262,72 @@ Each section includes:<br />
4198
  <tr>
4199
  <th>Implementation</th>
4200
  <th>Description</th>
 
 
 
4201
  </tr>
4202
  </thead>
4203
  <tbody>
4204
  <tr>
4205
- <td>Flash Attention</td>
4206
- <td>Flash Attention implementation</td>
4207
- </tr>
4208
- <tr>
4209
- <td>HF Kernels Flash Attention</td>
4210
- <td>HuggingFace kernels Flash Attention</td>
4211
  </tr>
4212
  <tr>
4213
- <td>HF Kernels Flash Attention 3</td>
4214
- <td>HuggingFace kernels Flash Attention 3</td>
 
 
 
4215
  </tr>
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
4216
  <tr>
4217
- <td>Memory Efficient Attention</td>
4218
- <td>Memory efficient attention implementation</td>
 
 
 
4219
  </tr>
 
 
4220
  <tr>
4221
- <td>Sage Attention</td>
4222
- <td>Sage attention implementation</td>
 
 
 
4223
  </tr>
4224
  <tr>
4225
- <td>xFormers</td>
4226
- <td>xFormers attention implementation</td>
 
 
 
4227
  </tr>
4228
  </tbody>
4229
  </table>
4230
  <p align="center">
4231
  <button
4232
- onclick="window.location.href='flash_attn/'"
4233
  class="btn">
4234
  Explore Full Bench
4235
  </button>
@@ -4246,16 +4344,25 @@ Each section includes:<br />
4246
  <tr>
4247
  <th>Implementation</th>
4248
  <th>Description</th>
 
 
 
4249
  </tr>
4250
  </thead>
4251
  <tbody>
4252
  <tr>
4253
  <td>HF Kernels Causal Conv1D</td>
4254
  <td>HuggingFace kernels implementation</td>
 
 
 
4255
  </tr>
4256
  <tr>
4257
  <td>PyTorch Causal Conv1D</td>
4258
  <td>PyTorch native implementation</td>
 
 
 
4259
  </tr>
4260
  </tbody>
4261
  </table>
@@ -4268,9 +4375,9 @@ Each section includes:<br />
4268
  </p>
4269
 
4270
  <hr />
4271
- <h2>ACTIVATION FUNCTIONS</h2>
4272
  <div class="artifact-preview">
4273
- <img src="activation/results/artifacts/combine/latency.svg" alt="Activation Latency" width="800">
4274
  </div>
4275
 
4276
  <table>
@@ -4278,28 +4385,77 @@ Each section includes:<br />
4278
  <tr>
4279
  <th>Implementation</th>
4280
  <th>Description</th>
 
 
 
4281
  </tr>
4282
  </thead>
4283
  <tbody>
4284
  <tr>
4285
- <td>HF Kernels SwiGLU</td>
4286
- <td>HuggingFace kernels SwiGLU implementation</td>
 
 
 
4287
  </tr>
4288
  <tr>
4289
- <td>PyTorch SwiGLU</td>
4290
- <td>PyTorch native SwiGLU implementation</td>
 
 
 
4291
  </tr>
4292
  </tbody>
4293
  </table>
4294
  <p align="center">
4295
  <button
4296
- onclick="window.location.href='activation/'"
4297
  class="btn">
4298
  Explore Full Bench
4299
  </button>
4300
  </p>
4301
 
4302
  <hr />
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
4303
  <style>
4304
  .controls {
4305
  display: none !important;
@@ -4343,12 +4499,10 @@ Each section includes:<br />
4343
  }
4344
  :root {
4345
  --bg-alert: #0069cbff;
4346
- --border-alert: #001628ff;
4347
  }
4348
  .alert {
4349
- padding: 5px;
4350
  background-color: var(--bg-alert);
4351
- border-left: 6px solid var(--border-alert);
4352
  margin-bottom: 10px;
4353
  border-radius: 6px;
4354
  }
 
4097
  </div>
4098
 
4099
  <div class="main-content">
4100
+ <div class="linkbar">
4101
+ <a target="_blank" href="https://github.com/huggingface/kernels">Python Library</a> |
4102
+ <a target="_blank" href="https://github.com/huggingface/kernel-builder">Builder</a> |
4103
+ <a target="_blank" href="https://github.com/huggingface/kernels-community">Community</a> |
4104
+ <a target="_blank" href="https://huggingface.co/kernels-community">Community Hub</a> |
4105
+ <a target="_blank" href="https://github.com/huggingface/kernels-benchmarks">Benchmarks</a>
4106
+ </div>
4107
+
4108
+ <p><br/></p>
4109
+ <h1>KERNELS COMMUNITY BENCHMARKS</h1>
4110
  <p>This report aggregates latency and performance benchmarks across core model components.<br />
4111
  Each section includes:<br />
4112
  - A latency visualization<br />
4113
  - Links to detailed implementation benchmarks </p>
4114
  <h2>TABLE OF CONTENTS</h2>
4115
  <ul>
4116
+ <li><a href="#activation-functions">ACTIVATION FUNCTIONS</a></li>
 
 
4117
  <li><a href="#flash-attention">FLASH ATTENTION</a></li>
4118
+ <li><a href="#deformable-detr">DEFORMABLE DETR</a></li>
4119
+ <li><a href="#openai-style-moe">OPENAI-STYLE MOE</a></li>
4120
+ <li><a href="#rotary-position-embeddings">ROTARY POSITION EMBEDDINGS</a></li>
4121
  <li><a href="#causal-conv1d">CAUSAL CONV1D</a></li>
4122
+ <li><a href="#layer-normaliz=ation">LAYER NORMALIZATION</a></li>
 
4123
  </ul>
4124
+ <h2>RUN YOURSELF</h2>
4125
+ <p>To run the benchmarks locally, clone the repository and use <code>uvx</code> to build and run the benchmarks:</p>
4126
+ <p>Note benches are made to run on a machine with a compatible NVIDIA GPU and CUDA installed, other hardware may not not work as expected.</p>
4127
+ <div class="codehilite"><pre><span></span><code>git<span class="w"> </span>clone<span class="w"> </span>https://github.com/huggingface/kernels-benchmarks.git
4128
+ <span class="nb">cd</span><span class="w"> </span>kernels-benchmarks
4129
+ uvx<span class="w"> </span>https://github.com/drbh/uvnote.git<span class="w"> </span>build<span class="w"> </span>benches
4130
+ </code></pre></div>
4131
+
4132
  <h2>METHODOLOGY</h2>
4133
+ <p>Each benchmark is run with the
4134
+ <a target="_blank" href="https://github.com/huggingface/kernels-benchmarks">Kernels Benchmarking Framework</a> and follows these principles:<br />
4135
  - a reference implementation (usually PyTorch native) is included for baseline comparison<br />
4136
  - multiple input sizes and batch sizes are tested to reflect real-world usage<br />
4137
  - runs are repeatable via python virtual environments and documented dependencies<br />
4138
  - results are collected and visualized using standardized scripts </p>
4139
+ <p><br/></p>
4140
+ <h2>BENCHMARKS</h2>
4141
  <div class="alert">
4142
  <strong>Note:</strong> Latency values are measured in milliseconds (ms). Lower values indicate better performance.
4143
  </div>
4144
 
4145
+ <h2>ACTIVATION FUNCTIONS</h2>
4146
  <div class="artifact-preview">
4147
+ <img src="activation/results/artifacts/combine/latency.svg" alt="Activation Latency" width="800">
4148
  </div>
4149
 
4150
  <table>
 
4152
  <tr>
4153
  <th>Implementation</th>
4154
  <th>Description</th>
4155
+ <th>Source</th>
4156
+ <th>HF</th>
4157
+ <th>Bench</th>
4158
  </tr>
4159
  </thead>
4160
  <tbody>
4161
  <tr>
4162
+ <td>HF Kernels SwiGLU</td>
4163
+ <td>HuggingFace kernels SwiGLU implementation</td>
4164
+ <td><a href="https://github.com/huggingface/kernels-community/tree/main/activation">GitHub</a></td>
4165
+ <td><a href="https://huggingface.co/kernels-community/activation">HF</a></td>
4166
+ <td><a href="activation/impls/hf_kernels_swiglu.html">Bench</a></td>
4167
  </tr>
4168
  <tr>
4169
+ <td>PyTorch SwiGLU</td>
4170
+ <td>PyTorch native SwiGLU implementation</td>
4171
+ <td>-</td>
4172
+ <td>-</td>
4173
+ <td><a href="activation/impls/torch_swiglu.html">Bench</a></td>
4174
  </tr>
4175
  </tbody>
4176
  </table>
4177
  <p align="center">
4178
+ <button
4179
+ onclick="window.location.href='/#/activation/'"
 
4180
  class="btn">
4181
  Explore Full Bench
4182
  </button>
4183
  </p>
4184
 
4185
  <hr />
4186
+ <h2>FLASH ATTENTION</h2>
4187
  <div class="artifact-preview">
4188
+ <img src="flash_attn/results/artifacts/combine/latency.svg" alt="Flash Attention Latency" width="800">
4189
  </div>
4190
 
4191
  <table>
 
4193
  <tr>
4194
  <th>Implementation</th>
4195
  <th>Description</th>
4196
+ <th>Source</th>
4197
+ <th>HF</th>
4198
+ <th>Bench</th>
4199
  </tr>
4200
  </thead>
4201
  <tbody>
4202
  <tr>
4203
+ <td>Flash Attention</td>
4204
+ <td>Torch SDPA Flash Attention implementation</td>
4205
+ <td>-</td>
4206
+ <td>-</td>
4207
+ <td><a href="flash_attn/impls/flash_attention.html">Bench</a></td>
4208
  </tr>
4209
  <tr>
4210
+ <td>HF Kernels Flash Attention 2</td>
4211
+ <td>HuggingFace kernels Flash Attention</td>
4212
+ <td><a href="https://github.com/huggingface/kernels-community/tree/main/flash-attn2">GitHub</a></td>
4213
+ <td><a href="https://huggingface.co/kernels-community/flash-attn2">HF</a></td>
4214
+ <td><a href="flash_attn/impls/hf_kernels_flash_attn.html">Bench</a></td>
4215
+ </tr>
4216
+ <tr>
4217
+ <td>HF Kernels Flash Attention 3</td>
4218
+ <td>HuggingFace kernels Flash Attention 3</td>
4219
+ <td><a href="https://github.com/huggingface/kernels-community/tree/main/flash-attn3">GitHub</a></td>
4220
+ <td><a href="https://huggingface.co/kernels-community/flash-attn3">HF</a></td>
4221
+ <td><a href="flash_attn/impls/hf_kernels_flash_attn3.html">Bench</a></td>
4222
+ </tr>
4223
+ <tr>
4224
+ <td>Memory Efficient Attention</td>
4225
+ <td>Memory efficient attention implementation</td>
4226
+ <td></td>
4227
+ <td>-</td>
4228
+ <td><a href="flash_attn/impls/mem_efficient_attention.html">Bench</a></td>
4229
+ </tr>
4230
+ <tr>
4231
+ <td>Sage Attention</td>
4232
+ <td>Sage attention implementation</td>
4233
+ <td></td>
4234
+ <td><a href="https://huggingface.co/kernels-community/sage_attention">HF</a></td>
4235
+ <td><a href="flash_attn/impls/sage_attention.html">Bench</a></td>
4236
+ </tr>
4237
+ <tr>
4238
+ <td>xFormers</td>
4239
+ <td>xFormers attention implementation</td>
4240
+ <td><a href="https://github.com/facebookresearch/xformers">GitHub</a></td>
4241
+ <td>-</td>
4242
+ <td><a href="flash_attn/impls/xformers.html">Bench</a></td>
4243
  </tr>
4244
  </tbody>
4245
  </table>
4246
  <p align="center">
4247
  <button
4248
+ onclick="window.location.href='flash_attn/'"
4249
  class="btn">
4250
  Explore Full Bench
4251
  </button>
4252
  </p>
4253
 
4254
  <hr />
4255
+ <h2>DEFORMABLE DETR</h2>
4256
  <div class="artifact-preview">
4257
+ <img src="deformable_detr/results/artifacts/combine/latency.svg" alt="Deformable DETR Latency" width="800">
4258
  </div>
4259
 
4260
  <table>
 
4262
  <tr>
4263
  <th>Implementation</th>
4264
  <th>Description</th>
4265
+ <th>Source</th>
4266
+ <th>HF</th>
4267
+ <th>Bench</th>
4268
  </tr>
4269
  </thead>
4270
  <tbody>
4271
  <tr>
4272
+ <td>HF Kernels Deformable DETR</td>
4273
+ <td>HuggingFace kernels Deformable DETR implementation</td>
4274
+ <td><a href="https://github.com/huggingface/kernels-community/tree/main/deformable-detr">GitHub</a></td>
4275
+ <td><a href="https://huggingface.co/kernels-community/deformable-detr">HF</a></td>
4276
+ <td><a href="deformable_detr/impls/hf_kernels_deformable_detr.html">Bench</a></td>
 
4277
  </tr>
4278
  <tr>
4279
+ <td>PyTorch Deformable DETR</td>
4280
+ <td>PyTorch native Deformable DETR implementation</td>
4281
+ <td>-</td>
4282
+ <td>-</td>
4283
+ <td><a href="deformable_detr/impls/torch_deformable_detr.html">Bench</a></td>
4284
  </tr>
4285
+ </tbody>
4286
+ </table>
4287
+ <p align="center">
4288
+ <button
4289
+ onclick="window.location.href='deformable_detr/'"
4290
+ class="btn">
4291
+ Explore Full Bench
4292
+ </button>
4293
+ </p>
4294
+
4295
+ <hr />
4296
+ <h2>OPENAI-STYLE MOE</h2>
4297
+ <div class="artifact-preview">
4298
+ <img src="openai_moe/results/artifacts/combine/latency.svg" alt="OpenAI MoE Latency" width="800">
4299
+ </div>
4300
+
4301
+ <table>
4302
+ <thead>
4303
  <tr>
4304
+ <th>Implementation</th>
4305
+ <th>Description</th>
4306
+ <th>Source</th>
4307
+ <th>HF</th>
4308
+ <th>Bench</th>
4309
  </tr>
4310
+ </thead>
4311
+ <tbody>
4312
  <tr>
4313
+ <td>GptOssExperts</td>
4314
+ <td>GPT OSS reference OpenAI-style MoE</td>
4315
+ <td></td>
4316
+ <td></td>
4317
+ <td><a href="openai_moe/impls/gpt_oss_moe.html">Bench</a></td>
4318
  </tr>
4319
  <tr>
4320
+ <td>Binned PyTorch</td>
4321
+ <td>Binned PyTorch OpenAI-style MoE implementation</td>
4322
+ <td>-</td>
4323
+ <td>-</td>
4324
+ <td><a href="openai_moe/impls/binned_torch.html">Bench</a></td>
4325
  </tr>
4326
  </tbody>
4327
  </table>
4328
  <p align="center">
4329
  <button
4330
+ onclick="window.location.href='openai_moe/'"
4331
  class="btn">
4332
  Explore Full Bench
4333
  </button>
 
4344
  <tr>
4345
  <th>Implementation</th>
4346
  <th>Description</th>
4347
+ <th>Source</th>
4348
+ <th>HF</th>
4349
+ <th>Bench</th>
4350
  </tr>
4351
  </thead>
4352
  <tbody>
4353
  <tr>
4354
  <td>HF Kernels Causal Conv1D</td>
4355
  <td>HuggingFace kernels implementation</td>
4356
+ <td><a href="https://github.com/huggingface/kernels-community/tree/main/causal-conv1d">GitHub</a></td>
4357
+ <td><a href="https://huggingface.co/kernels-community/causal-conv1d">HF</a></td>
4358
+ <td><a href="causal_conv1d/impls/hf_kernels_causal_conv1d.html">Bench</a></td>
4359
  </tr>
4360
  <tr>
4361
  <td>PyTorch Causal Conv1D</td>
4362
  <td>PyTorch native implementation</td>
4363
+ <td>-</td>
4364
+ <td>-</td>
4365
+ <td><a href="causal_conv1d/impls/torch_causal_conv1d.html">Bench</a></td>
4366
  </tr>
4367
  </tbody>
4368
  </table>
 
4375
  </p>
4376
 
4377
  <hr />
4378
+ <h2>ROTARY POSITION EMBEDDINGS</h2>
4379
  <div class="artifact-preview">
4380
+ <img src="rotary/results/artifacts/combine/latency.svg" alt="Rotary Position Embeddings Latency" width="800">
4381
  </div>
4382
 
4383
  <table>
 
4385
  <tr>
4386
  <th>Implementation</th>
4387
  <th>Description</th>
4388
+ <th>Source</th>
4389
+ <th>HF</th>
4390
+ <th>Bench</th>
4391
  </tr>
4392
  </thead>
4393
  <tbody>
4394
  <tr>
4395
+ <td>HF Kernels Rotary</td>
4396
+ <td>HuggingFace kernels implementation</td>
4397
+ <td><a href="https://github.com/huggingface/kernels-community/tree/main/rotary">GitHub</a></td>
4398
+ <td><a href="https://huggingface.co/kernels-community/rotary">HF</a></td>
4399
+ <td><a href="rotary/impls/hf_kernels_rotary.html">Bench</a></td>
4400
  </tr>
4401
  <tr>
4402
+ <td>PyTorch Rotary</td>
4403
+ <td>PyTorch native implementation</td>
4404
+ <td>-</td>
4405
+ <td>-</td>
4406
+ <td><a href="rotary/impls/torch_rotary.html">Bench</a></td>
4407
  </tr>
4408
  </tbody>
4409
  </table>
4410
  <p align="center">
4411
  <button
4412
+ onclick="window.location.href='rotary/'"
4413
  class="btn">
4414
  Explore Full Bench
4415
  </button>
4416
  </p>
4417
 
4418
  <hr />
4419
+ <h2>LAYER NORMALIZATION</h2>
4420
+ <div class="artifact-preview">
4421
+ <img src="layer_norm/results/artifacts/combine/latency.svg" alt="Layer Norm Latency" width="800">
4422
+ </div>
4423
+
4424
+ <table>
4425
+ <thead>
4426
+ <tr>
4427
+ <th>Implementation</th>
4428
+ <th>Description</th>
4429
+ <th>Source</th>
4430
+ <th>HF</th>
4431
+ <th>Bench</th>
4432
+ </tr>
4433
+ </thead>
4434
+ <tbody>
4435
+ <tr>
4436
+ <td>HF Kernels Layer Norm</td>
4437
+ <td>HuggingFace kernels implementation</td>
4438
+ <td><a href="https://github.com/huggingface/kernels-community/tree/main/layer-norm">GitHub</a></td>
4439
+ <td><a href="https://huggingface.co/kernels-community/layer-norm">HF</a></td>
4440
+ <td><a href="layer_norm/impls/hf_kernels_layer_norm.html">Bench</a></td>
4441
+ </tr>
4442
+ <tr>
4443
+ <td>PyTorch Layer Norm</td>
4444
+ <td>PyTorch native implementation</td>
4445
+ <td>-</td>
4446
+ <td>-</td>
4447
+ <td><a href="layer_norm/impls/torch_layer_norm.html">Bench</a></td>
4448
+ </tr>
4449
+ </tbody>
4450
+ </table>
4451
+ <p align="center">
4452
+ <button
4453
+ onclick="window.location.href='layer_norm/'"
4454
+ class="btn">
4455
+ Explore Full Bench
4456
+ </button>
4457
+ </p>
4458
+
4459
  <style>
4460
  .controls {
4461
  display: none !important;
 
4499
  }
4500
  :root {
4501
  --bg-alert: #0069cbff;
 
4502
  }
4503
  .alert {
4504
+ padding: 5px 10px;
4505
  background-color: var(--bg-alert);
 
4506
  margin-bottom: 10px;
4507
  border-radius: 6px;
4508
  }
layer_norm/impls/artifacts/benchmark/layer_norm.jsonl CHANGED
@@ -1,4 +1,4 @@
1
- {"ts": "2025-10-30T15:53:20Z", "run": "43d96fd4ebe14ca496dc9089d5327f41", "impl": "hf_kernels_layer_norm", "tags": {"family": "hf-kernels", "repo": "kernels-community/layer-norm", "op": "layer_norm"}, "wl": {"name": "LN_B16_S2048_D4096", "batch": 16, "seq_len": 2048, "hidden_dim": 4096, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.8346939999910319, "p50": 0.8380950000059784, "p90": 0.838174000023173, "mean": 0.8376522000048681, "iqr": 0.0016900000332498166, "raw_times": [0.8346939999910319, 0.8364839999899232, 0.8408140000142339, 0.838174000023173, 0.8380950000059784], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.8434949999696073, "peak_bytes": 2415935488, "ok": true, "absmax": 0.03125, "corr": {"ok": true, "rtol": 0.001, "atol": 0.03125, "absmax": 0.03125, "mae": 0.0015411376953125, "mse": 1.1205673217773438e-05, "ref": "layer_norm_ref"}, "err": null}
2
- {"ts": "2025-10-30T15:53:20Z", "run": "43d96fd4ebe14ca496dc9089d5327f41", "impl": "hf_kernels_layer_norm", "tags": {"family": "hf-kernels", "repo": "kernels-community/layer-norm", "op": "layer_norm"}, "wl": {"name": "LN_B16_S2048_D8192", "batch": 16, "seq_len": 2048, "hidden_dim": 8192, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 1.6432289999670502, "p50": 1.649038999971708, "p90": 1.6514490000076876, "mean": 1.6484529999956976, "iqr": 0.006049999967672193, "raw_times": [1.649038999971708, 1.6531489999920268, 1.6453990000400154, 1.6432289999670502, 1.6514490000076876], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 1.660748999995576, "peak_bytes": 4831870976, "ok": true, "absmax": 0.03125, "corr": {"ok": true, "rtol": 0.001, "atol": 0.03125, "absmax": 0.03125, "mae": 0.0015106201171875, "mse": 1.1086463928222656e-05, "ref": "layer_norm_ref"}, "err": null}
3
- {"ts": "2025-10-30T15:53:20Z", "run": "43d96fd4ebe14ca496dc9089d5327f41", "impl": "hf_kernels_layer_norm", "tags": {"family": "hf-kernels", "repo": "kernels-community/layer-norm", "op": "layer_norm"}, "wl": {"name": "LN_B16_S4096_D4096", "batch": 16, "seq_len": 4096, "hidden_dim": 4096, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 1.639337999961299, "p50": 1.64666899996746, "p90": 1.647079000008489, "mean": 1.645640799983994, "iqr": 0.0004199999921183917, "raw_times": [1.64666899996746, 1.6484589999663513, 1.639337999961299, 1.647079000008489, 1.6466590000163706], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 1.6403390000050422, "peak_bytes": 4831854592, "ok": true, "absmax": 0.03125, "corr": {"ok": true, "rtol": 0.001, "atol": 0.03125, "absmax": 0.03125, "mae": 0.0015411376953125, "mse": 1.1205673217773438e-05, "ref": "layer_norm_ref"}, "err": null}
4
- {"ts": "2025-10-30T15:53:21Z", "run": "43d96fd4ebe14ca496dc9089d5327f41", "impl": "hf_kernels_layer_norm", "tags": {"family": "hf-kernels", "repo": "kernels-community/layer-norm", "op": "layer_norm"}, "wl": {"name": "LN_B16_S4096_D8192", "batch": 16, "seq_len": 4096, "hidden_dim": 8192, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 3.2456669999874066, "p50": 3.2605380000063633, "p90": 3.2625569999709114, "mean": 3.2589550000011513, "iqr": 0.014490999944882788, "raw_times": [3.2456669999874066, 3.2625569999709114, 3.2605380000063633, 3.277947000015047, 3.2480660000260286], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 3.260236999949484, "peak_bytes": 9663709184, "ok": true, "absmax": 0.03125, "corr": {"ok": true, "rtol": 0.001, "atol": 0.03125, "absmax": 0.03125, "mae": 0.0015106201171875, "mse": 1.1026859283447266e-05, "ref": "layer_norm_ref"}, "err": null}
 
1
+ {"ts": "2025-10-31T20:00:11Z", "run": "f2cf664f6646484f88815be637f5bc9d", "impl": "torch_layer_norm", "tags": {"family": "torch", "op": "layer_norm"}, "wl": {"name": "LN_B16_S2048_D4096", "batch": 16, "seq_len": 2048, "hidden_dim": 4096, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.807951000012963, "p50": 0.8174310000299556, "p90": 0.8198709999760467, "mean": 0.8162470000002031, "iqr": 0.0038399999766625115, "raw_times": [0.8160309999993842, 0.8198709999760467, 0.8174310000299556, 0.807951000012963, 0.819950999982666], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.8318710000025931, "peak_bytes": 2415935488, "ok": true, "absmax": 0.03125, "corr": {"ok": true, "rtol": 0.001, "atol": 0.03125, "absmax": 0.03125, "mae": 0.0015411376953125, "mse": 1.1205673217773438e-05, "ref": "layer_norm_ref"}, "err": null}
2
+ {"ts": "2025-10-31T20:00:11Z", "run": "f2cf664f6646484f88815be637f5bc9d", "impl": "torch_layer_norm", "tags": {"family": "torch", "op": "layer_norm"}, "wl": {"name": "LN_B16_S2048_D8192", "batch": 16, "seq_len": 2048, "hidden_dim": 8192, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 1.6729929999996784, "p50": 1.6790130000003956, "p90": 1.685203000022284, "mean": 1.6802827999867986, "iqr": 0.007120000077520672, "raw_times": [1.685203000022284, 1.6790130000003956, 1.6729929999996784, 1.686121999966872, 1.6780829999447633], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 1.6821429999822612, "peak_bytes": 4831870976, "ok": true, "absmax": 0.03125, "corr": {"ok": true, "rtol": 0.001, "atol": 0.03125, "absmax": 0.03125, "mae": 0.0015106201171875, "mse": 1.1086463928222656e-05, "ref": "layer_norm_ref"}, "err": null}
3
+ {"ts": "2025-10-31T20:00:12Z", "run": "f2cf664f6646484f88815be637f5bc9d", "impl": "torch_layer_norm", "tags": {"family": "torch", "op": "layer_norm"}, "wl": {"name": "LN_B16_S4096_D4096", "batch": 16, "seq_len": 4096, "hidden_dim": 4096, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 1.6009309999844845, "p50": 1.6056009999942944, "p90": 1.611341000000266, "mean": 1.606853000009778, "iqr": 0.008409999963987502, "raw_times": [1.6009309999844845, 1.6056009999942944, 1.613461000033567, 1.6029310000362784, 1.611341000000266], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 1.6386120000220217, "peak_bytes": 4831854592, "ok": true, "absmax": 0.03125, "corr": {"ok": true, "rtol": 0.001, "atol": 0.03125, "absmax": 0.03125, "mae": 0.0015411376953125, "mse": 1.1205673217773438e-05, "ref": "layer_norm_ref"}, "err": null}
4
+ {"ts": "2025-10-31T20:00:12Z", "run": "f2cf664f6646484f88815be637f5bc9d", "impl": "torch_layer_norm", "tags": {"family": "torch", "op": "layer_norm"}, "wl": {"name": "LN_B16_S4096_D8192", "batch": 16, "seq_len": 4096, "hidden_dim": 8192, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 3.3123249999675863, "p50": 3.327974000001177, "p90": 3.3289149999973233, "mean": 3.3240905999946335, "iqr": 0.010180999993281148, "raw_times": [3.3325050000030387, 3.3289149999973233, 3.3123249999675863, 3.318734000004042, 3.327974000001177], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 3.335275000040383, "peak_bytes": 9663709184, "ok": true, "absmax": 0.03125, "corr": {"ok": true, "rtol": 0.001, "atol": 0.03125, "absmax": 0.03125, "mae": 0.0015106201171875, "mse": 1.1026859283447266e-05, "ref": "layer_norm_ref"}, "err": null}
layer_norm/impls/cells/benchmark.py CHANGED
@@ -3,7 +3,6 @@
3
  # dependencies = [
4
  # "numpy",
5
  # "torch==2.8.0",
6
- # "kernels",
7
  # "kernels-benchmark-tools",
8
  # ]
9
  #
@@ -13,37 +12,15 @@
13
  import torch
14
  import sys
15
  from kernels_benchmark_tools import KernelTypeEnum, run_benchmark
16
- from kernels import get_kernel
17
 
18
- # Load the layer norm kernel
19
- layer_norm_kernel = get_kernel("kernels-community/layer-norm")
20
 
21
-
22
- def hf_kernels_layer_norm(x, weight, bias, eps: float = 1e-5):
23
- B, S, D = x.shape
24
- # The kernel expects [N, D] input; support beta (bias) if provided.
25
- out = layer_norm_kernel.dropout_add_ln_fwd(
26
- input=x.view(-1, D),
27
- gamma=weight,
28
- beta=bias,
29
- rowscale=None,
30
- colscale=None,
31
- x0_subset=None,
32
- z_subset=None,
33
- dropout_p=0.0,
34
- epsilon=eps,
35
- rowscale_const=1.0,
36
- z_numrows=S,
37
- gen=None,
38
- residual_in_fp32=False,
39
- is_rms_norm=False,
40
- )[0].view(B, S, D)
41
- return out
42
 
43
 
44
  run_benchmark(
45
  kernel_type=KernelTypeEnum.LAYER_NORM,
46
- impl_name="hf_kernels_layer_norm",
47
- impl_tags={"family": "hf-kernels", "repo": "kernels-community/layer-norm", "op": "layer_norm"},
48
- impl_func=hf_kernels_layer_norm,
49
  )
 
3
  # dependencies = [
4
  # "numpy",
5
  # "torch==2.8.0",
 
6
  # "kernels-benchmark-tools",
7
  # ]
8
  #
 
12
  import torch
13
  import sys
14
  from kernels_benchmark_tools import KernelTypeEnum, run_benchmark
 
15
 
 
 
16
 
17
+ def torch_layer_norm(x, weight, bias, eps: float = 1e-5):
18
+ return torch.nn.functional.layer_norm(x, (x.shape[-1],), weight, bias, eps)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
19
 
20
 
21
  run_benchmark(
22
  kernel_type=KernelTypeEnum.LAYER_NORM,
23
+ impl_name="torch_layer_norm",
24
+ impl_tags={"family": "torch", "op": "layer_norm"},
25
+ impl_func=torch_layer_norm,
26
  )
layer_norm/impls/hf_kernels_layer_norm.html CHANGED
@@ -4107,11 +4107,12 @@ body[data-tool="eraser"] .main-content {
4107
  <span onclick="toggleOutput('benchmark')" style="cursor: pointer;">▼ output</span>
4108
  <span id="uv-indicator-benchmark" onclick="toggleUvLogsFromHeader('benchmark')" style="cursor: pointer;">▶ uv-logs</span>
4109
  </span> |
4110
- Cell: benchmark | 6.10s
4111
  | <button class="run-btn" onclick="runCell('benchmark')">▶ run</button>
4112
  <button class="copy-btn" onclick="copyCell('benchmark')">Copy</button>
4113
  <a href="cells/benchmark.py" target="_blank" class="raw-btn">Raw</a>
4114
- <a href="https://github.com/huggingface/kernels-uvnotes/blob/main/layer_norm/impls/hf_kernels_layer_norm.md" target="_blank" class="github-btn">GitHub</a>
 
4115
  </div>
4116
  <div id="code-benchmark" class="cell-code" data-lines="49">
4117
  <div class="code-wrap">
@@ -4178,19 +4179,19 @@ PROFILE TRACE: hf_kernels_layer_norm | LN_B16_S2048_D4096
4178
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4179
  Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
4180
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4181
- hf_kernels_layer_norm 4.90% 197.042us 46.64% 1.877ms 1.877ms 0.000us 0.00% 3.132ms 3.132ms 1
4182
- _layer_norm_f8ec252::dropout_add_ln_fwd 1.74% 69.952us 41.15% 1.656ms 551.934us 2.385ms 100.00% 3.132ms 1.044ms 3
4183
- hf_kernels_layer_norm 0.00% 0.000us 0.00% 0.000us 0.000us 2.386ms 100.06% 2.386ms 2.386ms 1
4184
- void layer_norm::ln_fwd_kernel&lt;layer_norm::Kernel_tr... 0.00% 0.000us 0.00% 0.000us 0.000us 2.385ms 100.00% 2.385ms 794.945us 3
4185
- Activity Buffer Request 37.01% 1.489ms 37.01% 1.489ms 1.489ms 747.170us 31.33% 747.170us 747.170us 1
4186
- aten::view 0.59% 23.780us 0.59% 23.780us 3.963us 0.000us 0.00% 0.000us 0.000us 6
4187
- aten::empty 1.17% 47.212us 1.17% 47.212us 5.246us 0.000us 0.00% 0.000us 0.000us 9
4188
- cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFla... 0.23% 9.090us 0.23% 9.090us 3.030us 0.000us 0.00% 0.000us 0.000us 3
4189
- cudaLaunchKernel 1.00% 40.411us 1.00% 40.411us 13.470us 0.000us 0.00% 0.000us 0.000us 3
4190
- cudaDeviceSynchronize 53.36% 2.147ms 53.36% 2.147ms 2.147ms 0.000us 0.00% 0.000us 0.000us 1
4191
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4192
- Self CPU time total: 4.023ms
4193
- Self CUDA time total: 2.385ms
4194
 
4195
 
4196
 
@@ -4200,19 +4201,19 @@ PROFILE TRACE: hf_kernels_layer_norm | LN_B16_S2048_D8192
4200
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4201
  Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
4202
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4203
- hf_kernels_layer_norm 2.41% 154.482us 27.38% 1.753ms 1.753ms 0.000us 0.00% 6.413ms 6.413ms 1
4204
- _layer_norm_f8ec252::dropout_add_ln_fwd 0.72% 46.409us 24.77% 1.586ms 528.643us 4.824ms 100.00% 6.413ms 2.138ms 3
4205
- hf_kernels_layer_norm 0.00% 0.000us 0.00% 0.000us 0.000us 4.826ms 100.03% 4.826ms 4.826ms 1
4206
- void layer_norm::ln_fwd_kernel&lt;layer_norm::Kernel_tr... 0.00% 0.000us 0.00% 0.000us 0.000us 4.824ms 100.00% 4.824ms 1.608ms 3
4207
- Activity Buffer Request 23.06% 1.476ms 23.06% 1.476ms 1.476ms 1.588ms 32.92% 1.588ms 1.588ms 1
4208
- aten::view 0.20% 12.531us 0.20% 12.531us 2.089us 0.000us 0.00% 0.000us 0.000us 6
4209
- aten::empty 0.47% 30.283us 0.47% 30.283us 3.365us 0.000us 0.00% 0.000us 0.000us 9
4210
- cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFla... 0.08% 5.150us 0.08% 5.150us 1.717us 0.000us 0.00% 0.000us 0.000us 3
4211
- cudaLaunchKernel 0.43% 27.650us 0.43% 27.650us 9.217us 0.000us 0.00% 0.000us 0.000us 3
4212
- cudaDeviceSynchronize 72.62% 4.650ms 72.62% 4.650ms 4.650ms 0.000us 0.00% 0.000us 0.000us 1
4213
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4214
- Self CPU time total: 6.403ms
4215
- Self CUDA time total: 4.824ms
4216
 
4217
 
4218
 
@@ -4222,19 +4223,19 @@ PROFILE TRACE: hf_kernels_layer_norm | LN_B16_S4096_D4096
4222
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4223
  Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
4224
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4225
- hf_kernels_layer_norm 2.19% 139.552us 27.69% 1.763ms 1.763ms 0.000us 0.00% 6.329ms 6.329ms 1
4226
- _layer_norm_f8ec252::dropout_add_ln_fwd 0.72% 45.651us 25.31% 1.612ms 537.326us 4.772ms 100.00% 6.329ms 2.110ms 3
4227
- hf_kernels_layer_norm 0.00% 0.000us 0.00% 0.000us 0.000us 4.774ms 100.03% 4.774ms 4.774ms 1
4228
- void layer_norm::ln_fwd_kernel&lt;layer_norm::Kernel_tr... 0.00% 0.000us 0.00% 0.000us 0.000us 4.772ms 100.00% 4.772ms 1.591ms 3
4229
- Activity Buffer Request 23.61% 1.504ms 23.61% 1.504ms 1.504ms 1.557ms 32.63% 1.557ms 1.557ms 1
4230
- aten::view 0.19% 11.951us 0.19% 11.951us 1.992us 0.000us 0.00% 0.000us 0.000us 6
4231
- aten::empty 0.48% 30.520us 0.48% 30.520us 3.391us 0.000us 0.00% 0.000us 0.000us 9
4232
- cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFla... 0.08% 5.131us 0.08% 5.131us 1.710us 0.000us 0.00% 0.000us 0.000us 3
4233
- cudaLaunchKernel 0.42% 26.970us 0.42% 26.970us 8.990us 0.000us 0.00% 0.000us 0.000us 3
4234
- cudaDeviceSynchronize 72.31% 4.606ms 72.31% 4.606ms 4.606ms 0.000us 0.00% 0.000us 0.000us 1
4235
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4236
- Self CPU time total: 6.370ms
4237
- Self CUDA time total: 4.772ms
4238
 
4239
 
4240
 
@@ -4244,36 +4245,38 @@ PROFILE TRACE: hf_kernels_layer_norm | LN_B16_S4096_D8192
4244
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4245
  Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
4246
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4247
- hf_kernels_layer_norm 1.25% 143.461us 17.42% 1.995ms 1.995ms 0.000us 0.00% 12.814ms 12.814ms 1
4248
- _layer_norm_f8ec252::dropout_add_ln_fwd 0.40% 45.652us 16.06% 1.839ms 613.131us 9.628ms 100.00% 12.814ms 4.271ms 3
4249
- hf_kernels_layer_norm 0.00% 0.000us 0.00% 0.000us 0.000us 9.629ms 100.01% 9.629ms 9.629ms 1
4250
- void layer_norm::ln_fwd_kernel&lt;layer_norm::Kernel_tr... 0.00% 0.000us 0.00% 0.000us 0.000us 9.628ms 100.00% 9.628ms 3.209ms 3
4251
- Activity Buffer Request 12.97% 1.486ms 12.97% 1.486ms 1.486ms 3.186ms 33.09% 3.186ms 3.186ms 1
4252
- aten::view 0.11% 12.411us 0.11% 12.411us 2.069us 0.000us 0.00% 0.000us 0.000us 6
4253
- aten::empty 0.27% 31.101us 0.27% 31.101us 3.456us 0.000us 0.00% 0.000us 0.000us 9
4254
- cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFla... 0.04% 5.010us 0.04% 5.010us 1.670us 0.000us 0.00% 0.000us 0.000us 3
4255
- cudaLaunchKernel 2.37% 271.915us 2.37% 271.915us 90.638us 0.000us 0.00% 0.000us 0.000us 3
4256
- cudaDeviceSynchronize 82.58% 9.458ms 82.58% 9.458ms 9.458ms 0.000us 0.00% 0.000us 0.000us 1
4257
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4258
- Self CPU time total: 11.453ms
4259
- Self CUDA time total: 9.628ms
4260
 
4261
 
4262
  impl wl p50(ms) ok
4263
  hf_kernels_layer_norm LN_B16_S2048_D4096 0.84 True
4264
- hf_kernels_layer_norm LN_B16_S2048_D8192 1.65 True
4265
- hf_kernels_layer_norm LN_B16_S4096_D4096 1.65 True
4266
- hf_kernels_layer_norm LN_B16_S4096_D8192 3.26 True
4267
  </pre></div>
4268
  <div class="uv-install-logs" id="uv-logs-benchmark">
4269
  <div class="uv-logs-header" onclick="toggleUvLogs(this)">▶ UV Install Logs</div>
4270
  <div class="uv-logs-content" style="display: none;">
4271
- Installed 15 packages in 12ms
 
 
4272
  </div>
4273
  </div>
4274
  <div class="cell-stderr">Fetching 4 files: 0%| | 0/4 [00:00&lt;?, ?it/s]
4275
- Fetching 4 files: 50%|█████ | 2/4 [00:01&lt;00:01, 1.24it/s]
4276
- Fetching 4 files: 100%|██████████| 4/4 [00:01&lt;00:00, 2.49it/s]</div>
4277
  <div class="cell-artifacts">
4278
  <h4>Artifacts:</h4>
4279
  <a href="artifacts/benchmark/layer_norm.jsonl" class="artifact" target="_blank">layer_norm.jsonl</a>
 
4107
  <span onclick="toggleOutput('benchmark')" style="cursor: pointer;">▼ output</span>
4108
  <span id="uv-indicator-benchmark" onclick="toggleUvLogsFromHeader('benchmark')" style="cursor: pointer;">▶ uv-logs</span>
4109
  </span> |
4110
+ Cell: benchmark | 10.09s
4111
  | <button class="run-btn" onclick="runCell('benchmark')">▶ run</button>
4112
  <button class="copy-btn" onclick="copyCell('benchmark')">Copy</button>
4113
  <a href="cells/benchmark.py" target="_blank" class="raw-btn">Raw</a>
4114
+ <a href="https://github.com/huggingface/kernels-benchmarks/blob/main/layer_norm/impls/hf_kernels_layer_norm.md" target="_blank" class="github-btn">GitHub</a>
4115
+ <a href="https://huggingface.co/kernels-community/layer-norm" target="_blank" class="hf-btn">🤗 HF</a>
4116
  </div>
4117
  <div id="code-benchmark" class="cell-code" data-lines="49">
4118
  <div class="code-wrap">
 
4179
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4180
  Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
4181
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4182
+ hf_kernels_layer_norm 5.01% 203.177us 46.78% 1.895ms 1.895ms 0.000us 0.00% 3.141ms 3.141ms 1
4183
+ _layer_norm_f8ec252::dropout_add_ln_fwd 1.71% 69.312us 41.16% 1.668ms 555.914us 2.399ms 100.00% 3.141ms 1.047ms 3
4184
+ hf_kernels_layer_norm 0.00% 0.000us 0.00% 0.000us 0.000us 2.401ms 100.06% 2.401ms 2.401ms 1
4185
+ void layer_norm::ln_fwd_kernel&lt;layer_norm::Kernel_tr... 0.00% 0.000us 0.00% 0.000us 0.000us 2.399ms 100.00% 2.399ms 799.825us 3
4186
+ Activity Buffer Request 36.95% 1.497ms 36.95% 1.497ms 1.497ms 742.012us 30.92% 742.012us 742.012us 1
4187
+ aten::view 0.61% 24.559us 0.61% 24.559us 4.093us 0.000us 0.00% 0.000us 0.000us 6
4188
+ aten::empty 1.20% 48.622us 1.20% 48.622us 5.402us 0.000us 0.00% 0.000us 0.000us 9
4189
+ cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFla... 0.23% 9.170us 0.23% 9.170us 3.057us 0.000us 0.00% 0.000us 0.000us 3
4190
+ cudaLaunchKernel 1.07% 43.390us 1.07% 43.390us 14.463us 0.000us 0.00% 0.000us 0.000us 3
4191
+ cudaDeviceSynchronize 53.22% 2.156ms 53.22% 2.156ms 2.156ms 0.000us 0.00% 0.000us 0.000us 1
4192
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4193
+ Self CPU time total: 4.052ms
4194
+ Self CUDA time total: 2.399ms
4195
 
4196
 
4197
 
 
4201
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4202
  Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
4203
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4204
+ hf_kernels_layer_norm 1.88% 119.443us 26.75% 1.701ms 1.701ms 0.000us 0.00% 6.407ms 6.407ms 1
4205
+ _layer_norm_f8ec252::dropout_add_ln_fwd 0.71% 45.121us 24.67% 1.568ms 522.677us 4.827ms 100.00% 6.407ms 2.136ms 3
4206
+ hf_kernels_layer_norm 0.00% 0.000us 0.00% 0.000us 0.000us 4.829ms 100.03% 4.829ms 4.829ms 1
4207
+ void layer_norm::ln_fwd_kernel&lt;layer_norm::Kernel_tr... 0.00% 0.000us 0.00% 0.000us 0.000us 4.827ms 100.00% 4.827ms 1.609ms 3
4208
+ Activity Buffer Request 22.91% 1.456ms 22.91% 1.456ms 1.456ms 1.580ms 32.72% 1.580ms 1.580ms 1
4209
+ aten::view 0.21% 13.200us 0.21% 13.200us 2.200us 0.000us 0.00% 0.000us 0.000us 6
4210
+ aten::empty 0.51% 32.711us 0.51% 32.711us 3.635us 0.000us 0.00% 0.000us 0.000us 9
4211
+ cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFla... 0.08% 5.289us 0.08% 5.289us 1.763us 0.000us 0.00% 0.000us 0.000us 3
4212
+ cudaLaunchKernel 0.45% 28.522us 0.45% 28.522us 9.507us 0.000us 0.00% 0.000us 0.000us 3
4213
+ cudaDeviceSynchronize 73.25% 4.656ms 73.25% 4.656ms 4.656ms 0.000us 0.00% 0.000us 0.000us 1
4214
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4215
+ Self CPU time total: 6.357ms
4216
+ Self CUDA time total: 4.827ms
4217
 
4218
 
4219
 
 
4223
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4224
  Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
4225
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4226
+ hf_kernels_layer_norm 1.89% 118.801us 26.85% 1.686ms 1.686ms 0.000us 0.00% 6.309ms 6.309ms 1
4227
+ _layer_norm_f8ec252::dropout_add_ln_fwd 0.78% 49.183us 24.77% 1.555ms 518.493us 4.763ms 100.00% 6.309ms 2.103ms 3
4228
+ hf_kernels_layer_norm 0.00% 0.000us 0.00% 0.000us 0.000us 4.765ms 100.03% 4.765ms 4.765ms 1
4229
+ void layer_norm::ln_fwd_kernel&lt;layer_norm::Kernel_tr... 0.00% 0.000us 0.00% 0.000us 0.000us 4.763ms 100.00% 4.763ms 1.588ms 3
4230
+ Activity Buffer Request 22.96% 1.442ms 22.96% 1.442ms 1.442ms 1.546ms 32.46% 1.546ms 1.546ms 1
4231
+ aten::view 0.19% 11.741us 0.19% 11.741us 1.957us 0.000us 0.00% 0.000us 0.000us 6
4232
+ aten::empty 0.49% 30.460us 0.49% 30.460us 3.384us 0.000us 0.00% 0.000us 0.000us 9
4233
+ cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFla... 0.08% 4.920us 0.08% 4.920us 1.640us 0.000us 0.00% 0.000us 0.000us 3
4234
+ cudaLaunchKernel 0.46% 29.050us 0.46% 29.050us 9.683us 0.000us 0.00% 0.000us 0.000us 3
4235
+ cudaDeviceSynchronize 73.15% 4.593ms 73.15% 4.593ms 4.593ms 0.000us 0.00% 0.000us 0.000us 1
4236
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4237
+ Self CPU time total: 6.279ms
4238
+ Self CUDA time total: 4.763ms
4239
 
4240
 
4241
 
 
4245
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4246
  Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
4247
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4248
+ hf_kernels_layer_norm 1.11% 112.814us 7.31% 743.908us 743.908us 0.000us 0.00% 12.737ms 12.737ms 1
4249
+ _layer_norm_f8ec252::dropout_add_ln_fwd 0.47% 47.722us 6.09% 619.105us 206.368us 9.594ms 100.00% 12.737ms 4.246ms 3
4250
+ hf_kernels_layer_norm 0.00% 0.000us 0.00% 0.000us 0.000us 9.595ms 100.02% 9.595ms 9.595ms 1
4251
+ void layer_norm::ln_fwd_kernel&lt;layer_norm::Kernel_tr... 0.00% 0.000us 0.00% 0.000us 0.000us 9.594ms 100.00% 9.594ms 3.198ms 3
4252
+ Activity Buffer Request 2.50% 254.176us 2.50% 254.176us 254.176us 3.143ms 32.76% 3.143ms 3.143ms 1
4253
+ aten::view 0.12% 11.989us 0.12% 11.989us 1.998us 0.000us 0.00% 0.000us 0.000us 6
4254
+ aten::empty 0.30% 30.280us 0.30% 30.280us 3.364us 0.000us 0.00% 0.000us 0.000us 9
4255
+ cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFla... 0.05% 5.000us 0.05% 5.000us 1.667us 0.000us 0.00% 0.000us 0.000us 3
4256
+ cudaLaunchKernel 2.77% 281.927us 2.77% 281.927us 93.976us 0.000us 0.00% 0.000us 0.000us 3
4257
+ cudaDeviceSynchronize 92.69% 9.430ms 92.69% 9.430ms 9.430ms 0.000us 0.00% 0.000us 0.000us 1
4258
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4259
+ Self CPU time total: 10.174ms
4260
+ Self CUDA time total: 9.594ms
4261
 
4262
 
4263
  impl wl p50(ms) ok
4264
  hf_kernels_layer_norm LN_B16_S2048_D4096 0.84 True
4265
+ hf_kernels_layer_norm LN_B16_S2048_D8192 1.66 True
4266
+ hf_kernels_layer_norm LN_B16_S4096_D4096 1.66 True
4267
+ hf_kernels_layer_norm LN_B16_S4096_D8192 3.27 True
4268
  </pre></div>
4269
  <div class="uv-install-logs" id="uv-logs-benchmark">
4270
  <div class="uv-logs-header" onclick="toggleUvLogs(this)">▶ UV Install Logs</div>
4271
  <div class="uv-logs-content" style="display: none;">
4272
+ Downloading hf-xet (3.2MiB)
4273
+ Downloading hf-xet
4274
+ Installed 52 packages in 218ms
4275
  </div>
4276
  </div>
4277
  <div class="cell-stderr">Fetching 4 files: 0%| | 0/4 [00:00&lt;?, ?it/s]
4278
+ Fetching 4 files: 50%|█████ | 2/4 [00:01&lt;00:01, 1.22it/s]
4279
+ Fetching 4 files: 100%|██████████| 4/4 [00:01&lt;00:00, 2.45it/s]</div>
4280
  <div class="cell-artifacts">
4281
  <h4>Artifacts:</h4>
4282
  <a href="artifacts/benchmark/layer_norm.jsonl" class="artifact" target="_blank">layer_norm.jsonl</a>
layer_norm/impls/torch_layer_norm.html CHANGED
@@ -4106,11 +4106,11 @@ body[data-tool="eraser"] .main-content {
4106
  <span onclick="toggleOutput('nv')" style="cursor: pointer;">▼ output</span>
4107
  <span id="uv-indicator-nv" style="cursor: default; opacity: 0.3;">▶ uv-logs</span>
4108
  </span> |
4109
- Cell: nv | 0.26s
4110
  | <button class="run-btn" onclick="runCell('nv')">▶ run</button>
4111
  <button class="copy-btn" onclick="copyCell('nv')">Copy</button>
4112
  <a href="cells/nv.py" target="_blank" class="raw-btn">Raw</a>
4113
- <a href="https://github.com/huggingface/kernels-uvnotes/blob/main/layer_norm/impls/torch_layer_norm.md" target="_blank" class="github-btn">GitHub</a>
4114
  </div>
4115
  <div id="code-nv" class="cell-code" data-lines="2">
4116
  <div class="code-wrap">
@@ -4122,7 +4122,7 @@ Cell: nv | 0.26s
4122
  </div>
4123
  </div>
4124
  <div id="output-nv" class="cell-output">
4125
- <div class="cell-stdout"><pre class="stdout-text">Thu Oct 30 15:52:47 2025
4126
  +-----------------------------------------------------------------------------------------+
4127
  | NVIDIA-SMI 570.195.03 Driver Version: 570.195.03 CUDA Version: 12.8 |
4128
  |-----------------------------------------+------------------------+----------------------+
@@ -4131,7 +4131,7 @@ Cell: nv | 0.26s
4131
  | | | MIG M. |
4132
  |=========================================+========================+======================|
4133
  | 0 NVIDIA L40S On | 00000000:4D:00.0 Off | 0 |
4134
- | N/A 33C P0 139W / 350W | 0MiB / 46068MiB | 100% Default |
4135
  | | | N/A |
4136
  +-----------------------------------------+------------------------+----------------------+
4137
 
@@ -4153,13 +4153,13 @@ Cell: nv | 0.26s
4153
  <span class="collapse-indicators">
4154
  <span onclick="toggleCode('benchmark')" style="cursor: pointer;">▼ code</span>
4155
  <span onclick="toggleOutput('benchmark')" style="cursor: pointer;">▼ output</span>
4156
- <span id="uv-indicator-benchmark" onclick="toggleUvLogsFromHeader('benchmark')" style="cursor: pointer;">▶ uv-logs</span>
4157
  </span> |
4158
- Cell: benchmark | 7.42s
4159
  | <button class="run-btn" onclick="runCell('benchmark')">▶ run</button>
4160
  <button class="copy-btn" onclick="copyCell('benchmark')">Copy</button>
4161
  <a href="cells/benchmark.py" target="_blank" class="raw-btn">Raw</a>
4162
- <a href="https://github.com/huggingface/kernels-uvnotes/blob/main/layer_norm/impls/torch_layer_norm.md" target="_blank" class="github-btn">GitHub</a>
4163
  </div>
4164
  <div id="code-benchmark" class="cell-code" data-lines="26">
4165
  <div class="code-wrap">
@@ -4203,19 +4203,19 @@ PROFILE TRACE: torch_layer_norm | LN_B16_S2048_D4096
4203
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4204
  Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
4205
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4206
- torch_layer_norm 3.91% 153.364us 46.27% 1.815ms 1.815ms 0.000us 0.00% 3.039ms 3.039ms 1
4207
- aten::layer_norm 0.42% 16.299us 42.36% 1.661ms 553.716us 0.000us 0.00% 3.039ms 1.013ms 3
4208
- aten::native_layer_norm 2.01% 79.002us 41.94% 1.645ms 548.283us 2.327ms 100.00% 3.039ms 1.013ms 3
4209
- torch_layer_norm 0.00% 0.000us 0.00% 0.000us 0.000us 2.329ms 100.06% 2.329ms 2.329ms 1
4210
- void at::native::(anonymous namespace)::vectorized_l... 0.00% 0.000us 0.00% 0.000us 0.000us 2.327ms 100.00% 2.327ms 775.829us 3
4211
- Activity Buffer Request 37.33% 1.464ms 37.33% 1.464ms 1.464ms 711.872us 30.59% 711.872us 711.872us 1
4212
- aten::empty 1.19% 46.781us 1.19% 46.781us 5.198us 0.000us 0.00% 0.000us 0.000us 9
4213
- cudaLaunchKernel 1.21% 47.400us 1.21% 47.400us 15.800us 0.000us 0.00% 0.000us 0.000us 3
4214
- aten::view 0.20% 7.811us 0.20% 7.811us 1.302us 0.000us 0.00% 0.000us 0.000us 6
4215
- cudaDeviceSynchronize 53.73% 2.107ms 53.73% 2.107ms 2.107ms 0.000us 0.00% 0.000us 0.000us 1
4216
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4217
- Self CPU time total: 3.922ms
4218
- Self CUDA time total: 2.327ms
4219
 
4220
 
4221
 
@@ -4225,19 +4225,19 @@ PROFILE TRACE: torch_layer_norm | LN_B16_S2048_D8192
4225
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4226
  Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
4227
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4228
- torch_layer_norm 1.15% 73.661us 25.36% 1.626ms 1.626ms 0.000us 0.00% 6.533ms 6.533ms 1
4229
- aten::layer_norm 0.14% 8.791us 24.21% 1.552ms 517.499us 0.000us 0.00% 6.533ms 2.178ms 3
4230
- aten::native_layer_norm 0.79% 50.951us 24.07% 1.544ms 514.569us 4.920ms 100.00% 6.533ms 2.178ms 3
4231
- torch_layer_norm 0.00% 0.000us 0.00% 0.000us 0.000us 4.922ms 100.03% 4.922ms 4.922ms 1
4232
- void at::native::(anonymous namespace)::vectorized_l... 0.00% 0.000us 0.00% 0.000us 0.000us 4.920ms 100.00% 4.920ms 1.640ms 3
4233
- Activity Buffer Request 22.34% 1.433ms 22.34% 1.433ms 1.433ms 1.613ms 32.78% 1.613ms 1.613ms 1
4234
- aten::empty 0.45% 28.941us 0.45% 28.941us 3.216us 0.000us 0.00% 0.000us 0.000us 9
4235
- cudaLaunchKernel 0.43% 27.430us 0.43% 27.430us 9.143us 0.000us 0.00% 0.000us 0.000us 3
4236
- aten::view 0.06% 3.590us 0.06% 3.590us 0.598us 0.000us 0.00% 0.000us 0.000us 6
4237
- cudaDeviceSynchronize 74.64% 4.787ms 74.64% 4.787ms 4.787ms 0.000us 0.00% 0.000us 0.000us 1
4238
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4239
- Self CPU time total: 6.413ms
4240
- Self CUDA time total: 4.920ms
4241
 
4242
 
4243
 
@@ -4247,19 +4247,19 @@ PROFILE TRACE: torch_layer_norm | LN_B16_S4096_D4096
4247
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4248
  Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
4249
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4250
- torch_layer_norm 1.10% 68.311us 26.09% 1.619ms 1.619ms 0.000us 0.00% 6.232ms 6.232ms 1
4251
- aten::layer_norm 0.13% 8.220us 24.99% 1.551ms 516.952us 0.000us 0.00% 6.232ms 2.077ms 3
4252
- aten::native_layer_norm 0.83% 51.401us 24.86% 1.543ms 514.212us 4.714ms 100.00% 6.232ms 2.077ms 3
4253
- torch_layer_norm 0.00% 0.000us 0.00% 0.000us 0.000us 4.716ms 100.03% 4.716ms 4.716ms 1
4254
- void at::native::(anonymous namespace)::vectorized_l... 0.00% 0.000us 0.00% 0.000us 0.000us 4.714ms 100.00% 4.714ms 1.571ms 3
4255
- Activity Buffer Request 23.07% 1.432ms 23.07% 1.432ms 1.432ms 1.518ms 32.20% 1.518ms 1.518ms 1
4256
- aten::empty 0.45% 27.641us 0.45% 27.641us 3.071us 0.000us 0.00% 0.000us 0.000us 9
4257
- cudaLaunchKernel 0.45% 27.961us 0.45% 27.961us 9.320us 0.000us 0.00% 0.000us 0.000us 3
4258
- aten::view 0.06% 3.720us 0.06% 3.720us 0.620us 0.000us 0.00% 0.000us 0.000us 6
4259
- cudaDeviceSynchronize 73.91% 4.587ms 73.91% 4.587ms 4.587ms 0.000us 0.00% 0.000us 0.000us 1
4260
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4261
- Self CPU time total: 6.206ms
4262
- Self CUDA time total: 4.714ms
4263
 
4264
 
4265
 
@@ -4269,33 +4269,27 @@ PROFILE TRACE: torch_layer_norm | LN_B16_S4096_D8192
4269
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4270
  Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
4271
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4272
- torch_layer_norm 0.61% 68.882us 14.40% 1.628ms 1.628ms 0.000us 0.00% 13.066ms 13.066ms 1
4273
- aten::layer_norm 0.08% 8.939us 13.79% 1.559ms 519.662us 0.000us 0.00% 13.066ms 4.355ms 3
4274
- aten::native_layer_norm 0.44% 49.281us 13.71% 1.550ms 516.682us 9.830ms 100.00% 13.066ms 4.355ms 3
4275
- torch_layer_norm 0.00% 0.000us 0.00% 0.000us 0.000us 9.831ms 100.01% 9.831ms 9.831ms 1
4276
- void at::native::(anonymous namespace)::vectorized_l... 0.00% 0.000us 0.00% 0.000us 0.000us 9.830ms 100.00% 9.830ms 3.277ms 3
4277
- Activity Buffer Request 11.27% 1.275ms 11.27% 1.275ms 1.275ms 3.236ms 32.92% 3.236ms 3.236ms 1
4278
- aten::empty 0.25% 28.400us 0.25% 28.400us 3.156us 0.000us 0.00% 0.000us 0.000us 9
4279
- cudaLaunchKernel 1.71% 193.833us 1.71% 193.833us 64.611us 0.000us 0.00% 0.000us 0.000us 3
4280
- aten::view 0.03% 3.811us 0.03% 3.811us 0.635us 0.000us 0.00% 0.000us 0.000us 6
4281
- cudaDeviceSynchronize 85.60% 9.678ms 85.60% 9.678ms 9.678ms 0.000us 0.00% 0.000us 0.000us 1
4282
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4283
- Self CPU time total: 11.306ms
4284
- Self CUDA time total: 9.830ms
4285
 
4286
 
4287
  impl wl p50(ms) ok
4288
  torch_layer_norm LN_B16_S2048_D4096 0.82 True
4289
  torch_layer_norm LN_B16_S2048_D8192 1.68 True
4290
  torch_layer_norm LN_B16_S4096_D4096 1.61 True
4291
- torch_layer_norm LN_B16_S4096_D8192 3.32 True
4292
  </pre></div>
4293
- <div class="uv-install-logs" id="uv-logs-benchmark">
4294
- <div class="uv-logs-header" onclick="toggleUvLogs(this)">▶ UV Install Logs</div>
4295
- <div class="uv-logs-content" style="display: none;">
4296
- Installed 37 packages in 236ms
4297
- </div>
4298
- </div>
4299
  <div class="cell-artifacts">
4300
  <h4>Artifacts:</h4>
4301
  <a href="artifacts/benchmark/layer_norm.jsonl" class="artifact" target="_blank">layer_norm.jsonl</a>
 
4106
  <span onclick="toggleOutput('nv')" style="cursor: pointer;">▼ output</span>
4107
  <span id="uv-indicator-nv" style="cursor: default; opacity: 0.3;">▶ uv-logs</span>
4108
  </span> |
4109
+ Cell: nv | 0.23s
4110
  | <button class="run-btn" onclick="runCell('nv')">▶ run</button>
4111
  <button class="copy-btn" onclick="copyCell('nv')">Copy</button>
4112
  <a href="cells/nv.py" target="_blank" class="raw-btn">Raw</a>
4113
+ <a href="https://github.com/huggingface/kernels-benchmarks/blob/main/layer_norm/impls/torch_layer_norm.md" target="_blank" class="github-btn">GitHub</a>
4114
  </div>
4115
  <div id="code-nv" class="cell-code" data-lines="2">
4116
  <div class="code-wrap">
 
4122
  </div>
4123
  </div>
4124
  <div id="output-nv" class="cell-output">
4125
+ <div class="cell-stdout"><pre class="stdout-text">Fri Oct 31 20:00:08 2025
4126
  +-----------------------------------------------------------------------------------------+
4127
  | NVIDIA-SMI 570.195.03 Driver Version: 570.195.03 CUDA Version: 12.8 |
4128
  |-----------------------------------------+------------------------+----------------------+
 
4131
  | | | MIG M. |
4132
  |=========================================+========================+======================|
4133
  | 0 NVIDIA L40S On | 00000000:4D:00.0 Off | 0 |
4134
+ | N/A 32C P0 85W / 350W | 0MiB / 46068MiB | 22% Default |
4135
  | | | N/A |
4136
  +-----------------------------------------+------------------------+----------------------+
4137
 
 
4153
  <span class="collapse-indicators">
4154
  <span onclick="toggleCode('benchmark')" style="cursor: pointer;">▼ code</span>
4155
  <span onclick="toggleOutput('benchmark')" style="cursor: pointer;">▼ output</span>
4156
+ <span id="uv-indicator-benchmark" style="cursor: default; opacity: 0.3;">▶ uv-logs</span>
4157
  </span> |
4158
+ Cell: benchmark | 3.89s
4159
  | <button class="run-btn" onclick="runCell('benchmark')">▶ run</button>
4160
  <button class="copy-btn" onclick="copyCell('benchmark')">Copy</button>
4161
  <a href="cells/benchmark.py" target="_blank" class="raw-btn">Raw</a>
4162
+ <a href="https://github.com/huggingface/kernels-benchmarks/blob/main/layer_norm/impls/torch_layer_norm.md" target="_blank" class="github-btn">GitHub</a>
4163
  </div>
4164
  <div id="code-benchmark" class="cell-code" data-lines="26">
4165
  <div class="code-wrap">
 
4203
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4204
  Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
4205
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4206
+ torch_layer_norm 3.88% 150.743us 46.08% 1.790ms 1.790ms 0.000us 0.00% 3.031ms 3.031ms 1
4207
+ aten::layer_norm 0.46% 17.882us 42.20% 1.639ms 546.344us 0.000us 0.00% 3.031ms 1.010ms 3
4208
+ aten::native_layer_norm 2.05% 79.451us 41.74% 1.621ms 540.384us 2.322ms 100.00% 3.031ms 1.010ms 3
4209
+ torch_layer_norm 0.00% 0.000us 0.00% 0.000us 0.000us 2.323ms 100.06% 2.323ms 2.323ms 1
4210
+ void at::native::(anonymous namespace)::vectorized_l... 0.00% 0.000us 0.00% 0.000us 0.000us 2.322ms 100.00% 2.322ms 773.873us 3
4211
+ Activity Buffer Request 37.13% 1.442ms 37.13% 1.442ms 1.442ms 709.660us 30.57% 709.660us 709.660us 1
4212
+ aten::empty 1.23% 47.623us 1.23% 47.623us 5.291us 0.000us 0.00% 0.000us 0.000us 9
4213
+ cudaLaunchKernel 1.17% 45.281us 1.17% 45.281us 15.094us 0.000us 0.00% 0.000us 0.000us 3
4214
+ aten::view 0.17% 6.710us 0.17% 6.710us 1.118us 0.000us 0.00% 0.000us 0.000us 6
4215
+ cudaDeviceSynchronize 53.92% 2.094ms 53.92% 2.094ms 2.094ms 0.000us 0.00% 0.000us 0.000us 1
4216
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4217
+ Self CPU time total: 3.884ms
4218
+ Self CUDA time total: 2.322ms
4219
 
4220
 
4221
 
 
4225
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4226
  Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
4227
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4228
+ torch_layer_norm 1.99% 129.362us 27.22% 1.769ms 1.769ms 0.000us 0.00% 6.490ms 6.490ms 1
4229
+ aten::layer_norm 0.17% 10.831us 25.23% 1.640ms 546.698us 0.000us 0.00% 6.490ms 2.163ms 3
4230
+ aten::native_layer_norm 0.91% 59.414us 25.06% 1.629ms 543.087us 4.900ms 100.00% 6.490ms 2.163ms 3
4231
+ torch_layer_norm 0.00% 0.000us 0.00% 0.000us 0.000us 4.901ms 100.03% 4.901ms 4.901ms 1
4232
+ void at::native::(anonymous namespace)::vectorized_l... 0.00% 0.000us 0.00% 0.000us 0.000us 4.900ms 100.00% 4.900ms 1.633ms 3
4233
+ Activity Buffer Request 23.14% 1.504ms 23.14% 1.504ms 1.504ms 1.590ms 32.46% 1.590ms 1.590ms 1
4234
+ aten::empty 0.46% 29.779us 0.46% 29.779us 3.309us 0.000us 0.00% 0.000us 0.000us 9
4235
+ cudaLaunchKernel 0.49% 31.860us 0.49% 31.860us 10.620us 0.000us 0.00% 0.000us 0.000us 3
4236
+ aten::view 0.06% 3.750us 0.06% 3.750us 0.625us 0.000us 0.00% 0.000us 0.000us 6
4237
+ cudaDeviceSynchronize 72.78% 4.732ms 72.78% 4.732ms 4.732ms 0.000us 0.00% 0.000us 0.000us 1
4238
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4239
+ Self CPU time total: 6.501ms
4240
+ Self CUDA time total: 4.900ms
4241
 
4242
 
4243
 
 
4247
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4248
  Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
4249
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4250
+ torch_layer_norm 1.73% 108.072us 26.73% 1.674ms 1.674ms 0.000us 0.00% 6.258ms 6.258ms 1
4251
+ aten::layer_norm 0.14% 8.910us 25.01% 1.566ms 522.010us 0.000us 0.00% 6.258ms 2.086ms 3
4252
+ aten::native_layer_norm 0.87% 54.314us 24.86% 1.557ms 519.040us 4.736ms 100.00% 6.258ms 2.086ms 3
4253
+ torch_layer_norm 0.00% 0.000us 0.00% 0.000us 0.000us 4.737ms 100.03% 4.737ms 4.737ms 1
4254
+ void at::native::(anonymous namespace)::vectorized_l... 0.00% 0.000us 0.00% 0.000us 0.000us 4.736ms 100.00% 4.736ms 1.579ms 3
4255
+ Activity Buffer Request 23.05% 1.444ms 23.05% 1.444ms 1.444ms 1.522ms 32.13% 1.522ms 1.522ms 1
4256
+ aten::empty 0.46% 28.531us 0.46% 28.531us 3.170us 0.000us 0.00% 0.000us 0.000us 9
4257
+ cudaLaunchKernel 0.43% 26.620us 0.43% 26.620us 8.873us 0.000us 0.00% 0.000us 0.000us 3
4258
+ aten::view 0.06% 4.039us 0.06% 4.039us 0.673us 0.000us 0.00% 0.000us 0.000us 6
4259
+ cudaDeviceSynchronize 73.27% 4.589ms 73.27% 4.589ms 4.589ms 0.000us 0.00% 0.000us 0.000us 1
4260
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4261
+ Self CPU time total: 6.263ms
4262
+ Self CUDA time total: 4.736ms
4263
 
4264
 
4265
 
 
4269
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4270
  Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
4271
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4272
+ torch_layer_norm 0.85% 101.562us 19.08% 2.285ms 2.285ms 0.000us 0.00% 13.093ms 13.093ms 1
4273
+ aten::layer_norm 0.08% 9.511us 18.23% 2.184ms 727.942us 0.000us 0.00% 13.093ms 4.364ms 3
4274
+ aten::native_layer_norm 0.48% 57.051us 18.15% 2.174ms 724.772us 9.846ms 100.00% 13.093ms 4.364ms 3
4275
+ torch_layer_norm 0.00% 0.000us 0.00% 0.000us 0.000us 9.848ms 100.01% 9.848ms 9.848ms 1
4276
+ void at::native::(anonymous namespace)::vectorized_l... 0.00% 0.000us 0.00% 0.000us 0.000us 9.846ms 100.00% 9.846ms 3.282ms 3
4277
+ Activity Buffer Request 11.95% 1.431ms 11.95% 1.431ms 1.431ms 3.247ms 32.97% 3.247ms 3.247ms 1
4278
+ aten::empty 0.24% 29.142us 0.24% 29.142us 3.238us 0.000us 0.00% 0.000us 0.000us 9
4279
+ cudaLaunchKernel 5.45% 653.217us 5.45% 653.217us 217.739us 0.000us 0.00% 0.000us 0.000us 3
4280
+ aten::view 0.03% 3.890us 0.03% 3.890us 0.648us 0.000us 0.00% 0.000us 0.000us 6
4281
+ cudaDeviceSynchronize 80.92% 9.693ms 80.92% 9.693ms 9.693ms 0.000us 0.00% 0.000us 0.000us 1
4282
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4283
+ Self CPU time total: 11.979ms
4284
+ Self CUDA time total: 9.846ms
4285
 
4286
 
4287
  impl wl p50(ms) ok
4288
  torch_layer_norm LN_B16_S2048_D4096 0.82 True
4289
  torch_layer_norm LN_B16_S2048_D8192 1.68 True
4290
  torch_layer_norm LN_B16_S4096_D4096 1.61 True
4291
+ torch_layer_norm LN_B16_S4096_D8192 3.33 True
4292
  </pre></div>
 
 
 
 
 
 
4293
  <div class="cell-artifacts">
4294
  <h4>Artifacts:</h4>
4295
  <a href="artifacts/benchmark/layer_norm.jsonl" class="artifact" target="_blank">layer_norm.jsonl</a>
layer_norm/results/artifacts/combine/latency.svg CHANGED

Git LFS Details

  • SHA256: fae823f30e52d7309b2e012b577544ab4911a33cc2d4ec0acdc57866ceb942fa
  • Pointer size: 130 Bytes
  • Size of remote file: 14.6 kB

Git LFS Details

  • SHA256: be29ece5a8e85e2941ac21710ec16efd87996aaf0e9b42756a2189660af81a2c
  • Pointer size: 130 Bytes
  • Size of remote file: 14.6 kB
layer_norm/results/combined_results.html CHANGED
@@ -4107,7 +4107,7 @@ body[data-tool="eraser"] .main-content {
4107
  <rdf:RDF>
4108
  <ns2:Work>
4109
  <dc:type rdf:resource="http://purl.org/dc/dcmitype/StillImage" />
4110
- <dc:date>2025-10-30T15:53:45.192018</dc:date>
4111
  <dc:format>image/svg+xml</dc:format>
4112
  <dc:creator>
4113
  <ns2:Agent>
@@ -4191,70 +4191,70 @@ body[data-tool="eraser"] .main-content {
4191
  <g id="matplotlib.axis_2">
4192
  <g id="ytick_1">
4193
  <g id="grid-y--2" class="grid grid-y">
4194
- <path d="M 47.72 409.256777 L 840.20233 409.256777 " clip-path="url(#p2214f54723)" style="fill: none; stroke: #b0b0b0; stroke-opacity: 0.3; stroke-width: 0.8; stroke-linecap: square" />
4195
  </g>
4196
  <g id="line2d_5">
4197
  <defs>
4198
  <path id="m0fca2865ba" d="M 0 0 L -3.5 0 " style="stroke: #000000; stroke-width: 0.8" />
4199
  </defs>
4200
  <g>
4201
- <use ns4:href="#m0fca2865ba" x="47.72" y="409.256777" style="stroke: #000000; stroke-width: 0.8" />
4202
  </g>
4203
  </g>
4204
  <g id="text_5">
4205
- <text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: end" x="40.72" y="413.055996" transform="rotate(-0 40.72 413.055996)">1.0</text>
4206
  </g>
4207
  </g>
4208
  <g id="ytick_2">
4209
  <g id="grid-y--3" class="grid grid-y">
4210
- <path d="M 47.72 331.172592 L 840.20233 331.172592 " clip-path="url(#p2214f54723)" style="fill: none; stroke: #b0b0b0; stroke-opacity: 0.3; stroke-width: 0.8; stroke-linecap: square" />
4211
  </g>
4212
  <g id="line2d_6">
4213
  <g>
4214
- <use ns4:href="#m0fca2865ba" x="47.72" y="331.172592" style="stroke: #000000; stroke-width: 0.8" />
4215
  </g>
4216
  </g>
4217
  <g id="text_6">
4218
- <text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: end" x="40.72" y="334.971811" transform="rotate(-0 40.72 334.971811)">1.5</text>
4219
  </g>
4220
  </g>
4221
  <g id="ytick_3">
4222
  <g id="grid-y--4" class="grid grid-y">
4223
- <path d="M 47.72 253.088408 L 840.20233 253.088408 " clip-path="url(#p2214f54723)" style="fill: none; stroke: #b0b0b0; stroke-opacity: 0.3; stroke-width: 0.8; stroke-linecap: square" />
4224
  </g>
4225
  <g id="line2d_7">
4226
  <g>
4227
- <use ns4:href="#m0fca2865ba" x="47.72" y="253.088408" style="stroke: #000000; stroke-width: 0.8" />
4228
  </g>
4229
  </g>
4230
  <g id="text_7">
4231
- <text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: end" x="40.72" y="256.887626" transform="rotate(-0 40.72 256.887626)">2.0</text>
4232
  </g>
4233
  </g>
4234
  <g id="ytick_4">
4235
  <g id="grid-y--5" class="grid grid-y">
4236
- <path d="M 47.72 175.004223 L 840.20233 175.004223 " clip-path="url(#p2214f54723)" style="fill: none; stroke: #b0b0b0; stroke-opacity: 0.3; stroke-width: 0.8; stroke-linecap: square" />
4237
  </g>
4238
  <g id="line2d_8">
4239
  <g>
4240
- <use ns4:href="#m0fca2865ba" x="47.72" y="175.004223" style="stroke: #000000; stroke-width: 0.8" />
4241
  </g>
4242
  </g>
4243
  <g id="text_8">
4244
- <text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: end" x="40.72" y="178.803442" transform="rotate(-0 40.72 178.803442)">2.5</text>
4245
  </g>
4246
  </g>
4247
  <g id="ytick_5">
4248
  <g id="grid-y--6" class="grid grid-y">
4249
- <path d="M 47.72 96.920038 L 840.20233 96.920038 " clip-path="url(#p2214f54723)" style="fill: none; stroke: #b0b0b0; stroke-opacity: 0.3; stroke-width: 0.8; stroke-linecap: square" />
4250
  </g>
4251
  <g id="line2d_9">
4252
  <g>
4253
- <use ns4:href="#m0fca2865ba" x="47.72" y="96.920038" style="stroke: #000000; stroke-width: 0.8" />
4254
  </g>
4255
  </g>
4256
  <g id="text_9">
4257
- <text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: end" x="40.72" y="100.719257" transform="rotate(-0 40.72 100.719257)">3.0</text>
4258
  </g>
4259
  </g>
4260
  <g id="label--y" class="ylabel">
@@ -4262,27 +4262,27 @@ body[data-tool="eraser"] .main-content {
4262
  </g>
4263
  </g>
4264
  <g id="series--torch-layer-norm" class="series">
4265
- <path d="M 83.741924 437.689571 L 323.888085 303.103046 L 564.034245 314.275643 L 804.180406 46.442361 " clip-path="url(#p2214f54723)" style="fill: none; stroke: #1f77b4; stroke-width: 1.5; stroke-linecap: square" />
4266
  <defs>
4267
  <path id="md7efaf3aec" d="M 0 3 C 0.795609 3 1.55874 2.683901 2.12132 2.12132 C 2.683901 1.55874 3 0.795609 3 0 C 3 -0.795609 2.683901 -1.55874 2.12132 -2.12132 C 1.55874 -2.683901 0.795609 -3 0 -3 C -0.795609 -3 -1.55874 -2.683901 -2.12132 -2.12132 C -2.683901 -1.55874 -3 -0.795609 -3 0 C -3 0.795609 -2.683901 1.55874 -2.12132 2.12132 C -1.55874 2.683901 -0.795609 3 0 3 z " style="stroke: #1f77b4" />
4268
  </defs>
4269
  <g clip-path="url(#p2214f54723)">
4270
  <use ns4:href="#md7efaf3aec" x="83.741924" y="437.689571" style="fill: #1f77b4; stroke: #1f77b4" />
4271
- <use ns4:href="#md7efaf3aec" x="323.888085" y="303.103046" style="fill: #1f77b4; stroke: #1f77b4" />
4272
- <use ns4:href="#md7efaf3aec" x="564.034245" y="314.275643" style="fill: #1f77b4; stroke: #1f77b4" />
4273
  <use ns4:href="#md7efaf3aec" x="804.180406" y="46.442361" style="fill: #1f77b4; stroke: #1f77b4" />
4274
  </g>
4275
  </g>
4276
  <g id="series--hf-kernels-layer-norm" class="series">
4277
- <path d="M 83.741924 434.541217 L 323.888085 307.897415 L 564.034245 308.267534 L 804.180406 56.232243 " clip-path="url(#p2214f54723)" style="fill: none; stroke: #ff7f0e; stroke-width: 1.5; stroke-linecap: square" />
4278
  <defs>
4279
  <path id="m9b8c54d372" d="M 0 3 C 0.795609 3 1.55874 2.683901 2.12132 2.12132 C 2.683901 1.55874 3 0.795609 3 0 C 3 -0.795609 2.683901 -1.55874 2.12132 -2.12132 C 1.55874 -2.683901 0.795609 -3 0 -3 C -0.795609 -3 -1.55874 -2.683901 -2.12132 -2.12132 C -2.683901 -1.55874 -3 -0.795609 -3 0 C -3 0.795609 -2.683901 1.55874 -2.12132 2.12132 C -1.55874 2.683901 -0.795609 3 0 3 z " style="stroke: #ff7f0e" />
4280
  </defs>
4281
  <g clip-path="url(#p2214f54723)">
4282
- <use ns4:href="#m9b8c54d372" x="83.741924" y="434.541217" style="fill: #ff7f0e; stroke: #ff7f0e" />
4283
- <use ns4:href="#m9b8c54d372" x="323.888085" y="307.897415" style="fill: #ff7f0e; stroke: #ff7f0e" />
4284
- <use ns4:href="#m9b8c54d372" x="564.034245" y="308.267534" style="fill: #ff7f0e; stroke: #ff7f0e" />
4285
- <use ns4:href="#m9b8c54d372" x="804.180406" y="56.232243" style="fill: #ff7f0e; stroke: #ff7f0e" />
4286
  </g>
4287
  </g>
4288
  <g id="patch_3">
@@ -4428,13 +4428,13 @@ COMBINED BENCHMARK SUMMARY
4428
 
4429
  impl wl p50(ms) ok
4430
  hf_kernels_layer_norm LN_B16_S2048_D4096 0.84 True
4431
- hf_kernels_layer_norm LN_B16_S2048_D8192 1.65 True
4432
- hf_kernels_layer_norm LN_B16_S4096_D4096 1.65 True
4433
- hf_kernels_layer_norm LN_B16_S4096_D8192 3.26 True
4434
  torch_layer_norm LN_B16_S2048_D4096 0.82 True
4435
  torch_layer_norm LN_B16_S2048_D8192 1.68 True
4436
  torch_layer_norm LN_B16_S4096_D4096 1.61 True
4437
- torch_layer_norm LN_B16_S4096_D8192 3.32 True
4438
 
4439
  GENERATING COMBINED VISUALIZATION
4440
 
@@ -4454,7 +4454,7 @@ Implementations included:
4454
  <div class="uv-install-logs" id="uv-logs-combine">
4455
  <div class="uv-logs-header" onclick="toggleUvLogs(this)">▶ UV Install Logs</div>
4456
  <div class="uv-logs-content" style="display: none;">
4457
- Installed 37 packages in 222ms
4458
  </div>
4459
  </div>
4460
  <div class="cell-artifacts">
@@ -4467,7 +4467,7 @@ Installed 37 packages in 222ms
4467
  <rdf:RDF>
4468
  <ns2:Work>
4469
  <dc:type rdf:resource="http://purl.org/dc/dcmitype/StillImage" />
4470
- <dc:date>2025-10-30T15:53:45.192018</dc:date>
4471
  <dc:format>image/svg+xml</dc:format>
4472
  <dc:creator>
4473
  <ns2:Agent>
@@ -4551,70 +4551,70 @@ Installed 37 packages in 222ms
4551
  <g id="matplotlib.axis_2">
4552
  <g id="ytick_1">
4553
  <g id="grid-y--2" class="grid grid-y">
4554
- <path d="M 47.72 409.256777 L 840.20233 409.256777 " clip-path="url(#p2214f54723)" style="fill: none; stroke: #b0b0b0; stroke-opacity: 0.3; stroke-width: 0.8; stroke-linecap: square" />
4555
  </g>
4556
  <g id="line2d_5">
4557
  <defs>
4558
  <path id="m0fca2865ba" d="M 0 0 L -3.5 0 " style="stroke: #000000; stroke-width: 0.8" />
4559
  </defs>
4560
  <g>
4561
- <use ns4:href="#m0fca2865ba" x="47.72" y="409.256777" style="stroke: #000000; stroke-width: 0.8" />
4562
  </g>
4563
  </g>
4564
  <g id="text_5">
4565
- <text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: end" x="40.72" y="413.055996" transform="rotate(-0 40.72 413.055996)">1.0</text>
4566
  </g>
4567
  </g>
4568
  <g id="ytick_2">
4569
  <g id="grid-y--3" class="grid grid-y">
4570
- <path d="M 47.72 331.172592 L 840.20233 331.172592 " clip-path="url(#p2214f54723)" style="fill: none; stroke: #b0b0b0; stroke-opacity: 0.3; stroke-width: 0.8; stroke-linecap: square" />
4571
  </g>
4572
  <g id="line2d_6">
4573
  <g>
4574
- <use ns4:href="#m0fca2865ba" x="47.72" y="331.172592" style="stroke: #000000; stroke-width: 0.8" />
4575
  </g>
4576
  </g>
4577
  <g id="text_6">
4578
- <text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: end" x="40.72" y="334.971811" transform="rotate(-0 40.72 334.971811)">1.5</text>
4579
  </g>
4580
  </g>
4581
  <g id="ytick_3">
4582
  <g id="grid-y--4" class="grid grid-y">
4583
- <path d="M 47.72 253.088408 L 840.20233 253.088408 " clip-path="url(#p2214f54723)" style="fill: none; stroke: #b0b0b0; stroke-opacity: 0.3; stroke-width: 0.8; stroke-linecap: square" />
4584
  </g>
4585
  <g id="line2d_7">
4586
  <g>
4587
- <use ns4:href="#m0fca2865ba" x="47.72" y="253.088408" style="stroke: #000000; stroke-width: 0.8" />
4588
  </g>
4589
  </g>
4590
  <g id="text_7">
4591
- <text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: end" x="40.72" y="256.887626" transform="rotate(-0 40.72 256.887626)">2.0</text>
4592
  </g>
4593
  </g>
4594
  <g id="ytick_4">
4595
  <g id="grid-y--5" class="grid grid-y">
4596
- <path d="M 47.72 175.004223 L 840.20233 175.004223 " clip-path="url(#p2214f54723)" style="fill: none; stroke: #b0b0b0; stroke-opacity: 0.3; stroke-width: 0.8; stroke-linecap: square" />
4597
  </g>
4598
  <g id="line2d_8">
4599
  <g>
4600
- <use ns4:href="#m0fca2865ba" x="47.72" y="175.004223" style="stroke: #000000; stroke-width: 0.8" />
4601
  </g>
4602
  </g>
4603
  <g id="text_8">
4604
- <text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: end" x="40.72" y="178.803442" transform="rotate(-0 40.72 178.803442)">2.5</text>
4605
  </g>
4606
  </g>
4607
  <g id="ytick_5">
4608
  <g id="grid-y--6" class="grid grid-y">
4609
- <path d="M 47.72 96.920038 L 840.20233 96.920038 " clip-path="url(#p2214f54723)" style="fill: none; stroke: #b0b0b0; stroke-opacity: 0.3; stroke-width: 0.8; stroke-linecap: square" />
4610
  </g>
4611
  <g id="line2d_9">
4612
  <g>
4613
- <use ns4:href="#m0fca2865ba" x="47.72" y="96.920038" style="stroke: #000000; stroke-width: 0.8" />
4614
  </g>
4615
  </g>
4616
  <g id="text_9">
4617
- <text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: end" x="40.72" y="100.719257" transform="rotate(-0 40.72 100.719257)">3.0</text>
4618
  </g>
4619
  </g>
4620
  <g id="label--y" class="ylabel">
@@ -4622,27 +4622,27 @@ Installed 37 packages in 222ms
4622
  </g>
4623
  </g>
4624
  <g id="series--torch-layer-norm" class="series">
4625
- <path d="M 83.741924 437.689571 L 323.888085 303.103046 L 564.034245 314.275643 L 804.180406 46.442361 " clip-path="url(#p2214f54723)" style="fill: none; stroke: #1f77b4; stroke-width: 1.5; stroke-linecap: square" />
4626
  <defs>
4627
  <path id="md7efaf3aec" d="M 0 3 C 0.795609 3 1.55874 2.683901 2.12132 2.12132 C 2.683901 1.55874 3 0.795609 3 0 C 3 -0.795609 2.683901 -1.55874 2.12132 -2.12132 C 1.55874 -2.683901 0.795609 -3 0 -3 C -0.795609 -3 -1.55874 -2.683901 -2.12132 -2.12132 C -2.683901 -1.55874 -3 -0.795609 -3 0 C -3 0.795609 -2.683901 1.55874 -2.12132 2.12132 C -1.55874 2.683901 -0.795609 3 0 3 z " style="stroke: #1f77b4" />
4628
  </defs>
4629
  <g clip-path="url(#p2214f54723)">
4630
  <use ns4:href="#md7efaf3aec" x="83.741924" y="437.689571" style="fill: #1f77b4; stroke: #1f77b4" />
4631
- <use ns4:href="#md7efaf3aec" x="323.888085" y="303.103046" style="fill: #1f77b4; stroke: #1f77b4" />
4632
- <use ns4:href="#md7efaf3aec" x="564.034245" y="314.275643" style="fill: #1f77b4; stroke: #1f77b4" />
4633
  <use ns4:href="#md7efaf3aec" x="804.180406" y="46.442361" style="fill: #1f77b4; stroke: #1f77b4" />
4634
  </g>
4635
  </g>
4636
  <g id="series--hf-kernels-layer-norm" class="series">
4637
- <path d="M 83.741924 434.541217 L 323.888085 307.897415 L 564.034245 308.267534 L 804.180406 56.232243 " clip-path="url(#p2214f54723)" style="fill: none; stroke: #ff7f0e; stroke-width: 1.5; stroke-linecap: square" />
4638
  <defs>
4639
  <path id="m9b8c54d372" d="M 0 3 C 0.795609 3 1.55874 2.683901 2.12132 2.12132 C 2.683901 1.55874 3 0.795609 3 0 C 3 -0.795609 2.683901 -1.55874 2.12132 -2.12132 C 1.55874 -2.683901 0.795609 -3 0 -3 C -0.795609 -3 -1.55874 -2.683901 -2.12132 -2.12132 C -2.683901 -1.55874 -3 -0.795609 -3 0 C -3 0.795609 -2.683901 1.55874 -2.12132 2.12132 C -1.55874 2.683901 -0.795609 3 0 3 z " style="stroke: #ff7f0e" />
4640
  </defs>
4641
  <g clip-path="url(#p2214f54723)">
4642
- <use ns4:href="#m9b8c54d372" x="83.741924" y="434.541217" style="fill: #ff7f0e; stroke: #ff7f0e" />
4643
- <use ns4:href="#m9b8c54d372" x="323.888085" y="307.897415" style="fill: #ff7f0e; stroke: #ff7f0e" />
4644
- <use ns4:href="#m9b8c54d372" x="564.034245" y="308.267534" style="fill: #ff7f0e; stroke: #ff7f0e" />
4645
- <use ns4:href="#m9b8c54d372" x="804.180406" y="56.232243" style="fill: #ff7f0e; stroke: #ff7f0e" />
4646
  </g>
4647
  </g>
4648
  <g id="patch_3">
 
4107
  <rdf:RDF>
4108
  <ns2:Work>
4109
  <dc:type rdf:resource="http://purl.org/dc/dcmitype/StillImage" />
4110
+ <dc:date>2025-10-31T20:13:56.885734</dc:date>
4111
  <dc:format>image/svg+xml</dc:format>
4112
  <dc:creator>
4113
  <ns2:Agent>
 
4191
  <g id="matplotlib.axis_2">
4192
  <g id="ytick_1">
4193
  <g id="grid-y--2" class="grid grid-y">
4194
+ <path d="M 47.72 409.237714 L 840.20233 409.237714 " clip-path="url(#p2214f54723)" style="fill: none; stroke: #b0b0b0; stroke-opacity: 0.3; stroke-width: 0.8; stroke-linecap: square" />
4195
  </g>
4196
  <g id="line2d_5">
4197
  <defs>
4198
  <path id="m0fca2865ba" d="M 0 0 L -3.5 0 " style="stroke: #000000; stroke-width: 0.8" />
4199
  </defs>
4200
  <g>
4201
+ <use ns4:href="#m0fca2865ba" x="47.72" y="409.237714" style="stroke: #000000; stroke-width: 0.8" />
4202
  </g>
4203
  </g>
4204
  <g id="text_5">
4205
+ <text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: end" x="40.72" y="413.036933" transform="rotate(-0 40.72 413.036933)">1.0</text>
4206
  </g>
4207
  </g>
4208
  <g id="ytick_2">
4209
  <g id="grid-y--3" class="grid grid-y">
4210
+ <path d="M 47.72 331.316879 L 840.20233 331.316879 " clip-path="url(#p2214f54723)" style="fill: none; stroke: #b0b0b0; stroke-opacity: 0.3; stroke-width: 0.8; stroke-linecap: square" />
4211
  </g>
4212
  <g id="line2d_6">
4213
  <g>
4214
+ <use ns4:href="#m0fca2865ba" x="47.72" y="331.316879" style="stroke: #000000; stroke-width: 0.8" />
4215
  </g>
4216
  </g>
4217
  <g id="text_6">
4218
+ <text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: end" x="40.72" y="335.116098" transform="rotate(-0 40.72 335.116098)">1.5</text>
4219
  </g>
4220
  </g>
4221
  <g id="ytick_3">
4222
  <g id="grid-y--4" class="grid grid-y">
4223
+ <path d="M 47.72 253.396045 L 840.20233 253.396045 " clip-path="url(#p2214f54723)" style="fill: none; stroke: #b0b0b0; stroke-opacity: 0.3; stroke-width: 0.8; stroke-linecap: square" />
4224
  </g>
4225
  <g id="line2d_7">
4226
  <g>
4227
+ <use ns4:href="#m0fca2865ba" x="47.72" y="253.396045" style="stroke: #000000; stroke-width: 0.8" />
4228
  </g>
4229
  </g>
4230
  <g id="text_7">
4231
+ <text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: end" x="40.72" y="257.195264" transform="rotate(-0 40.72 257.195264)">2.0</text>
4232
  </g>
4233
  </g>
4234
  <g id="ytick_4">
4235
  <g id="grid-y--5" class="grid grid-y">
4236
+ <path d="M 47.72 175.47521 L 840.20233 175.47521 " clip-path="url(#p2214f54723)" style="fill: none; stroke: #b0b0b0; stroke-opacity: 0.3; stroke-width: 0.8; stroke-linecap: square" />
4237
  </g>
4238
  <g id="line2d_8">
4239
  <g>
4240
+ <use ns4:href="#m0fca2865ba" x="47.72" y="175.47521" style="stroke: #000000; stroke-width: 0.8" />
4241
  </g>
4242
  </g>
4243
  <g id="text_8">
4244
+ <text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: end" x="40.72" y="179.274429" transform="rotate(-0 40.72 179.274429)">2.5</text>
4245
  </g>
4246
  </g>
4247
  <g id="ytick_5">
4248
  <g id="grid-y--6" class="grid grid-y">
4249
+ <path d="M 47.72 97.554376 L 840.20233 97.554376 " clip-path="url(#p2214f54723)" style="fill: none; stroke: #b0b0b0; stroke-opacity: 0.3; stroke-width: 0.8; stroke-linecap: square" />
4250
  </g>
4251
  <g id="line2d_9">
4252
  <g>
4253
+ <use ns4:href="#m0fca2865ba" x="47.72" y="97.554376" style="stroke: #000000; stroke-width: 0.8" />
4254
  </g>
4255
  </g>
4256
  <g id="text_9">
4257
+ <text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: end" x="40.72" y="101.353595" transform="rotate(-0 40.72 101.353595)">3.0</text>
4258
  </g>
4259
  </g>
4260
  <g id="label--y" class="ylabel">
 
4262
  </g>
4263
  </g>
4264
  <g id="series--torch-layer-norm" class="series">
4265
+ <path d="M 83.741924 437.689571 L 323.888085 303.419195 L 564.034245 314.859843 L 804.180406 46.442361 " clip-path="url(#p2214f54723)" style="fill: none; stroke: #1f77b4; stroke-width: 1.5; stroke-linecap: square" />
4266
  <defs>
4267
  <path id="md7efaf3aec" d="M 0 3 C 0.795609 3 1.55874 2.683901 2.12132 2.12132 C 2.683901 1.55874 3 0.795609 3 0 C 3 -0.795609 2.683901 -1.55874 2.12132 -2.12132 C 1.55874 -2.683901 0.795609 -3 0 -3 C -0.795609 -3 -1.55874 -2.683901 -2.12132 -2.12132 C -2.683901 -1.55874 -3 -0.795609 -3 0 C -3 0.795609 -2.683901 1.55874 -2.12132 2.12132 C -1.55874 2.683901 -0.795609 3 0 3 z " style="stroke: #1f77b4" />
4268
  </defs>
4269
  <g clip-path="url(#p2214f54723)">
4270
  <use ns4:href="#md7efaf3aec" x="83.741924" y="437.689571" style="fill: #1f77b4; stroke: #1f77b4" />
4271
+ <use ns4:href="#md7efaf3aec" x="323.888085" y="303.419195" style="fill: #1f77b4; stroke: #1f77b4" />
4272
+ <use ns4:href="#md7efaf3aec" x="564.034245" y="314.859843" style="fill: #1f77b4; stroke: #1f77b4" />
4273
  <use ns4:href="#md7efaf3aec" x="804.180406" y="46.442361" style="fill: #1f77b4; stroke: #1f77b4" />
4274
  </g>
4275
  </g>
4276
  <g id="series--hf-kernels-layer-norm" class="series">
4277
+ <path d="M 83.741924 434.525986 L 323.888085 307.036436 L 564.034245 306.425536 L 804.180406 56.12044 " clip-path="url(#p2214f54723)" style="fill: none; stroke: #ff7f0e; stroke-width: 1.5; stroke-linecap: square" />
4278
  <defs>
4279
  <path id="m9b8c54d372" d="M 0 3 C 0.795609 3 1.55874 2.683901 2.12132 2.12132 C 2.683901 1.55874 3 0.795609 3 0 C 3 -0.795609 2.683901 -1.55874 2.12132 -2.12132 C 1.55874 -2.683901 0.795609 -3 0 -3 C -0.795609 -3 -1.55874 -2.683901 -2.12132 -2.12132 C -2.683901 -1.55874 -3 -0.795609 -3 0 C -3 0.795609 -2.683901 1.55874 -2.12132 2.12132 C -1.55874 2.683901 -0.795609 3 0 3 z " style="stroke: #ff7f0e" />
4280
  </defs>
4281
  <g clip-path="url(#p2214f54723)">
4282
+ <use ns4:href="#m9b8c54d372" x="83.741924" y="434.525986" style="fill: #ff7f0e; stroke: #ff7f0e" />
4283
+ <use ns4:href="#m9b8c54d372" x="323.888085" y="307.036436" style="fill: #ff7f0e; stroke: #ff7f0e" />
4284
+ <use ns4:href="#m9b8c54d372" x="564.034245" y="306.425536" style="fill: #ff7f0e; stroke: #ff7f0e" />
4285
+ <use ns4:href="#m9b8c54d372" x="804.180406" y="56.12044" style="fill: #ff7f0e; stroke: #ff7f0e" />
4286
  </g>
4287
  </g>
4288
  <g id="patch_3">
 
4428
 
4429
  impl wl p50(ms) ok
4430
  hf_kernels_layer_norm LN_B16_S2048_D4096 0.84 True
4431
+ hf_kernels_layer_norm LN_B16_S2048_D8192 1.66 True
4432
+ hf_kernels_layer_norm LN_B16_S4096_D4096 1.66 True
4433
+ hf_kernels_layer_norm LN_B16_S4096_D8192 3.27 True
4434
  torch_layer_norm LN_B16_S2048_D4096 0.82 True
4435
  torch_layer_norm LN_B16_S2048_D8192 1.68 True
4436
  torch_layer_norm LN_B16_S4096_D4096 1.61 True
4437
+ torch_layer_norm LN_B16_S4096_D8192 3.33 True
4438
 
4439
  GENERATING COMBINED VISUALIZATION
4440
 
 
4454
  <div class="uv-install-logs" id="uv-logs-combine">
4455
  <div class="uv-logs-header" onclick="toggleUvLogs(this)">▶ UV Install Logs</div>
4456
  <div class="uv-logs-content" style="display: none;">
4457
+ Installed 37 packages in 216ms
4458
  </div>
4459
  </div>
4460
  <div class="cell-artifacts">
 
4467
  <rdf:RDF>
4468
  <ns2:Work>
4469
  <dc:type rdf:resource="http://purl.org/dc/dcmitype/StillImage" />
4470
+ <dc:date>2025-10-31T20:13:56.885734</dc:date>
4471
  <dc:format>image/svg+xml</dc:format>
4472
  <dc:creator>
4473
  <ns2:Agent>
 
4551
  <g id="matplotlib.axis_2">
4552
  <g id="ytick_1">
4553
  <g id="grid-y--2" class="grid grid-y">
4554
+ <path d="M 47.72 409.237714 L 840.20233 409.237714 " clip-path="url(#p2214f54723)" style="fill: none; stroke: #b0b0b0; stroke-opacity: 0.3; stroke-width: 0.8; stroke-linecap: square" />
4555
  </g>
4556
  <g id="line2d_5">
4557
  <defs>
4558
  <path id="m0fca2865ba" d="M 0 0 L -3.5 0 " style="stroke: #000000; stroke-width: 0.8" />
4559
  </defs>
4560
  <g>
4561
+ <use ns4:href="#m0fca2865ba" x="47.72" y="409.237714" style="stroke: #000000; stroke-width: 0.8" />
4562
  </g>
4563
  </g>
4564
  <g id="text_5">
4565
+ <text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: end" x="40.72" y="413.036933" transform="rotate(-0 40.72 413.036933)">1.0</text>
4566
  </g>
4567
  </g>
4568
  <g id="ytick_2">
4569
  <g id="grid-y--3" class="grid grid-y">
4570
+ <path d="M 47.72 331.316879 L 840.20233 331.316879 " clip-path="url(#p2214f54723)" style="fill: none; stroke: #b0b0b0; stroke-opacity: 0.3; stroke-width: 0.8; stroke-linecap: square" />
4571
  </g>
4572
  <g id="line2d_6">
4573
  <g>
4574
+ <use ns4:href="#m0fca2865ba" x="47.72" y="331.316879" style="stroke: #000000; stroke-width: 0.8" />
4575
  </g>
4576
  </g>
4577
  <g id="text_6">
4578
+ <text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: end" x="40.72" y="335.116098" transform="rotate(-0 40.72 335.116098)">1.5</text>
4579
  </g>
4580
  </g>
4581
  <g id="ytick_3">
4582
  <g id="grid-y--4" class="grid grid-y">
4583
+ <path d="M 47.72 253.396045 L 840.20233 253.396045 " clip-path="url(#p2214f54723)" style="fill: none; stroke: #b0b0b0; stroke-opacity: 0.3; stroke-width: 0.8; stroke-linecap: square" />
4584
  </g>
4585
  <g id="line2d_7">
4586
  <g>
4587
+ <use ns4:href="#m0fca2865ba" x="47.72" y="253.396045" style="stroke: #000000; stroke-width: 0.8" />
4588
  </g>
4589
  </g>
4590
  <g id="text_7">
4591
+ <text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: end" x="40.72" y="257.195264" transform="rotate(-0 40.72 257.195264)">2.0</text>
4592
  </g>
4593
  </g>
4594
  <g id="ytick_4">
4595
  <g id="grid-y--5" class="grid grid-y">
4596
+ <path d="M 47.72 175.47521 L 840.20233 175.47521 " clip-path="url(#p2214f54723)" style="fill: none; stroke: #b0b0b0; stroke-opacity: 0.3; stroke-width: 0.8; stroke-linecap: square" />
4597
  </g>
4598
  <g id="line2d_8">
4599
  <g>
4600
+ <use ns4:href="#m0fca2865ba" x="47.72" y="175.47521" style="stroke: #000000; stroke-width: 0.8" />
4601
  </g>
4602
  </g>
4603
  <g id="text_8">
4604
+ <text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: end" x="40.72" y="179.274429" transform="rotate(-0 40.72 179.274429)">2.5</text>
4605
  </g>
4606
  </g>
4607
  <g id="ytick_5">
4608
  <g id="grid-y--6" class="grid grid-y">
4609
+ <path d="M 47.72 97.554376 L 840.20233 97.554376 " clip-path="url(#p2214f54723)" style="fill: none; stroke: #b0b0b0; stroke-opacity: 0.3; stroke-width: 0.8; stroke-linecap: square" />
4610
  </g>
4611
  <g id="line2d_9">
4612
  <g>
4613
+ <use ns4:href="#m0fca2865ba" x="47.72" y="97.554376" style="stroke: #000000; stroke-width: 0.8" />
4614
  </g>
4615
  </g>
4616
  <g id="text_9">
4617
+ <text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: end" x="40.72" y="101.353595" transform="rotate(-0 40.72 101.353595)">3.0</text>
4618
  </g>
4619
  </g>
4620
  <g id="label--y" class="ylabel">
 
4622
  </g>
4623
  </g>
4624
  <g id="series--torch-layer-norm" class="series">
4625
+ <path d="M 83.741924 437.689571 L 323.888085 303.419195 L 564.034245 314.859843 L 804.180406 46.442361 " clip-path="url(#p2214f54723)" style="fill: none; stroke: #1f77b4; stroke-width: 1.5; stroke-linecap: square" />
4626
  <defs>
4627
  <path id="md7efaf3aec" d="M 0 3 C 0.795609 3 1.55874 2.683901 2.12132 2.12132 C 2.683901 1.55874 3 0.795609 3 0 C 3 -0.795609 2.683901 -1.55874 2.12132 -2.12132 C 1.55874 -2.683901 0.795609 -3 0 -3 C -0.795609 -3 -1.55874 -2.683901 -2.12132 -2.12132 C -2.683901 -1.55874 -3 -0.795609 -3 0 C -3 0.795609 -2.683901 1.55874 -2.12132 2.12132 C -1.55874 2.683901 -0.795609 3 0 3 z " style="stroke: #1f77b4" />
4628
  </defs>
4629
  <g clip-path="url(#p2214f54723)">
4630
  <use ns4:href="#md7efaf3aec" x="83.741924" y="437.689571" style="fill: #1f77b4; stroke: #1f77b4" />
4631
+ <use ns4:href="#md7efaf3aec" x="323.888085" y="303.419195" style="fill: #1f77b4; stroke: #1f77b4" />
4632
+ <use ns4:href="#md7efaf3aec" x="564.034245" y="314.859843" style="fill: #1f77b4; stroke: #1f77b4" />
4633
  <use ns4:href="#md7efaf3aec" x="804.180406" y="46.442361" style="fill: #1f77b4; stroke: #1f77b4" />
4634
  </g>
4635
  </g>
4636
  <g id="series--hf-kernels-layer-norm" class="series">
4637
+ <path d="M 83.741924 434.525986 L 323.888085 307.036436 L 564.034245 306.425536 L 804.180406 56.12044 " clip-path="url(#p2214f54723)" style="fill: none; stroke: #ff7f0e; stroke-width: 1.5; stroke-linecap: square" />
4638
  <defs>
4639
  <path id="m9b8c54d372" d="M 0 3 C 0.795609 3 1.55874 2.683901 2.12132 2.12132 C 2.683901 1.55874 3 0.795609 3 0 C 3 -0.795609 2.683901 -1.55874 2.12132 -2.12132 C 1.55874 -2.683901 0.795609 -3 0 -3 C -0.795609 -3 -1.55874 -2.683901 -2.12132 -2.12132 C -2.683901 -1.55874 -3 -0.795609 -3 0 C -3 0.795609 -2.683901 1.55874 -2.12132 2.12132 C -1.55874 2.683901 -0.795609 3 0 3 z " style="stroke: #ff7f0e" />
4640
  </defs>
4641
  <g clip-path="url(#p2214f54723)">
4642
+ <use ns4:href="#m9b8c54d372" x="83.741924" y="434.525986" style="fill: #ff7f0e; stroke: #ff7f0e" />
4643
+ <use ns4:href="#m9b8c54d372" x="323.888085" y="307.036436" style="fill: #ff7f0e; stroke: #ff7f0e" />
4644
+ <use ns4:href="#m9b8c54d372" x="564.034245" y="306.425536" style="fill: #ff7f0e; stroke: #ff7f0e" />
4645
+ <use ns4:href="#m9b8c54d372" x="804.180406" y="56.12044" style="fill: #ff7f0e; stroke: #ff7f0e" />
4646
  </g>
4647
  </g>
4648
  <g id="patch_3">
openai_moe/impls/artifacts/benchmark/openai_moe.jsonl ADDED
@@ -0,0 +1,8 @@
 
 
 
 
 
 
 
 
 
1
+ {"ts": "2025-10-31T20:01:48Z", "run": "cee70b6f35064c71bc12a633683f7c01", "impl": "binned_torch", "tags": {"family": "pytorch", "backend": "eager"}, "wl": {"name": "cuda_B1_S512_E2", "batch": 1, "seq_len": 512, "hidden_dim": 2880, "expert_dim": 5760, "num_experts": 2, "top_k": 2, "dtype": "float32", "device": "cuda"}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 149.85902099999748, "p50": 150.05062800003088, "p90": 150.2997029999733, "mean": 150.08009959999526, "iqr": 0.4259410000031494, "raw_times": [149.85902099999748, 150.3173840000045, 150.2997029999733, 149.87376199997016, 150.05062800003088], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 150.9511389999716, "peak_bytes": 416866816, "ok": true, "absmax": 2.765655517578125e-05, "corr": {"ok": true, "rtol": 0.01, "atol": 0.01, "absmax": 2.765655517578125e-05, "mae": 2.0696452338597737e-06, "mse": 7.332408985538663e-12, "ref": "naive_moe"}, "err": null}
2
+ {"ts": "2025-10-31T20:02:12Z", "run": "cee70b6f35064c71bc12a633683f7c01", "impl": "binned_torch", "tags": {"family": "pytorch", "backend": "eager"}, "wl": {"name": "cuda_B1_S512_E4", "batch": 1, "seq_len": 512, "hidden_dim": 2880, "expert_dim": 5760, "num_experts": 4, "top_k": 2, "dtype": "float32", "device": "cuda"}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 199.76808500001653, "p50": 200.257487999977, "p90": 201.3672960000008, "mean": 200.6008808000047, "iqr": 1.3947150000035435, "raw_times": [200.257487999977, 201.63895400003184, 201.3672960000008, 199.97258099999726, 199.76808500001653], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 200.2076969999962, "peak_bytes": 632035840, "ok": true, "absmax": 1.621246337890625e-05, "corr": {"ok": true, "rtol": 0.01, "atol": 0.01, "absmax": 1.621246337890625e-05, "mae": 9.61917862696282e-07, "mse": 1.59423277530657e-12, "ref": "naive_moe"}, "err": null}
3
+ {"ts": "2025-10-31T20:02:55Z", "run": "cee70b6f35064c71bc12a633683f7c01", "impl": "binned_torch", "tags": {"family": "pytorch", "backend": "eager"}, "wl": {"name": "cuda_B1_S1024_E2", "batch": 1, "seq_len": 1024, "hidden_dim": 2880, "expert_dim": 5760, "num_experts": 2, "top_k": 2, "dtype": "float32", "device": "cuda"}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 370.4508769999961, "p50": 372.7904090000038, "p90": 374.84007900002325, "mean": 372.8004498000132, "iqr": 3.7740770000027624, "raw_times": [374.84007900002325, 371.0660020000205, 370.4508769999961, 374.85488200002237, 372.7904090000038], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 371.103493000021, "peak_bytes": 643844608, "ok": true, "absmax": 2.6226043701171875e-05, "corr": {"ok": true, "rtol": 0.01, "atol": 0.01, "absmax": 2.6226043701171875e-05, "mae": 2.0501920516835526e-06, "mse": 7.1848811622476916e-12, "ref": "naive_moe"}, "err": null}
4
+ {"ts": "2025-10-31T20:03:43Z", "run": "cee70b6f35064c71bc12a633683f7c01", "impl": "binned_torch", "tags": {"family": "pytorch", "backend": "eager"}, "wl": {"name": "cuda_B1_S1024_E4", "batch": 1, "seq_len": 1024, "hidden_dim": 2880, "expert_dim": 5760, "num_experts": 4, "top_k": 2, "dtype": "float32", "device": "cuda"}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 381.2919249999709, "p50": 382.6824700000202, "p90": 382.6975609999863, "mean": 382.48455139998896, "iqr": 0.3518089999943186, "raw_times": [382.345751999992, 381.2919249999709, 383.4050489999754, 382.6975609999863, 382.6824700000202], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 384.12325699999883, "peak_bytes": 823386112, "ok": true, "absmax": 1.3589859008789062e-05, "corr": {"ok": true, "rtol": 0.01, "atol": 0.01, "absmax": 1.3589859008789062e-05, "mae": 9.400179123986163e-07, "mse": 1.5130355735665235e-12, "ref": "naive_moe"}, "err": null}
5
+ {"ts": "2025-10-31T20:05:12Z", "run": "cee70b6f35064c71bc12a633683f7c01", "impl": "binned_torch", "tags": {"family": "pytorch", "backend": "eager"}, "wl": {"name": "cuda_B4_S512_E2", "batch": 4, "seq_len": 512, "hidden_dim": 2880, "expert_dim": 5760, "num_experts": 2, "top_k": 2, "dtype": "float32", "device": "cuda"}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 735.1488859999336, "p50": 742.0204380000541, "p90": 746.9078719999516, "mean": 742.4016768000001, "iqr": 5.8942259998957525, "raw_times": [746.9175420000056, 746.9078719999516, 742.0204380000541, 735.1488859999336, 741.0136460000558], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 715.4345070000545, "peak_bytes": 1036112384, "ok": true, "absmax": 3.2901763916015625e-05, "corr": {"ok": true, "rtol": 0.01, "atol": 0.01, "absmax": 3.2901763916015625e-05, "mae": 2.0572656467265915e-06, "mse": 7.247809123700488e-12, "ref": "naive_moe"}, "err": null}
6
+ {"ts": "2025-10-31T20:06:54Z", "run": "cee70b6f35064c71bc12a633683f7c01", "impl": "binned_torch", "tags": {"family": "pytorch", "backend": "eager"}, "wl": {"name": "cuda_B4_S512_E4", "batch": 4, "seq_len": 512, "hidden_dim": 2880, "expert_dim": 5760, "num_experts": 4, "top_k": 2, "dtype": "float32", "device": "cuda"}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 799.7175619999553, "p50": 801.8970370000034, "p90": 803.0568570000014, "mean": 801.7179149999947, "iqr": 2.358569999955762, "raw_times": [799.7175619999553, 800.6982870000456, 803.2198319999679, 803.0568570000014, 801.8970370000034], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 797.9236759999822, "peak_bytes": 1235263488, "ok": true, "absmax": 1.430511474609375e-05, "corr": {"ok": true, "rtol": 0.01, "atol": 0.01, "absmax": 1.430511474609375e-05, "mae": 9.400343401466671e-07, "mse": 1.5107844445957919e-12, "ref": "naive_moe"}, "err": null}
7
+ {"ts": "2025-10-31T20:09:51Z", "run": "cee70b6f35064c71bc12a633683f7c01", "impl": "binned_torch", "tags": {"family": "pytorch", "backend": "eager"}, "wl": {"name": "cuda_B4_S1024_E2", "batch": 4, "seq_len": 1024, "hidden_dim": 2880, "expert_dim": 5760, "num_experts": 2, "top_k": 2, "dtype": "float32", "device": "cuda"}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 1484.2085319999114, "p50": 1486.4837999999736, "p90": 1487.529773999995, "mean": 1488.3352192000075, "iqr": 2.3281069999256943, "raw_times": [1498.252323000088, 1486.4837999999736, 1484.2085319999114, 1485.2016670000694, 1487.529773999995], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 1502.5766269999394, "peak_bytes": 1861947904, "ok": true, "absmax": 2.6226043701171875e-05, "corr": {"ok": true, "rtol": 0.01, "atol": 0.01, "absmax": 2.6226043701171875e-05, "mae": 2.060702854578267e-06, "mse": 7.262949790198814e-12, "ref": "naive_moe"}, "err": null}
8
+ {"ts": "2025-10-31T20:13:14Z", "run": "cee70b6f35064c71bc12a633683f7c01", "impl": "binned_torch", "tags": {"family": "pytorch", "backend": "eager"}, "wl": {"name": "cuda_B4_S1024_E4", "batch": 4, "seq_len": 1024, "hidden_dim": 2880, "expert_dim": 5760, "num_experts": 4, "top_k": 2, "dtype": "float32", "device": "cuda"}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 1520.7084719999102, "p50": 1524.500331000013, "p90": 1525.4868470000247, "mean": 1524.7435091999705, "iqr": 1.6920530000561484, "raw_times": [1529.2271019999362, 1524.500331000013, 1523.7947939999685, 1525.4868470000247, 1520.7084719999102], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 1532.9394789999924, "peak_bytes": 2062163968, "ok": true, "absmax": 1.5974044799804688e-05, "corr": {"ok": true, "rtol": 0.01, "atol": 0.01, "absmax": 1.5974044799804688e-05, "mae": 9.529014732834185e-07, "mse": 1.5621694476192216e-12, "ref": "naive_moe"}, "err": null}
openai_moe/impls/binned_torch.html ADDED
The diff for this file is too large to render. See raw diff
 
openai_moe/impls/cells/benchmark.py ADDED
@@ -0,0 +1,136 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # /// script
2
+ # requires-python = ">=3.10"
3
+ # dependencies = [
4
+ # "numpy",
5
+ # "torch==2.8.0",
6
+ # "kernels-benchmark-tools",
7
+ # ]
8
+ #
9
+ # [tool.uv.sources]
10
+ # kernels-benchmark-tools = { path = "../../../../../tools", editable = true }
11
+ # ///
12
+ import torch
13
+ import sys
14
+ from kernels_benchmark_tools import KernelTypeEnum, run_benchmark
15
+
16
+
17
+ def binned_gather(x, indices, bins, expert_capacity, top_k):
18
+ E, H = bins.shape[0], x.shape[1]
19
+ out = torch.zeros((E, expert_capacity, H), device=x.device, dtype=x.dtype)
20
+ for e in range(E):
21
+ start = 0 if e == 0 else bins[e - 1]
22
+ end = bins[e]
23
+ n = min(end - start, expert_capacity)
24
+ for i in range(n):
25
+ flat_pos = indices[start + i]
26
+ tok = flat_pos // top_k
27
+ out[e, i] = x[tok]
28
+ return out
29
+
30
+
31
+ def binned_scatter(x, indices, weights, bins, expert_capacity, top_k):
32
+ E, C, H = x.shape
33
+ N = indices.shape[0] // top_k
34
+ out = torch.zeros((N, top_k, H), dtype=x.dtype, device=x.device)
35
+ for e in range(E):
36
+ start = 0 if e == 0 else bins[e - 1]
37
+ end = bins[e]
38
+ n = end - start
39
+ if n == 0:
40
+ continue
41
+ take = min(n, expert_capacity)
42
+ for i in range(take):
43
+ flat_pos = indices[start + i] # flattened (token, slot)
44
+ tok = flat_pos // top_k
45
+ slot = flat_pos % top_k
46
+ scale = weights[flat_pos] if weights is not None else 1.0
47
+ out[tok, slot] = x[e, i] * scale
48
+ return out.sum(dim=1)
49
+
50
+
51
+ def sort_tokens_by_expert(router_indices, num_experts):
52
+ flat_indices = router_indices.flatten()
53
+ sorted_values, sorted_indices = torch.sort(flat_indices)
54
+ tokens_per_expert = torch.bincount(sorted_values, minlength=num_experts)
55
+ bins = torch.cumsum(tokens_per_expert, dim=0)
56
+ return sorted_indices, sorted_values, bins, tokens_per_expert
57
+
58
+
59
+ def binned_experts_ref(
60
+ hidden_states,
61
+ router_indices,
62
+ routing_weights,
63
+ gate_up_proj,
64
+ gate_up_proj_bias,
65
+ down_proj,
66
+ down_proj_bias,
67
+ expert_capacity,
68
+ ):
69
+ B, S, H = hidden_states.shape
70
+ E, K = routing_weights.shape[2], router_indices.shape[1]
71
+
72
+ indices, _, bins, _ = sort_tokens_by_expert(router_indices, E)
73
+ x = binned_gather(hidden_states.view(-1, H), indices, bins, expert_capacity, K)
74
+
75
+ gate_up = torch.bmm(x, gate_up_proj) + gate_up_proj_bias[..., None, :]
76
+ gate, up = gate_up[..., ::2], gate_up[..., 1::2]
77
+
78
+ # clamp to limit
79
+ limit = 7.0
80
+ gate = gate.clamp(min=None, max=limit)
81
+ up = up.clamp(min=-limit, max=limit)
82
+
83
+ glu = gate * torch.sigmoid(gate * 1.702)
84
+ x = (up + 1) * glu
85
+ x = torch.bmm(x, down_proj) + down_proj_bias[..., None, :]
86
+
87
+ # build routing weights aligned to (token, slot)
88
+ flat_dense = routing_weights.view(-1, E) # [B*S, E]
89
+ flat_router = router_indices.view(-1, K) # [B*S, K]
90
+ selected = torch.gather(flat_dense, 1, flat_router).reshape(-1) # [B*S*K]
91
+
92
+ # scatter back
93
+ y = binned_scatter(x, indices, selected, bins, expert_capacity, K) # [B*S, H]
94
+
95
+ return y.view(B, S, H)
96
+
97
+
98
+ def binned_torch_openai_moe(
99
+ hidden_states,
100
+ router_indices,
101
+ routing_weights,
102
+ gate_up_proj,
103
+ gate_up_proj_bias,
104
+ down_proj,
105
+ down_proj_bias,
106
+ ):
107
+ """
108
+ Binned PyTorch implementation of OpenAI-style MoE.
109
+ Sorts tokens by expert assignment for more efficient batched processing.
110
+ """
111
+ B, S = hidden_states.shape[0], hidden_states.shape[1]
112
+ K = router_indices.shape[1]
113
+
114
+ # Set expert_capacity to a reasonable value (max tokens per expert)
115
+ # Use 2x the average to handle imbalance
116
+ expert_capacity = (B * S * K * 2) // routing_weights.shape[2]
117
+
118
+ return binned_experts_ref(
119
+ hidden_states,
120
+ router_indices,
121
+ routing_weights,
122
+ gate_up_proj,
123
+ gate_up_proj_bias,
124
+ down_proj,
125
+ down_proj_bias,
126
+ expert_capacity,
127
+ )
128
+
129
+
130
+ run_benchmark(
131
+ kernel_type=KernelTypeEnum.OPENAI_MOE,
132
+ impl_name="binned_torch",
133
+ impl_tags={"family": "pytorch", "backend": "eager"},
134
+ impl_func=binned_torch_openai_moe,
135
+ dtype="float32",
136
+ )
openai_moe/impls/cells/nv.py ADDED
@@ -0,0 +1,2 @@
 
 
 
1
+ import subprocess
2
+ print(subprocess.run(["nvidia-smi"], capture_output=True, text=True).stdout)
openai_moe/impls/gpt_oss_moe.html ADDED
The diff for this file is too large to render. See raw diff
 
openai_moe/impls/index.html ADDED
@@ -0,0 +1,89 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ <!DOCTYPE html>
2
+ <html>
3
+ <head>
4
+ <meta charset='UTF-8'>
5
+ <meta name='viewport' content='width=device-width, initial-scale=1.0'>
6
+ <title>Index of /openai_moe/impls</title>
7
+ <style>
8
+ :root {
9
+ --bg-primary: #0a0a0a;
10
+ --bg-secondary: #121212;
11
+ --bg-tertiary: #181818;
12
+ --text-primary: #e0e0e0;
13
+ --text-secondary: #888888;
14
+ --text-link: #64b5f6;
15
+ --border-primary: #2a2a2a;
16
+ }
17
+ body {
18
+ font-family: system-ui, -apple-system, Segoe UI, Roboto, Helvetica, Arial, sans-serif;
19
+ background: var(--bg-primary);
20
+ color: var(--text-primary);
21
+ margin: 0;
22
+ padding: 16px;
23
+ max-width: 900px;
24
+ margin: 0 auto;
25
+ }
26
+ .controls {
27
+ display: flex;
28
+ justify-content: flex-end;
29
+ margin-bottom: 1rem;
30
+ }
31
+ .back-button {
32
+ background: var(--bg-secondary);
33
+ border: 1px solid var(--border-primary);
34
+ padding: 8px 12px;
35
+ border-radius: 4px;
36
+ color: var(--text-secondary);
37
+ cursor: pointer;
38
+ font-size: 0.9rem;
39
+ text-decoration: none;
40
+ display: inline-block;
41
+ }
42
+ .back-button:hover {
43
+ color: var(--text-primary);
44
+ background: var(--bg-tertiary);
45
+ }
46
+ h1 {
47
+ font-size: 1.5em;
48
+ margin: 1rem 0;
49
+ color: var(--text-primary);
50
+ border-bottom: 1px solid var(--border-primary);
51
+ padding-bottom: 0.5rem;
52
+ }
53
+ ul {
54
+ list-style-type: none;
55
+ padding: 0;
56
+ }
57
+ li {
58
+ margin: 0;
59
+ border-bottom: 1px solid var(--border-primary);
60
+ }
61
+ li:last-child {
62
+ border-bottom: none;
63
+ }
64
+ a {
65
+ display: block;
66
+ padding: 0.75rem 0.5rem;
67
+ text-decoration: none;
68
+ color: var(--text-link);
69
+ transition: background 0.2s ease;
70
+ }
71
+ a:hover {
72
+ background: var(--bg-secondary);
73
+ }
74
+ .dir {
75
+ font-weight: 500;
76
+ }
77
+ </style>
78
+ </head>
79
+ <body>
80
+ <div class='controls'>
81
+ <a href='../index.html' class='back-button'>← back</a>
82
+ </div>
83
+ <h1>Index of /openai_moe/impls</h1>
84
+ <ul>
85
+ <li><a href='binned_torch.html' class='file'>binned_torch.html</a></li>
86
+ <li><a href='gpt_oss_moe.html' class='file'>gpt_oss_moe.html</a></li>
87
+ </ul>
88
+ </body>
89
+ </html>
openai_moe/index.html ADDED
@@ -0,0 +1,89 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ <!DOCTYPE html>
2
+ <html>
3
+ <head>
4
+ <meta charset='UTF-8'>
5
+ <meta name='viewport' content='width=device-width, initial-scale=1.0'>
6
+ <title>Index of /openai_moe</title>
7
+ <style>
8
+ :root {
9
+ --bg-primary: #0a0a0a;
10
+ --bg-secondary: #121212;
11
+ --bg-tertiary: #181818;
12
+ --text-primary: #e0e0e0;
13
+ --text-secondary: #888888;
14
+ --text-link: #64b5f6;
15
+ --border-primary: #2a2a2a;
16
+ }
17
+ body {
18
+ font-family: system-ui, -apple-system, Segoe UI, Roboto, Helvetica, Arial, sans-serif;
19
+ background: var(--bg-primary);
20
+ color: var(--text-primary);
21
+ margin: 0;
22
+ padding: 16px;
23
+ max-width: 900px;
24
+ margin: 0 auto;
25
+ }
26
+ .controls {
27
+ display: flex;
28
+ justify-content: flex-end;
29
+ margin-bottom: 1rem;
30
+ }
31
+ .back-button {
32
+ background: var(--bg-secondary);
33
+ border: 1px solid var(--border-primary);
34
+ padding: 8px 12px;
35
+ border-radius: 4px;
36
+ color: var(--text-secondary);
37
+ cursor: pointer;
38
+ font-size: 0.9rem;
39
+ text-decoration: none;
40
+ display: inline-block;
41
+ }
42
+ .back-button:hover {
43
+ color: var(--text-primary);
44
+ background: var(--bg-tertiary);
45
+ }
46
+ h1 {
47
+ font-size: 1.5em;
48
+ margin: 1rem 0;
49
+ color: var(--text-primary);
50
+ border-bottom: 1px solid var(--border-primary);
51
+ padding-bottom: 0.5rem;
52
+ }
53
+ ul {
54
+ list-style-type: none;
55
+ padding: 0;
56
+ }
57
+ li {
58
+ margin: 0;
59
+ border-bottom: 1px solid var(--border-primary);
60
+ }
61
+ li:last-child {
62
+ border-bottom: none;
63
+ }
64
+ a {
65
+ display: block;
66
+ padding: 0.75rem 0.5rem;
67
+ text-decoration: none;
68
+ color: var(--text-link);
69
+ transition: background 0.2s ease;
70
+ }
71
+ a:hover {
72
+ background: var(--bg-secondary);
73
+ }
74
+ .dir {
75
+ font-weight: 500;
76
+ }
77
+ </style>
78
+ </head>
79
+ <body>
80
+ <div class='controls'>
81
+ <a href='../index.html' class='back-button'>← back</a>
82
+ </div>
83
+ <h1>Index of /openai_moe</h1>
84
+ <ul>
85
+ <li><a href='impls/index.html' class='dir'>impls/</a></li>
86
+ <li><a href='results/index.html' class='dir'>results/</a></li>
87
+ </ul>
88
+ </body>
89
+ </html>
openai_moe/results/artifacts/combine/latency.svg ADDED

Git LFS Details

  • SHA256: b6b68c91c95cfb46a71083a3812949c831a6e82a5f655eb32ed7c0b19426124d
  • Pointer size: 130 Bytes
  • Size of remote file: 21.9 kB
openai_moe/results/cells/combine.py ADDED
@@ -0,0 +1,27 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # /// script
2
+ # requires-python = ">=3.10"
3
+ # dependencies = [
4
+ # "numpy",
5
+ # "torch==2.8.0",
6
+ # "kernels-benchmark-tools",
7
+ # "matplotlib",
8
+ # ]
9
+ #
10
+ # [tool.uv.sources]
11
+ # kernels-benchmark-tools = { path = "../../../../../tools", editable = true }
12
+ # ///
13
+ from kernels_benchmark_tools.core.visuals import generate_combined_results
14
+
15
+ # Map display names to uvnote environment variables
16
+ cache_env_map = {
17
+ # "PyTorch OpenAI MoE": "UVNOTE_FILE_TORCH_OPENAI_MOE_BENCHMARK",
18
+ "Binned PyTorch": "UVNOTE_FILE_BINNED_TORCH_BENCHMARK",
19
+ "GptOssExperts": "UVNOTE_FILE_GPT_OSS_MOE_BENCHMARK",
20
+ }
21
+
22
+ # Generate combined results with visualization
23
+ generate_combined_results(
24
+ cache_env_map=cache_env_map,
25
+ output_filename="openai_moe.jsonl",
26
+ svg_filename="latency.svg"
27
+ )
openai_moe/results/combined_results.html ADDED
The diff for this file is too large to render. See raw diff
 
openai_moe/results/index.html ADDED
@@ -0,0 +1,88 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ <!DOCTYPE html>
2
+ <html>
3
+ <head>
4
+ <meta charset='UTF-8'>
5
+ <meta name='viewport' content='width=device-width, initial-scale=1.0'>
6
+ <title>Index of /openai_moe/results</title>
7
+ <style>
8
+ :root {
9
+ --bg-primary: #0a0a0a;
10
+ --bg-secondary: #121212;
11
+ --bg-tertiary: #181818;
12
+ --text-primary: #e0e0e0;
13
+ --text-secondary: #888888;
14
+ --text-link: #64b5f6;
15
+ --border-primary: #2a2a2a;
16
+ }
17
+ body {
18
+ font-family: system-ui, -apple-system, Segoe UI, Roboto, Helvetica, Arial, sans-serif;
19
+ background: var(--bg-primary);
20
+ color: var(--text-primary);
21
+ margin: 0;
22
+ padding: 16px;
23
+ max-width: 900px;
24
+ margin: 0 auto;
25
+ }
26
+ .controls {
27
+ display: flex;
28
+ justify-content: flex-end;
29
+ margin-bottom: 1rem;
30
+ }
31
+ .back-button {
32
+ background: var(--bg-secondary);
33
+ border: 1px solid var(--border-primary);
34
+ padding: 8px 12px;
35
+ border-radius: 4px;
36
+ color: var(--text-secondary);
37
+ cursor: pointer;
38
+ font-size: 0.9rem;
39
+ text-decoration: none;
40
+ display: inline-block;
41
+ }
42
+ .back-button:hover {
43
+ color: var(--text-primary);
44
+ background: var(--bg-tertiary);
45
+ }
46
+ h1 {
47
+ font-size: 1.5em;
48
+ margin: 1rem 0;
49
+ color: var(--text-primary);
50
+ border-bottom: 1px solid var(--border-primary);
51
+ padding-bottom: 0.5rem;
52
+ }
53
+ ul {
54
+ list-style-type: none;
55
+ padding: 0;
56
+ }
57
+ li {
58
+ margin: 0;
59
+ border-bottom: 1px solid var(--border-primary);
60
+ }
61
+ li:last-child {
62
+ border-bottom: none;
63
+ }
64
+ a {
65
+ display: block;
66
+ padding: 0.75rem 0.5rem;
67
+ text-decoration: none;
68
+ color: var(--text-link);
69
+ transition: background 0.2s ease;
70
+ }
71
+ a:hover {
72
+ background: var(--bg-secondary);
73
+ }
74
+ .dir {
75
+ font-weight: 500;
76
+ }
77
+ </style>
78
+ </head>
79
+ <body>
80
+ <div class='controls'>
81
+ <a href='../index.html' class='back-button'>← back</a>
82
+ </div>
83
+ <h1>Index of /openai_moe/results</h1>
84
+ <ul>
85
+ <li><a href='combined_results.html' class='file'>combined_results.html</a></li>
86
+ </ul>
87
+ </body>
88
+ </html>