drbh HF Staff commited on
Commit
f2afc26
·
verified ·
1 Parent(s): 05bebc1

Upload folder using huggingface_hub

Browse files
activation/impls/artifacts/benchmark/activation.jsonl CHANGED
@@ -1,9 +1,9 @@
1
- {"ts": "2025-10-24T19:24:48Z", "run": "f61cf615fbaa45ae9db9a8ea612d3936", "impl": "torch_eager", "tags": {"family": "hf-kernels", "backend": "eager"}, "wl": {"name": "cuda_T128_D768", "num_tokens": 128, "hidden_dim": 768, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L4", "sm": "8.9", "py": "3.12.7", "plat": "Linux-5.15.0-1084-aws-x86_64-with-glibc2.31"}, "lat_ms": {"p10": 0.04845100920647383, "p50": 0.04891102435067296, "p90": 0.05595100810751319, "mean": 0.051765027455985546, "iqr": 0.007269962225109339, "raw_times": [0.04868104588240385, 0.04891102435067296, 0.04845100920647383, 0.0568310497328639, 0.05595100810751319], "has_warnings": true, "reps": 5, "warmup": 2}, "compile_ms": 0.055451004300266504, "peak_bytes": 2164736, "ok": true, "absmax": 0.0, "corr": {"ok": true, "rtol": 0.01, "atol": 0.1, "absmax": 0.0, "mae": 0.0, "mse": 0.0, "relmax": 0.0, "ref": "swiglu_bfloat16"}, "err": null}
2
- {"ts": "2025-10-24T19:24:48Z", "run": "f61cf615fbaa45ae9db9a8ea612d3936", "impl": "torch_eager", "tags": {"family": "hf-kernels", "backend": "eager"}, "wl": {"name": "cuda_T128_D1024", "num_tokens": 128, "hidden_dim": 1024, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L4", "sm": "8.9", "py": "3.12.7", "plat": "Linux-5.15.0-1084-aws-x86_64-with-glibc2.31"}, "lat_ms": {"p10": 0.053381023462861776, "p50": 0.05378195783123374, "p90": 0.055961019825190306, "mean": 0.05980720743536949, "iqr": 0.002529995981603861, "raw_times": [0.055961019825190306, 0.053381023462861776, 0.08248101221397519, 0.05378195783123374, 0.053431023843586445], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.061191036365926266, "peak_bytes": 2885632, "ok": true, "absmax": 0.0, "corr": {"ok": true, "rtol": 0.01, "atol": 0.1, "absmax": 0.0, "mae": 0.0, "mse": 0.0, "relmax": 0.0, "ref": "swiglu_bfloat16"}, "err": null}
3
- {"ts": "2025-10-24T19:24:48Z", "run": "f61cf615fbaa45ae9db9a8ea612d3936", "impl": "torch_eager", "tags": {"family": "hf-kernels", "backend": "eager"}, "wl": {"name": "cuda_T128_D2048", "num_tokens": 128, "hidden_dim": 2048, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L4", "sm": "8.9", "py": "3.12.7", "plat": "Linux-5.15.0-1084-aws-x86_64-with-glibc2.31"}, "lat_ms": {"p10": 0.05066103767603636, "p50": 0.0530310207977891, "p90": 0.0544210197404027, "mean": 0.052935024723410606, "iqr": 0.002769986167550087, "raw_times": [0.05066103767603636, 0.0544210197404027, 0.0530310207977891, 0.05491101182997227, 0.05165103357285261], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.057801022194325924, "peak_bytes": 5769216, "ok": true, "absmax": 0.0, "corr": {"ok": true, "rtol": 0.01, "atol": 0.1, "absmax": 0.0, "mae": 0.0, "mse": 0.0, "relmax": 0.0, "ref": "swiglu_bfloat16"}, "err": null}
4
- {"ts": "2025-10-24T19:24:48Z", "run": "f61cf615fbaa45ae9db9a8ea612d3936", "impl": "torch_eager", "tags": {"family": "hf-kernels", "backend": "eager"}, "wl": {"name": "cuda_T256_D768", "num_tokens": 256, "hidden_dim": 768, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L4", "sm": "8.9", "py": "3.12.7", "plat": "Linux-5.15.0-1084-aws-x86_64-with-glibc2.31"}, "lat_ms": {"p10": 0.051781011279672384, "p50": 0.053541967645287514, "p90": 0.05360104842111468, "mean": 0.05314521258696914, "iqr": 0.0010800431482493877, "raw_times": [0.052521005272865295, 0.05428103031590581, 0.05360104842111468, 0.053541967645287514, 0.051781011279672384], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.0558110186830163, "peak_bytes": 4327424, "ok": true, "absmax": 0.0, "corr": {"ok": true, "rtol": 0.01, "atol": 0.1, "absmax": 0.0, "mae": 0.0, "mse": 0.0, "relmax": 0.0, "ref": "swiglu_bfloat16"}, "err": null}
5
- {"ts": "2025-10-24T19:24:48Z", "run": "f61cf615fbaa45ae9db9a8ea612d3936", "impl": "torch_eager", "tags": {"family": "hf-kernels", "backend": "eager"}, "wl": {"name": "cuda_T256_D1024", "num_tokens": 256, "hidden_dim": 1024, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L4", "sm": "8.9", "py": "3.12.7", "plat": "Linux-5.15.0-1084-aws-x86_64-with-glibc2.31"}, "lat_ms": {"p10": 0.05193101242184639, "p50": 0.05308096297085285, "p90": 0.05407101707533002, "mean": 0.05422099493443966, "iqr": 0.0010799849405884743, "raw_times": [0.052991032134741545, 0.05407101707533002, 0.05903095006942749, 0.05308096297085285, 0.05193101242184639], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.05453097401186824, "peak_bytes": 5769216, "ok": true, "absmax": 0.0, "corr": {"ok": true, "rtol": 0.01, "atol": 0.1, "absmax": 0.0, "mae": 0.0, "mse": 0.0, "relmax": 0.0, "ref": "swiglu_bfloat16"}, "err": null}
6
- {"ts": "2025-10-24T19:24:48Z", "run": "f61cf615fbaa45ae9db9a8ea612d3936", "impl": "torch_eager", "tags": {"family": "hf-kernels", "backend": "eager"}, "wl": {"name": "cuda_T256_D2048", "num_tokens": 256, "hidden_dim": 2048, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L4", "sm": "8.9", "py": "3.12.7", "plat": "Linux-5.15.0-1084-aws-x86_64-with-glibc2.31"}, "lat_ms": {"p10": 0.05170103395357728, "p50": 0.05204096669331193, "p90": 0.0529709504917264, "mean": 0.0523771857842803, "iqr": 0.0009989598765969276, "raw_times": [0.05204096669331193, 0.05170103395357728, 0.05320098716765642, 0.05197199061512947, 0.0529709504917264], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.05481095286086202, "peak_bytes": 11536384, "ok": true, "absmax": 0.0, "corr": {"ok": true, "rtol": 0.01, "atol": 0.1, "absmax": 0.0, "mae": 0.0, "mse": 0.0, "relmax": 0.0, "ref": "swiglu_bfloat16"}, "err": null}
7
- {"ts": "2025-10-24T19:24:49Z", "run": "f61cf615fbaa45ae9db9a8ea612d3936", "impl": "torch_eager", "tags": {"family": "hf-kernels", "backend": "eager"}, "wl": {"name": "cuda_T512_D768", "num_tokens": 512, "hidden_dim": 768, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L4", "sm": "8.9", "py": "3.12.7", "plat": "Linux-5.15.0-1084-aws-x86_64-with-glibc2.31"}, "lat_ms": {"p10": 0.05258101737126708, "p50": 0.05330098792910576, "p90": 0.053990981541574, "mean": 0.053516996558755636, "iqr": 0.0007699709385633469, "raw_times": [0.05258101737126708, 0.053221010603010654, 0.05330098792910576, 0.054490985348820686, 0.053990981541574], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.05454104393720627, "peak_bytes": 8652800, "ok": true, "absmax": 0.0, "corr": {"ok": true, "rtol": 0.01, "atol": 0.1, "absmax": 0.0, "mae": 0.0, "mse": 0.0, "relmax": 0.0, "ref": "swiglu_bfloat16"}, "err": null}
8
- {"ts": "2025-10-24T19:24:49Z", "run": "f61cf615fbaa45ae9db9a8ea612d3936", "impl": "torch_eager", "tags": {"family": "hf-kernels", "backend": "eager"}, "wl": {"name": "cuda_T512_D1024", "num_tokens": 512, "hidden_dim": 1024, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L4", "sm": "8.9", "py": "3.12.7", "plat": "Linux-5.15.0-1084-aws-x86_64-with-glibc2.31"}, "lat_ms": {"p10": 0.050960981752723455, "p50": 0.05149102071300149, "p90": 0.05149102071300149, "mean": 0.051745015662163496, "iqr": 0.00012997770681977272, "raw_times": [0.050960981752723455, 0.05149102071300149, 0.05149102071300149, 0.05342101212590933, 0.05136104300618172], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.05551200592890382, "peak_bytes": 11536384, "ok": true, "absmax": 0.0, "corr": {"ok": true, "rtol": 0.01, "atol": 0.1, "absmax": 0.0, "mae": 0.0, "mse": 0.0, "relmax": 0.0, "ref": "swiglu_bfloat16"}, "err": null}
9
- {"ts": "2025-10-24T19:24:49Z", "run": "f61cf615fbaa45ae9db9a8ea612d3936", "impl": "torch_eager", "tags": {"family": "hf-kernels", "backend": "eager"}, "wl": {"name": "cuda_T512_D2048", "num_tokens": 512, "hidden_dim": 2048, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L4", "sm": "8.9", "py": "3.12.7", "plat": "Linux-5.15.0-1084-aws-x86_64-with-glibc2.31"}, "lat_ms": {"p10": 0.049641006626188755, "p50": 0.05309097468852997, "p90": 0.05348102422431111, "mean": 0.052487198263406754, "iqr": 0.0008300412446260452, "raw_times": [0.049641006626188755, 0.05357200279831886, 0.05265098297968507, 0.05309097468852997, 0.05348102422431111], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.05537096876651049, "peak_bytes": 23070720, "ok": true, "absmax": 0.0, "corr": {"ok": true, "rtol": 0.01, "atol": 0.1, "absmax": 0.0, "mae": 0.0, "mse": 0.0, "relmax": 0.0, "ref": "swiglu_bfloat16"}, "err": null}
 
1
+ {"ts": "2025-10-27T14:46:29Z", "run": "cb61ba9d82ad40cba986f04f71dd51b6", "impl": "hf_kernels_swiglu", "tags": {"family": "hf-kernels", "backend": "cuda"}, "wl": {"name": "cuda_T128_D768", "num_tokens": 128, "hidden_dim": 768, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.024320999955307343, "p50": 0.025090999997701147, "p90": 0.02569000002949906, "mean": 0.026606800020090304, "iqr": 0.0010690000635804608, "raw_times": [0.03331100015202537, 0.025090999997701147, 0.024320999955307343, 0.02569000002949906, 0.0246209999659186], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.03336100007800269, "peak_bytes": 1966080, "ok": true, "absmax": 0.0, "corr": {"ok": true, "rtol": 0.02, "atol": 0.02, "absmax": 0.0, "mae": 0.0, "mse": 0.0, "ref": "swiglu_fp32"}, "err": null}
2
+ {"ts": "2025-10-27T14:46:29Z", "run": "cb61ba9d82ad40cba986f04f71dd51b6", "impl": "hf_kernels_swiglu", "tags": {"family": "hf-kernels", "backend": "cuda"}, "wl": {"name": "cuda_T128_D1024", "num_tokens": 128, "hidden_dim": 1024, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.028640999971685233, "p50": 0.02958999994007172, "p90": 0.030561000130546745, "mean": 0.02986059994327661, "iqr": 0.0012610003068402875, "raw_times": [0.029299999823706457, 0.028640999971685233, 0.02958999994007172, 0.0312109998503729, 0.030561000130546745], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.03354099999341997, "peak_bytes": 2621440, "ok": true, "absmax": 0.0, "corr": {"ok": true, "rtol": 0.02, "atol": 0.02, "absmax": 0.0, "mae": 0.0, "mse": 0.0, "ref": "swiglu_fp32"}, "err": null}
3
+ {"ts": "2025-10-27T14:46:29Z", "run": "cb61ba9d82ad40cba986f04f71dd51b6", "impl": "hf_kernels_swiglu", "tags": {"family": "hf-kernels", "backend": "cuda"}, "wl": {"name": "cuda_T128_D2048", "num_tokens": 128, "hidden_dim": 2048, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.02880000010918593, "p50": 0.030331000061778468, "p90": 0.030401000003621448, "mean": 0.030208600037440192, "iqr": 0.0004209998678561533, "raw_times": [0.02880000010918593, 0.03153099987684982, 0.029980000135765295, 0.030331000061778468, 0.030401000003621448], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.03317000005154114, "peak_bytes": 5242880, "ok": true, "absmax": 0.0, "corr": {"ok": true, "rtol": 0.02, "atol": 0.02, "absmax": 0.0, "mae": 0.0, "mse": 0.0, "ref": "swiglu_fp32"}, "err": null}
4
+ {"ts": "2025-10-27T14:46:29Z", "run": "cb61ba9d82ad40cba986f04f71dd51b6", "impl": "hf_kernels_swiglu", "tags": {"family": "hf-kernels", "backend": "cuda"}, "wl": {"name": "cuda_T256_D768", "num_tokens": 256, "hidden_dim": 768, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.02921000009337149, "p50": 0.0294310000299447, "p90": 0.029789999871354667, "mean": 0.029938399984530406, "iqr": 0.0004489997991186101, "raw_times": [0.0294310000299447, 0.02921000009337149, 0.03191999985574512, 0.029341000072236056, 0.029789999871354667], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.03343000003042107, "peak_bytes": 3932160, "ok": true, "absmax": 0.0, "corr": {"ok": true, "rtol": 0.02, "atol": 0.02, "absmax": 0.0, "mae": 0.0, "mse": 0.0, "ref": "swiglu_fp32"}, "err": null}
5
+ {"ts": "2025-10-27T14:46:29Z", "run": "cb61ba9d82ad40cba986f04f71dd51b6", "impl": "hf_kernels_swiglu", "tags": {"family": "hf-kernels", "backend": "cuda"}, "wl": {"name": "cuda_T256_D1024", "num_tokens": 256, "hidden_dim": 1024, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.029799999992974335, "p50": 0.031021000040709623, "p90": 0.031239999998433632, "mean": 0.03210639997632825, "iqr": 0.0009289999525208259, "raw_times": [0.038159999803610845, 0.031021000040709623, 0.029799999992974335, 0.030311000045912806, 0.031239999998433632], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.03207100007784902, "peak_bytes": 5242880, "ok": true, "absmax": 0.0, "corr": {"ok": true, "rtol": 0.02, "atol": 0.02, "absmax": 0.0, "mae": 0.0, "mse": 0.0, "ref": "swiglu_fp32"}, "err": null}
6
+ {"ts": "2025-10-27T14:46:29Z", "run": "cb61ba9d82ad40cba986f04f71dd51b6", "impl": "hf_kernels_swiglu", "tags": {"family": "hf-kernels", "backend": "cuda"}, "wl": {"name": "cuda_T256_D2048", "num_tokens": 256, "hidden_dim": 2048, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.0278800000614865, "p50": 0.028550999786602915, "p90": 0.029250000125102815, "mean": 0.02903839999817137, "iqr": 0.0010100000054080738, "raw_times": [0.0278800000614865, 0.02824000011969474, 0.028550999786602915, 0.029250000125102815, 0.03127099989796989], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.03262000018366962, "peak_bytes": 10485760, "ok": true, "absmax": 0.0, "corr": {"ok": true, "rtol": 0.02, "atol": 0.02, "absmax": 0.0, "mae": 0.0, "mse": 0.0, "ref": "swiglu_fp32"}, "err": null}
7
+ {"ts": "2025-10-27T14:46:29Z", "run": "cb61ba9d82ad40cba986f04f71dd51b6", "impl": "hf_kernels_swiglu", "tags": {"family": "hf-kernels", "backend": "cuda"}, "wl": {"name": "cuda_T512_D768", "num_tokens": 512, "hidden_dim": 768, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.02945000005638576, "p50": 0.029881000045861583, "p90": 0.03017099993485317, "mean": 0.03019639998456114, "iqr": 0.0005509998572961194, "raw_times": [0.029881000045861583, 0.03185999980814813, 0.03017099993485317, 0.02945000005638576, 0.02962000007755705], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.031610000178261544, "peak_bytes": 7864320, "ok": true, "absmax": 0.0, "corr": {"ok": true, "rtol": 0.02, "atol": 0.02, "absmax": 0.0, "mae": 0.0, "mse": 0.0, "ref": "swiglu_fp32"}, "err": null}
8
+ {"ts": "2025-10-27T14:46:29Z", "run": "cb61ba9d82ad40cba986f04f71dd51b6", "impl": "hf_kernels_swiglu", "tags": {"family": "hf-kernels", "backend": "cuda"}, "wl": {"name": "cuda_T512_D1024", "num_tokens": 512, "hidden_dim": 1024, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.02807000009852345, "p50": 0.028989999918849207, "p90": 0.02929000015683414, "mean": 0.028920200020365883, "iqr": 0.0003590002961573191, "raw_times": [0.029320000066945795, 0.02929000015683414, 0.02807000009852345, 0.028989999918849207, 0.02893099986067682], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.033219999977518455, "peak_bytes": 10485760, "ok": true, "absmax": 0.0, "corr": {"ok": true, "rtol": 0.02, "atol": 0.02, "absmax": 0.0, "mae": 0.0, "mse": 0.0, "ref": "swiglu_fp32"}, "err": null}
9
+ {"ts": "2025-10-27T14:46:29Z", "run": "cb61ba9d82ad40cba986f04f71dd51b6", "impl": "hf_kernels_swiglu", "tags": {"family": "hf-kernels", "backend": "cuda"}, "wl": {"name": "cuda_T512_D2048", "num_tokens": 512, "hidden_dim": 2048, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.029301000040504732, "p50": 0.03090099994551565, "p90": 0.03149000008306757, "mean": 0.03127060003862425, "iqr": 0.0014889999420120148, "raw_times": [0.029301000040504732, 0.030001000141055556, 0.03149000008306757, 0.03465999998297775, 0.03090099994551565], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.03197000000909611, "peak_bytes": 20971520, "ok": true, "absmax": 0.0, "corr": {"ok": true, "rtol": 0.02, "atol": 0.02, "absmax": 0.0, "mae": 0.0, "mse": 0.0, "ref": "swiglu_fp32"}, "err": null}
activation/impls/cells/benchmark.py CHANGED
@@ -4,6 +4,7 @@
4
  # "numpy",
5
  # "torch==2.8.0",
6
  # "kernels-benchmark-tools",
 
7
  # ]
8
  #
9
  # [tool.uv.sources]
@@ -12,17 +13,22 @@
12
  import torch
13
  import sys
14
  from kernels_benchmark_tools import KernelTypeEnum, run_benchmark
15
- import torch, torch.nn.functional as F
16
 
 
 
17
 
18
- def swiglu_eager(x):
19
- d = x.shape[-1] // 2
20
- return F.silu(x[..., :d]) * x[..., d:]
 
 
 
21
 
22
 
23
  run_benchmark(
24
  kernel_type=KernelTypeEnum.ACTIVATION,
25
- impl_name="torch_eager",
26
- impl_tags={"family":"hf-kernels", "backend":"eager"},
27
- impl_func=swiglu_eager,
28
  )
 
4
  # "numpy",
5
  # "torch==2.8.0",
6
  # "kernels-benchmark-tools",
7
+ # "kernels",
8
  # ]
9
  #
10
  # [tool.uv.sources]
 
13
  import torch
14
  import sys
15
  from kernels_benchmark_tools import KernelTypeEnum, run_benchmark
16
+ from kernels import get_kernel
17
 
18
+ # Load the activation kernel
19
+ activation = get_kernel("kernels-community/activation")
20
 
21
+
22
+ def hf_kernels_swiglu(input_tensor):
23
+ hidden_dim = input_tensor.shape[-1] // 2
24
+ out_shape = input_tensor.shape[:-1] + (hidden_dim,)
25
+ out = torch.empty(out_shape, dtype=input_tensor.dtype, device=input_tensor.device)
26
+ return activation.silu_and_mul(out, input_tensor)
27
 
28
 
29
  run_benchmark(
30
  kernel_type=KernelTypeEnum.ACTIVATION,
31
+ impl_name="hf_kernels_swiglu",
32
+ impl_tags={"family": "hf-kernels", "backend": "cuda"},
33
+ impl_func=hf_kernels_swiglu,
34
  )
activation/impls/hf_kernels_swiglu.html CHANGED
@@ -3857,7 +3857,7 @@ span.linenos.special { color: #000000; background-color: #ffffc0; padding-left:
3857
  <div class="system-info">
3858
  <div class="system-info-header">Generated on:</div>
3859
  <div class="system-info-content">
3860
- Linux x86_64 | Linux-5.15.0-1084-aws-x86_64-with-glibc2.31
3861
  </div>
3862
  </div>
3863
 
@@ -3871,7 +3871,7 @@ span.linenos.special { color: #000000; background-color: #ffffc0; padding-left:
3871
  <span onclick="toggleOutput('nv')" style="cursor: pointer;">▼ output</span>
3872
  <span id="uv-indicator-nv" style="cursor: default; opacity: 0.3;">▶ uv-logs</span>
3873
  </span> |
3874
- Cell: nv | 4.02s
3875
  | <button class="run-btn" onclick="runCell('nv')">▶ run</button>
3876
  <button class="copy-btn" onclick="copyCell('nv')">Copy</button>
3877
  <a href="cells/nv.py" target="_blank" class="raw-btn">Raw</a>
@@ -3887,34 +3887,22 @@ Cell: nv | 4.02s
3887
  </div>
3888
  </div>
3889
  <div id="output-nv" class="cell-output">
3890
- <div class="cell-stdout"><pre class="stdout-text">Fri Oct 24 19:18:43 2025
3891
  +-----------------------------------------------------------------------------------------+
3892
- | NVIDIA-SMI 560.35.05 Driver Version: 560.35.05 CUDA Version: 12.6 |
3893
  |-----------------------------------------+------------------------+----------------------+
3894
  | GPU Name Persistence-M | Bus-Id Disp.A | Volatile Uncorr. ECC |
3895
  | Fan Temp Perf Pwr:Usage/Cap | Memory-Usage | GPU-Util Compute M. |
3896
  | | | MIG M. |
3897
  |=========================================+========================+======================|
3898
- | 0 NVIDIA L4 Off | 00000000:38:00.0 Off | 0 |
3899
- | N/A 35C P0 27W / 72W | 1MiB / 23034MiB | 0% Default |
3900
- | | | N/A |
3901
- +-----------------------------------------+------------------------+----------------------+
3902
- | 1 NVIDIA L4 Off | 00000000:3A:00.0 Off | 0 |
3903
- | N/A 32C P0 28W / 72W | 1MiB / 23034MiB | 2% Default |
3904
- | | | N/A |
3905
- +-----------------------------------------+------------------------+----------------------+
3906
- | 2 NVIDIA L4 Off | 00000000:3C:00.0 Off | 0 |
3907
- | N/A 34C P0 27W / 72W | 1MiB / 23034MiB | 1% Default |
3908
- | | | N/A |
3909
- +-----------------------------------------+------------------------+----------------------+
3910
- | 3 NVIDIA L4 Off | 00000000:3E:00.0 Off | 0 |
3911
- | N/A 32C P0 27W / 72W | 1MiB / 23034MiB | 2% Default |
3912
  | | | N/A |
3913
  +-----------------------------------------+------------------------+----------------------+
3914
 
3915
  +-----------------------------------------------------------------------------------------+
3916
  | Processes: |
3917
- | GPU GI CI PID Type Process name GPU Memory |
3918
  | ID ID Usage |
3919
  |=========================================================================================|
3920
  | No running processes found |
@@ -3932,7 +3920,7 @@ Cell: nv | 4.02s
3932
  <span onclick="toggleOutput('benchmark')" style="cursor: pointer;">▼ output</span>
3933
  <span id="uv-indicator-benchmark" onclick="toggleUvLogsFromHeader('benchmark')" style="cursor: pointer;">▶ uv-logs</span>
3934
  </span> |
3935
- Cell: benchmark | 43.68s
3936
  | <button class="run-btn" onclick="runCell('benchmark')">▶ run</button>
3937
  <button class="copy-btn" onclick="copyCell('benchmark')">Copy</button>
3938
  <a href="cells/benchmark.py" target="_blank" class="raw-btn">Raw</a>
@@ -3988,17 +3976,17 @@ PROFILE TRACE: hf_kernels_swiglu | cuda_T128_D768
3988
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
3989
  Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
3990
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
3991
- hf_kernels_swiglu 0.00% 0.000us 0.00% 0.000us 0.000us 72.447us 1172.85% 72.447us 72.447us 1
3992
- hf_kernels_swiglu 10.70% 189.904us 99.62% 1.769ms 1.769ms 0.000us 0.00% 8.289us 8.289us 1
3993
- _activation_beeaae6::silu_and_mul 1.07% 18.931us 86.38% 1.534ms 511.168us 6.177us 100.00% 8.289us 2.763us 3
3994
- void vllm::act_and_mul_kernel&lt;c10::BFloat16, &amp;(c10::... 0.00% 0.000us 0.00% 0.000us 0.000us 6.177us 100.00% 6.177us 2.059us 3
3995
- Activity Buffer Request 82.95% 1.473ms 82.95% 1.473ms 1.473ms 2.112us 34.19% 2.112us 2.112us 1
3996
- aten::empty 2.54% 45.151us 2.54% 45.151us 15.050us 0.000us 0.00% 0.000us 0.000us 3
3997
- cudaLaunchKernel 2.36% 41.961us 2.36% 41.961us 13.987us 0.000us 0.00% 0.000us 0.000us 3
3998
- cudaDeviceSynchronize 0.38% 6.701us 0.38% 6.701us 6.701us 0.000us 0.00% 0.000us 0.000us 1
3999
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4000
- Self CPU time total: 1.775ms
4001
- Self CUDA time total: 6.177us
4002
 
4003
 
4004
 
@@ -4008,17 +3996,17 @@ PROFILE TRACE: hf_kernels_swiglu | cuda_T128_D1024
4008
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4009
  Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
4010
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4011
- hf_kernels_swiglu 0.00% 0.000us 0.00% 0.000us 0.000us 91.934us 1135.55% 91.934us 91.934us 1
4012
- hf_kernels_swiglu 6.80% 114.004us 99.69% 1.672ms 1.672ms 0.000us 0.00% 10.816us 10.816us 1
4013
- _activation_beeaae6::silu_and_mul 1.26% 21.089us 91.64% 1.537ms 512.271us 8.096us 100.00% 10.816us 3.605us 3
4014
- void vllm::act_and_mul_kernel&lt;c10::BFloat16, &amp;(c10::... 0.00% 0.000us 0.00% 0.000us 0.000us 8.096us 100.00% 8.096us 2.699us 3
4015
- Activity Buffer Request 88.69% 1.487ms 88.69% 1.487ms 1.487ms 2.720us 33.60% 2.720us 2.720us 1
4016
- aten::empty 1.24% 20.870us 1.24% 20.870us 6.957us 0.000us 0.00% 0.000us 0.000us 3
4017
- cudaLaunchKernel 1.70% 28.501us 1.70% 28.501us 9.500us 0.000us 0.00% 0.000us 0.000us 3
4018
- cudaDeviceSynchronize 0.31% 5.260us 0.31% 5.260us 5.260us 0.000us 0.00% 0.000us 0.000us 1
4019
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4020
- Self CPU time total: 1.677ms
4021
- Self CUDA time total: 8.096us
4022
 
4023
 
4024
 
@@ -4028,17 +4016,17 @@ PROFILE TRACE: hf_kernels_swiglu | cuda_T128_D2048
4028
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4029
  Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
4030
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4031
- hf_kernels_swiglu 0.00% 0.000us 0.00% 0.000us 0.000us 67.039us 596.86% 67.039us 67.039us 1
4032
- hf_kernels_swiglu 5.22% 85.373us 99.71% 1.630ms 1.630ms 0.000us 0.00% 15.008us 15.008us 1
4033
- _activation_beeaae6::silu_and_mul 1.19% 19.431us 93.38% 1.527ms 508.877us 11.232us 100.00% 15.008us 5.003us 3
4034
- void vllm::act_and_mul_kernel&lt;c10::BFloat16, &amp;(c10::... 0.00% 0.000us 0.00% 0.000us 0.000us 11.232us 100.00% 11.232us 3.744us 3
4035
- Activity Buffer Request 90.58% 1.481ms 90.58% 1.481ms 1.481ms 3.776us 33.62% 3.776us 3.776us 1
4036
- aten::empty 1.11% 18.160us 1.11% 18.160us 6.053us 0.000us 0.00% 0.000us 0.000us 3
4037
- cudaLaunchKernel 1.61% 26.370us 1.61% 26.370us 8.790us 0.000us 0.00% 0.000us 0.000us 3
4038
- cudaDeviceSynchronize 0.29% 4.730us 0.29% 4.730us 4.730us 0.000us 0.00% 0.000us 0.000us 1
4039
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4040
- Self CPU time total: 1.635ms
4041
- Self CUDA time total: 11.232us
4042
 
4043
 
4044
 
@@ -4048,17 +4036,17 @@ PROFILE TRACE: hf_kernels_swiglu | cuda_T256_D768
4048
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4049
  Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
4050
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4051
- hf_kernels_swiglu 0.00% 0.000us 0.00% 0.000us 0.000us 69.598us 870.08% 69.598us 69.598us 1
4052
- hf_kernels_swiglu 4.94% 87.632us 99.74% 1.771ms 1.771ms 0.000us 0.00% 10.719us 10.719us 1
4053
- _activation_beeaae6::silu_and_mul 1.09% 19.352us 93.69% 1.663ms 554.452us 7.999us 100.00% 10.719us 3.573us 3
4054
- void vllm::act_and_mul_kernel&lt;c10::BFloat16, &amp;(c10::... 0.00% 0.000us 0.00% 0.000us 0.000us 7.999us 100.00% 7.999us 2.666us 3
4055
- Activity Buffer Request 83.17% 1.477ms 83.17% 1.477ms 1.477ms 2.720us 34.00% 2.720us 2.720us 1
4056
- aten::empty 1.11% 19.710us 1.11% 19.710us 6.570us 0.000us 0.00% 0.000us 0.000us 3
4057
- cudaLaunchKernel 9.43% 167.443us 9.43% 167.443us 55.814us 0.000us 0.00% 0.000us 0.000us 3
4058
- cudaDeviceSynchronize 0.26% 4.610us 0.26% 4.610us 4.610us 0.000us 0.00% 0.000us 0.000us 1
4059
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4060
- Self CPU time total: 1.775ms
4061
- Self CUDA time total: 7.999us
4062
 
4063
 
4064
 
@@ -4068,17 +4056,17 @@ PROFILE TRACE: hf_kernels_swiglu | cuda_T256_D1024
4068
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4069
  Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
4070
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4071
- hf_kernels_swiglu 0.00% 0.000us 0.00% 0.000us 0.000us 70.239us 570.12% 70.239us 70.239us 1
4072
- hf_kernels_swiglu 5.14% 91.331us 99.75% 1.772ms 1.772ms 0.000us 0.00% 16.448us 16.448us 1
4073
- _activation_beeaae6::silu_and_mul 1.09% 19.360us 93.54% 1.662ms 553.872us 12.320us 100.00% 16.448us 5.483us 3
4074
- void vllm::act_and_mul_kernel&lt;c10::BFloat16, &amp;(c10::... 0.00% 0.000us 0.00% 0.000us 0.000us 12.320us 100.00% 12.320us 4.107us 3
4075
- Activity Buffer Request 83.14% 1.477ms 83.14% 1.477ms 1.477ms 4.128us 33.51% 4.128us 4.128us 1
4076
- aten::empty 1.07% 19.032us 1.07% 19.032us 6.344us 0.000us 0.00% 0.000us 0.000us 3
4077
- cudaLaunchKernel 9.31% 165.333us 9.31% 165.333us 55.111us 0.000us 0.00% 0.000us 0.000us 3
4078
- cudaDeviceSynchronize 0.25% 4.400us 0.25% 4.400us 4.400us 0.000us 0.00% 0.000us 0.000us 1
4079
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4080
- Self CPU time total: 1.776ms
4081
- Self CUDA time total: 12.320us
4082
 
4083
 
4084
 
@@ -4088,17 +4076,17 @@ PROFILE TRACE: hf_kernels_swiglu | cuda_T256_D2048
4088
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4089
  Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
4090
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4091
- hf_kernels_swiglu 0.00% 0.000us 0.00% 0.000us 0.000us 68.766us 394.32% 68.766us 68.766us 1
4092
- hf_kernels_swiglu 16.12% 86.942us 99.12% 534.642us 534.642us 0.000us 0.00% 23.263us 23.263us 1
4093
- _activation_beeaae6::silu_and_mul 3.56% 19.181us 79.14% 426.890us 142.297us 17.439us 100.00% 23.263us 7.754us 3
4094
- void vllm::act_and_mul_kernel&lt;c10::BFloat16, &amp;(c10::... 0.00% 0.000us 0.00% 0.000us 0.000us 17.439us 100.00% 17.439us 5.813us 3
4095
- Activity Buffer Request 44.72% 241.246us 44.72% 241.246us 241.246us 5.824us 33.40% 5.824us 5.824us 1
4096
- aten::empty 3.86% 20.810us 3.86% 20.810us 6.937us 0.000us 0.00% 0.000us 0.000us 3
4097
- cudaLaunchKernel 30.86% 166.463us 30.86% 166.463us 55.488us 0.000us 0.00% 0.000us 0.000us 3
4098
- cudaDeviceSynchronize 0.88% 4.760us 0.88% 4.760us 4.760us 0.000us 0.00% 0.000us 0.000us 1
4099
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4100
- Self CPU time total: 539.402us
4101
- Self CUDA time total: 17.439us
4102
 
4103
 
4104
 
@@ -4108,17 +4096,17 @@ PROFILE TRACE: hf_kernels_swiglu | cuda_T512_D768
4108
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4109
  Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
4110
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4111
- hf_kernels_swiglu 0.00% 0.000us 0.00% 0.000us 0.000us 67.422us 541.63% 67.422us 67.422us 1
4112
- hf_kernels_swiglu 15.67% 86.170us 99.13% 545.172us 545.172us 0.000us 0.00% 16.576us 16.576us 1
4113
- _activation_beeaae6::silu_and_mul 3.45% 18.981us 79.89% 439.370us 146.457us 12.448us 100.00% 16.576us 5.525us 3
4114
- void vllm::act_and_mul_kernel&lt;c10::BFloat16, &amp;(c10::... 0.00% 0.000us 0.00% 0.000us 0.000us 12.448us 100.00% 12.448us 4.149us 3
4115
- Activity Buffer Request 46.28% 254.506us 46.28% 254.506us 254.506us 4.128us 33.16% 4.128us 4.128us 1
4116
- aten::empty 3.57% 19.632us 3.57% 19.632us 6.544us 0.000us 0.00% 0.000us 0.000us 3
4117
- cudaLaunchKernel 30.16% 165.883us 30.16% 165.883us 55.294us 0.000us 0.00% 0.000us 0.000us 3
4118
- cudaDeviceSynchronize 0.87% 4.770us 0.87% 4.770us 4.770us 0.000us 0.00% 0.000us 0.000us 1
4119
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4120
- Self CPU time total: 549.942us
4121
- Self CUDA time total: 12.448us
4122
 
4123
 
4124
 
@@ -4128,17 +4116,17 @@ PROFILE TRACE: hf_kernels_swiglu | cuda_T512_D1024
4128
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4129
  Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
4130
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4131
- hf_kernels_swiglu 0.00% 0.000us 0.00% 0.000us 0.000us 70.941us 341.59% 70.941us 70.941us 1
4132
- hf_kernels_swiglu 15.89% 87.442us 99.17% 545.692us 545.692us 0.000us 0.00% 27.744us 27.744us 1
4133
- _activation_beeaae6::silu_and_mul 3.49% 19.210us 79.79% 439.080us 146.360us 20.768us 100.00% 27.744us 9.248us 3
4134
- void vllm::act_and_mul_kernel&lt;c10::BFloat16, &amp;(c10::... 0.00% 0.000us 0.00% 0.000us 0.000us 20.768us 100.00% 20.768us 6.923us 3
4135
- Activity Buffer Request 46.16% 253.986us 46.16% 253.986us 253.986us 6.976us 33.59% 6.976us 6.976us 1
4136
- aten::empty 3.48% 19.170us 3.48% 19.170us 6.390us 0.000us 0.00% 0.000us 0.000us 3
4137
- cudaLaunchKernel 30.15% 165.884us 30.15% 165.884us 55.295us 0.000us 0.00% 0.000us 0.000us 3
4138
- cudaDeviceSynchronize 0.83% 4.591us 0.83% 4.591us 4.591us 0.000us 0.00% 0.000us 0.000us 1
4139
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4140
- Self CPU time total: 550.283us
4141
- Self CUDA time total: 20.768us
4142
 
4143
 
4144
 
@@ -4148,17 +4136,17 @@ PROFILE TRACE: hf_kernels_swiglu | cuda_T512_D2048
4148
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4149
  Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
4150
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4151
- hf_kernels_swiglu 0.00% 0.000us 0.00% 0.000us 0.000us 70.780us 228.74% 70.780us 70.780us 1
4152
- hf_kernels_swiglu 16.83% 85.362us 99.15% 502.911us 502.911us 0.000us 0.00% 41.183us 41.183us 1
4153
- _activation_beeaae6::silu_and_mul 3.74% 18.980us 78.74% 399.388us 133.129us 30.943us 100.00% 41.183us 13.728us 3
4154
- void vllm::act_and_mul_kernel&lt;c10::BFloat16, &amp;(c10::... 0.00% 0.000us 0.00% 0.000us 0.000us 30.943us 100.00% 30.943us 10.314us 3
4155
- Activity Buffer Request 42.65% 216.335us 42.65% 216.335us 216.335us 10.240us 33.09% 10.240us 10.240us 1
4156
- aten::empty 3.58% 18.161us 3.58% 18.161us 6.054us 0.000us 0.00% 0.000us 0.000us 3
4157
- cudaLaunchKernel 32.35% 164.073us 32.35% 164.073us 54.691us 0.000us 0.00% 0.000us 0.000us 3
4158
- cudaDeviceSynchronize 0.85% 4.320us 0.85% 4.320us 4.320us 0.000us 0.00% 0.000us 0.000us 1
4159
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4160
- Self CPU time total: 507.231us
4161
- Self CUDA time total: 30.943us
4162
 
4163
 
4164
  impl wl p50(ms) ok
@@ -4175,61 +4163,12 @@ hf_kernels_swiglu cuda_T512_D768 0.03 True
4175
  <div class="uv-install-logs" id="uv-logs-benchmark">
4176
  <div class="uv-logs-header" onclick="toggleUvLogs(this)">▶ UV Install Logs</div>
4177
  <div class="uv-logs-content" style="display: none;">
4178
- Building kernels-benchmark-tools @ file:///home/ubuntu/Projects/kernels-benchmarks-consolidated/tools
4179
- Downloading numpy (15.9MiB)
4180
- Downloading setuptools (1.1MiB)
4181
- Downloading sympy (6.0MiB)
4182
- Downloading nvidia-cufft-cu12 (184.2MiB)
4183
- Downloading triton (148.4MiB)
4184
- Downloading nvidia-cudnn-cu12 (674.0MiB)
4185
- Downloading nvidia-cusparselt-cu12 (273.9MiB)
4186
- Downloading nvidia-nvjitlink-cu12 (37.4MiB)
4187
- Downloading nvidia-cuda-cupti-cu12 (9.8MiB)
4188
- Downloading nvidia-cusolver-cu12 (255.1MiB)
4189
- Downloading nvidia-cuda-nvrtc-cu12 (84.0MiB)
4190
- Downloading pillow (6.7MiB)
4191
- Downloading networkx (1.9MiB)
4192
- Downloading matplotlib (8.3MiB)
4193
- Downloading nvidia-cusparse-cu12 (274.9MiB)
4194
- Downloading nvidia-curand-cu12 (60.7MiB)
4195
- Downloading kiwisolver (1.4MiB)
4196
- Downloading nvidia-cufile-cu12 (1.1MiB)
4197
- Downloading hf-xet (3.2MiB)
4198
- Downloading nvidia-nccl-cu12 (307.4MiB)
4199
- Downloading fonttools (4.7MiB)
4200
- Downloading nvidia-cublas-cu12 (566.8MiB)
4201
- Downloading torch (846.8MiB)
4202
- Downloading nvidia-cufile-cu12
4203
- Downloading kiwisolver
4204
- Downloading hf-xet
4205
- Downloading setuptools
4206
- Downloading fonttools
4207
- Downloading networkx
4208
- Downloading pillow
4209
- Built kernels-benchmark-tools @ file:///home/ubuntu/Projects/kernels-benchmarks-consolidated/tools
4210
- Downloading nvidia-cuda-cupti-cu12
4211
- Downloading matplotlib
4212
- Downloading numpy
4213
- Downloading nvidia-nvjitlink-cu12
4214
- Downloading sympy
4215
- Downloading nvidia-curand-cu12
4216
- Downloading nvidia-cuda-nvrtc-cu12
4217
- Downloading triton
4218
- Downloading nvidia-cufft-cu12
4219
- Downloading nvidia-cusolver-cu12
4220
- Downloading nvidia-cusparse-cu12
4221
- Downloading nvidia-cusparselt-cu12
4222
- Downloading nvidia-nccl-cu12
4223
- Downloading nvidia-cublas-cu12
4224
- Downloading nvidia-cudnn-cu12
4225
- Downloading torch
4226
- Installed 47 packages in 234ms
4227
  </div>
4228
  </div>
4229
  <div class="cell-stderr">Fetching 7 files: 0%| | 0/7 [00:00&lt;?, ?it/s]
4230
- Fetching 7 files: 14%|█▍ | 1/7 [00:00&lt;00:01, 4.08it/s]
4231
- Fetching 7 files: 71%|███████▏ | 5/7 [00:00&lt;00:00, 9.40it/s]
4232
- Fetching 7 files: 100%|██████████| 7/7 [00:00&lt;00:00, 12.20it/s]</div>
4233
  <div class="cell-artifacts">
4234
  <h4>Artifacts:</h4>
4235
  <a href="artifacts/benchmark/activation.jsonl" class="artifact" target="_blank">activation.jsonl</a>
 
3857
  <div class="system-info">
3858
  <div class="system-info-header">Generated on:</div>
3859
  <div class="system-info-content">
3860
+ Linux x86_64 | Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35
3861
  </div>
3862
  </div>
3863
 
 
3871
  <span onclick="toggleOutput('nv')" style="cursor: pointer;">▼ output</span>
3872
  <span id="uv-indicator-nv" style="cursor: default; opacity: 0.3;">▶ uv-logs</span>
3873
  </span> |
3874
+ Cell: nv | 0.26s
3875
  | <button class="run-btn" onclick="runCell('nv')">▶ run</button>
3876
  <button class="copy-btn" onclick="copyCell('nv')">Copy</button>
3877
  <a href="cells/nv.py" target="_blank" class="raw-btn">Raw</a>
 
3887
  </div>
3888
  </div>
3889
  <div id="output-nv" class="cell-output">
3890
+ <div class="cell-stdout"><pre class="stdout-text">Mon Oct 27 14:46:00 2025
3891
  +-----------------------------------------------------------------------------------------+
3892
+ | NVIDIA-SMI 570.195.03 Driver Version: 570.195.03 CUDA Version: 12.8 |
3893
  |-----------------------------------------+------------------------+----------------------+
3894
  | GPU Name Persistence-M | Bus-Id Disp.A | Volatile Uncorr. ECC |
3895
  | Fan Temp Perf Pwr:Usage/Cap | Memory-Usage | GPU-Util Compute M. |
3896
  | | | MIG M. |
3897
  |=========================================+========================+======================|
3898
+ | 0 NVIDIA L40S On | 00000000:4D:00.0 Off | 0 |
3899
+ | N/A 32C P0 153W / 350W | 0MiB / 46068MiB | 75% Default |
 
 
 
 
 
 
 
 
 
 
 
 
3900
  | | | N/A |
3901
  +-----------------------------------------+------------------------+----------------------+
3902
 
3903
  +-----------------------------------------------------------------------------------------+
3904
  | Processes: |
3905
+ | GPU GI CI PID Type Process name GPU Memory |
3906
  | ID ID Usage |
3907
  |=========================================================================================|
3908
  | No running processes found |
 
3920
  <span onclick="toggleOutput('benchmark')" style="cursor: pointer;">▼ output</span>
3921
  <span id="uv-indicator-benchmark" onclick="toggleUvLogsFromHeader('benchmark')" style="cursor: pointer;">▶ uv-logs</span>
3922
  </span> |
3923
+ Cell: benchmark | 4.32s
3924
  | <button class="run-btn" onclick="runCell('benchmark')">▶ run</button>
3925
  <button class="copy-btn" onclick="copyCell('benchmark')">Copy</button>
3926
  <a href="cells/benchmark.py" target="_blank" class="raw-btn">Raw</a>
 
3976
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
3977
  Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
3978
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
3979
+ hf_kernels_swiglu 0.00% 0.000us 0.00% 0.000us 0.000us 80.128us 1940.62% 80.128us 80.128us 1
3980
+ hf_kernels_swiglu 11.19% 199.383us 99.56% 1.774ms 1.774ms 0.000us 0.00% 5.634us 5.634us 1
3981
+ _activation_beeaae6::silu_and_mul 1.10% 19.601us 85.64% 1.526ms 508.618us 4.129us 100.00% 5.634us 1.878us 3
3982
+ void vllm::act_and_mul_kernel&lt;c10::BFloat16, &amp;(c10::... 0.00% 0.000us 0.00% 0.000us 0.000us 4.129us 100.00% 4.129us 1.376us 3
3983
+ Activity Buffer Request 82.30% 1.466ms 82.30% 1.466ms 1.466ms 1.505us 36.45% 1.505us 1.505us 1
3984
+ aten::empty 2.73% 48.641us 2.73% 48.641us 16.214us 0.000us 0.00% 0.000us 0.000us 3
3985
+ cudaLaunchKernel 2.24% 39.931us 2.24% 39.931us 13.310us 0.000us 0.00% 0.000us 0.000us 3
3986
+ cudaDeviceSynchronize 0.44% 7.891us 0.44% 7.891us 7.891us 0.000us 0.00% 0.000us 0.000us 1
3987
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
3988
+ Self CPU time total: 1.782ms
3989
+ Self CUDA time total: 4.129us
3990
 
3991
 
3992
 
 
3996
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
3997
  Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
3998
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
3999
+ hf_kernels_swiglu 0.00% 0.000us 0.00% 0.000us 0.000us 77.823us 1961.76% 77.823us 77.823us 1
4000
+ hf_kernels_swiglu 7.28% 119.722us 99.70% 1.640ms 1.640ms 0.000us 0.00% 5.311us 5.311us 1
4001
+ _activation_beeaae6::silu_and_mul 1.57% 25.841us 91.18% 1.500ms 499.858us 3.967us 100.00% 5.311us 1.770us 3
4002
+ void vllm::act_and_mul_kernel&lt;c10::BFloat16, &amp;(c10::... 0.00% 0.000us 0.00% 0.000us 0.000us 3.967us 100.00% 3.967us 1.322us 3
4003
+ Activity Buffer Request 87.74% 1.443ms 87.74% 1.443ms 1.443ms 1.344us 33.88% 1.344us 1.344us 1
4004
+ aten::empty 1.24% 20.410us 1.24% 20.410us 6.803us 0.000us 0.00% 0.000us 0.000us 3
4005
+ cudaLaunchKernel 1.86% 30.650us 1.86% 30.650us 10.217us 0.000us 0.00% 0.000us 0.000us 3
4006
+ cudaDeviceSynchronize 0.30% 4.930us 0.30% 4.930us 4.930us 0.000us 0.00% 0.000us 0.000us 1
4007
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4008
+ Self CPU time total: 1.645ms
4009
+ Self CUDA time total: 3.967us
4010
 
4011
 
4012
 
 
4016
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4017
  Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
4018
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4019
+ hf_kernels_swiglu 0.00% 0.000us 0.00% 0.000us 0.000us 67.487us 1369.46% 67.487us 67.487us 1
4020
+ hf_kernels_swiglu 6.70% 107.400us 99.69% 1.598ms 1.598ms 0.000us 0.00% 6.592us 6.592us 1
4021
+ _activation_beeaae6::silu_and_mul 1.32% 21.191us 91.79% 1.471ms 490.438us 4.928us 100.00% 6.592us 2.197us 3
4022
+ void vllm::act_and_mul_kernel&lt;c10::BFloat16, &amp;(c10::... 0.00% 0.000us 0.00% 0.000us 0.000us 4.928us 100.00% 4.928us 1.643us 3
4023
+ Activity Buffer Request 88.89% 1.425ms 88.89% 1.425ms 1.425ms 1.664us 33.77% 1.664us 1.664us 1
4024
+ aten::empty 1.20% 19.281us 1.20% 19.281us 6.427us 0.000us 0.00% 0.000us 0.000us 3
4025
+ cudaLaunchKernel 1.57% 25.210us 1.57% 25.210us 8.403us 0.000us 0.00% 0.000us 0.000us 3
4026
+ cudaDeviceSynchronize 0.31% 4.970us 0.31% 4.970us 4.970us 0.000us 0.00% 0.000us 0.000us 1
4027
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4028
+ Self CPU time total: 1.603ms
4029
+ Self CUDA time total: 4.928us
4030
 
4031
 
4032
 
 
4036
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4037
  Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
4038
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4039
+ hf_kernels_swiglu 0.00% 0.000us 0.00% 0.000us 0.000us 75.265us 1768.03% 75.265us 75.265us 1
4040
+ hf_kernels_swiglu 6.51% 118.032us 99.70% 1.807ms 1.807ms 0.000us 0.00% 5.697us 5.697us 1
4041
+ _activation_beeaae6::silu_and_mul 1.22% 22.071us 92.05% 1.668ms 556.119us 4.257us 100.00% 5.697us 1.899us 3
4042
+ void vllm::act_and_mul_kernel&lt;c10::BFloat16, &amp;(c10::... 0.00% 0.000us 0.00% 0.000us 0.000us 4.257us 100.00% 4.257us 1.419us 3
4043
+ Activity Buffer Request 79.39% 1.439ms 79.39% 1.439ms 1.439ms 1.440us 33.83% 1.440us 1.440us 1
4044
+ aten::empty 1.14% 20.640us 1.14% 20.640us 6.880us 0.000us 0.00% 0.000us 0.000us 3
4045
+ cudaLaunchKernel 11.45% 207.513us 11.45% 207.513us 69.171us 0.000us 0.00% 0.000us 0.000us 3
4046
+ cudaDeviceSynchronize 0.30% 5.350us 0.30% 5.350us 5.350us 0.000us 0.00% 0.000us 0.000us 1
4047
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4048
+ Self CPU time total: 1.812ms
4049
+ Self CUDA time total: 4.257us
4050
 
4051
 
4052
 
 
4056
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4057
  Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
4058
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4059
+ hf_kernels_swiglu 0.00% 0.000us 0.00% 0.000us 0.000us 65.471us 1111.94% 65.471us 65.471us 1
4060
+ hf_kernels_swiglu 19.52% 89.390us 98.84% 452.537us 452.537us 0.000us 0.00% 7.872us 7.872us 1
4061
+ _activation_beeaae6::silu_and_mul 5.02% 23.003us 75.04% 343.547us 114.516us 5.888us 100.00% 7.872us 2.624us 3
4062
+ void vllm::act_and_mul_kernel&lt;c10::BFloat16, &amp;(c10::... 0.00% 0.000us 0.00% 0.000us 0.000us 5.888us 100.00% 5.888us 1.963us 3
4063
+ Activity Buffer Request 33.89% 155.152us 33.89% 155.152us 155.152us 1.984us 33.70% 1.984us 1.984us 1
4064
+ aten::empty 4.28% 19.600us 4.28% 19.600us 6.533us 0.000us 0.00% 0.000us 0.000us 3
4065
+ cudaLaunchKernel 36.13% 165.392us 36.13% 165.392us 55.131us 0.000us 0.00% 0.000us 0.000us 3
4066
+ cudaDeviceSynchronize 1.16% 5.290us 1.16% 5.290us 5.290us 0.000us 0.00% 0.000us 0.000us 1
4067
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4068
+ Self CPU time total: 457.827us
4069
+ Self CUDA time total: 5.888us
4070
 
4071
 
4072
 
 
4076
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4077
  Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
4078
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4079
+ hf_kernels_swiglu 0.00% 0.000us 0.00% 0.000us 0.000us 68.383us 879.52% 68.383us 68.383us 1
4080
+ hf_kernels_swiglu 6.83% 118.711us 99.72% 1.734ms 1.734ms 0.000us 0.00% 10.367us 10.367us 1
4081
+ _activation_beeaae6::silu_and_mul 1.25% 21.741us 91.78% 1.596ms 531.855us 7.775us 100.00% 10.367us 3.456us 3
4082
+ void vllm::act_and_mul_kernel&lt;c10::BFloat16, &amp;(c10::... 0.00% 0.000us 0.00% 0.000us 0.000us 7.775us 100.00% 7.775us 2.592us 3
4083
+ Activity Buffer Request 81.74% 1.421ms 81.74% 1.421ms 1.421ms 2.592us 33.34% 2.592us 2.592us 1
4084
+ aten::empty 1.11% 19.311us 1.11% 19.311us 6.437us 0.000us 0.00% 0.000us 0.000us 3
4085
+ cudaLaunchKernel 8.79% 152.752us 8.79% 152.752us 50.917us 0.000us 0.00% 0.000us 0.000us 3
4086
+ cudaDeviceSynchronize 0.28% 4.930us 0.28% 4.930us 4.930us 0.000us 0.00% 0.000us 0.000us 1
4087
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4088
+ Self CPU time total: 1.739ms
4089
+ Self CUDA time total: 7.775us
4090
 
4091
 
4092
 
 
4096
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4097
  Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
4098
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4099
+ hf_kernels_swiglu 0.00% 0.000us 0.00% 0.000us 0.000us 70.527us 1069.89% 70.527us 70.527us 1
4100
+ hf_kernels_swiglu 6.20% 108.691us 99.73% 1.749ms 1.749ms 0.000us 0.00% 8.800us 8.800us 1
4101
+ _activation_beeaae6::silu_and_mul 1.29% 22.622us 92.35% 1.619ms 539.785us 6.592us 100.00% 8.800us 2.933us 3
4102
+ void vllm::act_and_mul_kernel&lt;c10::BFloat16, &amp;(c10::... 0.00% 0.000us 0.00% 0.000us 0.000us 6.592us 100.00% 6.592us 2.197us 3
4103
+ Activity Buffer Request 82.48% 1.446ms 82.48% 1.446ms 1.446ms 2.208us 33.50% 2.208us 2.208us 1
4104
+ aten::empty 1.18% 20.650us 1.18% 20.650us 6.883us 0.000us 0.00% 0.000us 0.000us 3
4105
+ cudaLaunchKernel 8.58% 150.492us 8.58% 150.492us 50.164us 0.000us 0.00% 0.000us 0.000us 3
4106
+ cudaDeviceSynchronize 0.27% 4.780us 0.27% 4.780us 4.780us 0.000us 0.00% 0.000us 0.000us 1
4107
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4108
+ Self CPU time total: 1.753ms
4109
+ Self CUDA time total: 6.592us
4110
 
4111
 
4112
 
 
4116
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4117
  Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
4118
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4119
+ hf_kernels_swiglu 0.00% 0.000us 0.00% 0.000us 0.000us 66.591us 703.03% 66.591us 66.591us 1
4120
+ hf_kernels_swiglu 22.91% 88.512us 98.75% 381.506us 381.506us 0.000us 0.00% 12.640us 12.640us 1
4121
+ _activation_beeaae6::silu_and_mul 5.22% 20.151us 70.42% 272.064us 90.688us 9.472us 100.00% 12.640us 4.213us 3
4122
+ void vllm::act_and_mul_kernel&lt;c10::BFloat16, &amp;(c10::... 0.00% 0.000us 0.00% 0.000us 0.000us 9.472us 100.00% 9.472us 3.157us 3
4123
+ Activity Buffer Request 26.21% 101.241us 26.21% 101.241us 101.241us 3.168us 33.45% 3.168us 3.168us 1
4124
+ aten::empty 5.42% 20.930us 5.42% 20.930us 6.977us 0.000us 0.00% 0.000us 0.000us 3
4125
+ cudaLaunchKernel 39.00% 150.672us 39.00% 150.672us 50.224us 0.000us 0.00% 0.000us 0.000us 3
4126
+ cudaDeviceSynchronize 1.25% 4.820us 1.25% 4.820us 4.820us 0.000us 0.00% 0.000us 0.000us 1
4127
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4128
+ Self CPU time total: 386.326us
4129
+ Self CUDA time total: 9.472us
4130
 
4131
 
4132
 
 
4136
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4137
  Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
4138
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4139
+ hf_kernels_swiglu 0.00% 0.000us 0.00% 0.000us 0.000us 67.295us 514.21% 67.295us 67.295us 1
4140
+ hf_kernels_swiglu 24.05% 101.492us 98.90% 417.266us 417.266us 0.000us 0.00% 17.503us 17.503us 1
4141
+ _activation_beeaae6::silu_and_mul 5.33% 22.480us 70.08% 295.684us 98.561us 13.087us 100.00% 17.503us 5.834us 3
4142
+ void vllm::act_and_mul_kernel&lt;c10::BFloat16, &amp;(c10::... 0.00% 0.000us 0.00% 0.000us 0.000us 13.087us 100.00% 13.087us 4.362us 3
4143
+ Activity Buffer Request 28.92% 122.012us 28.92% 122.012us 122.012us 4.416us 33.74% 4.416us 4.416us 1
4144
+ aten::empty 4.76% 20.090us 4.76% 20.090us 6.697us 0.000us 0.00% 0.000us 0.000us 3
4145
+ cudaLaunchKernel 35.83% 151.192us 35.83% 151.192us 50.397us 0.000us 0.00% 0.000us 0.000us 3
4146
+ cudaDeviceSynchronize 1.10% 4.660us 1.10% 4.660us 4.660us 0.000us 0.00% 0.000us 0.000us 1
4147
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4148
+ Self CPU time total: 421.926us
4149
+ Self CUDA time total: 13.087us
4150
 
4151
 
4152
  impl wl p50(ms) ok
 
4163
  <div class="uv-install-logs" id="uv-logs-benchmark">
4164
  <div class="uv-logs-header" onclick="toggleUvLogs(this)">▶ UV Install Logs</div>
4165
  <div class="uv-logs-content" style="display: none;">
4166
+ Installed 15 packages in 15ms
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
4167
  </div>
4168
  </div>
4169
  <div class="cell-stderr">Fetching 7 files: 0%| | 0/7 [00:00&lt;?, ?it/s]
4170
+ Fetching 7 files: 71%|███████▏ | 5/7 [00:00&lt;00:00, 13.68it/s]
4171
+ Fetching 7 files: 100%|██████████| 7/7 [00:00&lt;00:00, 19.14it/s]</div>
 
4172
  <div class="cell-artifacts">
4173
  <h4>Artifacts:</h4>
4174
  <a href="artifacts/benchmark/activation.jsonl" class="artifact" target="_blank">activation.jsonl</a>
activation/impls/torch_swiglu.html CHANGED
@@ -3857,7 +3857,7 @@ span.linenos.special { color: #000000; background-color: #ffffc0; padding-left:
3857
  <div class="system-info">
3858
  <div class="system-info-header">Generated on:</div>
3859
  <div class="system-info-content">
3860
- Linux x86_64 | Linux-5.15.0-1084-aws-x86_64-with-glibc2.31
3861
  </div>
3862
  </div>
3863
 
@@ -3871,7 +3871,7 @@ span.linenos.special { color: #000000; background-color: #ffffc0; padding-left:
3871
  <span onclick="toggleOutput('nv')" style="cursor: pointer;">▼ output</span>
3872
  <span id="uv-indicator-nv" style="cursor: default; opacity: 0.3;">▶ uv-logs</span>
3873
  </span> |
3874
- Cell: nv | 4.02s
3875
  | <button class="run-btn" onclick="runCell('nv')">▶ run</button>
3876
  <button class="copy-btn" onclick="copyCell('nv')">Copy</button>
3877
  <a href="cells/nv.py" target="_blank" class="raw-btn">Raw</a>
@@ -3887,34 +3887,22 @@ Cell: nv | 4.02s
3887
  </div>
3888
  </div>
3889
  <div id="output-nv" class="cell-output">
3890
- <div class="cell-stdout"><pre class="stdout-text">Fri Oct 24 19:24:09 2025
3891
  +-----------------------------------------------------------------------------------------+
3892
- | NVIDIA-SMI 560.35.05 Driver Version: 560.35.05 CUDA Version: 12.6 |
3893
  |-----------------------------------------+------------------------+----------------------+
3894
  | GPU Name Persistence-M | Bus-Id Disp.A | Volatile Uncorr. ECC |
3895
  | Fan Temp Perf Pwr:Usage/Cap | Memory-Usage | GPU-Util Compute M. |
3896
  | | | MIG M. |
3897
  |=========================================+========================+======================|
3898
- | 0 NVIDIA L4 Off | 00000000:38:00.0 Off | 0 |
3899
- | N/A 36C P0 27W / 72W | 1MiB / 23034MiB | 2% Default |
3900
- | | | N/A |
3901
- +-----------------------------------------+------------------------+----------------------+
3902
- | 1 NVIDIA L4 Off | 00000000:3A:00.0 Off | 0 |
3903
- | N/A 33C P0 28W / 72W | 1MiB / 23034MiB | 0% Default |
3904
- | | | N/A |
3905
- +-----------------------------------------+------------------------+----------------------+
3906
- | 2 NVIDIA L4 Off | 00000000:3C:00.0 Off | 0 |
3907
- | N/A 34C P0 27W / 72W | 1MiB / 23034MiB | 0% Default |
3908
- | | | N/A |
3909
- +-----------------------------------------+------------------------+----------------------+
3910
- | 3 NVIDIA L4 Off | 00000000:3E:00.0 Off | 0 |
3911
- | N/A 33C P0 27W / 72W | 1MiB / 23034MiB | 2% Default |
3912
  | | | N/A |
3913
  +-----------------------------------------+------------------------+----------------------+
3914
 
3915
  +-----------------------------------------------------------------------------------------+
3916
  | Processes: |
3917
- | GPU GI CI PID Type Process name GPU Memory |
3918
  | ID ID Usage |
3919
  |=========================================================================================|
3920
  | No running processes found |
@@ -3932,7 +3920,7 @@ Cell: nv | 4.02s
3932
  <span onclick="toggleOutput('benchmark')" style="cursor: pointer;">▼ output</span>
3933
  <span id="uv-indicator-benchmark" onclick="toggleUvLogsFromHeader('benchmark')" style="cursor: pointer;">▶ uv-logs</span>
3934
  </span> |
3935
- Cell: benchmark | 42.42s
3936
  | <button class="run-btn" onclick="runCell('benchmark')">▶ run</button>
3937
  <button class="copy-btn" onclick="copyCell('benchmark')">Copy</button>
3938
  <a href="cells/benchmark.py" target="_blank" class="raw-btn">Raw</a>
@@ -3982,20 +3970,20 @@ PROFILE TRACE: torch_eager | cuda_T128_D768
3982
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
3983
  Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
3984
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
3985
- torch_eager 0.00% 0.000us 0.00% 0.000us 0.000us 237.726us 1604.74% 237.726us 237.726us 1
3986
- torch_eager 11.30% 225.975us 99.63% 1.992ms 1.992ms 0.000us 0.00% 17.566us 17.566us 1
3987
- aten::silu 3.42% 68.411us 81.12% 1.622ms 540.728us 7.646us 51.61% 10.398us 3.466us 3
3988
- void at::native::elementwise_kernel&lt;128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 7.646us 51.61% 7.646us 2.549us 3
3989
- aten::mul 2.15% 42.970us 3.33% 66.621us 22.207us 7.168us 48.39% 7.168us 2.389us 3
3990
- void at::native::elementwise_kernel&lt;128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 7.168us 48.39% 7.168us 2.389us 3
3991
- Activity Buffer Request 74.74% 1.495ms 74.74% 1.495ms 1.495ms 2.752us 18.58% 2.752us 2.752us 1
3992
- aten::slice 3.26% 65.261us 3.88% 77.582us 12.930us 0.000us 0.00% 0.000us 0.000us 6
3993
- aten::as_strided 0.62% 12.321us 0.62% 12.321us 2.053us 0.000us 0.00% 0.000us 0.000us 6
3994
- cudaLaunchKernel 4.14% 82.803us 4.14% 82.803us 13.800us 0.000us 0.00% 0.000us 0.000us 6
3995
- cudaDeviceSynchronize 0.37% 7.380us 0.37% 7.380us 7.380us 0.000us 0.00% 0.000us 0.000us 1
3996
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
3997
- Self CPU time total: 2.000ms
3998
- Self CUDA time total: 14.814us
3999
 
4000
 
4001
 
@@ -4005,20 +3993,20 @@ PROFILE TRACE: torch_eager | cuda_T128_D1024
4005
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4006
  Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
4007
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4008
- torch_eager 0.00% 0.000us 0.00% 0.000us 0.000us 155.197us 1056.55% 155.197us 155.197us 1
4009
- torch_eager 6.38% 113.914us 99.69% 1.779ms 1.779ms 0.000us 0.00% 17.249us 17.249us 1
4010
- aten::silu 2.13% 37.960us 88.89% 1.587ms 528.841us 7.616us 51.85% 10.176us 3.392us 3
4011
- void at::native::elementwise_kernel&lt;128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 7.616us 51.85% 7.616us 2.539us 3
4012
- aten::mul 1.58% 28.130us 2.63% 46.991us 15.664us 7.073us 48.15% 7.073us 2.358us 3
4013
- void at::native::elementwise_kernel&lt;128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 7.073us 48.15% 7.073us 2.358us 3
4014
- Activity Buffer Request 85.27% 1.522ms 85.27% 1.522ms 1.522ms 2.560us 17.43% 2.560us 2.560us 1
4015
- aten::slice 1.43% 25.481us 1.78% 31.850us 5.308us 0.000us 0.00% 0.000us 0.000us 6
4016
- aten::as_strided 0.36% 6.369us 0.36% 6.369us 1.061us 0.000us 0.00% 0.000us 0.000us 6
4017
- cudaLaunchKernel 2.55% 45.552us 2.55% 45.552us 7.592us 0.000us 0.00% 0.000us 0.000us 6
4018
- cudaDeviceSynchronize 0.31% 5.590us 0.31% 5.590us 5.590us 0.000us 0.00% 0.000us 0.000us 1
4019
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4020
- Self CPU time total: 1.785ms
4021
- Self CUDA time total: 14.689us
4022
 
4023
 
4024
 
@@ -4028,20 +4016,20 @@ PROFILE TRACE: torch_eager | cuda_T128_D2048
4028
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4029
  Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
4030
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4031
- torch_eager 0.00% 0.000us 0.00% 0.000us 0.000us 157.724us 928.23% 157.724us 157.724us 1
4032
- torch_eager 6.06% 107.501us 99.72% 1.769ms 1.769ms 0.000us 0.00% 19.872us 19.872us 1
4033
- aten::silu 2.60% 46.162us 89.17% 1.581ms 527.145us 8.576us 50.47% 11.456us 3.819us 3
4034
- void at::native::elementwise_kernel&lt;128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 8.576us 50.47% 8.576us 2.859us 3
4035
- aten::mul 1.54% 27.281us 2.61% 46.211us 15.404us 8.416us 49.53% 8.416us 2.805us 3
4036
- void at::native::elementwise_kernel&lt;128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 8.416us 49.53% 8.416us 2.805us 3
4037
- Activity Buffer Request 85.05% 1.508ms 85.05% 1.508ms 1.508ms 2.880us 16.95% 2.880us 2.880us 1
4038
- aten::slice 1.51% 26.721us 1.88% 33.391us 5.565us 0.000us 0.00% 0.000us 0.000us 6
4039
- aten::as_strided 0.38% 6.670us 0.38% 6.670us 1.112us 0.000us 0.00% 0.000us 0.000us 6
4040
- cudaLaunchKernel 2.58% 45.781us 2.58% 45.781us 7.630us 0.000us 0.00% 0.000us 0.000us 6
4041
- cudaDeviceSynchronize 0.28% 4.940us 0.28% 4.940us 4.940us 0.000us 0.00% 0.000us 0.000us 1
4042
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4043
- Self CPU time total: 1.773ms
4044
- Self CUDA time total: 16.992us
4045
 
4046
 
4047
 
@@ -4051,20 +4039,20 @@ PROFILE TRACE: torch_eager | cuda_T256_D768
4051
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4052
  Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
4053
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4054
- torch_eager 0.00% 0.000us 0.00% 0.000us 0.000us 154.972us 984.26% 154.972us 154.972us 1
4055
- torch_eager 7.81% 106.363us 99.66% 1.357ms 1.357ms 0.000us 0.00% 18.497us 18.497us 1
4056
- aten::silu 3.01% 41.020us 86.15% 1.173ms 391.021us 8.096us 51.42% 10.848us 3.616us 3
4057
- void at::native::elementwise_kernel&lt;128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 8.096us 51.42% 8.096us 2.699us 3
4058
- aten::mul 1.89% 25.761us 3.27% 44.581us 14.860us 7.649us 48.58% 7.649us 2.550us 3
4059
- void at::native::elementwise_kernel&lt;128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 7.649us 48.58% 7.649us 2.550us 3
4060
- Activity Buffer Request 68.76% 936.210us 68.76% 936.210us 936.210us 2.752us 17.48% 2.752us 2.752us 1
4061
- aten::slice 1.90% 25.829us 2.43% 33.031us 5.505us 0.000us 0.00% 0.000us 0.000us 6
4062
- aten::as_strided 0.53% 7.202us 0.53% 7.202us 1.200us 0.000us 0.00% 0.000us 0.000us 6
4063
- cudaLaunchKernel 15.76% 214.654us 15.76% 214.654us 35.776us 0.000us 0.00% 0.000us 0.000us 6
4064
- cudaDeviceSynchronize 0.34% 4.590us 0.34% 4.590us 4.590us 0.000us 0.00% 0.000us 0.000us 1
4065
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4066
- Self CPU time total: 1.362ms
4067
- Self CUDA time total: 15.745us
4068
 
4069
 
4070
 
@@ -4074,20 +4062,20 @@ PROFILE TRACE: torch_eager | cuda_T256_D1024
4074
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4075
  Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
4076
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4077
- torch_eager 0.00% 0.000us 0.00% 0.000us 0.000us 155.837us 910.37% 155.837us 155.837us 1
4078
- torch_eager 5.68% 106.351us 99.75% 1.869ms 1.869ms 0.000us 0.00% 20.126us 20.126us 1
4079
- aten::silu 2.11% 39.481us 89.91% 1.685ms 561.559us 8.671us 50.65% 11.679us 3.893us 3
4080
- void at::native::elementwise_kernel&lt;128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 8.671us 50.65% 8.671us 2.890us 3
4081
- aten::mul 1.44% 26.891us 2.49% 46.661us 15.554us 8.447us 49.35% 8.447us 2.816us 3
4082
- void at::native::elementwise_kernel&lt;128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 8.447us 49.35% 8.447us 2.816us 3
4083
- Activity Buffer Request 78.61% 1.473ms 78.61% 1.473ms 1.473ms 3.008us 17.57% 3.008us 3.008us 1
4084
- aten::slice 1.33% 24.861us 1.68% 31.451us 5.242us 0.000us 0.00% 0.000us 0.000us 6
4085
- aten::as_strided 0.35% 6.590us 0.35% 6.590us 1.098us 0.000us 0.00% 0.000us 0.000us 6
4086
- cudaLaunchKernel 10.25% 192.054us 10.25% 192.054us 32.009us 0.000us 0.00% 0.000us 0.000us 6
4087
- cudaDeviceSynchronize 0.25% 4.670us 0.25% 4.670us 4.670us 0.000us 0.00% 0.000us 0.000us 1
4088
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4089
- Self CPU time total: 1.874ms
4090
- Self CUDA time total: 17.118us
4091
 
4092
 
4093
 
@@ -4097,20 +4085,20 @@ PROFILE TRACE: torch_eager | cuda_T256_D2048
4097
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4098
  Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
4099
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4100
- torch_eager 0.00% 0.000us 0.00% 0.000us 0.000us 155.486us 596.92% 155.486us 155.486us 1
4101
- torch_eager 20.98% 108.302us 99.11% 511.621us 511.621us 0.000us 0.00% 30.592us 30.592us 1
4102
- aten::silu 7.61% 39.290us 63.32% 326.866us 108.955us 13.504us 51.84% 18.048us 6.016us 3
4103
- void at::native::elementwise_kernel&lt;128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 13.504us 51.84% 13.504us 4.501us 3
4104
- aten::mul 5.03% 25.960us 8.46% 43.671us 14.557us 12.544us 48.16% 12.544us 4.181us 3
4105
- void at::native::elementwise_kernel&lt;128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 12.544us 48.16% 12.544us 4.181us 3
4106
- Activity Buffer Request 25.15% 129.813us 25.15% 129.813us 129.813us 4.544us 17.44% 4.544us 4.544us 1
4107
- aten::slice 5.13% 26.471us 6.35% 32.782us 5.464us 0.000us 0.00% 0.000us 0.000us 6
4108
- aten::as_strided 1.22% 6.311us 1.22% 6.311us 1.052us 0.000us 0.00% 0.000us 0.000us 6
4109
- cudaLaunchKernel 33.99% 175.474us 33.99% 175.474us 29.246us 0.000us 0.00% 0.000us 0.000us 6
4110
- cudaDeviceSynchronize 0.89% 4.611us 0.89% 4.611us 4.611us 0.000us 0.00% 0.000us 0.000us 1
4111
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4112
- Self CPU time total: 516.232us
4113
- Self CUDA time total: 26.048us
4114
 
4115
 
4116
 
@@ -4120,20 +4108,20 @@ PROFILE TRACE: torch_eager | cuda_T512_D768
4120
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4121
  Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
4122
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4123
- torch_eager 0.00% 0.000us 0.00% 0.000us 0.000us 163.293us 685.87% 163.293us 163.293us 1
4124
- torch_eager 5.58% 106.954us 99.75% 1.910ms 1.910ms 0.000us 0.00% 27.872us 27.872us 1
4125
- aten::silu 2.13% 40.799us 89.92% 1.722ms 574.075us 12.032us 50.54% 16.096us 5.365us 3
4126
- void at::native::elementwise_kernel&lt;128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 12.032us 50.54% 12.032us 4.011us 3
4127
- aten::mul 1.39% 26.590us 2.40% 46.050us 15.350us 11.776us 49.46% 11.776us 3.925us 3
4128
- void at::native::elementwise_kernel&lt;128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 11.776us 49.46% 11.776us 3.925us 3
4129
- Activity Buffer Request 79.43% 1.521ms 79.43% 1.521ms 1.521ms 4.064us 17.07% 4.064us 4.064us 1
4130
- aten::slice 1.44% 27.592us 1.83% 35.091us 5.849us 0.000us 0.00% 0.000us 0.000us 6
4131
- aten::as_strided 0.39% 7.499us 0.39% 7.499us 1.250us 0.000us 0.00% 0.000us 0.000us 6
4132
- cudaLaunchKernel 9.38% 179.564us 9.38% 179.564us 29.927us 0.000us 0.00% 0.000us 0.000us 6
4133
- cudaDeviceSynchronize 0.25% 4.880us 0.25% 4.880us 4.880us 0.000us 0.00% 0.000us 0.000us 1
4134
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4135
- Self CPU time total: 1.915ms
4136
- Self CUDA time total: 23.808us
4137
 
4138
 
4139
 
@@ -4143,20 +4131,20 @@ PROFILE TRACE: torch_eager | cuda_T512_D1024
4143
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4144
  Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
4145
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4146
- torch_eager 0.00% 0.000us 0.00% 0.000us 0.000us 157.181us 605.66% 157.181us 157.181us 1
4147
- torch_eager 5.64% 105.982us 99.73% 1.874ms 1.874ms 0.000us 0.00% 30.528us 30.528us 1
4148
- aten::silu 2.16% 40.612us 89.86% 1.688ms 562.829us 13.440us 51.79% 18.016us 6.005us 3
4149
- void at::native::elementwise_kernel&lt;128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 13.440us 51.79% 13.440us 4.480us 3
4150
- aten::mul 1.34% 25.270us 2.38% 44.720us 14.907us 12.512us 48.21% 12.512us 4.171us 3
4151
- void at::native::elementwise_kernel&lt;128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 12.512us 48.21% 12.512us 4.171us 3
4152
- Activity Buffer Request 79.27% 1.489ms 79.27% 1.489ms 1.489ms 4.576us 17.63% 4.576us 4.576us 1
4153
- aten::slice 1.48% 27.801us 1.85% 34.741us 5.790us 0.000us 0.00% 0.000us 0.000us 6
4154
- aten::as_strided 0.37% 6.940us 0.37% 6.940us 1.157us 0.000us 0.00% 0.000us 0.000us 6
4155
- cudaLaunchKernel 9.47% 177.873us 9.47% 177.873us 29.645us 0.000us 0.00% 0.000us 0.000us 6
4156
- cudaDeviceSynchronize 0.27% 5.010us 0.27% 5.010us 5.010us 0.000us 0.00% 0.000us 0.000us 1
4157
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4158
- Self CPU time total: 1.879ms
4159
- Self CUDA time total: 25.952us
4160
 
4161
 
4162
 
@@ -4166,26 +4154,26 @@ PROFILE TRACE: torch_eager | cuda_T512_D2048
4166
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4167
  Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
4168
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4169
- torch_eager 0.00% 0.000us 0.00% 0.000us 0.000us 158.077us 375.10% 158.077us 158.077us 1
4170
- torch_eager 5.61% 105.585us 99.74% 1.877ms 1.877ms 0.000us 0.00% 49.375us 49.375us 1
4171
- aten::silu 2.18% 41.121us 90.06% 1.695ms 564.996us 21.856us 51.86% 29.088us 9.696us 3
4172
- void at::native::elementwise_kernel&lt;128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 21.856us 51.86% 21.856us 7.285us 3
4173
- aten::mul 1.38% 26.000us 2.45% 46.100us 15.367us 20.287us 48.14% 20.287us 6.762us 3
4174
- void at::native::elementwise_kernel&lt;128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 20.287us 48.14% 20.287us 6.762us 3
4175
- Activity Buffer Request 79.53% 1.497ms 79.53% 1.497ms 1.497ms 7.232us 17.16% 7.232us 7.232us 1
4176
- aten::slice 1.26% 23.718us 1.62% 30.479us 5.080us 0.000us 0.00% 0.000us 0.000us 6
4177
- aten::as_strided 0.36% 6.761us 0.36% 6.761us 1.127us 0.000us 0.00% 0.000us 0.000us 6
4178
- cudaLaunchKernel 9.41% 177.183us 9.41% 177.183us 29.531us 0.000us 0.00% 0.000us 0.000us 6
4179
- cudaDeviceSynchronize 0.26% 4.970us 0.26% 4.970us 4.970us 0.000us 0.00% 0.000us 0.000us 1
4180
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4181
- Self CPU time total: 1.882ms
4182
- Self CUDA time total: 42.143us
4183
 
4184
 
4185
  impl wl p50(ms) ok
4186
  torch_eager cuda_T128_D1024 0.05 True
4187
  torch_eager cuda_T128_D2048 0.05 True
4188
- torch_eager cuda_T128_D768 0.05 True
4189
  torch_eager cuda_T256_D1024 0.05 True
4190
  torch_eager cuda_T256_D2048 0.05 True
4191
  torch_eager cuda_T256_D768 0.05 True
@@ -4196,53 +4184,7 @@ torch_eager cuda_T512_D768 0.05 True
4196
  <div class="uv-install-logs" id="uv-logs-benchmark">
4197
  <div class="uv-logs-header" onclick="toggleUvLogs(this)">▶ UV Install Logs</div>
4198
  <div class="uv-logs-content" style="display: none;">
4199
- Building kernels-benchmark-tools @ file:///home/ubuntu/Projects/kernels-benchmarks-consolidated/tools
4200
- Downloading setuptools (1.1MiB)
4201
- Downloading sympy (6.0MiB)
4202
- Downloading nvidia-cuda-nvrtc-cu12 (84.0MiB)
4203
- Downloading nvidia-cusparselt-cu12 (273.9MiB)
4204
- Downloading nvidia-cuda-cupti-cu12 (9.8MiB)
4205
- Downloading nvidia-cublas-cu12 (566.8MiB)
4206
- Downloading networkx (1.9MiB)
4207
- Downloading numpy (15.9MiB)
4208
- Downloading matplotlib (8.3MiB)
4209
- Downloading nvidia-nvjitlink-cu12 (37.4MiB)
4210
- Downloading nvidia-cusolver-cu12 (255.1MiB)
4211
- Downloading pillow (6.7MiB)
4212
- Downloading nvidia-nccl-cu12 (307.4MiB)
4213
- Downloading nvidia-cudnn-cu12 (674.0MiB)
4214
- Downloading fonttools (4.7MiB)
4215
- Downloading kiwisolver (1.4MiB)
4216
- Downloading nvidia-curand-cu12 (60.7MiB)
4217
- Downloading nvidia-cusparse-cu12 (274.9MiB)
4218
- Downloading nvidia-cufile-cu12 (1.1MiB)
4219
- Downloading nvidia-cufft-cu12 (184.2MiB)
4220
- Downloading torch (846.8MiB)
4221
- Downloading triton (148.4MiB)
4222
- Downloading nvidia-cufile-cu12
4223
- Downloading kiwisolver
4224
- Downloading setuptools
4225
- Downloading networkx
4226
- Downloading fonttools
4227
- Downloading pillow
4228
- Built kernels-benchmark-tools @ file:///home/ubuntu/Projects/kernels-benchmarks-consolidated/tools
4229
- Downloading matplotlib
4230
- Downloading nvidia-cuda-cupti-cu12
4231
- Downloading numpy
4232
- Downloading nvidia-nvjitlink-cu12
4233
- Downloading sympy
4234
- Downloading nvidia-curand-cu12
4235
- Downloading nvidia-cuda-nvrtc-cu12
4236
- Downloading triton
4237
- Downloading nvidia-cufft-cu12
4238
- Downloading nvidia-cusolver-cu12
4239
- Downloading nvidia-cusparse-cu12
4240
- Downloading nvidia-cusparselt-cu12
4241
- Downloading nvidia-nccl-cu12
4242
- Downloading nvidia-cublas-cu12
4243
- Downloading nvidia-cudnn-cu12
4244
- Downloading torch
4245
- Installed 37 packages in 214ms
4246
  </div>
4247
  </div>
4248
  <div class="cell-artifacts">
 
3857
  <div class="system-info">
3858
  <div class="system-info-header">Generated on:</div>
3859
  <div class="system-info-content">
3860
+ Linux x86_64 | Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35
3861
  </div>
3862
  </div>
3863
 
 
3871
  <span onclick="toggleOutput('nv')" style="cursor: pointer;">▼ output</span>
3872
  <span id="uv-indicator-nv" style="cursor: default; opacity: 0.3;">▶ uv-logs</span>
3873
  </span> |
3874
+ Cell: nv | 0.26s
3875
  | <button class="run-btn" onclick="runCell('nv')">▶ run</button>
3876
  <button class="copy-btn" onclick="copyCell('nv')">Copy</button>
3877
  <a href="cells/nv.py" target="_blank" class="raw-btn">Raw</a>
 
3887
  </div>
3888
  </div>
3889
  <div id="output-nv" class="cell-output">
3890
+ <div class="cell-stdout"><pre class="stdout-text">Mon Oct 27 14:46:00 2025
3891
  +-----------------------------------------------------------------------------------------+
3892
+ | NVIDIA-SMI 570.195.03 Driver Version: 570.195.03 CUDA Version: 12.8 |
3893
  |-----------------------------------------+------------------------+----------------------+
3894
  | GPU Name Persistence-M | Bus-Id Disp.A | Volatile Uncorr. ECC |
3895
  | Fan Temp Perf Pwr:Usage/Cap | Memory-Usage | GPU-Util Compute M. |
3896
  | | | MIG M. |
3897
  |=========================================+========================+======================|
3898
+ | 0 NVIDIA L40S On | 00000000:4D:00.0 Off | 0 |
3899
+ | N/A 32C P0 153W / 350W | 0MiB / 46068MiB | 75% Default |
 
 
 
 
 
 
 
 
 
 
 
 
3900
  | | | N/A |
3901
  +-----------------------------------------+------------------------+----------------------+
3902
 
3903
  +-----------------------------------------------------------------------------------------+
3904
  | Processes: |
3905
+ | GPU GI CI PID Type Process name GPU Memory |
3906
  | ID ID Usage |
3907
  |=========================================================================================|
3908
  | No running processes found |
 
3920
  <span onclick="toggleOutput('benchmark')" style="cursor: pointer;">▼ output</span>
3921
  <span id="uv-indicator-benchmark" onclick="toggleUvLogsFromHeader('benchmark')" style="cursor: pointer;">▶ uv-logs</span>
3922
  </span> |
3923
+ Cell: benchmark | 6.99s
3924
  | <button class="run-btn" onclick="runCell('benchmark')">▶ run</button>
3925
  <button class="copy-btn" onclick="copyCell('benchmark')">Copy</button>
3926
  <a href="cells/benchmark.py" target="_blank" class="raw-btn">Raw</a>
 
3970
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
3971
  Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
3972
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
3973
+ torch_eager 0.00% 0.000us 0.00% 0.000us 0.000us 183.359us 1436.08% 183.359us 183.359us 1
3974
+ torch_eager 11.24% 212.694us 99.53% 1.883ms 1.883ms 0.000us 0.00% 15.072us 15.072us 1
3975
+ aten::silu 3.31% 62.660us 82.30% 1.557ms 519.134us 6.527us 51.12% 8.831us 2.944us 3
3976
+ void at::native::elementwise_kernel&lt;128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 6.527us 51.12% 6.527us 2.176us 3
3977
+ aten::mul 1.85% 35.100us 2.98% 56.340us 18.780us 6.241us 48.88% 6.241us 2.080us 3
3978
+ void at::native::elementwise_kernel&lt;128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 6.241us 48.88% 6.241us 2.080us 3
3979
+ Activity Buffer Request 76.74% 1.452ms 76.74% 1.452ms 1.452ms 2.304us 18.05% 2.304us 2.304us 1
3980
+ aten::slice 2.41% 45.561us 3.01% 56.902us 9.484us 0.000us 0.00% 0.000us 0.000us 6
3981
+ aten::as_strided 0.60% 11.341us 0.60% 11.341us 1.890us 0.000us 0.00% 0.000us 0.000us 6
3982
+ cudaLaunchKernel 3.37% 63.741us 3.37% 63.741us 10.623us 0.000us 0.00% 0.000us 0.000us 6
3983
+ cudaDeviceSynchronize 0.47% 8.969us 0.47% 8.969us 8.969us 0.000us 0.00% 0.000us 0.000us 1
3984
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
3985
+ Self CPU time total: 1.892ms
3986
+ Self CUDA time total: 12.768us
3987
 
3988
 
3989
 
 
3993
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
3994
  Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
3995
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
3996
+ torch_eager 0.00% 0.000us 0.00% 0.000us 0.000us 158.431us 1279.63% 158.431us 158.431us 1
3997
+ torch_eager 6.85% 117.301us 99.69% 1.707ms 1.707ms 0.000us 0.00% 14.557us 14.557us 1
3998
+ aten::silu 2.45% 41.990us 88.25% 1.511ms 503.680us 6.398us 51.68% 8.574us 2.858us 3
3999
+ void at::native::elementwise_kernel&lt;128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 6.398us 51.68% 6.398us 2.133us 3
4000
+ aten::mul 1.63% 27.830us 2.78% 47.630us 15.877us 5.983us 48.32% 5.983us 1.994us 3
4001
+ void at::native::elementwise_kernel&lt;128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 5.983us 48.32% 5.983us 1.994us 3
4002
+ Activity Buffer Request 84.28% 1.443ms 84.28% 1.443ms 1.443ms 2.176us 17.58% 2.176us 2.176us 1
4003
+ aten::slice 1.45% 24.820us 1.81% 30.931us 5.155us 0.000us 0.00% 0.000us 0.000us 6
4004
+ aten::as_strided 0.36% 6.111us 0.36% 6.111us 1.019us 0.000us 0.00% 0.000us 0.000us 6
4005
+ cudaLaunchKernel 2.67% 45.711us 2.67% 45.711us 7.618us 0.000us 0.00% 0.000us 0.000us 6
4006
+ cudaDeviceSynchronize 0.31% 5.320us 0.31% 5.320us 5.320us 0.000us 0.00% 0.000us 0.000us 1
4007
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4008
+ Self CPU time total: 1.712ms
4009
+ Self CUDA time total: 12.381us
4010
 
4011
 
4012
 
 
4016
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4017
  Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
4018
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4019
+ torch_eager 0.00% 0.000us 0.00% 0.000us 0.000us 145.182us 1095.88% 145.182us 145.182us 1
4020
+ torch_eager 6.28% 105.841us 99.65% 1.680ms 1.680ms 0.000us 0.00% 15.552us 15.552us 1
4021
+ aten::silu 2.40% 40.400us 89.03% 1.501ms 500.258us 6.816us 51.45% 9.120us 3.040us 3
4022
+ void at::native::elementwise_kernel&lt;128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 6.816us 51.45% 6.816us 2.272us 3
4023
+ aten::mul 1.52% 25.690us 2.64% 44.480us 14.827us 6.432us 48.55% 6.432us 2.144us 3
4024
+ void at::native::elementwise_kernel&lt;128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 6.432us 48.55% 6.432us 2.144us 3
4025
+ Activity Buffer Request 85.10% 1.434ms 85.10% 1.434ms 1.434ms 2.304us 17.39% 2.304us 2.304us 1
4026
+ aten::slice 1.37% 23.030us 1.70% 28.690us 4.782us 0.000us 0.00% 0.000us 0.000us 6
4027
+ aten::as_strided 0.34% 5.660us 0.34% 5.660us 0.943us 0.000us 0.00% 0.000us 0.000us 6
4028
+ cudaLaunchKernel 2.66% 44.762us 2.66% 44.762us 7.460us 0.000us 0.00% 0.000us 0.000us 6
4029
+ cudaDeviceSynchronize 0.35% 5.820us 0.35% 5.820us 5.820us 0.000us 0.00% 0.000us 0.000us 1
4030
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4031
+ Self CPU time total: 1.686ms
4032
+ Self CUDA time total: 13.248us
4033
 
4034
 
4035
 
 
4039
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4040
  Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
4041
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4042
+ torch_eager 0.00% 0.000us 0.00% 0.000us 0.000us 145.025us 1135.85% 145.025us 145.025us 1
4043
+ torch_eager 7.55% 116.292us 99.65% 1.535ms 1.535ms 0.000us 0.00% 14.976us 14.976us 1
4044
+ aten::silu 2.67% 41.061us 87.34% 1.345ms 448.460us 6.592us 51.63% 8.800us 2.933us 3
4045
+ void at::native::elementwise_kernel&lt;128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 6.592us 51.63% 6.592us 2.197us 3
4046
+ aten::mul 1.71% 26.359us 2.88% 44.330us 14.777us 6.176us 48.37% 6.176us 2.059us 3
4047
+ void at::native::elementwise_kernel&lt;128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 6.176us 48.37% 6.176us 2.059us 3
4048
+ Activity Buffer Request 69.61% 1.072ms 69.61% 1.072ms 1.072ms 2.208us 17.29% 2.208us 2.208us 1
4049
+ aten::slice 1.52% 23.350us 1.89% 29.050us 4.842us 0.000us 0.00% 0.000us 0.000us 6
4050
+ aten::as_strided 0.37% 5.700us 0.37% 5.700us 0.950us 0.000us 0.00% 0.000us 0.000us 6
4051
+ cudaLaunchKernel 16.23% 250.045us 16.23% 250.045us 41.674us 0.000us 0.00% 0.000us 0.000us 6
4052
+ cudaDeviceSynchronize 0.35% 5.360us 0.35% 5.360us 5.360us 0.000us 0.00% 0.000us 0.000us 1
4053
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4054
+ Self CPU time total: 1.540ms
4055
+ Self CUDA time total: 12.768us
4056
 
4057
 
4058
 
 
4062
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4063
  Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
4064
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4065
+ torch_eager 0.00% 0.000us 0.00% 0.000us 0.000us 144.030us 1089.82% 144.030us 144.030us 1
4066
+ torch_eager 5.82% 104.551us 99.68% 1.792ms 1.792ms 0.000us 0.00% 15.488us 15.488us 1
4067
+ aten::silu 2.32% 41.682us 89.81% 1.614ms 538.151us 6.752us 51.09% 9.024us 3.008us 3
4068
+ void at::native::elementwise_kernel&lt;128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 6.752us 51.09% 6.752us 2.251us 3
4069
+ aten::mul 1.41% 25.409us 2.48% 44.550us 14.850us 6.464us 48.91% 6.464us 2.155us 3
4070
+ void at::native::elementwise_kernel&lt;128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 6.464us 48.91% 6.464us 2.155us 3
4071
+ Activity Buffer Request 78.50% 1.411ms 78.50% 1.411ms 1.411ms 2.272us 17.19% 2.272us 2.272us 1
4072
+ aten::slice 1.27% 22.830us 1.58% 28.320us 4.720us 0.000us 0.00% 0.000us 0.000us 6
4073
+ aten::as_strided 0.31% 5.490us 0.31% 5.490us 0.915us 0.000us 0.00% 0.000us 0.000us 6
4074
+ cudaLaunchKernel 10.06% 180.853us 10.06% 180.853us 30.142us 0.000us 0.00% 0.000us 0.000us 6
4075
+ cudaDeviceSynchronize 0.32% 5.710us 0.32% 5.710us 5.710us 0.000us 0.00% 0.000us 0.000us 1
4076
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4077
+ Self CPU time total: 1.798ms
4078
+ Self CUDA time total: 13.216us
4079
 
4080
 
4081
 
 
4085
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4086
  Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
4087
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4088
+ torch_eager 0.00% 0.000us 0.00% 0.000us 0.000us 140.382us 902.66% 140.382us 140.382us 1
4089
+ torch_eager 21.39% 103.633us 98.99% 479.697us 479.697us 0.000us 0.00% 18.240us 18.240us 1
4090
+ aten::silu 8.56% 41.460us 63.18% 306.154us 102.051us 7.936us 51.03% 10.624us 3.541us 3
4091
+ void at::native::elementwise_kernel&lt;128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 7.936us 51.03% 7.936us 2.645us 3
4092
+ aten::mul 4.90% 23.759us 8.63% 41.840us 13.947us 7.616us 48.97% 7.616us 2.539us 3
4093
+ void at::native::elementwise_kernel&lt;128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 7.616us 48.97% 7.616us 2.539us 3
4094
+ Activity Buffer Request 23.12% 112.032us 23.12% 112.032us 112.032us 2.688us 17.28% 2.688us 2.688us 1
4095
+ aten::slice 4.68% 22.671us 5.79% 28.070us 4.678us 0.000us 0.00% 0.000us 0.000us 6
4096
+ aten::as_strided 1.11% 5.399us 1.11% 5.399us 0.900us 0.000us 0.00% 0.000us 0.000us 6
4097
+ cudaLaunchKernel 35.23% 170.743us 35.23% 170.743us 28.457us 0.000us 0.00% 0.000us 0.000us 6
4098
+ cudaDeviceSynchronize 1.01% 4.900us 1.01% 4.900us 4.900us 0.000us 0.00% 0.000us 0.000us 1
4099
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4100
+ Self CPU time total: 484.597us
4101
+ Self CUDA time total: 15.552us
4102
 
4103
 
4104
 
 
4108
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4109
  Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
4110
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4111
+ torch_eager 0.00% 0.000us 0.00% 0.000us 0.000us 145.662us 1011.54% 145.662us 145.662us 1
4112
+ torch_eager 5.99% 108.381us 99.73% 1.804ms 1.804ms 0.000us 0.00% 16.896us 16.896us 1
4113
+ aten::silu 2.28% 41.342us 89.69% 1.623ms 540.945us 7.392us 51.33% 9.888us 3.296us 3
4114
+ void at::native::elementwise_kernel&lt;128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 7.392us 51.33% 7.392us 2.464us 3
4115
+ aten::mul 1.44% 26.049us 2.45% 44.420us 14.807us 7.008us 48.67% 7.008us 2.336us 3
4116
+ void at::native::elementwise_kernel&lt;128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 7.008us 48.67% 7.008us 2.336us 3
4117
+ Activity Buffer Request 78.99% 1.429ms 78.99% 1.429ms 1.429ms 2.496us 17.33% 2.496us 2.496us 1
4118
+ aten::slice 1.28% 23.160us 1.59% 28.810us 4.802us 0.000us 0.00% 0.000us 0.000us 6
4119
+ aten::as_strided 0.31% 5.650us 0.31% 5.650us 0.942us 0.000us 0.00% 0.000us 0.000us 6
4120
+ cudaLaunchKernel 9.43% 170.603us 9.43% 170.603us 28.434us 0.000us 0.00% 0.000us 0.000us 6
4121
+ cudaDeviceSynchronize 0.27% 4.930us 0.27% 4.930us 4.930us 0.000us 0.00% 0.000us 0.000us 1
4122
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4123
+ Self CPU time total: 1.809ms
4124
+ Self CUDA time total: 14.400us
4125
 
4126
 
4127
 
 
4131
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4132
  Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
4133
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4134
+ torch_eager 0.00% 0.000us 0.00% 0.000us 0.000us 142.206us 914.45% 142.206us 142.206us 1
4135
+ torch_eager 21.70% 105.494us 98.87% 480.727us 480.727us 0.000us 0.00% 18.239us 18.239us 1
4136
+ aten::silu 8.21% 39.900us 62.39% 303.354us 101.118us 7.966us 51.23% 10.654us 3.551us 3
4137
+ void at::native::elementwise_kernel&lt;128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 7.966us 51.23% 7.966us 2.655us 3
4138
+ aten::mul 5.16% 25.070us 8.84% 42.990us 14.330us 7.585us 48.77% 7.585us 2.528us 3
4139
+ void at::native::elementwise_kernel&lt;128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 7.585us 48.77% 7.585us 2.528us 3
4140
+ Activity Buffer Request 23.29% 113.242us 23.29% 113.242us 113.242us 2.688us 17.29% 2.688us 2.688us 1
4141
+ aten::slice 4.75% 23.080us 5.94% 28.889us 4.815us 0.000us 0.00% 0.000us 0.000us 6
4142
+ aten::as_strided 1.19% 5.809us 1.19% 5.809us 0.968us 0.000us 0.00% 0.000us 0.000us 6
4143
+ cudaLaunchKernel 34.58% 168.132us 34.58% 168.132us 28.022us 0.000us 0.00% 0.000us 0.000us 6
4144
+ cudaDeviceSynchronize 1.13% 5.500us 1.13% 5.500us 5.500us 0.000us 0.00% 0.000us 0.000us 1
4145
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4146
+ Self CPU time total: 486.227us
4147
+ Self CUDA time total: 15.551us
4148
 
4149
 
4150
 
 
4154
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4155
  Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
4156
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4157
+ torch_eager 0.00% 0.000us 0.00% 0.000us 0.000us 149.022us 661.50% 149.022us 149.022us 1
4158
+ torch_eager 5.72% 105.900us 99.72% 1.847ms 1.847ms 0.000us 0.00% 26.431us 26.431us 1
4159
+ aten::silu 2.24% 41.461us 90.05% 1.668ms 555.875us 11.552us 51.28% 15.455us 5.152us 3
4160
+ void at::native::elementwise_kernel&lt;128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 11.552us 51.28% 11.552us 3.851us 3
4161
+ aten::mul 1.41% 26.021us 2.40% 44.421us 14.807us 10.976us 48.72% 10.976us 3.659us 3
4162
+ void at::native::elementwise_kernel&lt;128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 10.976us 48.72% 10.976us 3.659us 3
4163
+ Activity Buffer Request 79.50% 1.472ms 79.50% 1.472ms 1.472ms 3.903us 17.33% 3.903us 3.903us 1
4164
+ aten::slice 1.25% 23.131us 1.56% 28.831us 4.805us 0.000us 0.00% 0.000us 0.000us 6
4165
+ aten::as_strided 0.31% 5.700us 0.31% 5.700us 0.950us 0.000us 0.00% 0.000us 0.000us 6
4166
+ cudaLaunchKernel 9.31% 172.382us 9.31% 172.382us 28.730us 0.000us 0.00% 0.000us 0.000us 6
4167
+ cudaDeviceSynchronize 0.28% 5.130us 0.28% 5.130us 5.130us 0.000us 0.00% 0.000us 0.000us 1
4168
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4169
+ Self CPU time total: 1.852ms
4170
+ Self CUDA time total: 22.528us
4171
 
4172
 
4173
  impl wl p50(ms) ok
4174
  torch_eager cuda_T128_D1024 0.05 True
4175
  torch_eager cuda_T128_D2048 0.05 True
4176
+ torch_eager cuda_T128_D768 0.04 True
4177
  torch_eager cuda_T256_D1024 0.05 True
4178
  torch_eager cuda_T256_D2048 0.05 True
4179
  torch_eager cuda_T256_D768 0.05 True
 
4184
  <div class="uv-install-logs" id="uv-logs-benchmark">
4185
  <div class="uv-logs-header" onclick="toggleUvLogs(this)">▶ UV Install Logs</div>
4186
  <div class="uv-logs-content" style="display: none;">
4187
+ Installed 37 packages in 246ms
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
4188
  </div>
4189
  </div>
4190
  <div class="cell-artifacts">
activation/results/artifacts/combine/latency.svg CHANGED

Git LFS Details

  • SHA256: f286130086ddc73e4e87d0a2a68de7b2f17cff9f893d7fad0e1eb7210cf7e246
  • Pointer size: 130 Bytes
  • Size of remote file: 20.7 kB

Git LFS Details

  • SHA256: 9254fad09b1905d500f91c98ba5debdf4f6497c196acc2cdc499c0572bc73647
  • Pointer size: 130 Bytes
  • Size of remote file: 20.6 kB
activation/results/combined_results.html CHANGED
@@ -3857,7 +3857,7 @@ span.linenos.special { color: #000000; background-color: #ffffc0; padding-left:
3857
  <div class="system-info">
3858
  <div class="system-info-header">Generated on:</div>
3859
  <div class="system-info-content">
3860
- Linux x86_64 | Linux-5.15.0-1084-aws-x86_64-with-glibc2.31
3861
  </div>
3862
  </div>
3863
 
@@ -3872,7 +3872,7 @@ span.linenos.special { color: #000000; background-color: #ffffc0; padding-left:
3872
  <rdf:RDF>
3873
  <ns2:Work>
3874
  <dc:type rdf:resource="http://purl.org/dc/dcmitype/StillImage" />
3875
- <dc:date>2025-10-24T19:26:55.354611</dc:date>
3876
  <dc:format>image/svg+xml</dc:format>
3877
  <dc:creator>
3878
  <ns2:Agent>
@@ -4021,83 +4021,83 @@ span.linenos.special { color: #000000; background-color: #ffffc0; padding-left:
4021
  <g id="matplotlib.axis_2">
4022
  <g id="ytick_1">
4023
  <g id="grid-y--2" class="grid grid-y">
4024
- <path d="M 60.23 398.041123 L 847.294169 398.041123 " clip-path="url(#p620c7d392f)" style="fill: none; stroke: #b0b0b0; stroke-opacity: 0.3; stroke-width: 0.8; stroke-linecap: square" />
4025
  </g>
4026
  <g id="line2d_10">
4027
  <defs>
4028
  <path id="m0fca2865ba" d="M 0 0 L -3.5 0 " style="stroke: #000000; stroke-width: 0.8" />
4029
  </defs>
4030
  <g>
4031
- <use ns4:href="#m0fca2865ba" x="60.23" y="398.041123" style="stroke: #000000; stroke-width: 0.8" />
4032
  </g>
4033
  </g>
4034
  <g id="text_10">
4035
- <text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: end" x="53.23" y="401.840342" transform="rotate(-0 53.23 401.840342)">0.030</text>
4036
  </g>
4037
  </g>
4038
  <g id="ytick_2">
4039
  <g id="grid-y--3" class="grid grid-y">
4040
- <path d="M 60.23 324.254737 L 847.294169 324.254737 " clip-path="url(#p620c7d392f)" style="fill: none; stroke: #b0b0b0; stroke-opacity: 0.3; stroke-width: 0.8; stroke-linecap: square" />
4041
  </g>
4042
  <g id="line2d_11">
4043
  <g>
4044
- <use ns4:href="#m0fca2865ba" x="60.23" y="324.254737" style="stroke: #000000; stroke-width: 0.8" />
4045
  </g>
4046
  </g>
4047
  <g id="text_11">
4048
- <text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: end" x="53.23" y="328.053956" transform="rotate(-0 53.23 328.053956)">0.035</text>
4049
  </g>
4050
  </g>
4051
  <g id="ytick_3">
4052
  <g id="grid-y--4" class="grid grid-y">
4053
- <path d="M 60.23 250.468352 L 847.294169 250.468352 " clip-path="url(#p620c7d392f)" style="fill: none; stroke: #b0b0b0; stroke-opacity: 0.3; stroke-width: 0.8; stroke-linecap: square" />
4054
  </g>
4055
  <g id="line2d_12">
4056
  <g>
4057
- <use ns4:href="#m0fca2865ba" x="60.23" y="250.468352" style="stroke: #000000; stroke-width: 0.8" />
4058
  </g>
4059
  </g>
4060
  <g id="text_12">
4061
- <text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: end" x="53.23" y="254.26757" transform="rotate(-0 53.23 254.26757)">0.040</text>
4062
  </g>
4063
  </g>
4064
  <g id="ytick_4">
4065
  <g id="grid-y--5" class="grid grid-y">
4066
- <path d="M 60.23 176.681966 L 847.294169 176.681966 " clip-path="url(#p620c7d392f)" style="fill: none; stroke: #b0b0b0; stroke-opacity: 0.3; stroke-width: 0.8; stroke-linecap: square" />
4067
  </g>
4068
  <g id="line2d_13">
4069
  <g>
4070
- <use ns4:href="#m0fca2865ba" x="60.23" y="176.681966" style="stroke: #000000; stroke-width: 0.8" />
4071
  </g>
4072
  </g>
4073
  <g id="text_13">
4074
- <text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: end" x="53.23" y="180.481185" transform="rotate(-0 53.23 180.481185)">0.045</text>
4075
  </g>
4076
  </g>
4077
  <g id="ytick_5">
4078
  <g id="grid-y--6" class="grid grid-y">
4079
- <path d="M 60.23 102.89558 L 847.294169 102.89558 " clip-path="url(#p620c7d392f)" style="fill: none; stroke: #b0b0b0; stroke-opacity: 0.3; stroke-width: 0.8; stroke-linecap: square" />
4080
  </g>
4081
  <g id="line2d_14">
4082
  <g>
4083
- <use ns4:href="#m0fca2865ba" x="60.23" y="102.89558" style="stroke: #000000; stroke-width: 0.8" />
4084
  </g>
4085
  </g>
4086
  <g id="text_14">
4087
- <text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: end" x="53.23" y="106.694799" transform="rotate(-0 53.23 106.694799)">0.050</text>
4088
  </g>
4089
  </g>
4090
  <g id="ytick_6">
4091
  <g id="grid-y--7" class="grid grid-y">
4092
- <path d="M 60.23 29.109195 L 847.294169 29.109195 " clip-path="url(#p620c7d392f)" style="fill: none; stroke: #b0b0b0; stroke-opacity: 0.3; stroke-width: 0.8; stroke-linecap: square" />
4093
  </g>
4094
  <g id="line2d_15">
4095
  <g>
4096
- <use ns4:href="#m0fca2865ba" x="60.23" y="29.109195" style="stroke: #000000; stroke-width: 0.8" />
4097
  </g>
4098
  </g>
4099
  <g id="text_15">
4100
- <text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: end" x="53.23" y="32.908413" transform="rotate(-0 53.23 32.908413)">0.055</text>
4101
  </g>
4102
  </g>
4103
  <g id="label--y" class="ylabel">
@@ -4105,37 +4105,37 @@ span.linenos.special { color: #000000; background-color: #ffffc0; padding-left:
4105
  </g>
4106
  </g>
4107
  <g id="series--hf-kernels-swiglu" class="series">
4108
- <path d="M 96.005644 451.16779 L 185.444754 387.991045 L 274.883864 378.251855 L 364.322974 388.728914 L 453.762084 400.830317 L 543.201194 401.582789 L 632.640304 395.827579 L 722.079415 413.683333 L 811.518525 378.989724 " clip-path="url(#p620c7d392f)" style="fill: none; stroke: #1f77b4; stroke-width: 1.5; stroke-linecap: square" />
4109
  <defs>
4110
  <path id="md7efaf3aec" d="M 0 3 C 0.795609 3 1.55874 2.683901 2.12132 2.12132 C 2.683901 1.55874 3 0.795609 3 0 C 3 -0.795609 2.683901 -1.55874 2.12132 -2.12132 C 1.55874 -2.683901 0.795609 -3 0 -3 C -0.795609 -3 -1.55874 -2.683901 -2.12132 -2.12132 C -2.683901 -1.55874 -3 -0.795609 -3 0 C -3 0.795609 -2.683901 1.55874 -2.12132 2.12132 C -1.55874 2.683901 -0.795609 3 0 3 z " style="stroke: #1f77b4" />
4111
  </defs>
4112
  <g clip-path="url(#p620c7d392f)">
4113
  <use ns4:href="#md7efaf3aec" x="96.005644" y="451.16779" style="fill: #1f77b4; stroke: #1f77b4" />
4114
- <use ns4:href="#md7efaf3aec" x="185.444754" y="387.991045" style="fill: #1f77b4; stroke: #1f77b4" />
4115
- <use ns4:href="#md7efaf3aec" x="274.883864" y="378.251855" style="fill: #1f77b4; stroke: #1f77b4" />
4116
- <use ns4:href="#md7efaf3aec" x="364.322974" y="388.728914" style="fill: #1f77b4; stroke: #1f77b4" />
4117
- <use ns4:href="#md7efaf3aec" x="453.762084" y="400.830317" style="fill: #1f77b4; stroke: #1f77b4" />
4118
- <use ns4:href="#md7efaf3aec" x="543.201194" y="401.582789" style="fill: #1f77b4; stroke: #1f77b4" />
4119
- <use ns4:href="#md7efaf3aec" x="632.640304" y="395.827579" style="fill: #1f77b4; stroke: #1f77b4" />
4120
- <use ns4:href="#md7efaf3aec" x="722.079415" y="413.683333" style="fill: #1f77b4; stroke: #1f77b4" />
4121
- <use ns4:href="#md7efaf3aec" x="811.518525" y="378.989724" style="fill: #1f77b4; stroke: #1f77b4" />
4122
  </g>
4123
  </g>
4124
  <g id="series--torch-eager" class="series">
4125
- <path d="M 96.005644 118.965896 L 185.444754 47.08418 L 274.883864 58.165966 L 364.322974 50.625782 L 453.762084 57.428956 L 543.201194 72.776469 L 632.640304 54.181987 L 722.079415 80.892174 L 811.518525 57.28121 " clip-path="url(#p620c7d392f)" style="fill: none; stroke: #ff7f0e; stroke-width: 1.5; stroke-linecap: square" />
4126
  <defs>
4127
  <path id="m9b8c54d372" d="M 0 3 C 0.795609 3 1.55874 2.683901 2.12132 2.12132 C 2.683901 1.55874 3 0.795609 3 0 C 3 -0.795609 2.683901 -1.55874 2.12132 -2.12132 C 1.55874 -2.683901 0.795609 -3 0 -3 C -0.795609 -3 -1.55874 -2.683901 -2.12132 -2.12132 C -2.683901 -1.55874 -3 -0.795609 -3 0 C -3 0.795609 -2.683901 1.55874 -2.12132 2.12132 C -1.55874 2.683901 -0.795609 3 0 3 z " style="stroke: #ff7f0e" />
4128
  </defs>
4129
  <g clip-path="url(#p620c7d392f)">
4130
- <use ns4:href="#m9b8c54d372" x="96.005644" y="118.965896" style="fill: #ff7f0e; stroke: #ff7f0e" />
4131
- <use ns4:href="#m9b8c54d372" x="185.444754" y="47.08418" style="fill: #ff7f0e; stroke: #ff7f0e" />
4132
- <use ns4:href="#m9b8c54d372" x="274.883864" y="58.165966" style="fill: #ff7f0e; stroke: #ff7f0e" />
4133
- <use ns4:href="#m9b8c54d372" x="364.322974" y="50.625782" style="fill: #ff7f0e; stroke: #ff7f0e" />
4134
- <use ns4:href="#m9b8c54d372" x="453.762084" y="57.428956" style="fill: #ff7f0e; stroke: #ff7f0e" />
4135
- <use ns4:href="#m9b8c54d372" x="543.201194" y="72.776469" style="fill: #ff7f0e; stroke: #ff7f0e" />
4136
- <use ns4:href="#m9b8c54d372" x="632.640304" y="54.181987" style="fill: #ff7f0e; stroke: #ff7f0e" />
4137
- <use ns4:href="#m9b8c54d372" x="722.079415" y="80.892174" style="fill: #ff7f0e; stroke: #ff7f0e" />
4138
- <use ns4:href="#m9b8c54d372" x="811.518525" y="57.28121" style="fill: #ff7f0e; stroke: #ff7f0e" />
4139
  </g>
4140
  </g>
4141
  <g id="patch_3">
@@ -4155,25 +4155,25 @@ span.linenos.special { color: #000000; background-color: #ffffc0; padding-left:
4155
  </g>
4156
  <g id="legend" class="legend">
4157
  <g id="patch_7">
4158
- <path d="M 720.811356 466.37197 L 840.294169 466.37197 Q 842.294169 466.37197 842.294169 464.37197 L 842.294169 435.45947 Q 842.294169 433.45947 840.294169 433.45947 L 720.811356 433.45947 Q 718.811356 433.45947 718.811356 435.45947 L 718.811356 464.37197 Q 718.811356 466.37197 720.811356 466.37197 L 720.811356 466.37197 z " style="fill: none; opacity: 0.8; stroke: #cccccc; stroke-linejoin: miter" />
4159
  </g>
4160
  <g id="line2d_16">
4161
- <path d="M 722.811356 441.557908 L 732.811356 441.557908 L 742.811356 441.557908 " style="fill: none; stroke: #1f77b4; stroke-width: 1.5; stroke-linecap: square" />
4162
  <g>
4163
- <use ns4:href="#md7efaf3aec" x="732.811356" y="441.557908" style="fill: #1f77b4; stroke: #1f77b4" />
4164
  </g>
4165
  </g>
4166
  <g id="legend-label--hf-kernels-swiglu" class="legend">
4167
- <text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: start" x="750.811356" y="445.057908" transform="rotate(-0 750.811356 445.057908)">hf_kernels_swiglu</text>
4168
  </g>
4169
  <g id="line2d_17">
4170
- <path d="M 722.811356 456.514158 L 732.811356 456.514158 L 742.811356 456.514158 " style="fill: none; stroke: #ff7f0e; stroke-width: 1.5; stroke-linecap: square" />
4171
  <g>
4172
- <use ns4:href="#m9b8c54d372" x="732.811356" y="456.514158" style="fill: #ff7f0e; stroke: #ff7f0e" />
4173
  </g>
4174
  </g>
4175
  <g id="legend-label--torch-eager" class="legend">
4176
- <text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: start" x="750.811356" y="460.014158" transform="rotate(-0 750.811356 460.014158)">torch_eager</text>
4177
  </g>
4178
  </g>
4179
  </g>
@@ -4193,7 +4193,7 @@ span.linenos.special { color: #000000; background-color: #ffffc0; padding-left:
4193
  <span onclick="toggleOutput('combine')" style="cursor: pointer;">▼ output</span>
4194
  <span id="uv-indicator-combine" onclick="toggleUvLogsFromHeader('combine')" style="cursor: pointer;">▶ uv-logs</span>
4195
  </span> |
4196
- Cell: combine | 38.46s
4197
  | <button class="run-btn" onclick="runCell('combine')">▶ run</button>
4198
  <button class="copy-btn" onclick="copyCell('combine')">Copy</button>
4199
  <a href="cells/combine.py" target="_blank" class="raw-btn">Raw</a>
@@ -4267,13 +4267,13 @@ Cell: combine | 38.46s
4267
  <div class="cell-stdout"><pre class="stdout-text">======================================================================
4268
  LOADING BENCHMARK DATA
4269
  ======================================================================
4270
- ✓ HF Kernels SwiGLU : /home/ubuntu/Projects/kernels-benchmarks-consolidated/benches/activation/impls/.uvnote/cache/d30443d7e6209ed0a7ffb0b020b1f31815cb2e95563283b7a25710e6420dbed8
4271
- ✓ PyTorch SwiGLU : /home/ubuntu/Projects/kernels-benchmarks-consolidated/benches/activation/impls/.uvnote/cache/86a65ffc73cc3e7a7b1efe81bde7937d3f4e55d4f6b857c3fca0d9008687d8d6
4272
 
4273
  ✓ Found HF Kernels SwiGLU
4274
- Path: /home/ubuntu/Projects/kernels-benchmarks-consolidated/benches/activation/impls/.uvnote/cache/d30443d7e6209ed0a7ffb0b020b1f31815cb2e95563283b7a25710e6420dbed8/activation.jsonl
4275
  ✓ Found PyTorch SwiGLU
4276
- Path: /home/ubuntu/Projects/kernels-benchmarks-consolidated/benches/activation/impls/.uvnote/cache/86a65ffc73cc3e7a7b1efe81bde7937d3f4e55d4f6b857c3fca0d9008687d8d6/activation.jsonl
4277
 
4278
  ======================================================================
4279
  Summary: 2 found, 0 skipped, 0 missing
@@ -4293,7 +4293,7 @@ hf_kernels_swiglu cuda_T512_D2048 0.03 True
4293
  hf_kernels_swiglu cuda_T512_D768 0.03 True
4294
  torch_eager cuda_T128_D1024 0.05 True
4295
  torch_eager cuda_T128_D2048 0.05 True
4296
- torch_eager cuda_T128_D768 0.05 True
4297
  torch_eager cuda_T256_D1024 0.05 True
4298
  torch_eager cuda_T256_D2048 0.05 True
4299
  torch_eager cuda_T256_D768 0.05 True
@@ -4319,53 +4319,7 @@ Implementations included:
4319
  <div class="uv-install-logs" id="uv-logs-combine">
4320
  <div class="uv-logs-header" onclick="toggleUvLogs(this)">▶ UV Install Logs</div>
4321
  <div class="uv-logs-content" style="display: none;">
4322
- Building kernels-benchmark-tools @ file:///home/ubuntu/Projects/kernels-benchmarks-consolidated/tools
4323
- Downloading numpy (15.9MiB)
4324
- Downloading sympy (6.0MiB)
4325
- Downloading networkx (1.9MiB)
4326
- Downloading nvidia-cusparselt-cu12 (273.9MiB)
4327
- Downloading nvidia-cusparse-cu12 (274.9MiB)
4328
- Downloading nvidia-cufft-cu12 (184.2MiB)
4329
- Downloading torch (846.8MiB)
4330
- Downloading setuptools (1.1MiB)
4331
- Downloading nvidia-cuda-cupti-cu12 (9.8MiB)
4332
- Downloading nvidia-cuda-nvrtc-cu12 (84.0MiB)
4333
- Downloading nvidia-nvjitlink-cu12 (37.4MiB)
4334
- Downloading nvidia-cudnn-cu12 (674.0MiB)
4335
- Downloading pillow (6.7MiB)
4336
- Downloading nvidia-nccl-cu12 (307.4MiB)
4337
- Downloading nvidia-curand-cu12 (60.7MiB)
4338
- Downloading nvidia-cublas-cu12 (566.8MiB)
4339
- Downloading nvidia-cufile-cu12 (1.1MiB)
4340
- Downloading fonttools (4.7MiB)
4341
- Downloading triton (148.4MiB)
4342
- Downloading matplotlib (8.3MiB)
4343
- Downloading kiwisolver (1.4MiB)
4344
- Downloading nvidia-cusolver-cu12 (255.1MiB)
4345
- Downloading nvidia-cufile-cu12
4346
- Downloading kiwisolver
4347
- Downloading setuptools
4348
- Downloading networkx
4349
- Downloading fonttools
4350
- Downloading pillow
4351
- Built kernels-benchmark-tools @ file:///home/ubuntu/Projects/kernels-benchmarks-consolidated/tools
4352
- Downloading nvidia-cuda-cupti-cu12
4353
- Downloading matplotlib
4354
- Downloading numpy
4355
- Downloading sympy
4356
- Downloading nvidia-nvjitlink-cu12
4357
- Downloading nvidia-curand-cu12
4358
- Downloading nvidia-cuda-nvrtc-cu12
4359
- Downloading triton
4360
- Downloading nvidia-cufft-cu12
4361
- Downloading nvidia-cusolver-cu12
4362
- Downloading nvidia-cusparse-cu12
4363
- Downloading nvidia-cusparselt-cu12
4364
- Downloading nvidia-nccl-cu12
4365
- Downloading nvidia-cublas-cu12
4366
- Downloading nvidia-cudnn-cu12
4367
- Downloading torch
4368
- Installed 37 packages in 212ms
4369
  </div>
4370
  </div>
4371
  <div class="cell-artifacts">
@@ -4378,7 +4332,7 @@ Installed 37 packages in 212ms
4378
  <rdf:RDF>
4379
  <ns2:Work>
4380
  <dc:type rdf:resource="http://purl.org/dc/dcmitype/StillImage" />
4381
- <dc:date>2025-10-24T19:26:55.354611</dc:date>
4382
  <dc:format>image/svg+xml</dc:format>
4383
  <dc:creator>
4384
  <ns2:Agent>
@@ -4527,83 +4481,83 @@ Installed 37 packages in 212ms
4527
  <g id="matplotlib.axis_2">
4528
  <g id="ytick_1">
4529
  <g id="grid-y--2" class="grid grid-y">
4530
- <path d="M 60.23 398.041123 L 847.294169 398.041123 " clip-path="url(#p620c7d392f)" style="fill: none; stroke: #b0b0b0; stroke-opacity: 0.3; stroke-width: 0.8; stroke-linecap: square" />
4531
  </g>
4532
  <g id="line2d_10">
4533
  <defs>
4534
  <path id="m0fca2865ba" d="M 0 0 L -3.5 0 " style="stroke: #000000; stroke-width: 0.8" />
4535
  </defs>
4536
  <g>
4537
- <use ns4:href="#m0fca2865ba" x="60.23" y="398.041123" style="stroke: #000000; stroke-width: 0.8" />
4538
  </g>
4539
  </g>
4540
  <g id="text_10">
4541
- <text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: end" x="53.23" y="401.840342" transform="rotate(-0 53.23 401.840342)">0.030</text>
4542
  </g>
4543
  </g>
4544
  <g id="ytick_2">
4545
  <g id="grid-y--3" class="grid grid-y">
4546
- <path d="M 60.23 324.254737 L 847.294169 324.254737 " clip-path="url(#p620c7d392f)" style="fill: none; stroke: #b0b0b0; stroke-opacity: 0.3; stroke-width: 0.8; stroke-linecap: square" />
4547
  </g>
4548
  <g id="line2d_11">
4549
  <g>
4550
- <use ns4:href="#m0fca2865ba" x="60.23" y="324.254737" style="stroke: #000000; stroke-width: 0.8" />
4551
  </g>
4552
  </g>
4553
  <g id="text_11">
4554
- <text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: end" x="53.23" y="328.053956" transform="rotate(-0 53.23 328.053956)">0.035</text>
4555
  </g>
4556
  </g>
4557
  <g id="ytick_3">
4558
  <g id="grid-y--4" class="grid grid-y">
4559
- <path d="M 60.23 250.468352 L 847.294169 250.468352 " clip-path="url(#p620c7d392f)" style="fill: none; stroke: #b0b0b0; stroke-opacity: 0.3; stroke-width: 0.8; stroke-linecap: square" />
4560
  </g>
4561
  <g id="line2d_12">
4562
  <g>
4563
- <use ns4:href="#m0fca2865ba" x="60.23" y="250.468352" style="stroke: #000000; stroke-width: 0.8" />
4564
  </g>
4565
  </g>
4566
  <g id="text_12">
4567
- <text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: end" x="53.23" y="254.26757" transform="rotate(-0 53.23 254.26757)">0.040</text>
4568
  </g>
4569
  </g>
4570
  <g id="ytick_4">
4571
  <g id="grid-y--5" class="grid grid-y">
4572
- <path d="M 60.23 176.681966 L 847.294169 176.681966 " clip-path="url(#p620c7d392f)" style="fill: none; stroke: #b0b0b0; stroke-opacity: 0.3; stroke-width: 0.8; stroke-linecap: square" />
4573
  </g>
4574
  <g id="line2d_13">
4575
  <g>
4576
- <use ns4:href="#m0fca2865ba" x="60.23" y="176.681966" style="stroke: #000000; stroke-width: 0.8" />
4577
  </g>
4578
  </g>
4579
  <g id="text_13">
4580
- <text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: end" x="53.23" y="180.481185" transform="rotate(-0 53.23 180.481185)">0.045</text>
4581
  </g>
4582
  </g>
4583
  <g id="ytick_5">
4584
  <g id="grid-y--6" class="grid grid-y">
4585
- <path d="M 60.23 102.89558 L 847.294169 102.89558 " clip-path="url(#p620c7d392f)" style="fill: none; stroke: #b0b0b0; stroke-opacity: 0.3; stroke-width: 0.8; stroke-linecap: square" />
4586
  </g>
4587
  <g id="line2d_14">
4588
  <g>
4589
- <use ns4:href="#m0fca2865ba" x="60.23" y="102.89558" style="stroke: #000000; stroke-width: 0.8" />
4590
  </g>
4591
  </g>
4592
  <g id="text_14">
4593
- <text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: end" x="53.23" y="106.694799" transform="rotate(-0 53.23 106.694799)">0.050</text>
4594
  </g>
4595
  </g>
4596
  <g id="ytick_6">
4597
  <g id="grid-y--7" class="grid grid-y">
4598
- <path d="M 60.23 29.109195 L 847.294169 29.109195 " clip-path="url(#p620c7d392f)" style="fill: none; stroke: #b0b0b0; stroke-opacity: 0.3; stroke-width: 0.8; stroke-linecap: square" />
4599
  </g>
4600
  <g id="line2d_15">
4601
  <g>
4602
- <use ns4:href="#m0fca2865ba" x="60.23" y="29.109195" style="stroke: #000000; stroke-width: 0.8" />
4603
  </g>
4604
  </g>
4605
  <g id="text_15">
4606
- <text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: end" x="53.23" y="32.908413" transform="rotate(-0 53.23 32.908413)">0.055</text>
4607
  </g>
4608
  </g>
4609
  <g id="label--y" class="ylabel">
@@ -4611,37 +4565,37 @@ Installed 37 packages in 212ms
4611
  </g>
4612
  </g>
4613
  <g id="series--hf-kernels-swiglu" class="series">
4614
- <path d="M 96.005644 451.16779 L 185.444754 387.991045 L 274.883864 378.251855 L 364.322974 388.728914 L 453.762084 400.830317 L 543.201194 401.582789 L 632.640304 395.827579 L 722.079415 413.683333 L 811.518525 378.989724 " clip-path="url(#p620c7d392f)" style="fill: none; stroke: #1f77b4; stroke-width: 1.5; stroke-linecap: square" />
4615
  <defs>
4616
  <path id="md7efaf3aec" d="M 0 3 C 0.795609 3 1.55874 2.683901 2.12132 2.12132 C 2.683901 1.55874 3 0.795609 3 0 C 3 -0.795609 2.683901 -1.55874 2.12132 -2.12132 C 1.55874 -2.683901 0.795609 -3 0 -3 C -0.795609 -3 -1.55874 -2.683901 -2.12132 -2.12132 C -2.683901 -1.55874 -3 -0.795609 -3 0 C -3 0.795609 -2.683901 1.55874 -2.12132 2.12132 C -1.55874 2.683901 -0.795609 3 0 3 z " style="stroke: #1f77b4" />
4617
  </defs>
4618
  <g clip-path="url(#p620c7d392f)">
4619
  <use ns4:href="#md7efaf3aec" x="96.005644" y="451.16779" style="fill: #1f77b4; stroke: #1f77b4" />
4620
- <use ns4:href="#md7efaf3aec" x="185.444754" y="387.991045" style="fill: #1f77b4; stroke: #1f77b4" />
4621
- <use ns4:href="#md7efaf3aec" x="274.883864" y="378.251855" style="fill: #1f77b4; stroke: #1f77b4" />
4622
- <use ns4:href="#md7efaf3aec" x="364.322974" y="388.728914" style="fill: #1f77b4; stroke: #1f77b4" />
4623
- <use ns4:href="#md7efaf3aec" x="453.762084" y="400.830317" style="fill: #1f77b4; stroke: #1f77b4" />
4624
- <use ns4:href="#md7efaf3aec" x="543.201194" y="401.582789" style="fill: #1f77b4; stroke: #1f77b4" />
4625
- <use ns4:href="#md7efaf3aec" x="632.640304" y="395.827579" style="fill: #1f77b4; stroke: #1f77b4" />
4626
- <use ns4:href="#md7efaf3aec" x="722.079415" y="413.683333" style="fill: #1f77b4; stroke: #1f77b4" />
4627
- <use ns4:href="#md7efaf3aec" x="811.518525" y="378.989724" style="fill: #1f77b4; stroke: #1f77b4" />
4628
  </g>
4629
  </g>
4630
  <g id="series--torch-eager" class="series">
4631
- <path d="M 96.005644 118.965896 L 185.444754 47.08418 L 274.883864 58.165966 L 364.322974 50.625782 L 453.762084 57.428956 L 543.201194 72.776469 L 632.640304 54.181987 L 722.079415 80.892174 L 811.518525 57.28121 " clip-path="url(#p620c7d392f)" style="fill: none; stroke: #ff7f0e; stroke-width: 1.5; stroke-linecap: square" />
4632
  <defs>
4633
  <path id="m9b8c54d372" d="M 0 3 C 0.795609 3 1.55874 2.683901 2.12132 2.12132 C 2.683901 1.55874 3 0.795609 3 0 C 3 -0.795609 2.683901 -1.55874 2.12132 -2.12132 C 1.55874 -2.683901 0.795609 -3 0 -3 C -0.795609 -3 -1.55874 -2.683901 -2.12132 -2.12132 C -2.683901 -1.55874 -3 -0.795609 -3 0 C -3 0.795609 -2.683901 1.55874 -2.12132 2.12132 C -1.55874 2.683901 -0.795609 3 0 3 z " style="stroke: #ff7f0e" />
4634
  </defs>
4635
  <g clip-path="url(#p620c7d392f)">
4636
- <use ns4:href="#m9b8c54d372" x="96.005644" y="118.965896" style="fill: #ff7f0e; stroke: #ff7f0e" />
4637
- <use ns4:href="#m9b8c54d372" x="185.444754" y="47.08418" style="fill: #ff7f0e; stroke: #ff7f0e" />
4638
- <use ns4:href="#m9b8c54d372" x="274.883864" y="58.165966" style="fill: #ff7f0e; stroke: #ff7f0e" />
4639
- <use ns4:href="#m9b8c54d372" x="364.322974" y="50.625782" style="fill: #ff7f0e; stroke: #ff7f0e" />
4640
- <use ns4:href="#m9b8c54d372" x="453.762084" y="57.428956" style="fill: #ff7f0e; stroke: #ff7f0e" />
4641
- <use ns4:href="#m9b8c54d372" x="543.201194" y="72.776469" style="fill: #ff7f0e; stroke: #ff7f0e" />
4642
- <use ns4:href="#m9b8c54d372" x="632.640304" y="54.181987" style="fill: #ff7f0e; stroke: #ff7f0e" />
4643
- <use ns4:href="#m9b8c54d372" x="722.079415" y="80.892174" style="fill: #ff7f0e; stroke: #ff7f0e" />
4644
- <use ns4:href="#m9b8c54d372" x="811.518525" y="57.28121" style="fill: #ff7f0e; stroke: #ff7f0e" />
4645
  </g>
4646
  </g>
4647
  <g id="patch_3">
@@ -4661,25 +4615,25 @@ Installed 37 packages in 212ms
4661
  </g>
4662
  <g id="legend" class="legend">
4663
  <g id="patch_7">
4664
- <path d="M 720.811356 466.37197 L 840.294169 466.37197 Q 842.294169 466.37197 842.294169 464.37197 L 842.294169 435.45947 Q 842.294169 433.45947 840.294169 433.45947 L 720.811356 433.45947 Q 718.811356 433.45947 718.811356 435.45947 L 718.811356 464.37197 Q 718.811356 466.37197 720.811356 466.37197 L 720.811356 466.37197 z " style="fill: none; opacity: 0.8; stroke: #cccccc; stroke-linejoin: miter" />
4665
  </g>
4666
  <g id="line2d_16">
4667
- <path d="M 722.811356 441.557908 L 732.811356 441.557908 L 742.811356 441.557908 " style="fill: none; stroke: #1f77b4; stroke-width: 1.5; stroke-linecap: square" />
4668
  <g>
4669
- <use ns4:href="#md7efaf3aec" x="732.811356" y="441.557908" style="fill: #1f77b4; stroke: #1f77b4" />
4670
  </g>
4671
  </g>
4672
  <g id="legend-label--hf-kernels-swiglu" class="legend">
4673
- <text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: start" x="750.811356" y="445.057908" transform="rotate(-0 750.811356 445.057908)">hf_kernels_swiglu</text>
4674
  </g>
4675
  <g id="line2d_17">
4676
- <path d="M 722.811356 456.514158 L 732.811356 456.514158 L 742.811356 456.514158 " style="fill: none; stroke: #ff7f0e; stroke-width: 1.5; stroke-linecap: square" />
4677
  <g>
4678
- <use ns4:href="#m9b8c54d372" x="732.811356" y="456.514158" style="fill: #ff7f0e; stroke: #ff7f0e" />
4679
  </g>
4680
  </g>
4681
  <g id="legend-label--torch-eager" class="legend">
4682
- <text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: start" x="750.811356" y="460.014158" transform="rotate(-0 750.811356 460.014158)">torch_eager</text>
4683
  </g>
4684
  </g>
4685
  </g>
 
3857
  <div class="system-info">
3858
  <div class="system-info-header">Generated on:</div>
3859
  <div class="system-info-content">
3860
+ Linux x86_64 | Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35
3861
  </div>
3862
  </div>
3863
 
 
3872
  <rdf:RDF>
3873
  <ns2:Work>
3874
  <dc:type rdf:resource="http://purl.org/dc/dcmitype/StillImage" />
3875
+ <dc:date>2025-10-27T14:46:43.482898</dc:date>
3876
  <dc:format>image/svg+xml</dc:format>
3877
  <dc:creator>
3878
  <ns2:Agent>
 
4021
  <g id="matplotlib.axis_2">
4022
  <g id="ytick_1">
4023
  <g id="grid-y--2" class="grid grid-y">
4024
+ <path d="M 60.23 452.615548 L 847.294169 452.615548 " clip-path="url(#p620c7d392f)" style="fill: none; stroke: #b0b0b0; stroke-opacity: 0.3; stroke-width: 0.8; stroke-linecap: square" />
4025
  </g>
4026
  <g id="line2d_10">
4027
  <defs>
4028
  <path id="m0fca2865ba" d="M 0 0 L -3.5 0 " style="stroke: #000000; stroke-width: 0.8" />
4029
  </defs>
4030
  <g>
4031
+ <use ns4:href="#m0fca2865ba" x="60.23" y="452.615548" style="stroke: #000000; stroke-width: 0.8" />
4032
  </g>
4033
  </g>
4034
  <g id="text_10">
4035
+ <text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: end" x="53.23" y="456.414767" transform="rotate(-0 53.23 456.414767)">0.025</text>
4036
  </g>
4037
  </g>
4038
  <g id="ytick_2">
4039
  <g id="grid-y--3" class="grid grid-y">
4040
+ <path d="M 60.23 373.068398 L 847.294169 373.068398 " clip-path="url(#p620c7d392f)" style="fill: none; stroke: #b0b0b0; stroke-opacity: 0.3; stroke-width: 0.8; stroke-linecap: square" />
4041
  </g>
4042
  <g id="line2d_11">
4043
  <g>
4044
+ <use ns4:href="#m0fca2865ba" x="60.23" y="373.068398" style="stroke: #000000; stroke-width: 0.8" />
4045
  </g>
4046
  </g>
4047
  <g id="text_11">
4048
+ <text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: end" x="53.23" y="376.867617" transform="rotate(-0 53.23 376.867617)">0.030</text>
4049
  </g>
4050
  </g>
4051
  <g id="ytick_3">
4052
  <g id="grid-y--4" class="grid grid-y">
4053
+ <path d="M 60.23 293.521249 L 847.294169 293.521249 " clip-path="url(#p620c7d392f)" style="fill: none; stroke: #b0b0b0; stroke-opacity: 0.3; stroke-width: 0.8; stroke-linecap: square" />
4054
  </g>
4055
  <g id="line2d_12">
4056
  <g>
4057
+ <use ns4:href="#m0fca2865ba" x="60.23" y="293.521249" style="stroke: #000000; stroke-width: 0.8" />
4058
  </g>
4059
  </g>
4060
  <g id="text_12">
4061
+ <text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: end" x="53.23" y="297.320468" transform="rotate(-0 53.23 297.320468)">0.035</text>
4062
  </g>
4063
  </g>
4064
  <g id="ytick_4">
4065
  <g id="grid-y--5" class="grid grid-y">
4066
+ <path d="M 60.23 213.974099 L 847.294169 213.974099 " clip-path="url(#p620c7d392f)" style="fill: none; stroke: #b0b0b0; stroke-opacity: 0.3; stroke-width: 0.8; stroke-linecap: square" />
4067
  </g>
4068
  <g id="line2d_13">
4069
  <g>
4070
+ <use ns4:href="#m0fca2865ba" x="60.23" y="213.974099" style="stroke: #000000; stroke-width: 0.8" />
4071
  </g>
4072
  </g>
4073
  <g id="text_13">
4074
+ <text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: end" x="53.23" y="217.773318" transform="rotate(-0 53.23 217.773318)">0.040</text>
4075
  </g>
4076
  </g>
4077
  <g id="ytick_5">
4078
  <g id="grid-y--6" class="grid grid-y">
4079
+ <path d="M 60.23 134.42695 L 847.294169 134.42695 " clip-path="url(#p620c7d392f)" style="fill: none; stroke: #b0b0b0; stroke-opacity: 0.3; stroke-width: 0.8; stroke-linecap: square" />
4080
  </g>
4081
  <g id="line2d_14">
4082
  <g>
4083
+ <use ns4:href="#m0fca2865ba" x="60.23" y="134.42695" style="stroke: #000000; stroke-width: 0.8" />
4084
  </g>
4085
  </g>
4086
  <g id="text_14">
4087
+ <text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: end" x="53.23" y="138.226168" transform="rotate(-0 53.23 138.226168)">0.045</text>
4088
  </g>
4089
  </g>
4090
  <g id="ytick_6">
4091
  <g id="grid-y--7" class="grid grid-y">
4092
+ <path d="M 60.23 54.8798 L 847.294169 54.8798 " clip-path="url(#p620c7d392f)" style="fill: none; stroke: #b0b0b0; stroke-opacity: 0.3; stroke-width: 0.8; stroke-linecap: square" />
4093
  </g>
4094
  <g id="line2d_15">
4095
  <g>
4096
+ <use ns4:href="#m0fca2865ba" x="60.23" y="54.8798" style="stroke: #000000; stroke-width: 0.8" />
4097
  </g>
4098
  </g>
4099
  <g id="text_15">
4100
+ <text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: end" x="53.23" y="58.679019" transform="rotate(-0 53.23 58.679019)">0.050</text>
4101
  </g>
4102
  </g>
4103
  <g id="label--y" class="ylabel">
 
4105
  </g>
4106
  </g>
4107
  <g id="series--hf-kernels-swiglu" class="series">
4108
+ <path d="M 96.005644 451.16779 L 185.444754 379.591266 L 274.883864 367.802376 L 364.322974 382.120864 L 453.762084 356.82487 L 543.201194 396.121166 L 632.640304 374.96162 L 722.079415 389.136924 L 811.518525 358.734003 " clip-path="url(#p620c7d392f)" style="fill: none; stroke: #1f77b4; stroke-width: 1.5; stroke-linecap: square" />
4109
  <defs>
4110
  <path id="md7efaf3aec" d="M 0 3 C 0.795609 3 1.55874 2.683901 2.12132 2.12132 C 2.683901 1.55874 3 0.795609 3 0 C 3 -0.795609 2.683901 -1.55874 2.12132 -2.12132 C 1.55874 -2.683901 0.795609 -3 0 -3 C -0.795609 -3 -1.55874 -2.683901 -2.12132 -2.12132 C -2.683901 -1.55874 -3 -0.795609 -3 0 C -3 0.795609 -2.683901 1.55874 -2.12132 2.12132 C -1.55874 2.683901 -0.795609 3 0 3 z " style="stroke: #1f77b4" />
4111
  </defs>
4112
  <g clip-path="url(#p620c7d392f)">
4113
  <use ns4:href="#md7efaf3aec" x="96.005644" y="451.16779" style="fill: #1f77b4; stroke: #1f77b4" />
4114
+ <use ns4:href="#md7efaf3aec" x="185.444754" y="379.591266" style="fill: #1f77b4; stroke: #1f77b4" />
4115
+ <use ns4:href="#md7efaf3aec" x="274.883864" y="367.802376" style="fill: #1f77b4; stroke: #1f77b4" />
4116
+ <use ns4:href="#md7efaf3aec" x="364.322974" y="382.120864" style="fill: #1f77b4; stroke: #1f77b4" />
4117
+ <use ns4:href="#md7efaf3aec" x="453.762084" y="356.82487" style="fill: #1f77b4; stroke: #1f77b4" />
4118
+ <use ns4:href="#md7efaf3aec" x="543.201194" y="396.121166" style="fill: #1f77b4; stroke: #1f77b4" />
4119
+ <use ns4:href="#md7efaf3aec" x="632.640304" y="374.96162" style="fill: #1f77b4; stroke: #1f77b4" />
4120
+ <use ns4:href="#md7efaf3aec" x="722.079415" y="389.136924" style="fill: #1f77b4; stroke: #1f77b4" />
4121
+ <use ns4:href="#md7efaf3aec" x="811.518525" y="358.734003" style="fill: #1f77b4; stroke: #1f77b4" />
4122
  </g>
4123
  </g>
4124
  <g id="series--torch-eager" class="series">
4125
+ <path d="M 96.005644 189.63267 L 185.444754 53.272948 L 274.883864 47.08418 L 364.322974 66.175497 L 453.762084 61.545851 L 543.201194 66.795966 L 632.640304 59.954911 L 722.079415 85.26681 L 811.518525 95.751126 " clip-path="url(#p620c7d392f)" style="fill: none; stroke: #ff7f0e; stroke-width: 1.5; stroke-linecap: square" />
4126
  <defs>
4127
  <path id="m9b8c54d372" d="M 0 3 C 0.795609 3 1.55874 2.683901 2.12132 2.12132 C 2.683901 1.55874 3 0.795609 3 0 C 3 -0.795609 2.683901 -1.55874 2.12132 -2.12132 C 1.55874 -2.683901 0.795609 -3 0 -3 C -0.795609 -3 -1.55874 -2.683901 -2.12132 -2.12132 C -2.683901 -1.55874 -3 -0.795609 -3 0 C -3 0.795609 -2.683901 1.55874 -2.12132 2.12132 C -1.55874 2.683901 -0.795609 3 0 3 z " style="stroke: #ff7f0e" />
4128
  </defs>
4129
  <g clip-path="url(#p620c7d392f)">
4130
+ <use ns4:href="#m9b8c54d372" x="96.005644" y="189.63267" style="fill: #ff7f0e; stroke: #ff7f0e" />
4131
+ <use ns4:href="#m9b8c54d372" x="185.444754" y="53.272948" style="fill: #ff7f0e; stroke: #ff7f0e" />
4132
+ <use ns4:href="#m9b8c54d372" x="274.883864" y="47.08418" style="fill: #ff7f0e; stroke: #ff7f0e" />
4133
+ <use ns4:href="#m9b8c54d372" x="364.322974" y="66.175497" style="fill: #ff7f0e; stroke: #ff7f0e" />
4134
+ <use ns4:href="#m9b8c54d372" x="453.762084" y="61.545851" style="fill: #ff7f0e; stroke: #ff7f0e" />
4135
+ <use ns4:href="#m9b8c54d372" x="543.201194" y="66.795966" style="fill: #ff7f0e; stroke: #ff7f0e" />
4136
+ <use ns4:href="#m9b8c54d372" x="632.640304" y="59.954911" style="fill: #ff7f0e; stroke: #ff7f0e" />
4137
+ <use ns4:href="#m9b8c54d372" x="722.079415" y="85.26681" style="fill: #ff7f0e; stroke: #ff7f0e" />
4138
+ <use ns4:href="#m9b8c54d372" x="811.518525" y="95.751126" style="fill: #ff7f0e; stroke: #ff7f0e" />
4139
  </g>
4140
  </g>
4141
  <g id="patch_3">
 
4155
  </g>
4156
  <g id="legend" class="legend">
4157
  <g id="patch_7">
4158
+ <path d="M 720.811356 64.7925 L 840.294169 64.7925 Q 842.294169 64.7925 842.294169 62.7925 L 842.294169 33.88 Q 842.294169 31.88 840.294169 31.88 L 720.811356 31.88 Q 718.811356 31.88 718.811356 33.88 L 718.811356 62.7925 Q 718.811356 64.7925 720.811356 64.7925 L 720.811356 64.7925 z " style="fill: none; opacity: 0.8; stroke: #cccccc; stroke-linejoin: miter" />
4159
  </g>
4160
  <g id="line2d_16">
4161
+ <path d="M 722.811356 39.978438 L 732.811356 39.978438 L 742.811356 39.978438 " style="fill: none; stroke: #1f77b4; stroke-width: 1.5; stroke-linecap: square" />
4162
  <g>
4163
+ <use ns4:href="#md7efaf3aec" x="732.811356" y="39.978438" style="fill: #1f77b4; stroke: #1f77b4" />
4164
  </g>
4165
  </g>
4166
  <g id="legend-label--hf-kernels-swiglu" class="legend">
4167
+ <text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: start" x="750.811356" y="43.478438" transform="rotate(-0 750.811356 43.478438)">hf_kernels_swiglu</text>
4168
  </g>
4169
  <g id="line2d_17">
4170
+ <path d="M 722.811356 54.934687 L 732.811356 54.934687 L 742.811356 54.934687 " style="fill: none; stroke: #ff7f0e; stroke-width: 1.5; stroke-linecap: square" />
4171
  <g>
4172
+ <use ns4:href="#m9b8c54d372" x="732.811356" y="54.934687" style="fill: #ff7f0e; stroke: #ff7f0e" />
4173
  </g>
4174
  </g>
4175
  <g id="legend-label--torch-eager" class="legend">
4176
+ <text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: start" x="750.811356" y="58.434687" transform="rotate(-0 750.811356 58.434687)">torch_eager</text>
4177
  </g>
4178
  </g>
4179
  </g>
 
4193
  <span onclick="toggleOutput('combine')" style="cursor: pointer;">▼ output</span>
4194
  <span id="uv-indicator-combine" onclick="toggleUvLogsFromHeader('combine')" style="cursor: pointer;">▶ uv-logs</span>
4195
  </span> |
4196
+ Cell: combine | 4.45s
4197
  | <button class="run-btn" onclick="runCell('combine')">▶ run</button>
4198
  <button class="copy-btn" onclick="copyCell('combine')">Copy</button>
4199
  <a href="cells/combine.py" target="_blank" class="raw-btn">Raw</a>
 
4267
  <div class="cell-stdout"><pre class="stdout-text">======================================================================
4268
  LOADING BENCHMARK DATA
4269
  ======================================================================
4270
+ ✓ HF Kernels SwiGLU : /__w/kernels-benchmarks/kernels-benchmarks/benches/activation/impls/.uvnote/cache/2775e6386f1caf1fda935a997130c06dcaf7641efb0db21560c35301fdabfd9b
4271
+ ✓ PyTorch SwiGLU : /__w/kernels-benchmarks/kernels-benchmarks/benches/activation/impls/.uvnote/cache/661ca38adec8893d7c284140e922da661f0afcea4aaff6a3bf48a6494ce7c6eb
4272
 
4273
  ✓ Found HF Kernels SwiGLU
4274
+ Path: /__w/kernels-benchmarks/kernels-benchmarks/benches/activation/impls/.uvnote/cache/2775e6386f1caf1fda935a997130c06dcaf7641efb0db21560c35301fdabfd9b/activation.jsonl
4275
  ✓ Found PyTorch SwiGLU
4276
+ Path: /__w/kernels-benchmarks/kernels-benchmarks/benches/activation/impls/.uvnote/cache/661ca38adec8893d7c284140e922da661f0afcea4aaff6a3bf48a6494ce7c6eb/activation.jsonl
4277
 
4278
  ======================================================================
4279
  Summary: 2 found, 0 skipped, 0 missing
 
4293
  hf_kernels_swiglu cuda_T512_D768 0.03 True
4294
  torch_eager cuda_T128_D1024 0.05 True
4295
  torch_eager cuda_T128_D2048 0.05 True
4296
+ torch_eager cuda_T128_D768 0.04 True
4297
  torch_eager cuda_T256_D1024 0.05 True
4298
  torch_eager cuda_T256_D2048 0.05 True
4299
  torch_eager cuda_T256_D768 0.05 True
 
4319
  <div class="uv-install-logs" id="uv-logs-combine">
4320
  <div class="uv-logs-header" onclick="toggleUvLogs(this)">▶ UV Install Logs</div>
4321
  <div class="uv-logs-content" style="display: none;">
4322
+ Installed 37 packages in 250ms
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
4323
  </div>
4324
  </div>
4325
  <div class="cell-artifacts">
 
4332
  <rdf:RDF>
4333
  <ns2:Work>
4334
  <dc:type rdf:resource="http://purl.org/dc/dcmitype/StillImage" />
4335
+ <dc:date>2025-10-27T14:46:43.482898</dc:date>
4336
  <dc:format>image/svg+xml</dc:format>
4337
  <dc:creator>
4338
  <ns2:Agent>
 
4481
  <g id="matplotlib.axis_2">
4482
  <g id="ytick_1">
4483
  <g id="grid-y--2" class="grid grid-y">
4484
+ <path d="M 60.23 452.615548 L 847.294169 452.615548 " clip-path="url(#p620c7d392f)" style="fill: none; stroke: #b0b0b0; stroke-opacity: 0.3; stroke-width: 0.8; stroke-linecap: square" />
4485
  </g>
4486
  <g id="line2d_10">
4487
  <defs>
4488
  <path id="m0fca2865ba" d="M 0 0 L -3.5 0 " style="stroke: #000000; stroke-width: 0.8" />
4489
  </defs>
4490
  <g>
4491
+ <use ns4:href="#m0fca2865ba" x="60.23" y="452.615548" style="stroke: #000000; stroke-width: 0.8" />
4492
  </g>
4493
  </g>
4494
  <g id="text_10">
4495
+ <text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: end" x="53.23" y="456.414767" transform="rotate(-0 53.23 456.414767)">0.025</text>
4496
  </g>
4497
  </g>
4498
  <g id="ytick_2">
4499
  <g id="grid-y--3" class="grid grid-y">
4500
+ <path d="M 60.23 373.068398 L 847.294169 373.068398 " clip-path="url(#p620c7d392f)" style="fill: none; stroke: #b0b0b0; stroke-opacity: 0.3; stroke-width: 0.8; stroke-linecap: square" />
4501
  </g>
4502
  <g id="line2d_11">
4503
  <g>
4504
+ <use ns4:href="#m0fca2865ba" x="60.23" y="373.068398" style="stroke: #000000; stroke-width: 0.8" />
4505
  </g>
4506
  </g>
4507
  <g id="text_11">
4508
+ <text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: end" x="53.23" y="376.867617" transform="rotate(-0 53.23 376.867617)">0.030</text>
4509
  </g>
4510
  </g>
4511
  <g id="ytick_3">
4512
  <g id="grid-y--4" class="grid grid-y">
4513
+ <path d="M 60.23 293.521249 L 847.294169 293.521249 " clip-path="url(#p620c7d392f)" style="fill: none; stroke: #b0b0b0; stroke-opacity: 0.3; stroke-width: 0.8; stroke-linecap: square" />
4514
  </g>
4515
  <g id="line2d_12">
4516
  <g>
4517
+ <use ns4:href="#m0fca2865ba" x="60.23" y="293.521249" style="stroke: #000000; stroke-width: 0.8" />
4518
  </g>
4519
  </g>
4520
  <g id="text_12">
4521
+ <text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: end" x="53.23" y="297.320468" transform="rotate(-0 53.23 297.320468)">0.035</text>
4522
  </g>
4523
  </g>
4524
  <g id="ytick_4">
4525
  <g id="grid-y--5" class="grid grid-y">
4526
+ <path d="M 60.23 213.974099 L 847.294169 213.974099 " clip-path="url(#p620c7d392f)" style="fill: none; stroke: #b0b0b0; stroke-opacity: 0.3; stroke-width: 0.8; stroke-linecap: square" />
4527
  </g>
4528
  <g id="line2d_13">
4529
  <g>
4530
+ <use ns4:href="#m0fca2865ba" x="60.23" y="213.974099" style="stroke: #000000; stroke-width: 0.8" />
4531
  </g>
4532
  </g>
4533
  <g id="text_13">
4534
+ <text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: end" x="53.23" y="217.773318" transform="rotate(-0 53.23 217.773318)">0.040</text>
4535
  </g>
4536
  </g>
4537
  <g id="ytick_5">
4538
  <g id="grid-y--6" class="grid grid-y">
4539
+ <path d="M 60.23 134.42695 L 847.294169 134.42695 " clip-path="url(#p620c7d392f)" style="fill: none; stroke: #b0b0b0; stroke-opacity: 0.3; stroke-width: 0.8; stroke-linecap: square" />
4540
  </g>
4541
  <g id="line2d_14">
4542
  <g>
4543
+ <use ns4:href="#m0fca2865ba" x="60.23" y="134.42695" style="stroke: #000000; stroke-width: 0.8" />
4544
  </g>
4545
  </g>
4546
  <g id="text_14">
4547
+ <text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: end" x="53.23" y="138.226168" transform="rotate(-0 53.23 138.226168)">0.045</text>
4548
  </g>
4549
  </g>
4550
  <g id="ytick_6">
4551
  <g id="grid-y--7" class="grid grid-y">
4552
+ <path d="M 60.23 54.8798 L 847.294169 54.8798 " clip-path="url(#p620c7d392f)" style="fill: none; stroke: #b0b0b0; stroke-opacity: 0.3; stroke-width: 0.8; stroke-linecap: square" />
4553
  </g>
4554
  <g id="line2d_15">
4555
  <g>
4556
+ <use ns4:href="#m0fca2865ba" x="60.23" y="54.8798" style="stroke: #000000; stroke-width: 0.8" />
4557
  </g>
4558
  </g>
4559
  <g id="text_15">
4560
+ <text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: end" x="53.23" y="58.679019" transform="rotate(-0 53.23 58.679019)">0.050</text>
4561
  </g>
4562
  </g>
4563
  <g id="label--y" class="ylabel">
 
4565
  </g>
4566
  </g>
4567
  <g id="series--hf-kernels-swiglu" class="series">
4568
+ <path d="M 96.005644 451.16779 L 185.444754 379.591266 L 274.883864 367.802376 L 364.322974 382.120864 L 453.762084 356.82487 L 543.201194 396.121166 L 632.640304 374.96162 L 722.079415 389.136924 L 811.518525 358.734003 " clip-path="url(#p620c7d392f)" style="fill: none; stroke: #1f77b4; stroke-width: 1.5; stroke-linecap: square" />
4569
  <defs>
4570
  <path id="md7efaf3aec" d="M 0 3 C 0.795609 3 1.55874 2.683901 2.12132 2.12132 C 2.683901 1.55874 3 0.795609 3 0 C 3 -0.795609 2.683901 -1.55874 2.12132 -2.12132 C 1.55874 -2.683901 0.795609 -3 0 -3 C -0.795609 -3 -1.55874 -2.683901 -2.12132 -2.12132 C -2.683901 -1.55874 -3 -0.795609 -3 0 C -3 0.795609 -2.683901 1.55874 -2.12132 2.12132 C -1.55874 2.683901 -0.795609 3 0 3 z " style="stroke: #1f77b4" />
4571
  </defs>
4572
  <g clip-path="url(#p620c7d392f)">
4573
  <use ns4:href="#md7efaf3aec" x="96.005644" y="451.16779" style="fill: #1f77b4; stroke: #1f77b4" />
4574
+ <use ns4:href="#md7efaf3aec" x="185.444754" y="379.591266" style="fill: #1f77b4; stroke: #1f77b4" />
4575
+ <use ns4:href="#md7efaf3aec" x="274.883864" y="367.802376" style="fill: #1f77b4; stroke: #1f77b4" />
4576
+ <use ns4:href="#md7efaf3aec" x="364.322974" y="382.120864" style="fill: #1f77b4; stroke: #1f77b4" />
4577
+ <use ns4:href="#md7efaf3aec" x="453.762084" y="356.82487" style="fill: #1f77b4; stroke: #1f77b4" />
4578
+ <use ns4:href="#md7efaf3aec" x="543.201194" y="396.121166" style="fill: #1f77b4; stroke: #1f77b4" />
4579
+ <use ns4:href="#md7efaf3aec" x="632.640304" y="374.96162" style="fill: #1f77b4; stroke: #1f77b4" />
4580
+ <use ns4:href="#md7efaf3aec" x="722.079415" y="389.136924" style="fill: #1f77b4; stroke: #1f77b4" />
4581
+ <use ns4:href="#md7efaf3aec" x="811.518525" y="358.734003" style="fill: #1f77b4; stroke: #1f77b4" />
4582
  </g>
4583
  </g>
4584
  <g id="series--torch-eager" class="series">
4585
+ <path d="M 96.005644 189.63267 L 185.444754 53.272948 L 274.883864 47.08418 L 364.322974 66.175497 L 453.762084 61.545851 L 543.201194 66.795966 L 632.640304 59.954911 L 722.079415 85.26681 L 811.518525 95.751126 " clip-path="url(#p620c7d392f)" style="fill: none; stroke: #ff7f0e; stroke-width: 1.5; stroke-linecap: square" />
4586
  <defs>
4587
  <path id="m9b8c54d372" d="M 0 3 C 0.795609 3 1.55874 2.683901 2.12132 2.12132 C 2.683901 1.55874 3 0.795609 3 0 C 3 -0.795609 2.683901 -1.55874 2.12132 -2.12132 C 1.55874 -2.683901 0.795609 -3 0 -3 C -0.795609 -3 -1.55874 -2.683901 -2.12132 -2.12132 C -2.683901 -1.55874 -3 -0.795609 -3 0 C -3 0.795609 -2.683901 1.55874 -2.12132 2.12132 C -1.55874 2.683901 -0.795609 3 0 3 z " style="stroke: #ff7f0e" />
4588
  </defs>
4589
  <g clip-path="url(#p620c7d392f)">
4590
+ <use ns4:href="#m9b8c54d372" x="96.005644" y="189.63267" style="fill: #ff7f0e; stroke: #ff7f0e" />
4591
+ <use ns4:href="#m9b8c54d372" x="185.444754" y="53.272948" style="fill: #ff7f0e; stroke: #ff7f0e" />
4592
+ <use ns4:href="#m9b8c54d372" x="274.883864" y="47.08418" style="fill: #ff7f0e; stroke: #ff7f0e" />
4593
+ <use ns4:href="#m9b8c54d372" x="364.322974" y="66.175497" style="fill: #ff7f0e; stroke: #ff7f0e" />
4594
+ <use ns4:href="#m9b8c54d372" x="453.762084" y="61.545851" style="fill: #ff7f0e; stroke: #ff7f0e" />
4595
+ <use ns4:href="#m9b8c54d372" x="543.201194" y="66.795966" style="fill: #ff7f0e; stroke: #ff7f0e" />
4596
+ <use ns4:href="#m9b8c54d372" x="632.640304" y="59.954911" style="fill: #ff7f0e; stroke: #ff7f0e" />
4597
+ <use ns4:href="#m9b8c54d372" x="722.079415" y="85.26681" style="fill: #ff7f0e; stroke: #ff7f0e" />
4598
+ <use ns4:href="#m9b8c54d372" x="811.518525" y="95.751126" style="fill: #ff7f0e; stroke: #ff7f0e" />
4599
  </g>
4600
  </g>
4601
  <g id="patch_3">
 
4615
  </g>
4616
  <g id="legend" class="legend">
4617
  <g id="patch_7">
4618
+ <path d="M 720.811356 64.7925 L 840.294169 64.7925 Q 842.294169 64.7925 842.294169 62.7925 L 842.294169 33.88 Q 842.294169 31.88 840.294169 31.88 L 720.811356 31.88 Q 718.811356 31.88 718.811356 33.88 L 718.811356 62.7925 Q 718.811356 64.7925 720.811356 64.7925 L 720.811356 64.7925 z " style="fill: none; opacity: 0.8; stroke: #cccccc; stroke-linejoin: miter" />
4619
  </g>
4620
  <g id="line2d_16">
4621
+ <path d="M 722.811356 39.978438 L 732.811356 39.978438 L 742.811356 39.978438 " style="fill: none; stroke: #1f77b4; stroke-width: 1.5; stroke-linecap: square" />
4622
  <g>
4623
+ <use ns4:href="#md7efaf3aec" x="732.811356" y="39.978438" style="fill: #1f77b4; stroke: #1f77b4" />
4624
  </g>
4625
  </g>
4626
  <g id="legend-label--hf-kernels-swiglu" class="legend">
4627
+ <text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: start" x="750.811356" y="43.478438" transform="rotate(-0 750.811356 43.478438)">hf_kernels_swiglu</text>
4628
  </g>
4629
  <g id="line2d_17">
4630
+ <path d="M 722.811356 54.934687 L 732.811356 54.934687 L 742.811356 54.934687 " style="fill: none; stroke: #ff7f0e; stroke-width: 1.5; stroke-linecap: square" />
4631
  <g>
4632
+ <use ns4:href="#m9b8c54d372" x="732.811356" y="54.934687" style="fill: #ff7f0e; stroke: #ff7f0e" />
4633
  </g>
4634
  </g>
4635
  <g id="legend-label--torch-eager" class="legend">
4636
+ <text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: start" x="750.811356" y="58.434687" transform="rotate(-0 750.811356 58.434687)">torch_eager</text>
4637
  </g>
4638
  </g>
4639
  </g>
flash_attn/impls/artifacts/benchmark/attention.jsonl CHANGED
@@ -1,6 +1,6 @@
1
- {"ts": "2025-10-24T19:25:35Z", "run": "352bdfcdb8354c52bb9bcfb05fb3e88b", "impl": "hf_kernels_flash_attn", "tags": {"family": "hf-kernels", "backend": "flash-attn", "compile": "none"}, "wl": {"name": "cuda_attn_L128_bfloat16", "batch": 1, "seq_len": 4224, "heads": 24, "head_dim": 128, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L4", "sm": "8.9", "py": "3.12.7", "plat": "Linux-5.15.0-1084-aws-x86_64-with-glibc2.31"}, "lat_ms": {"p10": 2.817559987306595, "p50": 2.819840970914811, "p90": 2.8203310212120414, "mean": 2.8193464037030935, "iqr": 0.002661021426320076, "raw_times": [2.8176699997857213, 2.8203310212120414, 2.821330039296299, 2.819840970914811, 2.817559987306595], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 2.8170199948363006, "peak_bytes": 295567360, "ok": true, "absmax": 0.0625, "corr": {"ok": true, "rtol": 0.02, "atol": 0.02, "absmax": 0.0625, "mae": 0.0003528594970703125, "mse": 2.726912498474121e-06, "ref": "sdpa_math_fp32"}, "err": null}
2
- {"ts": "2025-10-24T19:25:35Z", "run": "352bdfcdb8354c52bb9bcfb05fb3e88b", "impl": "hf_kernels_flash_attn", "tags": {"family": "hf-kernels", "backend": "flash-attn", "compile": "none"}, "wl": {"name": "cuda_attn_L256_bfloat16", "batch": 1, "seq_len": 4352, "heads": 24, "head_dim": 128, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L4", "sm": "8.9", "py": "3.12.7", "plat": "Linux-5.15.0-1084-aws-x86_64-with-glibc2.31"}, "lat_ms": {"p10": 3.9076139801181853, "p50": 3.9129150100052357, "p90": 3.91379400389269, "mean": 3.920128010213375, "iqr": 0.0021209707483649254, "raw_times": [3.9546440239064395, 3.9076139801181853, 3.9129150100052357, 3.911673033144325, 3.91379400389269], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 3.1108770053833723, "peak_bytes": 304742400, "ok": true, "absmax": 0.0625, "corr": {"ok": true, "rtol": 0.02, "atol": 0.02, "absmax": 0.0625, "mae": 0.000354766845703125, "mse": 2.7567148208618164e-06, "ref": "sdpa_math_fp32"}, "err": null}
3
- {"ts": "2025-10-24T19:25:35Z", "run": "352bdfcdb8354c52bb9bcfb05fb3e88b", "impl": "hf_kernels_flash_attn", "tags": {"family": "hf-kernels", "backend": "flash-attn", "compile": "none"}, "wl": {"name": "cuda_attn_L320_bfloat16", "batch": 1, "seq_len": 4416, "heads": 24, "head_dim": 128, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L4", "sm": "8.9", "py": "3.12.7", "plat": "Linux-5.15.0-1084-aws-x86_64-with-glibc2.31"}, "lat_ms": {"p10": 4.073257034178823, "p50": 4.119218967389315, "p90": 4.122229001950473, "mean": 4.102474392857403, "iqr": 0.04891102435067296, "raw_times": [4.073257034178823, 4.122229001950473, 4.119218967389315, 4.124348983168602, 4.0733179775998], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 3.606237005442381, "peak_bytes": 307494912, "ok": true, "absmax": 0.0625, "corr": {"ok": true, "rtol": 0.02, "atol": 0.02, "absmax": 0.0625, "mae": 0.000354766845703125, "mse": 2.7418136596679688e-06, "ref": "sdpa_math_fp32"}, "err": null}
4
- {"ts": "2025-10-24T19:25:36Z", "run": "352bdfcdb8354c52bb9bcfb05fb3e88b", "impl": "hf_kernels_flash_attn", "tags": {"family": "hf-kernels", "backend": "flash-attn", "compile": "none"}, "wl": {"name": "cuda_attn_L384_bfloat16", "batch": 1, "seq_len": 4480, "heads": 24, "head_dim": 128, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L4", "sm": "8.9", "py": "3.12.7", "plat": "Linux-5.15.0-1084-aws-x86_64-with-glibc2.31"}, "lat_ms": {"p10": 4.078477970324457, "p50": 4.127818974666297, "p90": 4.151278990320861, "mean": 4.122894583269954, "iqr": 0.06814103107899427, "raw_times": [4.173759021796286, 4.151278990320861, 4.127818974666297, 4.078477970324457, 4.083137959241867], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 3.617736976593733, "peak_bytes": 311296000, "ok": true, "absmax": 0.0625, "corr": {"ok": true, "rtol": 0.02, "atol": 0.02, "absmax": 0.0625, "mae": 0.000354766845703125, "mse": 2.7418136596679688e-06, "ref": "sdpa_math_fp32"}, "err": null}
5
- {"ts": "2025-10-24T19:25:36Z", "run": "352bdfcdb8354c52bb9bcfb05fb3e88b", "impl": "hf_kernels_flash_attn", "tags": {"family": "hf-kernels", "backend": "flash-attn", "compile": "none"}, "wl": {"name": "cuda_attn_L448_bfloat16", "batch": 1, "seq_len": 4544, "heads": 24, "head_dim": 128, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L4", "sm": "8.9", "py": "3.12.7", "plat": "Linux-5.15.0-1084-aws-x86_64-with-glibc2.31"}, "lat_ms": {"p10": 4.104706982616335, "p50": 4.1118780500255525, "p90": 4.146788967773318, "mean": 4.123546194750816, "iqr": 0.0404709717258811, "raw_times": [4.106317996047437, 4.104706982616335, 4.1118780500255525, 4.146788967773318, 4.148038977291435], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 4.064576991368085, "peak_bytes": 315621376, "ok": true, "absmax": 0.0625, "corr": {"ok": true, "rtol": 0.02, "atol": 0.02, "absmax": 0.0625, "mae": 0.0003528594970703125, "mse": 2.726912498474121e-06, "ref": "sdpa_math_fp32"}, "err": null}
6
- {"ts": "2025-10-24T19:25:36Z", "run": "352bdfcdb8354c52bb9bcfb05fb3e88b", "impl": "hf_kernels_flash_attn", "tags": {"family": "hf-kernels", "backend": "flash-attn", "compile": "none"}, "wl": {"name": "cuda_attn_L512_bfloat16", "batch": 1, "seq_len": 4608, "heads": 24, "head_dim": 128, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L4", "sm": "8.9", "py": "3.12.7", "plat": "Linux-5.15.0-1084-aws-x86_64-with-glibc2.31"}, "lat_ms": {"p10": 4.358973994385451, "p50": 4.570448014419526, "p90": 4.571158031467348, "mean": 4.518645000644028, "iqr": 0.052271061576902866, "raw_times": [4.358973994385451, 4.570448014419526, 4.57375799305737, 4.571158031467348, 4.5188869698904455], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 3.924274002201855, "peak_bytes": 319946752, "ok": true, "absmax": 0.0625, "corr": {"ok": true, "rtol": 0.02, "atol": 0.02, "absmax": 0.0625, "mae": 0.0003528594970703125, "mse": 2.726912498474121e-06, "ref": "sdpa_math_fp32"}, "err": null}
 
1
+ {"ts": "2025-10-27T14:46:25Z", "run": "a317de5ffd144e2d917a7f8d53507ad4", "impl": "sage_int8_fp16", "tags": {"family": "sageattention", "backend": "int8_fp16_cuda", "compile": "none"}, "wl": {"name": "cuda_attn_L128_bfloat16", "batch": 1, "seq_len": 4224, "heads": 24, "head_dim": 128, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35"}, "lat_ms": null, "compile_ms": null, "peak_bytes": null, "ok": false, "absmax": null, "corr": {}, "err": {"type": "AttributeError", "msg": "module 'sage_attention_12c766386675beb4' has no attribute 'fwd'"}}
2
+ {"ts": "2025-10-27T14:46:25Z", "run": "a317de5ffd144e2d917a7f8d53507ad4", "impl": "sage_int8_fp16", "tags": {"family": "sageattention", "backend": "int8_fp16_cuda", "compile": "none"}, "wl": {"name": "cuda_attn_L256_bfloat16", "batch": 1, "seq_len": 4352, "heads": 24, "head_dim": 128, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35"}, "lat_ms": null, "compile_ms": null, "peak_bytes": null, "ok": false, "absmax": null, "corr": {}, "err": {"type": "AttributeError", "msg": "module 'sage_attention_12c766386675beb4' has no attribute 'fwd'"}}
3
+ {"ts": "2025-10-27T14:46:25Z", "run": "a317de5ffd144e2d917a7f8d53507ad4", "impl": "sage_int8_fp16", "tags": {"family": "sageattention", "backend": "int8_fp16_cuda", "compile": "none"}, "wl": {"name": "cuda_attn_L320_bfloat16", "batch": 1, "seq_len": 4416, "heads": 24, "head_dim": 128, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35"}, "lat_ms": null, "compile_ms": null, "peak_bytes": null, "ok": false, "absmax": null, "corr": {}, "err": {"type": "AttributeError", "msg": "module 'sage_attention_12c766386675beb4' has no attribute 'fwd'"}}
4
+ {"ts": "2025-10-27T14:46:25Z", "run": "a317de5ffd144e2d917a7f8d53507ad4", "impl": "sage_int8_fp16", "tags": {"family": "sageattention", "backend": "int8_fp16_cuda", "compile": "none"}, "wl": {"name": "cuda_attn_L384_bfloat16", "batch": 1, "seq_len": 4480, "heads": 24, "head_dim": 128, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35"}, "lat_ms": null, "compile_ms": null, "peak_bytes": null, "ok": false, "absmax": null, "corr": {}, "err": {"type": "AttributeError", "msg": "module 'sage_attention_12c766386675beb4' has no attribute 'fwd'"}}
5
+ {"ts": "2025-10-27T14:46:25Z", "run": "a317de5ffd144e2d917a7f8d53507ad4", "impl": "sage_int8_fp16", "tags": {"family": "sageattention", "backend": "int8_fp16_cuda", "compile": "none"}, "wl": {"name": "cuda_attn_L448_bfloat16", "batch": 1, "seq_len": 4544, "heads": 24, "head_dim": 128, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35"}, "lat_ms": null, "compile_ms": null, "peak_bytes": null, "ok": false, "absmax": null, "corr": {}, "err": {"type": "AttributeError", "msg": "module 'sage_attention_12c766386675beb4' has no attribute 'fwd'"}}
6
+ {"ts": "2025-10-27T14:46:25Z", "run": "a317de5ffd144e2d917a7f8d53507ad4", "impl": "sage_int8_fp16", "tags": {"family": "sageattention", "backend": "int8_fp16_cuda", "compile": "none"}, "wl": {"name": "cuda_attn_L512_bfloat16", "batch": 1, "seq_len": 4608, "heads": 24, "head_dim": 128, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35"}, "lat_ms": null, "compile_ms": null, "peak_bytes": null, "ok": false, "absmax": null, "corr": {}, "err": {"type": "AttributeError", "msg": "module 'sage_attention_12c766386675beb4' has no attribute 'fwd'"}}
flash_attn/impls/cells/benchmark.py CHANGED
@@ -3,8 +3,9 @@
3
  # dependencies = [
4
  # "numpy",
5
  # "torch==2.8.0",
6
- # "kernels-benchmark-tools",
7
  # "kernels",
 
 
8
  # ]
9
  #
10
  # [tool.uv.sources]
@@ -15,18 +16,18 @@ import sys
15
  from kernels_benchmark_tools import KernelTypeEnum, run_benchmark
16
  from kernels import get_kernel
17
 
18
- # Load the flash attention kernel
19
- hf_kernels_flash_attn = get_kernel("kernels-community/flash-attn")
20
 
21
 
22
- def hf_flash_attention(query, key, value):
23
- """HuggingFace Kernels Flash Attention"""
24
- return hf_kernels_flash_attn.fwd(query, key, value, is_causal=False)[0]
25
 
26
 
27
  run_benchmark(
28
  kernel_type=KernelTypeEnum.ATTENTION,
29
- impl_name="hf_kernels_flash_attn",
30
- impl_tags={"family": "hf-kernels", "backend": "flash-attn", "compile": "none"},
31
- impl_func=hf_flash_attention,
32
  )
 
3
  # dependencies = [
4
  # "numpy",
5
  # "torch==2.8.0",
 
6
  # "kernels",
7
+ # "kernels-benchmark-tools",
8
+ # "sageattention",
9
  # ]
10
  #
11
  # [tool.uv.sources]
 
16
  from kernels_benchmark_tools import KernelTypeEnum, run_benchmark
17
  from kernels import get_kernel
18
 
19
+ # Load the sage attention kernel
20
+ hf_kernels_sage_attn = get_kernel("kernels-community/sage_attention")
21
 
22
 
23
+ def sage_attention(query, key, value):
24
+ """SageAttention with INT8 Q/K quantization and FP16 P/V"""
25
+ return hf_kernels_sage_attn.fwd(query, key, value, is_causal=False)[0]
26
 
27
 
28
  run_benchmark(
29
  kernel_type=KernelTypeEnum.ATTENTION,
30
+ impl_name="sage_int8_fp16",
31
+ impl_tags={"family": "sageattention", "backend": "int8_fp16_cuda", "compile": "none"},
32
+ impl_func=sage_attention,
33
  )
flash_attn/impls/flash_attention.html CHANGED
@@ -3857,7 +3857,7 @@ span.linenos.special { color: #000000; background-color: #ffffc0; padding-left:
3857
  <div class="system-info">
3858
  <div class="system-info-header">Generated on:</div>
3859
  <div class="system-info-content">
3860
- Linux x86_64 | Linux-5.15.0-1084-aws-x86_64-with-glibc2.31
3861
  </div>
3862
  </div>
3863
 
@@ -3871,7 +3871,7 @@ span.linenos.special { color: #000000; background-color: #ffffc0; padding-left:
3871
  <span onclick="toggleOutput('nv')" style="cursor: pointer;">▼ output</span>
3872
  <span id="uv-indicator-nv" style="cursor: default; opacity: 0.3;">▶ uv-logs</span>
3873
  </span> |
3874
- Cell: nv | 4.05s
3875
  | <button class="run-btn" onclick="runCell('nv')">▶ run</button>
3876
  <button class="copy-btn" onclick="copyCell('nv')">Copy</button>
3877
  <a href="cells/nv.py" target="_blank" class="raw-btn">Raw</a>
@@ -3888,34 +3888,22 @@ Cell: nv | 4.05s
3888
  </div>
3889
  </div>
3890
  <div id="output-nv" class="cell-output">
3891
- <div class="cell-stdout"><pre class="stdout-text">Fri Oct 24 19:21:04 2025
3892
  +-----------------------------------------------------------------------------------------+
3893
- | NVIDIA-SMI 560.35.05 Driver Version: 560.35.05 CUDA Version: 12.6 |
3894
  |-----------------------------------------+------------------------+----------------------+
3895
  | GPU Name Persistence-M | Bus-Id Disp.A | Volatile Uncorr. ECC |
3896
  | Fan Temp Perf Pwr:Usage/Cap | Memory-Usage | GPU-Util Compute M. |
3897
  | | | MIG M. |
3898
  |=========================================+========================+======================|
3899
- | 0 NVIDIA L4 Off | 00000000:38:00.0 Off | 0 |
3900
- | N/A 36C P0 27W / 72W | 1MiB / 23034MiB | 0% Default |
3901
- | | | N/A |
3902
- +-----------------------------------------+------------------------+----------------------+
3903
- | 1 NVIDIA L4 Off | 00000000:3A:00.0 Off | 0 |
3904
- | N/A 33C P0 28W / 72W | 1MiB / 23034MiB | 2% Default |
3905
- | | | N/A |
3906
- +-----------------------------------------+------------------------+----------------------+
3907
- | 2 NVIDIA L4 Off | 00000000:3C:00.0 Off | 0 |
3908
- | N/A 34C P0 27W / 72W | 1MiB / 23034MiB | 1% Default |
3909
- | | | N/A |
3910
- +-----------------------------------------+------------------------+----------------------+
3911
- | 3 NVIDIA L4 Off | 00000000:3E:00.0 Off | 0 |
3912
- | N/A 33C P0 27W / 72W | 1MiB / 23034MiB | 2% Default |
3913
  | | | N/A |
3914
  +-----------------------------------------+------------------------+----------------------+
3915
 
3916
  +-----------------------------------------------------------------------------------------+
3917
  | Processes: |
3918
- | GPU GI CI PID Type Process name GPU Memory |
3919
  | ID ID Usage |
3920
  |=========================================================================================|
3921
  | No running processes found |
@@ -3931,9 +3919,9 @@ Cell: nv | 4.05s
3931
  <span class="collapse-indicators">
3932
  <span onclick="toggleCode('benchmark')" style="cursor: pointer;">▼ code</span>
3933
  <span onclick="toggleOutput('benchmark')" style="cursor: pointer;">▼ output</span>
3934
- <span id="uv-indicator-benchmark" onclick="toggleUvLogsFromHeader('benchmark')" style="cursor: pointer;">▶ uv-logs</span>
3935
  </span> |
3936
- Cell: benchmark | 44.13s
3937
  | <button class="run-btn" onclick="runCell('benchmark')">▶ run</button>
3938
  <button class="copy-btn" onclick="copyCell('benchmark')">Copy</button>
3939
  <a href="cells/benchmark.py" target="_blank" class="raw-btn">Raw</a>
@@ -3984,29 +3972,29 @@ PROFILE TRACE: torch_flash_ma | cuda_attn_L128_bfloat16
3984
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
3985
  Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
3986
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
3987
- torch_flash_ma 2.87% 353.236us 20.60% 2.536ms 2.536ms 0.000us 0.00% 10.773ms 10.773ms 1
3988
- torch_flash_ma 0.00% 0.000us 0.00% 0.000us 0.000us 10.620ms 100.09% 10.620ms 10.620ms 1
3989
- aten::scaled_dot_product_attention 0.36% 44.342us 1.92% 236.065us 78.688us 0.000us 0.00% 8.386ms 2.795ms 3
3990
- aten::_scaled_dot_product_flash_attention 0.24% 29.551us 1.56% 191.723us 63.908us 0.000us 0.00% 8.386ms 2.795ms 3
3991
- aten::_flash_attention_forward 0.31% 38.342us 1.10% 135.583us 45.194us 8.386ms 79.03% 8.386ms 2.795ms 3
3992
- void pytorch_flash::flash_fwd_kernel&lt;Flash_fwd_kerne... 0.00% 0.000us 0.00% 0.000us 0.000us 8.386ms 79.03% 8.386ms 2.795ms 3
3993
- aten::contiguous 0.12% 15.199us 15.18% 1.869ms 155.744us 0.000us 0.00% 2.387ms 198.924us 12
3994
- aten::clone 0.36% 44.321us 15.06% 1.854ms 154.478us 0.000us 0.00% 2.387ms 198.924us 12
3995
- aten::copy_ 0.78% 95.990us 13.98% 1.720ms 143.361us 2.225ms 20.97% 2.387ms 198.924us 12
3996
- void at::native::elementwise_kernel&lt;128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 2.225ms 20.97% 2.225ms 185.396us 12
3997
- Activity Buffer Request 12.35% 1.520ms 12.35% 1.520ms 1.520ms 162.335us 1.53% 162.335us 162.335us 1
3998
- aten::transpose 0.62% 76.778us 0.84% 103.972us 4.332us 0.000us 0.00% 0.000us 0.000us 24
3999
- aten::as_strided 0.22% 27.194us 0.22% 27.194us 1.133us 0.000us 0.00% 0.000us 0.000us 24
4000
- aten::empty_like 0.24% 30.024us 0.91% 112.425us 7.495us 0.000us 0.00% 0.000us 0.000us 15
4001
- aten::empty 0.80% 98.881us 0.80% 98.881us 4.120us 0.000us 0.00% 0.000us 0.000us 24
4002
- cudaLaunchKernel 1.06% 129.984us 1.06% 129.984us 8.666us 0.000us 0.00% 0.000us 0.000us 15
4003
- aten::empty_strided 0.14% 17.180us 0.14% 17.180us 5.727us 0.000us 0.00% 0.000us 0.000us 3
4004
- cudaDeviceGetAttribute 0.02% 2.899us 0.02% 2.899us 0.483us 0.000us 0.00% 0.000us 0.000us 6
4005
- cudaFuncSetAttribute 0.10% 11.980us 0.10% 11.980us 3.993us 0.000us 0.00% 0.000us 0.000us 3
4006
- cudaDeviceSynchronize 79.40% 9.774ms 79.40% 9.774ms 9.774ms 0.000us 0.00% 0.000us 0.000us 1
4007
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4008
- Self CPU time total: 12.309ms
4009
- Self CUDA time total: 10.610ms
4010
 
4011
 
4012
 
@@ -4016,29 +4004,29 @@ PROFILE TRACE: torch_flash_ma | cuda_attn_L256_bfloat16
4016
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4017
  Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
4018
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4019
- torch_flash_ma 1.72% 263.576us 14.84% 2.279ms 2.279ms 0.000us 0.00% 13.971ms 13.971ms 1
4020
- torch_flash_ma 0.00% 0.000us 0.00% 0.000us 0.000us 13.784ms 100.09% 13.784ms 13.784ms 1
4021
- aten::scaled_dot_product_attention 0.17% 25.751us 1.16% 178.074us 59.358us 0.000us 0.00% 11.389ms 3.796ms 3
4022
- aten::_scaled_dot_product_flash_attention 0.12% 18.370us 0.99% 152.323us 50.774us 0.000us 0.00% 11.389ms 3.796ms 3
4023
- aten::_flash_attention_forward 0.21% 32.869us 0.72% 109.873us 36.624us 11.389ms 82.70% 11.389ms 3.796ms 3
4024
- void pytorch_flash::flash_fwd_kernel&lt;Flash_fwd_kerne... 0.00% 0.000us 0.00% 0.000us 0.000us 11.389ms 82.70% 11.389ms 3.796ms 3
4025
- aten::contiguous 0.06% 9.710us 11.64% 1.787ms 148.932us 0.000us 0.00% 2.582ms 215.169us 12
4026
- aten::clone 0.19% 29.062us 11.57% 1.777ms 148.123us 0.000us 0.00% 2.582ms 215.169us 12
4027
- aten::copy_ 0.55% 83.901us 10.97% 1.685ms 140.395us 2.382ms 17.30% 2.582ms 215.169us 12
4028
- void at::native::elementwise_kernel&lt;128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 2.382ms 17.30% 2.382ms 198.534us 12
4029
- Activity Buffer Request 9.88% 1.517ms 9.88% 1.517ms 1.517ms 199.614us 1.45% 199.614us 199.614us 1
4030
- aten::transpose 0.36% 54.739us 0.48% 74.091us 3.087us 0.000us 0.00% 0.000us 0.000us 24
4031
- aten::as_strided 0.13% 19.352us 0.13% 19.352us 0.806us 0.000us 0.00% 0.000us 0.000us 24
4032
- aten::empty_like 0.13% 19.810us 0.54% 82.371us 5.491us 0.000us 0.00% 0.000us 0.000us 15
4033
- aten::empty 0.51% 77.821us 0.51% 77.821us 3.243us 0.000us 0.00% 0.000us 0.000us 24
4034
- cudaLaunchKernel 0.70% 107.293us 0.70% 107.293us 7.153us 0.000us 0.00% 0.000us 0.000us 15
4035
- aten::empty_strided 0.09% 13.681us 0.09% 13.681us 4.560us 0.000us 0.00% 0.000us 0.000us 3
4036
- cudaDeviceGetAttribute 0.01% 1.961us 0.01% 1.961us 0.327us 0.000us 0.00% 0.000us 0.000us 6
4037
- cudaFuncSetAttribute 0.03% 4.001us 0.03% 4.001us 1.334us 0.000us 0.00% 0.000us 0.000us 3
4038
- cudaDeviceSynchronize 85.16% 13.081ms 85.16% 13.081ms 13.081ms 0.000us 0.00% 0.000us 0.000us 1
4039
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4040
- Self CPU time total: 15.360ms
4041
- Self CUDA time total: 13.772ms
4042
 
4043
 
4044
 
@@ -4048,29 +4036,29 @@ PROFILE TRACE: torch_flash_ma | cuda_attn_L320_bfloat16
4048
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4049
  Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
4050
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4051
- torch_flash_ma 1.59% 253.009us 16.33% 2.606ms 2.606ms 0.000us 0.00% 14.231ms 14.231ms 1
4052
- torch_flash_ma 0.00% 0.000us 0.00% 0.000us 0.000us 14.040ms 100.09% 14.040ms 14.040ms 1
4053
- aten::scaled_dot_product_attention 0.16% 26.200us 1.12% 178.593us 59.531us 0.000us 0.00% 11.609ms 3.870ms 3
4054
- aten::_scaled_dot_product_flash_attention 0.12% 19.071us 0.96% 152.393us 50.798us 0.000us 0.00% 11.609ms 3.870ms 3
4055
- aten::_flash_attention_forward 0.21% 33.032us 0.69% 110.322us 36.774us 11.609ms 82.76% 11.609ms 3.870ms 3
4056
- void pytorch_flash::flash_fwd_kernel&lt;Flash_fwd_kerne... 0.00% 0.000us 0.00% 0.000us 0.000us 11.609ms 82.76% 11.609ms 3.870ms 3
4057
- aten::contiguous 0.06% 10.030us 13.32% 2.125ms 177.070us 0.000us 0.00% 2.623ms 218.547us 12
4058
- aten::clone 0.18% 28.858us 13.25% 2.115ms 176.235us 0.000us 0.00% 2.623ms 218.547us 12
4059
- aten::copy_ 0.51% 81.604us 12.67% 2.022ms 168.500us 2.418ms 17.24% 2.623ms 218.547us 12
4060
- void at::native::elementwise_kernel&lt;128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 2.418ms 17.24% 2.418ms 201.529us 12
4061
- Activity Buffer Request 11.62% 1.854ms 11.62% 1.854ms 1.854ms 204.222us 1.46% 204.222us 204.222us 1
4062
- aten::transpose 0.33% 52.790us 0.45% 72.350us 3.015us 0.000us 0.00% 0.000us 0.000us 24
4063
- aten::as_strided 0.12% 19.560us 0.12% 19.560us 0.815us 0.000us 0.00% 0.000us 0.000us 24
4064
- aten::empty_like 0.12% 19.891us 0.52% 83.030us 5.535us 0.000us 0.00% 0.000us 0.000us 15
4065
- aten::empty 0.49% 77.888us 0.49% 77.888us 3.245us 0.000us 0.00% 0.000us 0.000us 24
4066
- cudaLaunchKernel 0.69% 109.402us 0.69% 109.402us 7.293us 0.000us 0.00% 0.000us 0.000us 15
4067
- aten::empty_strided 0.09% 14.430us 0.09% 14.430us 4.810us 0.000us 0.00% 0.000us 0.000us 3
4068
- cudaDeviceGetAttribute 0.01% 1.730us 0.01% 1.730us 0.288us 0.000us 0.00% 0.000us 0.000us 6
4069
- cudaFuncSetAttribute 0.02% 3.831us 0.02% 3.831us 1.277us 0.000us 0.00% 0.000us 0.000us 3
4070
- cudaDeviceSynchronize 83.67% 13.349ms 83.67% 13.349ms 13.349ms 0.000us 0.00% 0.000us 0.000us 1
4071
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4072
- Self CPU time total: 15.955ms
4073
- Self CUDA time total: 14.027ms
4074
 
4075
 
4076
 
@@ -4080,29 +4068,29 @@ PROFILE TRACE: torch_flash_ma | cuda_attn_L384_bfloat16
4080
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4081
  Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
4082
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4083
- torch_flash_ma 1.54% 253.696us 15.59% 2.567ms 2.567ms 0.000us 0.00% 14.787ms 14.787ms 1
4084
- torch_flash_ma 0.00% 0.000us 0.00% 0.000us 0.000us 14.594ms 100.09% 14.594ms 14.594ms 1
4085
- aten::scaled_dot_product_attention 0.16% 26.450us 1.08% 178.164us 59.388us 0.000us 0.00% 12.117ms 4.039ms 3
4086
- aten::_scaled_dot_product_flash_attention 0.12% 18.962us 0.92% 151.714us 50.571us 0.000us 0.00% 12.117ms 4.039ms 3
4087
- aten::_flash_attention_forward 0.20% 32.440us 0.66% 109.033us 36.344us 12.117ms 83.10% 12.117ms 4.039ms 3
4088
- void pytorch_flash::flash_fwd_kernel&lt;Flash_fwd_kerne... 0.00% 0.000us 0.00% 0.000us 0.000us 12.117ms 83.10% 12.117ms 4.039ms 3
4089
- aten::contiguous 0.06% 10.538us 12.68% 2.087ms 173.951us 0.000us 0.00% 2.670ms 222.462us 12
4090
- aten::clone 0.17% 28.412us 12.61% 2.077ms 173.073us 0.000us 0.00% 2.670ms 222.462us 12
4091
- aten::copy_ 0.50% 82.093us 12.05% 1.984ms 165.351us 2.464ms 16.90% 2.670ms 222.462us 12
4092
- void at::native::elementwise_kernel&lt;128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 2.464ms 16.90% 2.464ms 205.326us 12
4093
- Activity Buffer Request 9.45% 1.555ms 9.45% 1.555ms 1.555ms 205.630us 1.41% 205.630us 205.630us 1
4094
- aten::transpose 0.32% 52.269us 0.44% 71.730us 2.989us 0.000us 0.00% 0.000us 0.000us 24
4095
- aten::as_strided 0.12% 19.461us 0.12% 19.461us 0.811us 0.000us 0.00% 0.000us 0.000us 24
4096
- aten::empty_like 0.12% 19.690us 0.51% 84.151us 5.610us 0.000us 0.00% 0.000us 0.000us 15
4097
- aten::empty 0.47% 77.802us 0.47% 77.802us 3.242us 0.000us 0.00% 0.000us 0.000us 24
4098
- cudaLaunchKernel 2.24% 369.337us 2.24% 369.337us 24.622us 0.000us 0.00% 0.000us 0.000us 15
4099
- aten::empty_strided 0.09% 14.871us 0.09% 14.871us 4.957us 0.000us 0.00% 0.000us 0.000us 3
4100
- cudaDeviceGetAttribute 0.01% 1.880us 0.01% 1.880us 0.313us 0.000us 0.00% 0.000us 0.000us 6
4101
- cudaFuncSetAttribute 0.02% 4.010us 0.02% 4.010us 1.337us 0.000us 0.00% 0.000us 0.000us 3
4102
- cudaDeviceSynchronize 84.41% 13.899ms 84.41% 13.899ms 13.899ms 0.000us 0.00% 0.000us 0.000us 1
4103
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4104
- Self CPU time total: 16.466ms
4105
- Self CUDA time total: 14.581ms
4106
 
4107
 
4108
 
@@ -4112,29 +4100,29 @@ PROFILE TRACE: torch_flash_ma | cuda_attn_L448_bfloat16
4112
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4113
  Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
4114
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4115
- torch_flash_ma 1.70% 278.864us 15.50% 2.543ms 2.543ms 0.000us 0.00% 14.797ms 14.797ms 1
4116
- torch_flash_ma 0.00% 0.000us 0.00% 0.000us 0.000us 14.600ms 100.09% 14.600ms 14.600ms 1
4117
- aten::scaled_dot_product_attention 0.17% 27.381us 1.16% 189.724us 63.241us 0.000us 0.00% 12.088ms 4.029ms 3
4118
- aten::_scaled_dot_product_flash_attention 0.12% 19.359us 0.99% 162.343us 54.114us 0.000us 0.00% 12.088ms 4.029ms 3
4119
- aten::_flash_attention_forward 0.21% 33.700us 0.72% 118.223us 39.408us 12.088ms 82.87% 12.088ms 4.029ms 3
4120
- void pytorch_flash::flash_fwd_kernel&lt;Flash_fwd_kerne... 0.00% 0.000us 0.00% 0.000us 0.000us 12.088ms 82.87% 12.088ms 4.029ms 3
4121
- aten::contiguous 0.06% 10.278us 12.35% 2.025ms 168.720us 0.000us 0.00% 2.709ms 225.729us 12
4122
- aten::clone 0.18% 29.935us 12.28% 2.014ms 167.864us 0.000us 0.00% 2.709ms 225.729us 12
4123
- aten::copy_ 0.52% 84.857us 11.68% 1.915ms 159.605us 2.499ms 17.13% 2.709ms 225.729us 12
4124
- void at::native::elementwise_kernel&lt;128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 2.499ms 17.13% 2.499ms 208.262us 12
4125
- Activity Buffer Request 9.10% 1.493ms 9.10% 1.493ms 1.493ms 209.598us 1.44% 209.598us 209.598us 1
4126
- aten::transpose 0.33% 54.376us 0.45% 74.216us 3.092us 0.000us 0.00% 0.000us 0.000us 24
4127
- aten::as_strided 0.12% 19.840us 0.12% 19.840us 0.827us 0.000us 0.00% 0.000us 0.000us 24
4128
- aten::empty_like 0.12% 20.251us 0.54% 88.821us 5.921us 0.000us 0.00% 0.000us 0.000us 15
4129
- aten::empty 0.50% 82.172us 0.50% 82.172us 3.424us 0.000us 0.00% 0.000us 0.000us 24
4130
- cudaLaunchKernel 2.25% 368.209us 2.25% 368.209us 24.547us 0.000us 0.00% 0.000us 0.000us 15
4131
- aten::empty_strided 0.09% 14.850us 0.09% 14.850us 4.950us 0.000us 0.00% 0.000us 0.000us 3
4132
- cudaDeviceGetAttribute 0.01% 2.110us 0.01% 2.110us 0.352us 0.000us 0.00% 0.000us 0.000us 6
4133
- cudaFuncSetAttribute 0.02% 3.861us 0.02% 3.861us 1.287us 0.000us 0.00% 0.000us 0.000us 3
4134
- cudaDeviceSynchronize 84.50% 13.857ms 84.50% 13.857ms 13.857ms 0.000us 0.00% 0.000us 0.000us 1
4135
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4136
- Self CPU time total: 16.399ms
4137
- Self CUDA time total: 14.587ms
4138
 
4139
 
4140
 
@@ -4144,91 +4132,39 @@ PROFILE TRACE: torch_flash_ma | cuda_attn_L512_bfloat16
4144
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4145
  Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
4146
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4147
- torch_flash_ma 1.34% 250.556us 18.55% 3.457ms 3.457ms 0.000us 0.00% 16.094ms 16.094ms 1
4148
- torch_flash_ma 0.00% 0.000us 0.00% 0.000us 0.000us 15.878ms 100.09% 15.878ms 15.878ms 1
4149
- aten::scaled_dot_product_attention 0.14% 25.201us 0.97% 180.244us 60.081us 0.000us 0.00% 12.955ms 4.318ms 3
4150
- aten::_scaled_dot_product_flash_attention 0.10% 18.431us 0.83% 155.043us 51.681us 0.000us 0.00% 12.955ms 4.318ms 3
4151
- aten::_flash_attention_forward 0.18% 33.193us 0.61% 113.432us 37.811us 12.955ms 81.66% 12.955ms 4.318ms 3
4152
- void pytorch_flash::flash_fwd_kernel&lt;Flash_fwd_kerne... 0.00% 0.000us 0.00% 0.000us 0.000us 12.955ms 81.66% 12.955ms 4.318ms 3
4153
- aten::contiguous 0.05% 10.100us 15.97% 2.976ms 248.003us 0.000us 0.00% 3.139ms 261.603us 12
4154
- aten::clone 0.16% 29.450us 15.92% 2.966ms 247.161us 0.000us 0.00% 3.139ms 261.603us 12
4155
- aten::copy_ 0.46% 85.134us 15.41% 2.871ms 239.275us 2.909ms 18.34% 3.139ms 261.603us 12
4156
- void at::native::elementwise_kernel&lt;128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 2.909ms 18.34% 2.909ms 242.440us 12
4157
- Activity Buffer Request 8.03% 1.497ms 8.03% 1.497ms 1.497ms 229.949us 1.45% 229.949us 229.949us 1
4158
- aten::transpose 0.29% 53.550us 0.39% 73.110us 3.046us 0.000us 0.00% 0.000us 0.000us 24
4159
- aten::as_strided 0.10% 19.560us 0.10% 19.560us 0.815us 0.000us 0.00% 0.000us 0.000us 24
4160
- aten::empty_like 0.11% 19.791us 0.47% 87.501us 5.833us 0.000us 0.00% 0.000us 0.000us 15
4161
- aten::empty 0.42% 78.571us 0.42% 78.571us 3.274us 0.000us 0.00% 0.000us 0.000us 24
4162
- cudaLaunchKernel 7.05% 1.313ms 7.05% 1.313ms 87.561us 0.000us 0.00% 0.000us 0.000us 15
4163
- aten::empty_strided 0.09% 17.450us 0.09% 17.450us 5.817us 0.000us 0.00% 0.000us 0.000us 3
4164
- cudaDeviceGetAttribute 0.01% 1.828us 0.01% 1.828us 0.305us 0.000us 0.00% 0.000us 0.000us 6
4165
- cudaFuncSetAttribute 0.02% 3.779us 0.02% 3.779us 1.260us 0.000us 0.00% 0.000us 0.000us 3
4166
- cudaDeviceSynchronize 81.45% 15.178ms 81.45% 15.178ms 15.178ms 0.000us 0.00% 0.000us 0.000us 1
4167
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4168
- Self CPU time total: 18.634ms
4169
- Self CUDA time total: 15.864ms
4170
 
4171
 
4172
  impl wl p50(ms) ok
4173
- torch_flash_ma cuda_attn_L128_bfloat16 4.09 True
4174
- torch_flash_ma cuda_attn_L256_bfloat16 4.79 True
4175
- torch_flash_ma cuda_attn_L320_bfloat16 4.90 True
4176
- torch_flash_ma cuda_attn_L384_bfloat16 4.98 True
4177
- torch_flash_ma cuda_attn_L448_bfloat16 5.05 True
4178
- torch_flash_ma cuda_attn_L512_bfloat16 5.47 True
4179
  </pre></div>
4180
- <div class="uv-install-logs" id="uv-logs-benchmark">
4181
- <div class="uv-logs-header" onclick="toggleUvLogs(this)">▶ UV Install Logs</div>
4182
- <div class="uv-logs-content" style="display: none;">
4183
- Building kernels-benchmark-tools @ file:///home/ubuntu/Projects/kernels-benchmarks-consolidated/tools
4184
- Downloading nvidia-curand-cu12 (60.7MiB)
4185
- Downloading numpy (15.9MiB)
4186
- Downloading nvidia-cuda-cupti-cu12 (9.8MiB)
4187
- Downloading pillow (6.7MiB)
4188
- Downloading fonttools (4.7MiB)
4189
- Downloading networkx (1.9MiB)
4190
- Downloading setuptools (1.1MiB)
4191
- Downloading nvidia-cusolver-cu12 (255.1MiB)
4192
- Downloading nvidia-cudnn-cu12 (674.0MiB)
4193
- Downloading nvidia-nccl-cu12 (307.4MiB)
4194
- Downloading nvidia-cufft-cu12 (184.2MiB)
4195
- Downloading nvidia-cublas-cu12 (566.8MiB)
4196
- Downloading nvidia-cufile-cu12 (1.1MiB)
4197
- Downloading nvidia-cusparse-cu12 (274.9MiB)
4198
- Downloading torch (846.8MiB)
4199
- Downloading kiwisolver (1.4MiB)
4200
- Downloading nvidia-nvjitlink-cu12 (37.4MiB)
4201
- Downloading nvidia-cuda-nvrtc-cu12 (84.0MiB)
4202
- Downloading matplotlib (8.3MiB)
4203
- Downloading nvidia-cusparselt-cu12 (273.9MiB)
4204
- Downloading sympy (6.0MiB)
4205
- Downloading triton (148.4MiB)
4206
- Downloading nvidia-cufile-cu12
4207
- Downloading kiwisolver
4208
- Downloading setuptools
4209
- Downloading networkx
4210
- Downloading fonttools
4211
- Downloading pillow
4212
- Built kernels-benchmark-tools @ file:///home/ubuntu/Projects/kernels-benchmarks-consolidated/tools
4213
- Downloading matplotlib
4214
- Downloading nvidia-cuda-cupti-cu12
4215
- Downloading numpy
4216
- Downloading sympy
4217
- Downloading nvidia-nvjitlink-cu12
4218
- Downloading nvidia-curand-cu12
4219
- Downloading nvidia-cuda-nvrtc-cu12
4220
- Downloading triton
4221
- Downloading nvidia-cufft-cu12
4222
- Downloading nvidia-cusolver-cu12
4223
- Downloading nvidia-cusparse-cu12
4224
- Downloading nvidia-cusparselt-cu12
4225
- Downloading nvidia-nccl-cu12
4226
- Downloading nvidia-cublas-cu12
4227
- Downloading nvidia-cudnn-cu12
4228
- Downloading torch
4229
- Installed 37 packages in 231ms
4230
- </div>
4231
- </div>
4232
  <div class="cell-artifacts">
4233
  <h4>Artifacts:</h4>
4234
  <a href="artifacts/benchmark/attention.jsonl" class="artifact" target="_blank">attention.jsonl</a>
 
3857
  <div class="system-info">
3858
  <div class="system-info-header">Generated on:</div>
3859
  <div class="system-info-content">
3860
+ Linux x86_64 | Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35
3861
  </div>
3862
  </div>
3863
 
 
3871
  <span onclick="toggleOutput('nv')" style="cursor: pointer;">▼ output</span>
3872
  <span id="uv-indicator-nv" style="cursor: default; opacity: 0.3;">▶ uv-logs</span>
3873
  </span> |
3874
+ Cell: nv | 0.26s
3875
  | <button class="run-btn" onclick="runCell('nv')">▶ run</button>
3876
  <button class="copy-btn" onclick="copyCell('nv')">Copy</button>
3877
  <a href="cells/nv.py" target="_blank" class="raw-btn">Raw</a>
 
3888
  </div>
3889
  </div>
3890
  <div id="output-nv" class="cell-output">
3891
+ <div class="cell-stdout"><pre class="stdout-text">Mon Oct 27 14:45:45 2025
3892
  +-----------------------------------------------------------------------------------------+
3893
+ | NVIDIA-SMI 570.195.03 Driver Version: 570.195.03 CUDA Version: 12.8 |
3894
  |-----------------------------------------+------------------------+----------------------+
3895
  | GPU Name Persistence-M | Bus-Id Disp.A | Volatile Uncorr. ECC |
3896
  | Fan Temp Perf Pwr:Usage/Cap | Memory-Usage | GPU-Util Compute M. |
3897
  | | | MIG M. |
3898
  |=========================================+========================+======================|
3899
+ | 0 NVIDIA L40S On | 00000000:4D:00.0 Off | 0 |
3900
+ | N/A 31C P0 135W / 350W | 0MiB / 46068MiB | 100% Default |
 
 
 
 
 
 
 
 
 
 
 
 
3901
  | | | N/A |
3902
  +-----------------------------------------+------------------------+----------------------+
3903
 
3904
  +-----------------------------------------------------------------------------------------+
3905
  | Processes: |
3906
+ | GPU GI CI PID Type Process name GPU Memory |
3907
  | ID ID Usage |
3908
  |=========================================================================================|
3909
  | No running processes found |
 
3919
  <span class="collapse-indicators">
3920
  <span onclick="toggleCode('benchmark')" style="cursor: pointer;">▼ code</span>
3921
  <span onclick="toggleOutput('benchmark')" style="cursor: pointer;">▼ output</span>
3922
+ <span id="uv-indicator-benchmark" style="cursor: default; opacity: 0.3;">▶ uv-logs</span>
3923
  </span> |
3924
+ Cell: benchmark | 3.87s
3925
  | <button class="run-btn" onclick="runCell('benchmark')">▶ run</button>
3926
  <button class="copy-btn" onclick="copyCell('benchmark')">Copy</button>
3927
  <a href="cells/benchmark.py" target="_blank" class="raw-btn">Raw</a>
 
3972
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
3973
  Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
3974
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
3975
+ torch_flash_ma 0.00% 0.000us 0.00% 0.000us 0.000us 3.610ms 101.76% 3.610ms 3.610ms 1
3976
+ torch_flash_ma 6.54% 340.396us 46.01% 2.394ms 2.394ms 0.000us 0.00% 3.588ms 3.588ms 1
3977
+ aten::scaled_dot_product_attention 0.84% 43.810us 4.24% 220.593us 73.531us 0.000us 0.00% 2.829ms 943.091us 3
3978
+ aten::_scaled_dot_product_flash_attention 0.51% 26.609us 3.40% 176.783us 58.928us 0.000us 0.00% 2.829ms 943.091us 3
3979
+ aten::_flash_attention_forward 0.74% 38.381us 2.45% 127.692us 42.564us 2.829ms 79.74% 2.829ms 943.091us 3
3980
+ void pytorch_flash::flash_fwd_kernel&lt;Flash_fwd_kerne... 0.00% 0.000us 0.00% 0.000us 0.000us 2.829ms 79.74% 2.829ms 943.091us 3
3981
+ aten::contiguous 0.29% 15.001us 33.86% 1.762ms 146.802us 0.000us 0.00% 759.072us 63.256us 12
3982
+ aten::clone 0.76% 39.432us 33.57% 1.747ms 145.552us 0.000us 0.00% 759.072us 63.256us 12
3983
+ aten::copy_ 1.71% 88.801us 31.26% 1.626ms 135.534us 718.688us 20.26% 759.072us 63.256us 12
3984
+ void at::native::elementwise_kernel&lt;128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 718.688us 20.26% 718.688us 59.891us 12
3985
+ Activity Buffer Request 27.68% 1.440ms 27.68% 1.440ms 1.440ms 40.384us 1.14% 40.384us 40.384us 1
3986
+ aten::transpose 1.34% 69.973us 1.80% 93.503us 3.896us 0.000us 0.00% 0.000us 0.000us 24
3987
+ aten::as_strided 0.45% 23.530us 0.45% 23.530us 0.980us 0.000us 0.00% 0.000us 0.000us 24
3988
+ aten::empty_like 0.50% 25.908us 1.97% 102.319us 6.821us 0.000us 0.00% 0.000us 0.000us 15
3989
+ aten::empty 1.75% 91.041us 1.75% 91.041us 3.793us 0.000us 0.00% 0.000us 0.000us 24
3990
+ cudaLaunchKernel 2.36% 123.031us 2.36% 123.031us 8.202us 0.000us 0.00% 0.000us 0.000us 15
3991
+ aten::empty_strided 0.31% 16.010us 0.31% 16.010us 5.337us 0.000us 0.00% 0.000us 0.000us 3
3992
+ cudaDeviceGetAttribute 0.05% 2.700us 0.05% 2.700us 0.450us 0.000us 0.00% 0.000us 0.000us 6
3993
+ cudaFuncSetAttribute 0.17% 8.980us 0.17% 8.980us 2.993us 0.000us 0.00% 0.000us 0.000us 3
3994
+ cudaDeviceSynchronize 53.99% 2.809ms 53.99% 2.809ms 2.809ms 0.000us 0.00% 0.000us 0.000us 1
3995
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
3996
+ Self CPU time total: 5.203ms
3997
+ Self CUDA time total: 3.548ms
3998
 
3999
 
4000
 
 
4004
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4005
  Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
4006
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4007
+ torch_flash_ma 5.17% 272.917us 42.06% 2.218ms 2.218ms 0.000us 0.00% 3.821ms 3.821ms 1
4008
+ torch_flash_ma 0.00% 0.000us 0.00% 0.000us 0.000us 3.777ms 100.28% 3.777ms 3.777ms 1
4009
+ aten::scaled_dot_product_attention 0.53% 27.761us 3.55% 187.333us 62.444us 0.000us 0.00% 3.004ms 1.001ms 3
4010
+ aten::_scaled_dot_product_flash_attention 0.37% 19.492us 3.03% 159.572us 53.191us 0.000us 0.00% 3.004ms 1.001ms 3
4011
+ aten::_flash_attention_forward 0.75% 39.549us 2.23% 117.371us 39.124us 3.004ms 79.75% 3.004ms 1.001ms 3
4012
+ void pytorch_flash::flash_fwd_kernel&lt;Flash_fwd_kerne... 0.00% 0.000us 0.00% 0.000us 0.000us 3.004ms 79.75% 3.004ms 1.001ms 3
4013
+ aten::contiguous 0.20% 10.320us 32.06% 1.691ms 140.876us 0.000us 0.00% 817.314us 68.110us 12
4014
+ aten::clone 0.55% 29.048us 31.86% 1.680ms 140.016us 0.000us 0.00% 817.314us 68.110us 12
4015
+ aten::copy_ 1.64% 86.662us 30.11% 1.588ms 132.347us 762.658us 20.25% 817.314us 68.110us 12
4016
+ void at::native::elementwise_kernel&lt;128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 762.658us 20.25% 762.658us 63.555us 12
4017
+ Activity Buffer Request 26.84% 1.415ms 26.84% 1.415ms 1.415ms 54.656us 1.45% 54.656us 54.656us 1
4018
+ aten::transpose 1.36% 71.528us 1.71% 90.179us 3.757us 0.000us 0.00% 0.000us 0.000us 24
4019
+ aten::as_strided 0.35% 18.651us 0.35% 18.651us 0.777us 0.000us 0.00% 0.000us 0.000us 24
4020
+ aten::empty_like 0.38% 19.801us 1.55% 81.840us 5.456us 0.000us 0.00% 0.000us 0.000us 15
4021
+ aten::empty 1.46% 77.040us 1.46% 77.040us 3.210us 0.000us 0.00% 0.000us 0.000us 24
4022
+ cudaLaunchKernel 2.07% 108.973us 2.07% 108.973us 7.265us 0.000us 0.00% 0.000us 0.000us 15
4023
+ aten::empty_strided 0.26% 13.940us 0.26% 13.940us 4.647us 0.000us 0.00% 0.000us 0.000us 3
4024
+ cudaDeviceGetAttribute 0.06% 2.910us 0.06% 2.910us 0.485us 0.000us 0.00% 0.000us 0.000us 6
4025
+ cudaFuncSetAttribute 0.08% 4.240us 0.08% 4.240us 1.413us 0.000us 0.00% 0.000us 0.000us 3
4026
+ cudaDeviceSynchronize 57.94% 3.056ms 57.94% 3.056ms 3.056ms 0.000us 0.00% 0.000us 0.000us 1
4027
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4028
+ Self CPU time total: 5.274ms
4029
+ Self CUDA time total: 3.767ms
4030
 
4031
 
4032
 
 
4036
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4037
  Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
4038
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4039
+ torch_flash_ma 4.99% 269.576us 41.89% 2.262ms 2.262ms 0.000us 0.00% 3.875ms 3.875ms 1
4040
+ torch_flash_ma 0.00% 0.000us 0.00% 0.000us 0.000us 3.827ms 100.29% 3.827ms 3.827ms 1
4041
+ aten::scaled_dot_product_attention 0.50% 27.011us 3.47% 187.262us 62.421us 0.000us 0.00% 3.037ms 1.012ms 3
4042
+ aten::_scaled_dot_product_flash_attention 0.35% 18.851us 2.97% 160.251us 53.417us 0.000us 0.00% 3.037ms 1.012ms 3
4043
+ aten::_flash_attention_forward 0.72% 39.000us 2.20% 118.550us 39.517us 3.037ms 79.57% 3.037ms 1.012ms 3
4044
+ void pytorch_flash::flash_fwd_kernel&lt;Flash_fwd_kerne... 0.00% 0.000us 0.00% 0.000us 0.000us 3.037ms 79.57% 3.037ms 1.012ms 3
4045
+ aten::contiguous 0.18% 9.780us 32.51% 1.755ms 146.253us 0.000us 0.00% 838.461us 69.872us 12
4046
+ aten::clone 0.54% 29.119us 32.32% 1.745ms 145.438us 0.000us 0.00% 838.461us 69.872us 12
4047
+ aten::copy_ 1.56% 84.200us 30.52% 1.648ms 137.328us 779.741us 20.43% 838.461us 69.872us 12
4048
+ void at::native::elementwise_kernel&lt;128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 779.741us 20.43% 779.741us 64.978us 12
4049
+ Activity Buffer Request 27.41% 1.480ms 27.41% 1.480ms 1.480ms 58.720us 1.54% 58.720us 58.720us 1
4050
+ aten::transpose 1.00% 54.180us 1.34% 72.500us 3.021us 0.000us 0.00% 0.000us 0.000us 24
4051
+ aten::as_strided 0.34% 18.320us 0.34% 18.320us 0.763us 0.000us 0.00% 0.000us 0.000us 24
4052
+ aten::empty_like 0.36% 19.560us 1.66% 89.381us 5.959us 0.000us 0.00% 0.000us 0.000us 15
4053
+ aten::empty 1.53% 82.821us 1.53% 82.821us 3.451us 0.000us 0.00% 0.000us 0.000us 24
4054
+ cudaLaunchKernel 1.99% 107.272us 1.99% 107.272us 7.151us 0.000us 0.00% 0.000us 0.000us 15
4055
+ aten::empty_strided 0.30% 16.380us 0.30% 16.380us 5.460us 0.000us 0.00% 0.000us 0.000us 3
4056
+ cudaDeviceGetAttribute 0.03% 1.850us 0.03% 1.850us 0.308us 0.000us 0.00% 0.000us 0.000us 6
4057
+ cudaFuncSetAttribute 0.07% 3.830us 0.07% 3.830us 1.277us 0.000us 0.00% 0.000us 0.000us 3
4058
+ cudaDeviceSynchronize 58.11% 3.138ms 58.11% 3.138ms 3.138ms 0.000us 0.00% 0.000us 0.000us 1
4059
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4060
+ Self CPU time total: 5.399ms
4061
+ Self CUDA time total: 3.817ms
4062
 
4063
 
4064
 
 
4068
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4069
  Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
4070
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4071
+ torch_flash_ma 4.76% 268.853us 43.13% 2.435ms 2.435ms 0.000us 0.00% 3.964ms 3.964ms 1
4072
+ torch_flash_ma 0.00% 0.000us 0.00% 0.000us 0.000us 3.917ms 100.30% 3.917ms 3.917ms 1
4073
+ aten::scaled_dot_product_attention 0.49% 27.720us 3.46% 195.333us 65.111us 0.000us 0.00% 3.118ms 1.039ms 3
4074
+ aten::_scaled_dot_product_flash_attention 0.34% 19.471us 2.97% 167.613us 55.871us 0.000us 0.00% 3.118ms 1.039ms 3
4075
+ aten::_flash_attention_forward 0.70% 39.530us 2.23% 125.742us 41.914us 3.118ms 79.84% 3.118ms 1.039ms 3
4076
+ void pytorch_flash::flash_fwd_kernel&lt;Flash_fwd_kerne... 0.00% 0.000us 0.00% 0.000us 0.000us 3.118ms 79.84% 3.118ms 1.039ms 3
4077
+ aten::contiguous 0.17% 9.719us 34.03% 1.921ms 160.116us 0.000us 0.00% 845.599us 70.467us 12
4078
+ aten::clone 0.52% 29.239us 33.85% 1.912ms 159.306us 0.000us 0.00% 845.599us 70.467us 12
4079
+ aten::copy_ 1.54% 86.910us 32.19% 1.818ms 151.460us 787.167us 20.16% 845.599us 70.467us 12
4080
+ void at::native::elementwise_kernel&lt;128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 787.167us 20.16% 787.167us 65.597us 12
4081
+ Activity Buffer Request 25.41% 1.435ms 25.41% 1.435ms 1.435ms 58.432us 1.50% 58.432us 58.432us 1
4082
+ aten::transpose 0.96% 54.080us 1.28% 72.141us 3.006us 0.000us 0.00% 0.000us 0.000us 24
4083
+ aten::as_strided 0.32% 18.061us 0.32% 18.061us 0.753us 0.000us 0.00% 0.000us 0.000us 24
4084
+ aten::empty_like 0.35% 19.512us 1.49% 84.134us 5.609us 0.000us 0.00% 0.000us 0.000us 15
4085
+ aten::empty 1.53% 86.581us 1.53% 86.581us 3.608us 0.000us 0.00% 0.000us 0.000us 24
4086
+ cudaLaunchKernel 5.66% 319.547us 5.66% 319.547us 21.303us 0.000us 0.00% 0.000us 0.000us 15
4087
+ aten::empty_strided 0.26% 14.430us 0.26% 14.430us 4.810us 0.000us 0.00% 0.000us 0.000us 3
4088
+ cudaDeviceGetAttribute 0.05% 2.740us 0.05% 2.740us 0.457us 0.000us 0.00% 0.000us 0.000us 6
4089
+ cudaFuncSetAttribute 0.07% 4.201us 0.07% 4.201us 1.400us 0.000us 0.00% 0.000us 0.000us 3
4090
+ cudaDeviceSynchronize 56.87% 3.211ms 56.87% 3.211ms 3.211ms 0.000us 0.00% 0.000us 0.000us 1
4091
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4092
+ Self CPU time total: 5.647ms
4093
+ Self CUDA time total: 3.906ms
4094
 
4095
 
4096
 
 
4100
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4101
  Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
4102
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4103
+ torch_flash_ma 5.25% 320.614us 40.80% 2.490ms 2.490ms 0.000us 0.00% 4.428ms 4.428ms 1
4104
+ torch_flash_ma 0.00% 0.000us 0.00% 0.000us 0.000us 4.377ms 100.25% 4.377ms 4.377ms 1
4105
+ aten::scaled_dot_product_attention 0.44% 26.800us 3.27% 199.713us 66.571us 0.000us 0.00% 3.558ms 1.186ms 3
4106
+ aten::_scaled_dot_product_flash_attention 0.32% 19.239us 2.83% 172.913us 57.638us 0.000us 0.00% 3.558ms 1.186ms 3
4107
+ aten::_flash_attention_forward 0.64% 38.816us 2.13% 129.963us 43.321us 3.558ms 81.48% 3.558ms 1.186ms 3
4108
+ void pytorch_flash::flash_fwd_kernel&lt;Flash_fwd_kerne... 0.00% 0.000us 0.00% 0.000us 0.000us 3.558ms 81.48% 3.558ms 1.186ms 3
4109
+ aten::contiguous 0.17% 10.568us 31.48% 1.922ms 160.138us 0.000us 0.00% 870.015us 72.501us 12
4110
+ aten::clone 0.48% 29.552us 31.31% 1.911ms 159.257us 0.000us 0.00% 870.015us 72.501us 12
4111
+ aten::copy_ 1.37% 83.622us 29.71% 1.813ms 151.123us 808.479us 18.52% 870.015us 72.501us 12
4112
+ void at::native::elementwise_kernel&lt;128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 808.479us 18.52% 808.479us 67.373us 12
4113
+ Activity Buffer Request 24.07% 1.469ms 24.07% 1.469ms 1.469ms 61.536us 1.41% 61.536us 61.536us 1
4114
+ aten::transpose 0.88% 53.494us 1.18% 71.893us 2.996us 0.000us 0.00% 0.000us 0.000us 24
4115
+ aten::as_strided 0.30% 18.399us 0.30% 18.399us 0.767us 0.000us 0.00% 0.000us 0.000us 24
4116
+ aten::empty_like 0.45% 27.388us 1.61% 98.450us 6.563us 0.000us 0.00% 0.000us 0.000us 15
4117
+ aten::empty 1.35% 82.243us 1.35% 82.243us 3.427us 0.000us 0.00% 0.000us 0.000us 24
4118
+ cudaLaunchKernel 4.68% 285.943us 4.68% 285.943us 19.063us 0.000us 0.00% 0.000us 0.000us 15
4119
+ aten::empty_strided 0.29% 17.820us 0.29% 17.820us 5.940us 0.000us 0.00% 0.000us 0.000us 3
4120
+ cudaDeviceGetAttribute 0.04% 2.328us 0.04% 2.328us 0.388us 0.000us 0.00% 0.000us 0.000us 6
4121
+ cudaFuncSetAttribute 0.07% 4.078us 0.07% 4.078us 1.359us 0.000us 0.00% 0.000us 0.000us 3
4122
+ cudaDeviceSynchronize 59.20% 3.614ms 59.20% 3.614ms 3.614ms 0.000us 0.00% 0.000us 0.000us 1
4123
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4124
+ Self CPU time total: 6.104ms
4125
+ Self CUDA time total: 4.366ms
4126
 
4127
 
4128
 
 
4132
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4133
  Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
4134
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4135
+ torch_flash_ma 4.45% 272.752us 38.96% 2.390ms 2.390ms 0.000us 0.00% 4.517ms 4.517ms 1
4136
+ torch_flash_ma 0.00% 0.000us 0.00% 0.000us 0.000us 4.467ms 100.24% 4.467ms 4.467ms 1
4137
+ aten::scaled_dot_product_attention 0.45% 27.641us 3.22% 197.213us 65.738us 0.000us 0.00% 3.636ms 1.212ms 3
4138
+ aten::_scaled_dot_product_flash_attention 0.32% 19.841us 2.76% 169.572us 56.524us 0.000us 0.00% 3.636ms 1.212ms 3
4139
+ aten::_flash_attention_forward 0.71% 43.282us 2.06% 126.092us 42.031us 3.636ms 81.58% 3.636ms 1.212ms 3
4140
+ void pytorch_flash::flash_fwd_kernel&lt;Flash_fwd_kerne... 0.00% 0.000us 0.00% 0.000us 0.000us 3.636ms 81.58% 3.636ms 1.212ms 3
4141
+ aten::contiguous 0.18% 11.069us 30.46% 1.869ms 155.711us 0.000us 0.00% 881.085us 73.424us 12
4142
+ aten::clone 0.50% 30.953us 30.28% 1.857ms 154.789us 0.000us 0.00% 881.085us 73.424us 12
4143
+ aten::copy_ 1.39% 85.529us 28.66% 1.758ms 146.482us 820.670us 18.42% 881.085us 73.424us 12
4144
+ void at::native::elementwise_kernel&lt;128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 820.670us 18.42% 820.670us 68.389us 12
4145
+ Activity Buffer Request 23.40% 1.435ms 23.40% 1.435ms 1.435ms 60.415us 1.36% 60.415us 60.415us 1
4146
+ aten::transpose 0.92% 56.138us 1.22% 75.130us 3.130us 0.000us 0.00% 0.000us 0.000us 24
4147
+ aten::as_strided 0.31% 18.992us 0.31% 18.992us 0.791us 0.000us 0.00% 0.000us 0.000us 24
4148
+ aten::empty_like 0.33% 20.287us 1.48% 90.810us 6.054us 0.000us 0.00% 0.000us 0.000us 15
4149
+ aten::empty 1.36% 83.613us 1.36% 83.613us 3.484us 0.000us 0.00% 0.000us 0.000us 24
4150
+ cudaLaunchKernel 4.26% 261.175us 4.26% 261.175us 17.412us 0.000us 0.00% 0.000us 0.000us 15
4151
+ aten::empty_strided 0.28% 17.260us 0.28% 17.260us 5.753us 0.000us 0.00% 0.000us 0.000us 3
4152
+ cudaDeviceGetAttribute 0.03% 1.850us 0.03% 1.850us 0.308us 0.000us 0.00% 0.000us 0.000us 6
4153
+ cudaFuncSetAttribute 0.07% 4.250us 0.07% 4.250us 1.417us 0.000us 0.00% 0.000us 0.000us 3
4154
+ cudaDeviceSynchronize 61.04% 3.744ms 61.04% 3.744ms 3.744ms 0.000us 0.00% 0.000us 0.000us 1
4155
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4156
+ Self CPU time total: 6.134ms
4157
+ Self CUDA time total: 4.456ms
4158
 
4159
 
4160
  impl wl p50(ms) ok
4161
+ torch_flash_ma cuda_attn_L128_bfloat16 1.22 True
4162
+ torch_flash_ma cuda_attn_L256_bfloat16 1.27 True
4163
+ torch_flash_ma cuda_attn_L320_bfloat16 1.31 True
4164
+ torch_flash_ma cuda_attn_L384_bfloat16 1.34 True
4165
+ torch_flash_ma cuda_attn_L448_bfloat16 1.48 True
4166
+ torch_flash_ma cuda_attn_L512_bfloat16 1.52 True
4167
  </pre></div>
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
4168
  <div class="cell-artifacts">
4169
  <h4>Artifacts:</h4>
4170
  <a href="artifacts/benchmark/attention.jsonl" class="artifact" target="_blank">attention.jsonl</a>
flash_attn/impls/hf_kernels_flash_attn.html CHANGED
@@ -3857,7 +3857,7 @@ span.linenos.special { color: #000000; background-color: #ffffc0; padding-left:
3857
  <div class="system-info">
3858
  <div class="system-info-header">Generated on:</div>
3859
  <div class="system-info-content">
3860
- Linux x86_64 | Linux-5.15.0-1084-aws-x86_64-with-glibc2.31
3861
  </div>
3862
  </div>
3863
 
@@ -3871,7 +3871,7 @@ span.linenos.special { color: #000000; background-color: #ffffc0; padding-left:
3871
  <span onclick="toggleOutput('benchmark')" style="cursor: pointer;">▼ output</span>
3872
  <span id="uv-indicator-benchmark" onclick="toggleUvLogsFromHeader('benchmark')" style="cursor: pointer;">▶ uv-logs</span>
3873
  </span> |
3874
- Cell: benchmark | 47.93s
3875
  | <button class="run-btn" onclick="runCell('benchmark')">▶ run</button>
3876
  <button class="copy-btn" onclick="copyCell('benchmark')">Copy</button>
3877
  <a href="cells/benchmark.py" target="_blank" class="raw-btn">Raw</a>
@@ -3926,21 +3926,21 @@ PROFILE TRACE: hf_kernels_flash_attn | cuda_attn_L128_bfloat16
3926
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
3927
  Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
3928
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
3929
- hf_kernels_flash_attn 1.75% 172.444us 18.87% 1.860ms 1.860ms 0.000us 0.00% 10.982ms 10.982ms 1
3930
- _flash_attn_9e27194::fwd 0.72% 71.472us 17.12% 1.688ms 562.609us 8.236ms 100.00% 10.982ms 3.661ms 3
3931
- hf_kernels_flash_attn 0.00% 0.000us 0.00% 0.000us 0.000us 8.238ms 100.02% 8.238ms 8.238ms 1
3932
- void flash::flash_fwd_kernel&lt;Flash_fwd_kernel_traits... 0.00% 0.000us 0.00% 0.000us 0.000us 8.236ms 100.00% 8.236ms 2.745ms 3
3933
- Activity Buffer Request 14.98% 1.477ms 14.98% 1.477ms 1.477ms 2.746ms 33.34% 2.746ms 2.746ms 1
3934
- cudaDeviceGetAttribute 0.11% 11.099us 0.11% 11.099us 0.740us 0.000us 0.00% 0.000us 0.000us 15
3935
- aten::empty_like 0.19% 18.800us 0.53% 52.161us 17.387us 0.000us 0.00% 0.000us 0.000us 3
3936
- aten::empty_strided 0.34% 33.361us 0.34% 33.361us 11.120us 0.000us 0.00% 0.000us 0.000us 3
3937
- aten::empty 0.27% 26.650us 0.27% 26.650us 2.961us 0.000us 0.00% 0.000us 0.000us 9
3938
- cudaFuncSetAttribute 0.09% 8.722us 0.09% 8.722us 2.907us 0.000us 0.00% 0.000us 0.000us 3
3939
- cudaLaunchKernel 0.41% 40.651us 0.41% 40.651us 13.550us 0.000us 0.00% 0.000us 0.000us 3
3940
- cudaDeviceSynchronize 81.13% 8.001ms 81.13% 8.001ms 8.001ms 0.000us 0.00% 0.000us 0.000us 1
3941
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
3942
- Self CPU time total: 9.861ms
3943
- Self CUDA time total: 8.236ms
3944
 
3945
 
3946
 
@@ -3950,21 +3950,21 @@ PROFILE TRACE: hf_kernels_flash_attn | cuda_attn_L256_bfloat16
3950
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
3951
  Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
3952
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
3953
- hf_kernels_flash_attn 0.74% 96.063us 13.14% 1.699ms 1.699ms 0.000us 0.00% 15.210ms 15.210ms 1
3954
- _flash_attn_9e27194::fwd 0.37% 48.372us 12.39% 1.603ms 534.225us 11.384ms 100.00% 15.210ms 5.070ms 3
3955
- hf_kernels_flash_attn 0.00% 0.000us 0.00% 0.000us 0.000us 11.386ms 100.02% 11.386ms 11.386ms 1
3956
- void flash::flash_fwd_kernel&lt;Flash_fwd_kernel_traits... 0.00% 0.000us 0.00% 0.000us 0.000us 11.384ms 100.00% 11.384ms 3.795ms 3
3957
- Activity Buffer Request 11.40% 1.474ms 11.40% 1.474ms 1.474ms 3.826ms 33.61% 3.826ms 3.826ms 1
3958
- cudaDeviceGetAttribute 0.03% 4.448us 0.03% 4.448us 0.297us 0.000us 0.00% 0.000us 0.000us 15
3959
- aten::empty_like 0.05% 6.910us 0.18% 23.882us 7.961us 0.000us 0.00% 0.000us 0.000us 3
3960
- aten::empty_strided 0.13% 16.972us 0.13% 16.972us 5.657us 0.000us 0.00% 0.000us 0.000us 3
3961
- aten::empty 0.17% 21.490us 0.17% 21.490us 2.388us 0.000us 0.00% 0.000us 0.000us 9
3962
- cudaFuncSetAttribute 0.03% 3.650us 0.03% 3.650us 1.217us 0.000us 0.00% 0.000us 0.000us 3
3963
- cudaLaunchKernel 0.21% 26.920us 0.21% 26.920us 8.973us 0.000us 0.00% 0.000us 0.000us 3
3964
- cudaDeviceSynchronize 86.86% 11.232ms 86.86% 11.232ms 11.232ms 0.000us 0.00% 0.000us 0.000us 1
3965
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
3966
- Self CPU time total: 12.931ms
3967
- Self CUDA time total: 11.384ms
3968
 
3969
 
3970
 
@@ -3974,21 +3974,21 @@ PROFILE TRACE: hf_kernels_flash_attn | cuda_attn_L320_bfloat16
3974
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
3975
  Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
3976
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
3977
- hf_kernels_flash_attn 0.67% 91.024us 12.59% 1.703ms 1.703ms 0.000us 0.00% 15.954ms 15.954ms 1
3978
- _flash_attn_9e27194::fwd 0.35% 47.311us 11.92% 1.612ms 537.434us 11.964ms 100.00% 15.954ms 5.318ms 3
3979
- hf_kernels_flash_attn 0.00% 0.000us 0.00% 0.000us 0.000us 11.966ms 100.01% 11.966ms 11.966ms 1
3980
- void flash::flash_fwd_kernel&lt;Flash_fwd_kernel_traits... 0.00% 0.000us 0.00% 0.000us 0.000us 11.964ms 100.00% 11.964ms 3.988ms 3
3981
- Activity Buffer Request 10.98% 1.485ms 10.98% 1.485ms 1.485ms 3.990ms 33.35% 3.990ms 3.990ms 1
3982
- cudaDeviceGetAttribute 0.03% 4.340us 0.03% 4.340us 0.289us 0.000us 0.00% 0.000us 0.000us 15
3983
- aten::empty_like 0.06% 8.720us 0.18% 24.830us 8.277us 0.000us 0.00% 0.000us 0.000us 3
3984
- aten::empty_strided 0.12% 16.110us 0.12% 16.110us 5.370us 0.000us 0.00% 0.000us 0.000us 3
3985
- aten::empty 0.15% 20.500us 0.15% 20.500us 2.278us 0.000us 0.00% 0.000us 0.000us 9
3986
- cudaFuncSetAttribute 0.03% 3.660us 0.03% 3.660us 1.220us 0.000us 0.00% 0.000us 0.000us 3
3987
- cudaLaunchKernel 0.20% 26.400us 0.20% 26.400us 8.800us 0.000us 0.00% 0.000us 0.000us 3
3988
- cudaDeviceSynchronize 87.41% 11.823ms 87.41% 11.823ms 11.823ms 0.000us 0.00% 0.000us 0.000us 1
3989
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
3990
- Self CPU time total: 13.526ms
3991
- Self CUDA time total: 11.964ms
3992
 
3993
 
3994
 
@@ -3998,21 +3998,21 @@ PROFILE TRACE: hf_kernels_flash_attn | cuda_attn_L384_bfloat16
3998
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
3999
  Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
4000
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4001
- hf_kernels_flash_attn 0.67% 93.544us 14.10% 1.960ms 1.960ms 0.000us 0.00% 16.171ms 16.171ms 1
4002
- _flash_attn_9e27194::fwd 0.34% 47.108us 13.43% 1.866ms 622.149us 12.086ms 100.00% 16.171ms 5.390ms 3
4003
- hf_kernels_flash_attn 0.00% 0.000us 0.00% 0.000us 0.000us 12.088ms 100.02% 12.088ms 12.088ms 1
4004
- void flash::flash_fwd_kernel&lt;Flash_fwd_kernel_traits... 0.00% 0.000us 0.00% 0.000us 0.000us 12.086ms 100.00% 12.086ms 4.029ms 3
4005
- Activity Buffer Request 10.87% 1.511ms 10.87% 1.511ms 1.511ms 4.085ms 33.80% 4.085ms 4.085ms 1
4006
- cudaDeviceGetAttribute 0.03% 4.151us 0.03% 4.151us 0.277us 0.000us 0.00% 0.000us 0.000us 15
4007
- aten::empty_like 0.05% 7.020us 0.18% 24.401us 8.134us 0.000us 0.00% 0.000us 0.000us 3
4008
- aten::empty_strided 0.13% 17.381us 0.13% 17.381us 5.794us 0.000us 0.00% 0.000us 0.000us 3
4009
- aten::empty 0.16% 21.650us 0.16% 21.650us 2.406us 0.000us 0.00% 0.000us 0.000us 9
4010
- cudaFuncSetAttribute 0.03% 3.680us 0.03% 3.680us 1.227us 0.000us 0.00% 0.000us 0.000us 3
4011
- cudaLaunchKernel 1.83% 254.116us 1.83% 254.116us 84.705us 0.000us 0.00% 0.000us 0.000us 3
4012
- cudaDeviceSynchronize 85.90% 11.939ms 85.90% 11.939ms 11.939ms 0.000us 0.00% 0.000us 0.000us 1
4013
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4014
- Self CPU time total: 13.899ms
4015
- Self CUDA time total: 12.086ms
4016
 
4017
 
4018
 
@@ -4022,21 +4022,21 @@ PROFILE TRACE: hf_kernels_flash_attn | cuda_attn_L448_bfloat16
4022
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4023
  Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
4024
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4025
- hf_kernels_flash_attn 0.66% 93.812us 13.64% 1.945ms 1.945ms 0.000us 0.00% 16.623ms 16.623ms 1
4026
- _flash_attn_9e27194::fwd 0.35% 50.392us 12.98% 1.852ms 617.193us 12.470ms 100.00% 16.623ms 5.541ms 3
4027
- hf_kernels_flash_attn 0.00% 0.000us 0.00% 0.000us 0.000us 12.472ms 100.02% 12.472ms 12.472ms 1
4028
- void flash::flash_fwd_kernel&lt;Flash_fwd_kernel_traits... 0.00% 0.000us 0.00% 0.000us 0.000us 12.470ms 100.00% 12.470ms 4.157ms 3
4029
- Activity Buffer Request 10.49% 1.496ms 10.49% 1.496ms 1.496ms 4.153ms 33.30% 4.153ms 4.153ms 1
4030
- cudaDeviceGetAttribute 0.03% 4.180us 0.03% 4.180us 0.279us 0.000us 0.00% 0.000us 0.000us 15
4031
- aten::empty_like 0.11% 15.512us 0.23% 32.181us 10.727us 0.000us 0.00% 0.000us 0.000us 3
4032
- aten::empty_strided 0.12% 16.669us 0.12% 16.669us 5.556us 0.000us 0.00% 0.000us 0.000us 3
4033
- aten::empty 0.15% 21.480us 0.15% 21.480us 2.387us 0.000us 0.00% 0.000us 0.000us 9
4034
- cudaFuncSetAttribute 0.03% 4.150us 0.03% 4.150us 1.383us 0.000us 0.00% 0.000us 0.000us 3
4035
- cudaLaunchKernel 1.70% 242.835us 1.70% 242.835us 80.945us 0.000us 0.00% 0.000us 0.000us 3
4036
- cudaDeviceSynchronize 86.36% 12.315ms 86.36% 12.315ms 12.315ms 0.000us 0.00% 0.000us 0.000us 1
4037
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4038
- Self CPU time total: 14.261ms
4039
- Self CUDA time total: 12.470ms
4040
 
4041
 
4042
 
@@ -4046,89 +4046,88 @@ PROFILE TRACE: hf_kernels_flash_attn | cuda_attn_L512_bfloat16
4046
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4047
  Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
4048
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4049
- hf_kernels_flash_attn 0.61% 96.222us 15.74% 2.480ms 2.480ms 0.000us 0.00% 17.900ms 17.900ms 1
4050
- _flash_attn_9e27194::fwd 0.31% 49.571us 15.13% 2.384ms 794.661us 13.426ms 100.00% 17.900ms 5.967ms 3
4051
- hf_kernels_flash_attn 0.00% 0.000us 0.00% 0.000us 0.000us 13.428ms 100.02% 13.428ms 13.428ms 1
4052
- void flash::flash_fwd_kernel&lt;Flash_fwd_kernel_traits... 0.00% 0.000us 0.00% 0.000us 0.000us 13.426ms 100.00% 13.426ms 4.475ms 3
4053
- Activity Buffer Request 9.64% 1.519ms 9.64% 1.519ms 1.519ms 4.474ms 33.33% 4.474ms 4.474ms 1
4054
- cudaDeviceGetAttribute 0.03% 4.041us 0.03% 4.041us 0.269us 0.000us 0.00% 0.000us 0.000us 15
4055
- aten::empty_like 0.05% 7.901us 0.16% 24.582us 8.194us 0.000us 0.00% 0.000us 0.000us 3
4056
- aten::empty_strided 0.11% 16.681us 0.11% 16.681us 5.560us 0.000us 0.00% 0.000us 0.000us 3
4057
- aten::empty 0.13% 20.818us 0.13% 20.818us 2.313us 0.000us 0.00% 0.000us 0.000us 9
4058
- cudaFuncSetAttribute 0.02% 3.610us 0.02% 3.610us 1.203us 0.000us 0.00% 0.000us 0.000us 3
4059
- cudaLaunchKernel 4.84% 761.957us 4.84% 761.957us 253.986us 0.000us 0.00% 0.000us 0.000us 3
4060
- cudaDeviceSynchronize 84.26% 13.278ms 84.26% 13.278ms 13.278ms 0.000us 0.00% 0.000us 0.000us 1
4061
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4062
- Self CPU time total: 15.758ms
4063
- Self CUDA time total: 13.426ms
4064
 
4065
 
4066
  impl wl p50(ms) ok
4067
- hf_kernels_flash_attn cuda_attn_L128_bfloat16 2.82 True
4068
- hf_kernels_flash_attn cuda_attn_L256_bfloat16 3.91 True
4069
- hf_kernels_flash_attn cuda_attn_L320_bfloat16 4.12 True
4070
- hf_kernels_flash_attn cuda_attn_L384_bfloat16 4.13 True
4071
- hf_kernels_flash_attn cuda_attn_L448_bfloat16 4.11 True
4072
- hf_kernels_flash_attn cuda_attn_L512_bfloat16 4.57 True
4073
  </pre></div>
4074
  <div class="uv-install-logs" id="uv-logs-benchmark">
4075
  <div class="uv-logs-header" onclick="toggleUvLogs(this)">▶ UV Install Logs</div>
4076
  <div class="uv-logs-content" style="display: none;">
4077
- Building kernels-benchmark-tools @ file:///home/ubuntu/Projects/kernels-benchmarks-consolidated/tools
4078
- Downloading nvidia-cufft-cu12 (184.2MiB)
4079
  Downloading hf-xet (3.2MiB)
4080
- Downloading setuptools (1.1MiB)
4081
- Downloading sympy (6.0MiB)
4082
- Downloading nvidia-cusparse-cu12 (274.9MiB)
4083
- Downloading fonttools (4.7MiB)
4084
- Downloading kiwisolver (1.4MiB)
4085
  Downloading networkx (1.9MiB)
4086
- Downloading nvidia-cusolver-cu12 (255.1MiB)
4087
- Downloading pillow (6.7MiB)
4088
- Downloading nvidia-cublas-cu12 (566.8MiB)
4089
- Downloading nvidia-cuda-cupti-cu12 (9.8MiB)
4090
  Downloading nvidia-nccl-cu12 (307.4MiB)
4091
- Downloading nvidia-cuda-nvrtc-cu12 (84.0MiB)
4092
- Downloading triton (148.4MiB)
4093
  Downloading nvidia-curand-cu12 (60.7MiB)
4094
- Downloading nvidia-nvjitlink-cu12 (37.4MiB)
4095
- Downloading nvidia-cusparselt-cu12 (273.9MiB)
 
 
 
 
4096
  Downloading nvidia-cudnn-cu12 (674.0MiB)
4097
- Downloading torch (846.8MiB)
4098
- Downloading numpy (15.9MiB)
 
4099
  Downloading nvidia-cufile-cu12 (1.1MiB)
4100
- Downloading matplotlib (8.3MiB)
 
 
 
4101
  Downloading nvidia-cufile-cu12
4102
  Downloading kiwisolver
4103
  Downloading hf-xet
4104
  Downloading setuptools
4105
- Downloading fonttools
4106
  Downloading networkx
 
4107
  Downloading pillow
4108
- Built kernels-benchmark-tools @ file:///home/ubuntu/Projects/kernels-benchmarks-consolidated/tools
4109
  Downloading nvidia-cuda-cupti-cu12
4110
  Downloading matplotlib
4111
  Downloading numpy
4112
- Downloading nvidia-nvjitlink-cu12
4113
  Downloading sympy
 
4114
  Downloading nvidia-curand-cu12
4115
  Downloading nvidia-cuda-nvrtc-cu12
4116
  Downloading triton
4117
  Downloading nvidia-cufft-cu12
4118
  Downloading nvidia-cusolver-cu12
4119
- Downloading nvidia-cusparse-cu12
4120
  Downloading nvidia-cusparselt-cu12
 
4121
  Downloading nvidia-nccl-cu12
4122
  Downloading nvidia-cublas-cu12
4123
  Downloading nvidia-cudnn-cu12
4124
  Downloading torch
4125
- Installed 47 packages in 223ms
4126
  </div>
4127
  </div>
4128
  <div class="cell-stderr">Fetching 20 files: 0%| | 0/20 [00:00&lt;?, ?it/s]
4129
- Fetching 20 files: 5%|▌ | 1/20 [00:00&lt;00:04, 4.15it/s]
4130
- Fetching 20 files: 10%|█ | 2/20 [00:03&lt;00:35, 1.96s/it]
4131
- Fetching 20 files: 100%|██████████| 20/20 [00:03&lt;00:00, 5.86it/s]</div>
4132
  <div class="cell-artifacts">
4133
  <h4>Artifacts:</h4>
4134
  <a href="artifacts/benchmark/attention.jsonl" class="artifact" target="_blank">attention.jsonl</a>
 
3857
  <div class="system-info">
3858
  <div class="system-info-header">Generated on:</div>
3859
  <div class="system-info-content">
3860
+ Linux x86_64 | Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35
3861
  </div>
3862
  </div>
3863
 
 
3871
  <span onclick="toggleOutput('benchmark')" style="cursor: pointer;">▼ output</span>
3872
  <span id="uv-indicator-benchmark" onclick="toggleUvLogsFromHeader('benchmark')" style="cursor: pointer;">▶ uv-logs</span>
3873
  </span> |
3874
+ Cell: benchmark | 35.44s
3875
  | <button class="run-btn" onclick="runCell('benchmark')">▶ run</button>
3876
  <button class="copy-btn" onclick="copyCell('benchmark')">Copy</button>
3877
  <a href="cells/benchmark.py" target="_blank" class="raw-btn">Raw</a>
 
3926
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
3927
  Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
3928
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
3929
+ hf_kernels_flash_attn 3.89% 173.532us 41.54% 1.852ms 1.852ms 0.000us 0.00% 3.821ms 3.821ms 1
3930
+ _flash_attn_9e27194::fwd 1.71% 76.382us 37.65% 1.679ms 559.513us 2.851ms 100.00% 3.821ms 1.274ms 3
3931
+ hf_kernels_flash_attn 0.00% 0.000us 0.00% 0.000us 0.000us 2.852ms 100.05% 2.852ms 2.852ms 1
3932
+ void flash::flash_fwd_kernel&lt;Flash_fwd_kernel_traits... 0.00% 0.000us 0.00% 0.000us 0.000us 2.851ms 100.00% 2.851ms 950.289us 3
3933
+ Activity Buffer Request 32.53% 1.450ms 32.53% 1.450ms 1.450ms 970.364us 34.04% 970.364us 970.364us 1
3934
+ cudaDeviceGetAttribute 0.10% 4.520us 0.10% 4.520us 0.301us 0.000us 0.00% 0.000us 0.000us 15
3935
+ aten::empty_like 0.46% 20.440us 1.29% 57.461us 19.154us 0.000us 0.00% 0.000us 0.000us 3
3936
+ aten::empty_strided 0.83% 37.021us 0.83% 37.021us 12.340us 0.000us 0.00% 0.000us 0.000us 3
3937
+ aten::empty 0.76% 33.730us 0.76% 33.730us 3.748us 0.000us 0.00% 0.000us 0.000us 9
3938
+ cudaFuncSetAttribute 0.29% 12.870us 0.29% 12.870us 4.290us 0.000us 0.00% 0.000us 0.000us 3
3939
+ cudaLaunchKernel 0.97% 43.280us 0.97% 43.280us 14.427us 0.000us 0.00% 0.000us 0.000us 3
3940
+ cudaDeviceSynchronize 58.46% 2.606ms 58.46% 2.606ms 2.606ms 0.000us 0.00% 0.000us 0.000us 1
3941
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
3942
+ Self CPU time total: 4.458ms
3943
+ Self CUDA time total: 2.851ms
3944
 
3945
 
3946
 
 
3950
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
3951
  Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
3952
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
3953
+ hf_kernels_flash_attn 2.32% 104.162us 37.24% 1.676ms 1.676ms 0.000us 0.00% 4.000ms 4.000ms 1
3954
+ _flash_attn_9e27194::fwd 1.05% 47.052us 34.93% 1.571ms 523.812us 2.988ms 100.00% 4.000ms 1.333ms 3
3955
+ hf_kernels_flash_attn 0.00% 0.000us 0.00% 0.000us 0.000us 2.989ms 100.04% 2.989ms 2.989ms 1
3956
+ void flash::flash_fwd_kernel&lt;Flash_fwd_kernel_traits... 0.00% 0.000us 0.00% 0.000us 0.000us 2.988ms 100.00% 2.988ms 995.942us 3
3957
+ Activity Buffer Request 32.02% 1.441ms 32.02% 1.441ms 1.441ms 1.012ms 33.87% 1.012ms 1.012ms 1
3958
+ cudaDeviceGetAttribute 0.10% 4.331us 0.10% 4.331us 0.289us 0.000us 0.00% 0.000us 0.000us 15
3959
+ aten::empty_like 0.16% 7.210us 0.52% 23.350us 7.783us 0.000us 0.00% 0.000us 0.000us 3
3960
+ aten::empty_strided 0.36% 16.140us 0.36% 16.140us 5.380us 0.000us 0.00% 0.000us 0.000us 3
3961
+ aten::empty 0.47% 21.320us 0.47% 21.320us 2.369us 0.000us 0.00% 0.000us 0.000us 9
3962
+ cudaFuncSetAttribute 0.10% 4.349us 0.10% 4.349us 1.450us 0.000us 0.00% 0.000us 0.000us 3
3963
+ cudaLaunchKernel 0.67% 30.329us 0.67% 30.329us 10.110us 0.000us 0.00% 0.000us 0.000us 3
3964
+ cudaDeviceSynchronize 62.76% 2.824ms 62.76% 2.824ms 2.824ms 0.000us 0.00% 0.000us 0.000us 1
3965
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
3966
+ Self CPU time total: 4.499ms
3967
+ Self CUDA time total: 2.988ms
3968
 
3969
 
3970
 
 
3974
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
3975
  Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
3976
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
3977
+ hf_kernels_flash_attn 2.58% 116.241us 37.17% 1.677ms 1.677ms 0.000us 0.00% 4.040ms 4.040ms 1
3978
+ _flash_attn_9e27194::fwd 1.11% 49.909us 34.60% 1.561ms 520.326us 3.012ms 100.00% 4.040ms 1.347ms 3
3979
+ hf_kernels_flash_attn 0.00% 0.000us 0.00% 0.000us 0.000us 3.013ms 100.04% 3.013ms 3.013ms 1
3980
+ void flash::flash_fwd_kernel&lt;Flash_fwd_kernel_traits... 0.00% 0.000us 0.00% 0.000us 0.000us 3.012ms 100.00% 3.012ms 1.004ms 3
3981
+ Activity Buffer Request 31.60% 1.426ms 31.60% 1.426ms 1.426ms 1.029ms 34.16% 1.029ms 1.029ms 1
3982
+ cudaDeviceGetAttribute 0.08% 3.801us 0.08% 3.801us 0.253us 0.000us 0.00% 0.000us 0.000us 15
3983
+ aten::empty_like 0.18% 8.151us 0.55% 24.960us 8.320us 0.000us 0.00% 0.000us 0.000us 3
3984
+ aten::empty_strided 0.37% 16.809us 0.37% 16.809us 5.603us 0.000us 0.00% 0.000us 0.000us 3
3985
+ aten::empty 0.47% 21.201us 0.47% 21.201us 2.356us 0.000us 0.00% 0.000us 0.000us 9
3986
+ cudaFuncSetAttribute 0.09% 3.950us 0.09% 3.950us 1.317us 0.000us 0.00% 0.000us 0.000us 3
3987
+ cudaLaunchKernel 0.69% 31.260us 0.69% 31.260us 10.420us 0.000us 0.00% 0.000us 0.000us 3
3988
+ cudaDeviceSynchronize 62.83% 2.835ms 62.83% 2.835ms 2.835ms 0.000us 0.00% 0.000us 0.000us 1
3989
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
3990
+ Self CPU time total: 4.512ms
3991
+ Self CUDA time total: 3.012ms
3992
 
3993
 
3994
 
 
3998
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
3999
  Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
4000
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4001
+ hf_kernels_flash_attn 2.01% 99.212us 38.53% 1.898ms 1.898ms 0.000us 0.00% 4.264ms 4.264ms 1
4002
+ _flash_attn_9e27194::fwd 1.06% 52.152us 36.51% 1.799ms 599.723us 3.190ms 100.00% 4.264ms 1.421ms 3
4003
+ hf_kernels_flash_attn 0.00% 0.000us 0.00% 0.000us 0.000us 3.191ms 100.05% 3.191ms 3.191ms 1
4004
+ void flash::flash_fwd_kernel&lt;Flash_fwd_kernel_traits... 0.00% 0.000us 0.00% 0.000us 0.000us 3.190ms 100.00% 3.190ms 1.063ms 3
4005
+ Activity Buffer Request 28.82% 1.420ms 28.82% 1.420ms 1.420ms 1.074ms 33.68% 1.074ms 1.074ms 1
4006
+ cudaDeviceGetAttribute 0.09% 4.479us 0.09% 4.479us 0.299us 0.000us 0.00% 0.000us 0.000us 15
4007
+ aten::empty_like 0.16% 7.900us 0.54% 26.470us 8.823us 0.000us 0.00% 0.000us 0.000us 3
4008
+ aten::empty_strided 0.38% 18.570us 0.38% 18.570us 6.190us 0.000us 0.00% 0.000us 0.000us 3
4009
+ aten::empty 0.46% 22.430us 0.46% 22.430us 2.492us 0.000us 0.00% 0.000us 0.000us 9
4010
+ cudaFuncSetAttribute 0.08% 3.830us 0.08% 3.830us 1.277us 0.000us 0.00% 0.000us 0.000us 3
4011
+ cudaLaunchKernel 5.47% 269.763us 5.47% 269.763us 89.921us 0.000us 0.00% 0.000us 0.000us 3
4012
+ cudaDeviceSynchronize 61.47% 3.029ms 61.47% 3.029ms 3.029ms 0.000us 0.00% 0.000us 0.000us 1
4013
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4014
+ Self CPU time total: 4.928ms
4015
+ Self CUDA time total: 3.190ms
4016
 
4017
 
4018
 
 
4022
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4023
  Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
4024
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4025
+ hf_kernels_flash_attn 2.16% 88.971us 14.91% 614.057us 614.057us 0.000us 0.00% 4.875ms 4.875ms 1
4026
+ _flash_attn_9e27194::fwd 1.23% 50.539us 12.75% 525.086us 175.029us 3.652ms 100.00% 4.875ms 1.625ms 3
4027
+ hf_kernels_flash_attn 0.00% 0.000us 0.00% 0.000us 0.000us 3.653ms 100.04% 3.653ms 3.653ms 1
4028
+ void flash::flash_fwd_kernel&lt;Flash_fwd_kernel_traits... 0.00% 0.000us 0.00% 0.000us 0.000us 3.652ms 100.00% 3.652ms 1.217ms 3
4029
+ Activity Buffer Request 5.08% 209.112us 5.08% 209.112us 209.112us 1.223ms 33.50% 1.223ms 1.223ms 1
4030
+ cudaDeviceGetAttribute 0.10% 3.960us 0.10% 3.960us 0.264us 0.000us 0.00% 0.000us 0.000us 15
4031
+ aten::empty_like 0.19% 7.749us 0.60% 24.700us 8.233us 0.000us 0.00% 0.000us 0.000us 3
4032
+ aten::empty_strided 0.41% 16.951us 0.41% 16.951us 5.650us 0.000us 0.00% 0.000us 0.000us 3
4033
+ aten::empty 0.54% 22.121us 0.54% 22.121us 2.458us 0.000us 0.00% 0.000us 0.000us 9
4034
+ cudaFuncSetAttribute 0.10% 4.190us 0.10% 4.190us 1.397us 0.000us 0.00% 0.000us 0.000us 3
4035
+ cudaLaunchKernel 5.11% 210.464us 5.11% 210.464us 70.155us 0.000us 0.00% 0.000us 0.000us 3
4036
+ cudaDeviceSynchronize 85.09% 3.504ms 85.09% 3.504ms 3.504ms 0.000us 0.00% 0.000us 0.000us 1
4037
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4038
+ Self CPU time total: 4.118ms
4039
+ Self CUDA time total: 3.652ms
4040
 
4041
 
4042
 
 
4046
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4047
  Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
4048
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4049
+ hf_kernels_flash_attn 2.23% 91.402us 14.65% 600.857us 600.857us 0.000us 0.00% 4.881ms 4.881ms 1
4050
+ _flash_attn_9e27194::fwd 1.15% 47.191us 12.42% 509.455us 169.818us 3.654ms 100.00% 4.881ms 1.627ms 3
4051
+ hf_kernels_flash_attn 0.00% 0.000us 0.00% 0.000us 0.000us 3.655ms 100.04% 3.655ms 3.655ms 1
4052
+ void flash::flash_fwd_kernel&lt;Flash_fwd_kernel_traits... 0.00% 0.000us 0.00% 0.000us 0.000us 3.654ms 100.00% 3.654ms 1.218ms 3
4053
+ Activity Buffer Request 5.38% 220.623us 5.38% 220.623us 220.623us 1.227ms 33.59% 1.227ms 1.227ms 1
4054
+ cudaDeviceGetAttribute 0.09% 3.601us 0.09% 3.601us 0.240us 0.000us 0.00% 0.000us 0.000us 15
4055
+ aten::empty_like 0.18% 7.230us 0.58% 23.840us 7.947us 0.000us 0.00% 0.000us 0.000us 3
4056
+ aten::empty_strided 0.40% 16.610us 0.40% 16.610us 5.537us 0.000us 0.00% 0.000us 0.000us 3
4057
+ aten::empty 0.51% 20.851us 0.51% 20.851us 2.317us 0.000us 0.00% 0.000us 0.000us 9
4058
+ cudaFuncSetAttribute 0.09% 3.688us 0.09% 3.688us 1.229us 0.000us 0.00% 0.000us 0.000us 3
4059
+ cudaLaunchKernel 4.62% 189.661us 4.62% 189.661us 63.220us 0.000us 0.00% 0.000us 0.000us 3
4060
+ cudaDeviceSynchronize 85.35% 3.502ms 85.35% 3.502ms 3.502ms 0.000us 0.00% 0.000us 0.000us 1
4061
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4062
+ Self CPU time total: 4.103ms
4063
+ Self CUDA time total: 3.654ms
4064
 
4065
 
4066
  impl wl p50(ms) ok
4067
+ hf_kernels_flash_attn cuda_attn_L128_bfloat16 0.98 True
4068
+ hf_kernels_flash_attn cuda_attn_L256_bfloat16 1.02 True
4069
+ hf_kernels_flash_attn cuda_attn_L320_bfloat16 1.05 True
4070
+ hf_kernels_flash_attn cuda_attn_L384_bfloat16 1.07 True
4071
+ hf_kernels_flash_attn cuda_attn_L448_bfloat16 1.23 True
4072
+ hf_kernels_flash_attn cuda_attn_L512_bfloat16 1.23 True
4073
  </pre></div>
4074
  <div class="uv-install-logs" id="uv-logs-benchmark">
4075
  <div class="uv-logs-header" onclick="toggleUvLogs(this)">▶ UV Install Logs</div>
4076
  <div class="uv-logs-content" style="display: none;">
4077
+ Building kernels-benchmark-tools @ file:///__w/kernels-benchmarks/kernels-benchmarks/tools
 
4078
  Downloading hf-xet (3.2MiB)
4079
+ Downloading nvidia-nvjitlink-cu12 (37.4MiB)
 
 
 
 
4080
  Downloading networkx (1.9MiB)
4081
+ Downloading nvidia-cusparselt-cu12 (273.9MiB)
 
 
 
4082
  Downloading nvidia-nccl-cu12 (307.4MiB)
4083
+ Downloading kiwisolver (1.4MiB)
4084
+ Downloading pillow (6.7MiB)
4085
  Downloading nvidia-curand-cu12 (60.7MiB)
4086
+ Downloading nvidia-cublas-cu12 (566.8MiB)
4087
+ Downloading sympy (6.0MiB)
4088
+ Downloading setuptools (1.1MiB)
4089
+ Downloading matplotlib (8.3MiB)
4090
+ Downloading numpy (16.2MiB)
4091
+ Downloading triton (148.3MiB)
4092
  Downloading nvidia-cudnn-cu12 (674.0MiB)
4093
+ Downloading fonttools (4.7MiB)
4094
+ Downloading nvidia-cusparse-cu12 (274.9MiB)
4095
+ Downloading nvidia-cuda-nvrtc-cu12 (84.0MiB)
4096
  Downloading nvidia-cufile-cu12 (1.1MiB)
4097
+ Downloading nvidia-cusolver-cu12 (255.1MiB)
4098
+ Downloading nvidia-cuda-cupti-cu12 (9.8MiB)
4099
+ Downloading nvidia-cufft-cu12 (184.2MiB)
4100
+ Downloading torch (846.9MiB)
4101
  Downloading nvidia-cufile-cu12
4102
  Downloading kiwisolver
4103
  Downloading hf-xet
4104
  Downloading setuptools
 
4105
  Downloading networkx
4106
+ Downloading fonttools
4107
  Downloading pillow
4108
+ Built kernels-benchmark-tools @ file:///__w/kernels-benchmarks/kernels-benchmarks/tools
4109
  Downloading nvidia-cuda-cupti-cu12
4110
  Downloading matplotlib
4111
  Downloading numpy
 
4112
  Downloading sympy
4113
+ Downloading nvidia-nvjitlink-cu12
4114
  Downloading nvidia-curand-cu12
4115
  Downloading nvidia-cuda-nvrtc-cu12
4116
  Downloading triton
4117
  Downloading nvidia-cufft-cu12
4118
  Downloading nvidia-cusolver-cu12
 
4119
  Downloading nvidia-cusparselt-cu12
4120
+ Downloading nvidia-cusparse-cu12
4121
  Downloading nvidia-nccl-cu12
4122
  Downloading nvidia-cublas-cu12
4123
  Downloading nvidia-cudnn-cu12
4124
  Downloading torch
4125
+ Installed 52 packages in 223ms
4126
  </div>
4127
  </div>
4128
  <div class="cell-stderr">Fetching 20 files: 0%| | 0/20 [00:00&lt;?, ?it/s]
4129
+ Fetching 20 files: 10%|█ | 2/20 [00:01&lt;00:12, 1.43it/s]
4130
+ Fetching 20 files: 100%|██████████| 20/20 [00:01&lt;00:00, 14.34it/s]</div>
 
4131
  <div class="cell-artifacts">
4132
  <h4>Artifacts:</h4>
4133
  <a href="artifacts/benchmark/attention.jsonl" class="artifact" target="_blank">attention.jsonl</a>
flash_attn/impls/hf_kernels_flash_attn3.html CHANGED
@@ -3857,7 +3857,7 @@ span.linenos.special { color: #000000; background-color: #ffffc0; padding-left:
3857
  <div class="system-info">
3858
  <div class="system-info-header">Generated on:</div>
3859
  <div class="system-info-content">
3860
- Linux x86_64 | Linux-5.15.0-1084-aws-x86_64-with-glibc2.31
3861
  </div>
3862
  </div>
3863
 
@@ -3869,9 +3869,9 @@ span.linenos.special { color: #000000; background-color: #ffffc0; padding-left:
3869
  <span class="collapse-indicators">
3870
  <span onclick="toggleCode('benchmark')" style="cursor: pointer;">▼ code</span>
3871
  <span onclick="toggleOutput('benchmark')" style="cursor: pointer;">▼ output</span>
3872
- <span id="uv-indicator-benchmark" onclick="toggleUvLogsFromHeader('benchmark')" style="cursor: pointer;">▶ uv-logs</span>
3873
  </span> |
3874
- Cell: benchmark | 45.91s
3875
  | <button class="run-btn" onclick="runCell('benchmark')">▶ run</button>
3876
  <button class="copy-btn" onclick="copyCell('benchmark')">Copy</button>
3877
  <a href="cells/benchmark.py" target="_blank" class="raw-btn">Raw</a>
@@ -3925,19 +3925,19 @@ PROFILE TRACE: hf_kernels_flash_attn3 | cuda_attn_L128_bfloat16
3925
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
3926
  Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
3927
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
3928
- hf_kernels_flash_attn3 1.92% 183.884us 20.78% 1.986ms 1.986ms 0.000us 0.00% 10.512ms 10.512ms 1
3929
- FlashAttnFunc 1.41% 134.465us 18.86% 1.802ms 600.660us 0.000us 0.00% 10.512ms 3.504ms 3
3930
- _flash_attn3_48fe103_dirty::fwd 0.80% 76.599us 17.45% 1.668ms 555.838us 7.883ms 100.00% 10.512ms 3.504ms 3
3931
- hf_kernels_flash_attn3 0.00% 0.000us 0.00% 0.000us 0.000us 7.884ms 100.02% 7.884ms 7.884ms 1
3932
- void cutlass::device_kernel&lt;flash::enable_sm80_to_sm... 0.00% 0.000us 0.00% 0.000us 0.000us 7.883ms 100.00% 7.883ms 2.628ms 3
3933
- Activity Buffer Request 15.56% 1.487ms 15.56% 1.487ms 1.487ms 2.629ms 33.36% 2.629ms 2.629ms 1
3934
- aten::empty 0.46% 44.151us 0.46% 44.151us 7.358us 0.000us 0.00% 0.000us 0.000us 6
3935
- cudaFuncSetAttribute 0.16% 15.420us 0.16% 15.420us 5.140us 0.000us 0.00% 0.000us 0.000us 3
3936
- cudaLaunchKernel 0.46% 44.162us 0.46% 44.162us 14.721us 0.000us 0.00% 0.000us 0.000us 3
3937
- cudaDeviceSynchronize 79.22% 7.570ms 79.22% 7.570ms 7.570ms 0.000us 0.00% 0.000us 0.000us 1
3938
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
3939
- Self CPU time total: 9.555ms
3940
- Self CUDA time total: 7.883ms
3941
 
3942
 
3943
 
@@ -3947,19 +3947,19 @@ PROFILE TRACE: hf_kernels_flash_attn3 | cuda_attn_L256_bfloat16
3947
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
3948
  Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
3949
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
3950
- FlashAttnFunc 0.80% 101.601us 13.56% 1.712ms 570.799us 0.000us 0.00% 14.746ms 4.915ms 3
3951
- _flash_attn3_48fe103_dirty::fwd 0.39% 49.531us 12.75% 1.611ms 536.932us 11.037ms 100.00% 14.746ms 4.915ms 3
3952
- hf_kernels_flash_attn3 0.89% 111.943us 14.45% 1.824ms 1.824ms 0.000us 0.00% 14.746ms 14.746ms 1
3953
- hf_kernels_flash_attn3 0.00% 0.000us 0.00% 0.000us 0.000us 11.039ms 100.02% 11.039ms 11.039ms 1
3954
- void cutlass::device_kernel&lt;flash::enable_sm80_to_sm... 0.00% 0.000us 0.00% 0.000us 0.000us 11.037ms 100.00% 11.037ms 3.679ms 3
3955
- Activity Buffer Request 11.87% 1.500ms 11.87% 1.500ms 1.500ms 3.709ms 33.60% 3.709ms 3.709ms 1
3956
- aten::empty 0.21% 26.220us 0.21% 26.220us 4.370us 0.000us 0.00% 0.000us 0.000us 6
3957
- cudaFuncSetAttribute 0.04% 5.092us 0.04% 5.092us 1.697us 0.000us 0.00% 0.000us 0.000us 3
3958
- cudaLaunchKernel 0.24% 30.290us 0.24% 30.290us 10.097us 0.000us 0.00% 0.000us 0.000us 3
3959
- cudaDeviceSynchronize 85.55% 10.805ms 85.55% 10.805ms 10.805ms 0.000us 0.00% 0.000us 0.000us 1
3960
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
3961
- Self CPU time total: 12.629ms
3962
- Self CUDA time total: 11.037ms
3963
 
3964
 
3965
 
@@ -3969,19 +3969,19 @@ PROFILE TRACE: hf_kernels_flash_attn3 | cuda_attn_L320_bfloat16
3969
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
3970
  Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
3971
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
3972
- hf_kernels_flash_attn3 0.84% 108.082us 14.36% 1.851ms 1.851ms 0.000us 0.00% 15.081ms 15.081ms 1
3973
- FlashAttnFunc 0.79% 101.882us 13.52% 1.743ms 580.849us 0.000us 0.00% 15.081ms 5.027ms 3
3974
- _flash_attn3_48fe103_dirty::fwd 0.38% 48.472us 12.73% 1.641ms 546.889us 11.268ms 100.00% 15.081ms 5.027ms 3
3975
- hf_kernels_flash_attn3 0.00% 0.000us 0.00% 0.000us 0.000us 11.269ms 100.02% 11.269ms 11.269ms 1
3976
- void cutlass::device_kernel&lt;flash::enable_sm80_to_sm... 0.00% 0.000us 0.00% 0.000us 0.000us 11.268ms 100.00% 11.268ms 3.756ms 3
3977
- Activity Buffer Request 11.87% 1.530ms 11.87% 1.530ms 1.530ms 3.813ms 33.84% 3.813ms 3.813ms 1
3978
- aten::empty 0.21% 26.670us 0.21% 26.670us 4.445us 0.000us 0.00% 0.000us 0.000us 6
3979
- cudaFuncSetAttribute 0.04% 5.170us 0.04% 5.170us 1.723us 0.000us 0.00% 0.000us 0.000us 3
3980
- cudaLaunchKernel 0.24% 30.581us 0.24% 30.581us 10.194us 0.000us 0.00% 0.000us 0.000us 3
3981
- cudaDeviceSynchronize 85.64% 11.041ms 85.64% 11.041ms 11.041ms 0.000us 0.00% 0.000us 0.000us 1
3982
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
3983
- Self CPU time total: 12.891ms
3984
- Self CUDA time total: 11.268ms
3985
 
3986
 
3987
 
@@ -3991,19 +3991,19 @@ PROFILE TRACE: hf_kernels_flash_attn3 | cuda_attn_L384_bfloat16
3991
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
3992
  Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
3993
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
3994
- hf_kernels_flash_attn3 0.87% 107.542us 12.07% 1.493ms 1.493ms 0.000us 0.00% 14.923ms 14.923ms 1
3995
- FlashAttnFunc 0.84% 104.222us 11.20% 1.385ms 461.687us 0.000us 0.00% 14.923ms 4.974ms 3
3996
- _flash_attn3_48fe103_dirty::fwd 0.41% 51.032us 10.36% 1.281ms 426.946us 11.101ms 100.00% 14.923ms 4.974ms 3
3997
- hf_kernels_flash_attn3 0.00% 0.000us 0.00% 0.000us 0.000us 11.102ms 100.02% 11.102ms 11.102ms 1
3998
- void cutlass::device_kernel&lt;flash::enable_sm80_to_sm... 0.00% 0.000us 0.00% 0.000us 0.000us 11.101ms 100.00% 11.101ms 3.700ms 3
3999
- Activity Buffer Request 7.69% 950.601us 7.69% 950.601us 950.601us 3.822ms 34.43% 3.822ms 3.822ms 1
4000
- aten::empty 0.22% 27.719us 0.22% 27.719us 4.620us 0.000us 0.00% 0.000us 0.000us 6
4001
- cudaFuncSetAttribute 0.04% 5.160us 0.04% 5.160us 1.720us 0.000us 0.00% 0.000us 0.000us 3
4002
- cudaLaunchKernel 1.99% 246.326us 1.99% 246.326us 82.109us 0.000us 0.00% 0.000us 0.000us 3
4003
- cudaDeviceSynchronize 87.93% 10.869ms 87.93% 10.869ms 10.869ms 0.000us 0.00% 0.000us 0.000us 1
4004
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4005
- Self CPU time total: 12.361ms
4006
- Self CUDA time total: 11.101ms
4007
 
4008
 
4009
 
@@ -4013,19 +4013,19 @@ PROFILE TRACE: hf_kernels_flash_attn3 | cuda_attn_L448_bfloat16
4013
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4014
  Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
4015
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4016
- hf_kernels_flash_attn3 0.89% 122.681us 14.72% 2.032ms 2.032ms 0.000us 0.00% 16.019ms 16.019ms 1
4017
- FlashAttnFunc 0.72% 100.054us 13.83% 1.909ms 636.464us 0.000us 0.00% 16.019ms 5.340ms 3
4018
- _flash_attn3_48fe103_dirty::fwd 0.37% 50.743us 13.11% 1.809ms 603.113us 11.999ms 100.00% 16.019ms 5.340ms 3
4019
- hf_kernels_flash_attn3 0.00% 0.000us 0.00% 0.000us 0.000us 12.001ms 100.02% 12.001ms 12.001ms 1
4020
- void cutlass::device_kernel&lt;flash::enable_sm80_to_sm... 0.00% 0.000us 0.00% 0.000us 0.000us 11.999ms 100.00% 11.999ms 4.000ms 3
4021
- Activity Buffer Request 10.68% 1.474ms 10.68% 1.474ms 1.474ms 4.020ms 33.50% 4.020ms 4.020ms 1
4022
- aten::empty 0.20% 27.509us 0.20% 27.509us 4.585us 0.000us 0.00% 0.000us 0.000us 6
4023
- cudaFuncSetAttribute 0.04% 5.180us 0.04% 5.180us 1.727us 0.000us 0.00% 0.000us 0.000us 3
4024
- cudaLaunchKernel 1.82% 251.475us 1.82% 251.475us 83.825us 0.000us 0.00% 0.000us 0.000us 3
4025
- cudaDeviceSynchronize 85.28% 11.773ms 85.28% 11.773ms 11.773ms 0.000us 0.00% 0.000us 0.000us 1
4026
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4027
- Self CPU time total: 13.805ms
4028
- Self CUDA time total: 11.999ms
4029
 
4030
 
4031
 
@@ -4035,87 +4035,34 @@ PROFILE TRACE: hf_kernels_flash_attn3 | cuda_attn_L512_bfloat16
4035
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4036
  Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
4037
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4038
- hf_kernels_flash_attn3 0.65% 102.032us 20.79% 3.268ms 3.268ms 0.000us 0.00% 16.971ms 16.971ms 1
4039
- FlashAttnFunc 0.66% 104.392us 20.14% 3.166ms 1.055ms 0.000us 0.00% 16.971ms 5.657ms 3
4040
- _flash_attn3_48fe103_dirty::fwd 0.30% 47.113us 19.48% 3.062ms 1.021ms 12.681ms 100.00% 16.971ms 5.657ms 3
4041
- hf_kernels_flash_attn3 0.00% 0.000us 0.00% 0.000us 0.000us 12.683ms 100.02% 12.683ms 12.683ms 1
4042
- void cutlass::device_kernel&lt;flash::enable_sm80_to_sm... 0.00% 0.000us 0.00% 0.000us 0.000us 12.681ms 100.00% 12.681ms 4.227ms 3
4043
- Activity Buffer Request 10.87% 1.709ms 10.87% 1.709ms 1.709ms 4.290ms 33.83% 4.290ms 4.290ms 1
4044
- aten::empty 0.17% 27.090us 0.17% 27.090us 4.515us 0.000us 0.00% 0.000us 0.000us 6
4045
- cudaFuncSetAttribute 0.03% 5.219us 0.03% 5.219us 1.740us 0.000us 0.00% 0.000us 0.000us 3
4046
- cudaLaunchKernel 8.10% 1.273ms 8.10% 1.273ms 424.362us 0.000us 0.00% 0.000us 0.000us 3
4047
- cudaDeviceSynchronize 79.21% 12.453ms 79.21% 12.453ms 12.453ms 0.000us 0.00% 0.000us 0.000us 1
4048
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4049
- Self CPU time total: 15.722ms
4050
- Self CUDA time total: 12.681ms
4051
 
4052
 
4053
  impl wl p50(ms) ok
4054
- hf_kernels_flash_attn3 cuda_attn_L128_bfloat16 3.22 True
4055
- hf_kernels_flash_attn3 cuda_attn_L256_bfloat16 3.77 True
4056
- hf_kernels_flash_attn3 cuda_attn_L320_bfloat16 3.91 True
4057
- hf_kernels_flash_attn3 cuda_attn_L384_bfloat16 3.97 True
4058
- hf_kernels_flash_attn3 cuda_attn_L448_bfloat16 4.19 True
4059
- hf_kernels_flash_attn3 cuda_attn_L512_bfloat16 4.41 True
4060
  </pre></div>
4061
- <div class="uv-install-logs" id="uv-logs-benchmark">
4062
- <div class="uv-logs-header" onclick="toggleUvLogs(this)">▶ UV Install Logs</div>
4063
- <div class="uv-logs-content" style="display: none;">
4064
- Building kernels-benchmark-tools @ file:///home/ubuntu/Projects/kernels-benchmarks-consolidated/tools
4065
- Downloading nvidia-cufile-cu12 (1.1MiB)
4066
- Downloading nvidia-cufft-cu12 (184.2MiB)
4067
- Downloading sympy (6.0MiB)
4068
- Downloading setuptools (1.1MiB)
4069
- Downloading nvidia-curand-cu12 (60.7MiB)
4070
- Downloading numpy (15.9MiB)
4071
- Downloading nvidia-cuda-cupti-cu12 (9.8MiB)
4072
- Downloading nvidia-cusolver-cu12 (255.1MiB)
4073
- Downloading nvidia-cusparse-cu12 (274.9MiB)
4074
- Downloading nvidia-cudnn-cu12 (674.0MiB)
4075
- Downloading nvidia-cusparselt-cu12 (273.9MiB)
4076
- Downloading matplotlib (8.3MiB)
4077
- Downloading hf-xet (3.2MiB)
4078
- Downloading nvidia-cublas-cu12 (566.8MiB)
4079
- Downloading pillow (6.7MiB)
4080
- Downloading nvidia-nccl-cu12 (307.4MiB)
4081
- Downloading networkx (1.9MiB)
4082
- Downloading nvidia-cuda-nvrtc-cu12 (84.0MiB)
4083
- Downloading nvidia-nvjitlink-cu12 (37.4MiB)
4084
- Downloading triton (148.4MiB)
4085
- Downloading fonttools (4.7MiB)
4086
- Downloading kiwisolver (1.4MiB)
4087
- Downloading torch (846.8MiB)
4088
- Downloading nvidia-cufile-cu12
4089
- Downloading kiwisolver
4090
- Downloading hf-xet
4091
- Downloading setuptools
4092
- Downloading networkx
4093
- Downloading fonttools
4094
- Downloading pillow
4095
- Built kernels-benchmark-tools @ file:///home/ubuntu/Projects/kernels-benchmarks-consolidated/tools
4096
- Downloading nvidia-cuda-cupti-cu12
4097
- Downloading matplotlib
4098
- Downloading numpy
4099
- Downloading sympy
4100
- Downloading nvidia-nvjitlink-cu12
4101
- Downloading nvidia-curand-cu12
4102
- Downloading nvidia-cuda-nvrtc-cu12
4103
- Downloading triton
4104
- Downloading nvidia-cufft-cu12
4105
- Downloading nvidia-cusolver-cu12
4106
- Downloading nvidia-cusparse-cu12
4107
- Downloading nvidia-cusparselt-cu12
4108
- Downloading nvidia-nccl-cu12
4109
- Downloading nvidia-cublas-cu12
4110
- Downloading nvidia-cudnn-cu12
4111
- Downloading torch
4112
- Installed 47 packages in 222ms
4113
  </div>
4114
- </div>
4115
- <div class="cell-stderr">Fetching 4 files: 0%| | 0/4 [00:00&lt;?, ?it/s]
4116
- Fetching 4 files: 25%|██▌ | 1/4 [00:00&lt;00:00, 7.95it/s]
4117
- Fetching 4 files: 50%|█████ | 2/4 [00:01&lt;00:01, 1.15it/s]
4118
- Fetching 4 files: 100%|██████████| 4/4 [00:01&lt;00:00, 2.64it/s]</div>
4119
  <div class="cell-artifacts">
4120
  <h4>Artifacts:</h4>
4121
  <a href="artifacts/benchmark/attention.jsonl" class="artifact" target="_blank">attention.jsonl</a>
 
3857
  <div class="system-info">
3858
  <div class="system-info-header">Generated on:</div>
3859
  <div class="system-info-content">
3860
+ Linux x86_64 | Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35
3861
  </div>
3862
  </div>
3863
 
 
3869
  <span class="collapse-indicators">
3870
  <span onclick="toggleCode('benchmark')" style="cursor: pointer;">▼ code</span>
3871
  <span onclick="toggleOutput('benchmark')" style="cursor: pointer;">▼ output</span>
3872
+ <span id="uv-indicator-benchmark" style="cursor: default; opacity: 0.3;">▶ uv-logs</span>
3873
  </span> |
3874
+ Cell: benchmark | 5.62s
3875
  | <button class="run-btn" onclick="runCell('benchmark')">▶ run</button>
3876
  <button class="copy-btn" onclick="copyCell('benchmark')">Copy</button>
3877
  <a href="cells/benchmark.py" target="_blank" class="raw-btn">Raw</a>
 
3925
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
3926
  Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
3927
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
3928
+ hf_kernels_flash_attn3 3.90% 171.143us 44.22% 1.941ms 1.941ms 0.000us 0.00% 3.653ms 3.653ms 1
3929
+ FlashAttnFunc 2.92% 128.011us 40.32% 1.769ms 589.788us 0.000us 0.00% 3.653ms 1.218ms 3
3930
+ _flash_attn3_48fe103_dirty::fwd 1.90% 83.422us 37.41% 1.641ms 547.118us 2.755ms 100.00% 3.653ms 1.218ms 3
3931
+ hf_kernels_flash_attn3 0.00% 0.000us 0.00% 0.000us 0.000us 2.756ms 100.05% 2.756ms 2.756ms 1
3932
+ void cutlass::device_kernel&lt;flash::enable_sm80_to_sm... 0.00% 0.000us 0.00% 0.000us 0.000us 2.755ms 100.00% 2.755ms 918.306us 3
3933
+ Activity Buffer Request 33.13% 1.454ms 33.13% 1.454ms 1.454ms 898.082us 32.60% 898.082us 898.082us 1
3934
+ aten::empty 1.02% 44.762us 1.02% 44.762us 7.460us 0.000us 0.00% 0.000us 0.000us 6
3935
+ cudaFuncSetAttribute 0.33% 14.660us 0.33% 14.660us 4.887us 0.000us 0.00% 0.000us 0.000us 3
3936
+ cudaLaunchKernel 1.02% 44.660us 1.02% 44.660us 14.887us 0.000us 0.00% 0.000us 0.000us 3
3937
+ cudaDeviceSynchronize 55.78% 2.447ms 55.78% 2.447ms 2.447ms 0.000us 0.00% 0.000us 0.000us 1
3938
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
3939
+ Self CPU time total: 4.388ms
3940
+ Self CUDA time total: 2.755ms
3941
 
3942
 
3943
 
 
3947
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
3948
  Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
3949
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
3950
+ hf_kernels_flash_attn3 2.42% 105.470us 40.03% 1.743ms 1.743ms 0.000us 0.00% 3.784ms 3.784ms 1
3951
+ FlashAttnFunc 2.12% 92.121us 37.61% 1.638ms 546.005us 0.000us 0.00% 3.784ms 1.261ms 3
3952
+ _flash_attn3_48fe103_dirty::fwd 1.23% 53.460us 35.49% 1.546ms 515.298us 2.836ms 100.00% 3.784ms 1.261ms 3
3953
+ hf_kernels_flash_attn3 0.00% 0.000us 0.00% 0.000us 0.000us 2.838ms 100.05% 2.838ms 2.838ms 1
3954
+ void cutlass::device_kernel&lt;flash::enable_sm80_to_sm... 0.00% 0.000us 0.00% 0.000us 0.000us 2.836ms 100.00% 2.836ms 945.359us 3
3955
+ Activity Buffer Request 32.85% 1.431ms 32.85% 1.431ms 1.431ms 947.652us 33.41% 947.652us 947.652us 1
3956
+ aten::empty 0.62% 27.052us 0.62% 27.052us 4.509us 0.000us 0.00% 0.000us 0.000us 6
3957
+ cudaFuncSetAttribute 0.11% 4.721us 0.11% 4.721us 1.574us 0.000us 0.00% 0.000us 0.000us 3
3958
+ cudaLaunchKernel 0.68% 29.730us 0.68% 29.730us 9.910us 0.000us 0.00% 0.000us 0.000us 3
3959
+ cudaDeviceSynchronize 59.97% 2.612ms 59.97% 2.612ms 2.612ms 0.000us 0.00% 0.000us 0.000us 1
3960
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
3961
+ Self CPU time total: 4.355ms
3962
+ Self CUDA time total: 2.836ms
3963
 
3964
 
3965
 
 
3969
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
3970
  Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
3971
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
3972
+ hf_kernels_flash_attn3 2.34% 104.112us 39.68% 1.767ms 1.767ms 0.000us 0.00% 3.931ms 3.931ms 1
3973
+ FlashAttnFunc 2.59% 115.143us 37.35% 1.662ms 554.155us 0.000us 0.00% 3.931ms 1.310ms 3
3974
+ _flash_attn3_48fe103_dirty::fwd 1.23% 54.772us 34.76% 1.547ms 515.774us 2.932ms 100.00% 3.931ms 1.310ms 3
3975
+ hf_kernels_flash_attn3 0.00% 0.000us 0.00% 0.000us 0.000us 2.934ms 100.05% 2.934ms 2.934ms 1
3976
+ void cutlass::device_kernel&lt;flash::enable_sm80_to_sm... 0.00% 0.000us 0.00% 0.000us 0.000us 2.932ms 100.00% 2.932ms 977.432us 3
3977
+ Activity Buffer Request 32.05% 1.427ms 32.05% 1.427ms 1.427ms 998.487us 34.05% 998.487us 998.487us 1
3978
+ aten::empty 0.66% 29.309us 0.66% 29.309us 4.885us 0.000us 0.00% 0.000us 0.000us 6
3979
+ cudaFuncSetAttribute 0.11% 4.840us 0.11% 4.840us 1.613us 0.000us 0.00% 0.000us 0.000us 3
3980
+ cudaLaunchKernel 0.71% 31.520us 0.71% 31.520us 10.507us 0.000us 0.00% 0.000us 0.000us 3
3981
+ cudaDeviceSynchronize 60.32% 2.685ms 60.32% 2.685ms 2.685ms 0.000us 0.00% 0.000us 0.000us 1
3982
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
3983
+ Self CPU time total: 4.452ms
3984
+ Self CUDA time total: 2.932ms
3985
 
3986
 
3987
 
 
3991
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
3992
  Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
3993
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
3994
+ hf_kernels_flash_attn3 2.48% 118.391us 41.58% 1.983ms 1.983ms 0.000us 0.00% 4.029ms 4.029ms 1
3995
+ FlashAttnFunc 2.00% 95.232us 39.09% 1.865ms 621.579us 0.000us 0.00% 4.029ms 1.343ms 3
3996
+ _flash_attn3_48fe103_dirty::fwd 1.18% 56.301us 37.10% 1.770ms 589.835us 3.014ms 100.00% 4.029ms 1.343ms 3
3997
+ hf_kernels_flash_attn3 0.00% 0.000us 0.00% 0.000us 0.000us 3.016ms 100.06% 3.016ms 3.016ms 1
3998
+ void cutlass::device_kernel&lt;flash::enable_sm80_to_sm... 0.00% 0.000us 0.00% 0.000us 0.000us 3.014ms 100.00% 3.014ms 1.005ms 3
3999
+ Activity Buffer Request 30.19% 1.440ms 30.19% 1.440ms 1.440ms 1.015ms 33.67% 1.015ms 1.015ms 1
4000
+ aten::empty 0.58% 27.710us 0.58% 27.710us 4.618us 0.000us 0.00% 0.000us 0.000us 6
4001
+ cudaFuncSetAttribute 0.10% 4.771us 0.10% 4.771us 1.590us 0.000us 0.00% 0.000us 0.000us 3
4002
+ cudaLaunchKernel 5.05% 240.873us 5.05% 240.873us 80.291us 0.000us 0.00% 0.000us 0.000us 3
4003
+ cudaDeviceSynchronize 58.42% 2.787ms 58.42% 2.787ms 2.787ms 0.000us 0.00% 0.000us 0.000us 1
4004
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4005
+ Self CPU time total: 4.770ms
4006
+ Self CUDA time total: 3.014ms
4007
 
4008
 
4009
 
 
4013
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4014
  Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
4015
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4016
+ hf_kernels_flash_attn3 2.45% 127.821us 37.14% 1.937ms 1.937ms 0.000us 0.00% 4.669ms 4.669ms 1
4017
+ FlashAttnFunc 1.78% 92.961us 34.69% 1.809ms 603.079us 0.000us 0.00% 4.669ms 1.556ms 3
4018
+ _flash_attn3_48fe103_dirty::fwd 0.98% 50.990us 32.91% 1.716ms 572.092us 3.496ms 100.00% 4.669ms 1.556ms 3
4019
+ hf_kernels_flash_attn3 0.00% 0.000us 0.00% 0.000us 0.000us 3.498ms 100.05% 3.498ms 3.498ms 1
4020
+ void cutlass::device_kernel&lt;flash::enable_sm80_to_sm... 0.00% 0.000us 0.00% 0.000us 0.000us 3.496ms 100.00% 3.496ms 1.165ms 3
4021
+ Activity Buffer Request 27.66% 1.443ms 27.66% 1.443ms 1.443ms 1.173ms 33.56% 1.173ms 1.173ms 1
4022
+ aten::empty 0.56% 28.951us 0.56% 28.951us 4.825us 0.000us 0.00% 0.000us 0.000us 6
4023
+ cudaFuncSetAttribute 0.09% 4.870us 0.09% 4.870us 1.623us 0.000us 0.00% 0.000us 0.000us 3
4024
+ cudaLaunchKernel 3.62% 188.673us 3.62% 188.673us 62.891us 0.000us 0.00% 0.000us 0.000us 3
4025
+ cudaDeviceSynchronize 62.86% 3.279ms 62.86% 3.279ms 3.279ms 0.000us 0.00% 0.000us 0.000us 1
4026
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4027
+ Self CPU time total: 5.216ms
4028
+ Self CUDA time total: 3.496ms
4029
 
4030
 
4031
 
 
4035
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4036
  Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
4037
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4038
+ hf_kernels_flash_attn3 2.26% 115.651us 36.11% 1.844ms 1.844ms 0.000us 0.00% 4.648ms 4.648ms 1
4039
+ FlashAttnFunc 1.78% 91.130us 33.84% 1.728ms 576.085us 0.000us 0.00% 4.648ms 1.549ms 3
4040
+ _flash_attn3_48fe103_dirty::fwd 1.06% 54.250us 32.06% 1.637ms 545.708us 3.480ms 100.00% 4.648ms 1.549ms 3
4041
+ hf_kernels_flash_attn3 0.00% 0.000us 0.00% 0.000us 0.000us 3.481ms 100.04% 3.481ms 3.481ms 1
4042
+ void cutlass::device_kernel&lt;flash::enable_sm80_to_sm... 0.00% 0.000us 0.00% 0.000us 0.000us 3.480ms 100.00% 3.480ms 1.160ms 3
4043
+ Activity Buffer Request 27.00% 1.379ms 27.00% 1.379ms 1.379ms 1.168ms 33.58% 1.168ms 1.168ms 1
4044
+ aten::empty 0.55% 28.142us 0.55% 28.142us 4.690us 0.000us 0.00% 0.000us 0.000us 6
4045
+ cudaFuncSetAttribute 0.10% 5.261us 0.10% 5.261us 1.754us 0.000us 0.00% 0.000us 0.000us 3
4046
+ cudaLaunchKernel 3.35% 170.883us 3.35% 170.883us 56.961us 0.000us 0.00% 0.000us 0.000us 3
4047
+ cudaDeviceSynchronize 63.89% 3.263ms 63.89% 3.263ms 3.263ms 0.000us 0.00% 0.000us 0.000us 1
4048
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4049
+ Self CPU time total: 5.107ms
4050
+ Self CUDA time total: 3.480ms
4051
 
4052
 
4053
  impl wl p50(ms) ok
4054
+ hf_kernels_flash_attn3 cuda_attn_L128_bfloat16 0.95 True
4055
+ hf_kernels_flash_attn3 cuda_attn_L256_bfloat16 0.98 True
4056
+ hf_kernels_flash_attn3 cuda_attn_L320_bfloat16 1.03 True
4057
+ hf_kernels_flash_attn3 cuda_attn_L384_bfloat16 1.04 True
4058
+ hf_kernels_flash_attn3 cuda_attn_L448_bfloat16 1.21 True
4059
+ hf_kernels_flash_attn3 cuda_attn_L512_bfloat16 1.18 True
4060
  </pre></div>
4061
+ <div class="cell-stderr">
4062
+ Fetching 4 files: 0%| | 0/4 [00:00&lt;?, ?it/s]
4063
+ Fetching 4 files: 50%|█████ | 2/4 [00:01&lt;00:01, 1.33it/s]
4064
+ Fetching 4 files: 100%|██████████| 4/4 [00:01&lt;00:00, 2.66it/s]
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
4065
  </div>
 
 
 
 
 
4066
  <div class="cell-artifacts">
4067
  <h4>Artifacts:</h4>
4068
  <a href="artifacts/benchmark/attention.jsonl" class="artifact" target="_blank">attention.jsonl</a>
flash_attn/impls/mem_efficient_attention.html CHANGED
@@ -3857,7 +3857,7 @@ span.linenos.special { color: #000000; background-color: #ffffc0; padding-left:
3857
  <div class="system-info">
3858
  <div class="system-info-header">Generated on:</div>
3859
  <div class="system-info-content">
3860
- Linux x86_64 | Linux-5.15.0-1084-aws-x86_64-with-glibc2.31
3861
  </div>
3862
  </div>
3863
 
@@ -3869,9 +3869,9 @@ span.linenos.special { color: #000000; background-color: #ffffc0; padding-left:
3869
  <span class="collapse-indicators">
3870
  <span onclick="toggleCode('benchmark')" style="cursor: pointer;">▼ code</span>
3871
  <span onclick="toggleOutput('benchmark')" style="cursor: pointer;">▼ output</span>
3872
- <span id="uv-indicator-benchmark" onclick="toggleUvLogsFromHeader('benchmark')" style="cursor: pointer;">▶ uv-logs</span>
3873
  </span> |
3874
- Cell: benchmark | 44.03s
3875
  | <button class="run-btn" onclick="runCell('benchmark')">▶ run</button>
3876
  <button class="copy-btn" onclick="copyCell('benchmark')">Copy</button>
3877
  <a href="cells/benchmark.py" target="_blank" class="raw-btn">Raw</a>
@@ -3924,28 +3924,28 @@ PROFILE TRACE: torch_mem_eff | cuda_attn_L128_bfloat16
3924
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
3925
  Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
3926
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
3927
- torch_mem_eff 2.05% 363.238us 13.65% 2.421ms 2.421ms 0.000us 0.00% 16.223ms 16.223ms 1
3928
- torch_mem_eff 0.00% 0.000us 0.00% 0.000us 0.000us 16.048ms 100.05% 16.048ms 16.048ms 1
3929
- aten::scaled_dot_product_attention 0.20% 35.830us 1.03% 182.144us 60.715us 0.000us 0.00% 14.265ms 4.755ms 3
3930
- aten::_scaled_dot_product_efficient_attention 0.13% 22.700us 0.82% 146.314us 48.771us 0.000us 0.00% 14.265ms 4.755ms 3
3931
- aten::_efficient_attention_forward 0.19% 33.351us 0.54% 96.203us 32.068us 14.265ms 88.94% 14.265ms 4.755ms 3
3932
- fmha_cutlassF_bf16_aligned_64x128_rf_sm80(PyTorchMem... 0.00% 0.000us 0.00% 0.000us 0.000us 14.265ms 88.94% 14.265ms 4.755ms 3
3933
- aten::contiguous 0.08% 13.451us 10.18% 1.806ms 200.629us 0.000us 0.00% 1.957ms 217.467us 9
3934
- aten::clone 0.17% 30.701us 10.10% 1.792ms 199.134us 0.000us 0.00% 1.957ms 217.467us 9
3935
- aten::copy_ 0.43% 76.213us 9.49% 1.684ms 187.121us 1.775ms 11.06% 1.957ms 217.467us 9
3936
- void at::native::elementwise_kernel&lt;128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 1.775ms 11.06% 1.775ms 197.189us 9
3937
- Activity Buffer Request 8.62% 1.529ms 8.62% 1.529ms 1.529ms 182.494us 1.14% 182.494us 182.494us 1
3938
- aten::transpose 0.41% 73.552us 0.55% 97.771us 4.074us 0.000us 0.00% 0.000us 0.000us 24
3939
- aten::as_strided 0.14% 24.219us 0.14% 24.219us 1.009us 0.000us 0.00% 0.000us 0.000us 24
3940
- aten::empty_like 0.13% 23.478us 0.44% 77.421us 8.602us 0.000us 0.00% 0.000us 0.000us 9
3941
- aten::empty 0.48% 85.684us 0.48% 85.684us 4.080us 0.000us 0.00% 0.000us 0.000us 21
3942
- cudaLaunchKernel 0.58% 102.581us 0.58% 102.581us 8.548us 0.000us 0.00% 0.000us 0.000us 12
3943
- cudaStreamIsCapturing 0.02% 3.010us 0.02% 3.010us 1.003us 0.000us 0.00% 0.000us 0.000us 3
3944
- cudaFuncSetAttribute 0.02% 4.301us 0.02% 4.301us 1.434us 0.000us 0.00% 0.000us 0.000us 3
3945
- cudaDeviceSynchronize 86.35% 15.322ms 86.35% 15.322ms 15.322ms 0.000us 0.00% 0.000us 0.000us 1
3946
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
3947
- Self CPU time total: 17.744ms
3948
- Self CUDA time total: 16.040ms
3949
 
3950
 
3951
 
@@ -3955,28 +3955,28 @@ PROFILE TRACE: torch_mem_eff | cuda_attn_L256_bfloat16
3955
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
3956
  Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
3957
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
3958
- torch_mem_eff 1.10% 253.536us 9.32% 2.141ms 2.141ms 0.000us 0.00% 21.587ms 21.587ms 1
3959
- torch_mem_eff 0.00% 0.000us 0.00% 0.000us 0.000us 21.402ms 100.04% 21.402ms 21.402ms 1
3960
- aten::scaled_dot_product_attention 0.08% 19.430us 0.63% 143.683us 47.894us 0.000us 0.00% 19.557ms 6.519ms 3
3961
- aten::_scaled_dot_product_efficient_attention 0.08% 18.332us 0.54% 124.253us 41.418us 0.000us 0.00% 19.557ms 6.519ms 3
3962
- aten::_efficient_attention_forward 0.12% 28.280us 0.35% 81.271us 27.090us 19.557ms 91.42% 19.557ms 6.519ms 3
3963
- fmha_cutlassF_bf16_aligned_64x128_rf_sm80(PyTorchMem... 0.00% 0.000us 0.00% 0.000us 0.000us 19.557ms 91.42% 19.557ms 6.519ms 3
3964
- aten::contiguous 0.03% 7.109us 7.41% 1.701ms 189.023us 0.000us 0.00% 2.030ms 225.605us 9
3965
- aten::clone 0.09% 20.673us 7.38% 1.694ms 188.233us 0.000us 0.00% 2.030ms 225.605us 9
3966
- aten::copy_ 0.27% 61.032us 7.08% 1.625ms 180.543us 1.836ms 8.58% 2.030ms 225.605us 9
3967
- void at::native::elementwise_kernel&lt;128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 1.836ms 8.58% 1.836ms 203.973us 9
3968
- Activity Buffer Request 6.54% 1.501ms 6.54% 1.501ms 1.501ms 194.686us 0.91% 194.686us 194.686us 1
3969
- aten::transpose 0.22% 49.892us 0.29% 67.250us 2.802us 0.000us 0.00% 0.000us 0.000us 24
3970
- aten::as_strided 0.08% 17.358us 0.08% 17.358us 0.723us 0.000us 0.00% 0.000us 0.000us 24
3971
- aten::empty_like 0.05% 11.620us 0.21% 48.540us 5.393us 0.000us 0.00% 0.000us 0.000us 9
3972
- aten::empty 0.27% 63.131us 0.27% 63.131us 3.006us 0.000us 0.00% 0.000us 0.000us 21
3973
- cudaLaunchKernel 0.37% 84.411us 0.37% 84.411us 7.034us 0.000us 0.00% 0.000us 0.000us 12
3974
- cudaStreamIsCapturing 0.01% 2.460us 0.01% 2.460us 0.820us 0.000us 0.00% 0.000us 0.000us 3
3975
- cudaFuncSetAttribute 0.01% 2.960us 0.01% 2.960us 0.987us 0.000us 0.00% 0.000us 0.000us 3
3976
- cudaDeviceSynchronize 90.68% 20.821ms 90.68% 20.821ms 20.821ms 0.000us 0.00% 0.000us 0.000us 1
3977
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
3978
- Self CPU time total: 22.962ms
3979
- Self CUDA time total: 21.392ms
3980
 
3981
 
3982
 
@@ -3986,28 +3986,28 @@ PROFILE TRACE: torch_mem_eff | cuda_attn_L320_bfloat16
3986
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
3987
  Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
3988
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
3989
- torch_mem_eff 1.02% 243.020us 8.92% 2.127ms 2.127ms 0.000us 0.00% 22.482ms 22.482ms 1
3990
- torch_mem_eff 0.00% 0.000us 0.00% 0.000us 0.000us 22.293ms 100.04% 22.293ms 22.293ms 1
3991
- aten::scaled_dot_product_attention 0.08% 18.442us 0.60% 142.065us 47.355us 0.000us 0.00% 20.413ms 6.804ms 3
3992
- aten::_scaled_dot_product_efficient_attention 0.08% 17.984us 0.52% 123.623us 41.208us 0.000us 0.00% 20.413ms 6.804ms 3
3993
- aten::_efficient_attention_forward 0.12% 28.538us 0.35% 82.550us 27.517us 20.413ms 91.61% 20.413ms 6.804ms 3
3994
- fmha_cutlassF_bf16_aligned_64x128_rf_sm80(PyTorchMem... 0.00% 0.000us 0.00% 0.000us 0.000us 20.413ms 91.61% 20.413ms 6.804ms 3
3995
- aten::contiguous 0.03% 7.301us 7.12% 1.699ms 188.733us 0.000us 0.00% 2.068ms 229.822us 9
3996
- aten::clone 0.09% 20.431us 7.09% 1.691ms 187.922us 0.000us 0.00% 2.068ms 229.822us 9
3997
- aten::copy_ 0.25% 59.709us 6.80% 1.622ms 180.233us 1.870ms 8.39% 2.068ms 229.822us 9
3998
- void at::native::elementwise_kernel&lt;128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 1.870ms 8.39% 1.870ms 207.771us 9
3999
- Activity Buffer Request 6.28% 1.498ms 6.28% 1.498ms 1.498ms 198.462us 0.89% 198.462us 198.462us 1
4000
- aten::transpose 0.21% 49.091us 0.28% 66.291us 2.762us 0.000us 0.00% 0.000us 0.000us 24
4001
- aten::as_strided 0.07% 17.200us 0.07% 17.200us 0.717us 0.000us 0.00% 0.000us 0.000us 24
4002
- aten::empty_like 0.05% 11.563us 0.20% 48.772us 5.419us 0.000us 0.00% 0.000us 0.000us 9
4003
- aten::empty 0.27% 63.659us 0.27% 63.659us 3.031us 0.000us 0.00% 0.000us 0.000us 21
4004
- cudaLaunchKernel 0.36% 86.324us 0.36% 86.324us 7.194us 0.000us 0.00% 0.000us 0.000us 12
4005
- cudaStreamIsCapturing 0.01% 2.431us 0.01% 2.431us 0.810us 0.000us 0.00% 0.000us 0.000us 3
4006
- cudaFuncSetAttribute 0.01% 2.970us 0.01% 2.970us 0.990us 0.000us 0.00% 0.000us 0.000us 3
4007
- cudaDeviceSynchronize 91.08% 21.725ms 91.08% 21.725ms 21.725ms 0.000us 0.00% 0.000us 0.000us 1
4008
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4009
- Self CPU time total: 23.852ms
4010
- Self CUDA time total: 22.283ms
4011
 
4012
 
4013
 
@@ -4017,28 +4017,28 @@ PROFILE TRACE: torch_mem_eff | cuda_attn_L384_bfloat16
4017
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4018
  Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
4019
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4020
- torch_mem_eff 1.02% 244.258us 9.92% 2.384ms 2.384ms 0.000us 0.00% 22.468ms 22.468ms 1
4021
- torch_mem_eff 0.00% 0.000us 0.00% 0.000us 0.000us 22.273ms 100.04% 22.273ms 22.273ms 1
4022
- aten::scaled_dot_product_attention 0.08% 18.581us 0.64% 152.823us 50.941us 0.000us 0.00% 20.365ms 6.788ms 3
4023
- aten::_scaled_dot_product_efficient_attention 0.08% 18.340us 0.56% 134.242us 44.747us 0.000us 0.00% 20.365ms 6.788ms 3
4024
- aten::_efficient_attention_forward 0.12% 27.659us 0.39% 92.632us 30.877us 20.365ms 91.47% 20.365ms 6.788ms 3
4025
- fmha_cutlassF_bf16_aligned_64x128_rf_sm80(PyTorchMem... 0.00% 0.000us 0.00% 0.000us 0.000us 20.365ms 91.47% 20.365ms 6.788ms 3
4026
- aten::contiguous 0.03% 7.371us 8.08% 1.943ms 215.938us 0.000us 0.00% 2.103ms 233.655us 9
4027
- aten::clone 0.09% 21.799us 8.05% 1.936ms 215.119us 0.000us 0.00% 2.103ms 233.655us 9
4028
- aten::copy_ 0.27% 65.442us 7.66% 1.841ms 204.604us 1.898ms 8.53% 2.103ms 233.655us 9
4029
- void at::native::elementwise_kernel&lt;128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 1.898ms 8.53% 1.898ms 210.921us 9
4030
- Activity Buffer Request 6.22% 1.495ms 6.22% 1.495ms 1.495ms 204.607us 0.92% 204.607us 204.607us 1
4031
- aten::transpose 0.20% 48.657us 0.28% 66.799us 2.783us 0.000us 0.00% 0.000us 0.000us 24
4032
- aten::as_strided 0.08% 18.142us 0.08% 18.142us 0.756us 0.000us 0.00% 0.000us 0.000us 24
4033
- aten::empty_like 0.13% 32.371us 0.30% 72.832us 8.092us 0.000us 0.00% 0.000us 0.000us 9
4034
- aten::empty 0.29% 69.063us 0.29% 69.063us 3.289us 0.000us 0.00% 0.000us 0.000us 21
4035
- cudaLaunchKernel 1.30% 311.775us 1.30% 311.775us 25.981us 0.000us 0.00% 0.000us 0.000us 12
4036
- cudaStreamIsCapturing 0.01% 2.430us 0.01% 2.430us 0.810us 0.000us 0.00% 0.000us 0.000us 3
4037
- cudaFuncSetAttribute 0.01% 2.951us 0.01% 2.951us 0.984us 0.000us 0.00% 0.000us 0.000us 3
4038
- cudaDeviceSynchronize 90.08% 21.659ms 90.08% 21.659ms 21.659ms 0.000us 0.00% 0.000us 0.000us 1
4039
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4040
- Self CPU time total: 24.043ms
4041
- Self CUDA time total: 22.264ms
4042
 
4043
 
4044
 
@@ -4048,28 +4048,28 @@ PROFILE TRACE: torch_mem_eff | cuda_attn_L448_bfloat16
4048
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4049
  Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
4050
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4051
- torch_mem_eff 0.99% 238.965us 8.38% 2.024ms 2.024ms 0.000us 0.00% 22.887ms 22.887ms 1
4052
- torch_mem_eff 0.00% 0.000us 0.00% 0.000us 0.000us 22.691ms 100.04% 22.691ms 22.691ms 1
4053
- aten::scaled_dot_product_attention 0.08% 19.540us 0.60% 145.283us 48.428us 0.000us 0.00% 20.756ms 6.919ms 3
4054
- aten::_scaled_dot_product_efficient_attention 0.08% 18.450us 0.52% 125.743us 41.914us 0.000us 0.00% 20.756ms 6.919ms 3
4055
- aten::_efficient_attention_forward 0.12% 28.200us 0.34% 82.042us 27.347us 20.756ms 91.51% 20.756ms 6.919ms 3
4056
- fmha_cutlassF_bf16_aligned_64x128_rf_sm80(PyTorchMem... 0.00% 0.000us 0.00% 0.000us 0.000us 20.756ms 91.51% 20.756ms 6.919ms 3
4057
- aten::contiguous 0.03% 7.310us 6.62% 1.597ms 177.483us 0.000us 0.00% 2.130ms 236.720us 9
4058
- aten::clone 0.08% 20.502us 6.59% 1.590ms 176.671us 0.000us 0.00% 2.130ms 236.720us 9
4059
- aten::copy_ 0.25% 60.710us 6.29% 1.519ms 168.815us 1.926ms 8.49% 2.130ms 236.720us 9
4060
- void at::native::elementwise_kernel&lt;128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 1.926ms 8.49% 1.926ms 213.965us 9
4061
- Activity Buffer Request 4.97% 1.199ms 4.97% 1.199ms 1.199ms 204.798us 0.90% 204.798us 204.798us 1
4062
- aten::transpose 0.21% 49.950us 0.28% 67.671us 2.820us 0.000us 0.00% 0.000us 0.000us 24
4063
- aten::as_strided 0.07% 17.721us 0.07% 17.721us 0.738us 0.000us 0.00% 0.000us 0.000us 24
4064
- aten::empty_like 0.05% 11.321us 0.21% 50.202us 5.578us 0.000us 0.00% 0.000us 0.000us 9
4065
- aten::empty 0.27% 64.383us 0.27% 64.383us 3.066us 0.000us 0.00% 0.000us 0.000us 21
4066
- cudaLaunchKernel 1.17% 282.217us 1.17% 282.217us 23.518us 0.000us 0.00% 0.000us 0.000us 12
4067
- cudaStreamIsCapturing 0.01% 2.720us 0.01% 2.720us 0.907us 0.000us 0.00% 0.000us 0.000us 3
4068
- cudaFuncSetAttribute 0.01% 3.029us 0.01% 3.029us 1.010us 0.000us 0.00% 0.000us 0.000us 3
4069
- cudaDeviceSynchronize 91.62% 22.117ms 91.62% 22.117ms 22.117ms 0.000us 0.00% 0.000us 0.000us 1
4070
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4071
- Self CPU time total: 24.141ms
4072
- Self CUDA time total: 22.682ms
4073
 
4074
 
4075
 
@@ -4079,90 +4079,38 @@ PROFILE TRACE: torch_mem_eff | cuda_attn_L512_bfloat16
4079
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4080
  Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
4081
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4082
- torch_mem_eff 0.89% 241.438us 9.64% 2.630ms 2.630ms 0.000us 0.00% 25.454ms 25.454ms 1
4083
- torch_mem_eff 0.00% 0.000us 0.00% 0.000us 0.000us 25.223ms 100.04% 25.223ms 25.223ms 1
4084
- aten::scaled_dot_product_attention 0.07% 18.690us 0.53% 143.613us 47.871us 0.000us 0.00% 22.917ms 7.639ms 3
4085
- aten::_scaled_dot_product_efficient_attention 0.07% 19.432us 0.46% 124.923us 41.641us 0.000us 0.00% 22.917ms 7.639ms 3
4086
- aten::_efficient_attention_forward 0.10% 27.951us 0.30% 81.832us 27.277us 22.917ms 90.90% 22.917ms 7.639ms 3
4087
- fmha_cutlassF_bf16_aligned_64x128_rf_sm80(PyTorchMem... 0.00% 0.000us 0.00% 0.000us 0.000us 22.917ms 90.90% 22.917ms 7.639ms 3
4088
- aten::contiguous 0.03% 7.769us 8.07% 2.200ms 244.390us 0.000us 0.00% 2.537ms 281.850us 9
4089
- aten::clone 0.08% 21.360us 8.04% 2.192ms 243.526us 0.000us 0.00% 2.537ms 281.850us 9
4090
- aten::copy_ 0.23% 62.351us 7.77% 2.118ms 235.368us 2.295ms 9.10% 2.537ms 281.850us 9
4091
- void at::native::elementwise_kernel&lt;128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 2.295ms 9.10% 2.295ms 255.042us 9
4092
- Activity Buffer Request 5.96% 1.625ms 5.96% 1.625ms 1.625ms 241.278us 0.96% 241.278us 241.278us 1
4093
- aten::transpose 0.19% 51.326us 0.25% 68.688us 2.862us 0.000us 0.00% 0.000us 0.000us 24
4094
- aten::as_strided 0.06% 17.362us 0.06% 17.362us 0.723us 0.000us 0.00% 0.000us 0.000us 24
4095
- aten::empty_like 0.04% 11.861us 0.19% 52.062us 5.785us 0.000us 0.00% 0.000us 0.000us 9
4096
- aten::empty 0.24% 65.461us 0.24% 65.461us 3.117us 0.000us 0.00% 0.000us 0.000us 21
4097
- cudaLaunchKernel 1.67% 454.311us 1.67% 454.311us 37.859us 0.000us 0.00% 0.000us 0.000us 12
4098
- cudaStreamIsCapturing 0.01% 2.710us 0.01% 2.710us 0.903us 0.000us 0.00% 0.000us 0.000us 3
4099
- cudaFuncSetAttribute 0.01% 2.880us 0.01% 2.880us 0.960us 0.000us 0.00% 0.000us 0.000us 3
4100
- cudaDeviceSynchronize 90.36% 24.642ms 90.36% 24.642ms 24.642ms 0.000us 0.00% 0.000us 0.000us 1
4101
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4102
- Self CPU time total: 27.271ms
4103
- Self CUDA time total: 25.213ms
4104
 
4105
 
4106
  impl wl p50(ms) ok
4107
- torch_mem_eff cuda_attn_L128_bfloat16 6.77 True
4108
- torch_mem_eff cuda_attn_L256_bfloat16 7.24 True
4109
- torch_mem_eff cuda_attn_L320_bfloat16 7.52 True
4110
- torch_mem_eff cuda_attn_L384_bfloat16 7.59 True
4111
- torch_mem_eff cuda_attn_L448_bfloat16 7.97 True
4112
- torch_mem_eff cuda_attn_L512_bfloat16 8.47 True
4113
  </pre></div>
4114
- <div class="uv-install-logs" id="uv-logs-benchmark">
4115
- <div class="uv-logs-header" onclick="toggleUvLogs(this)">▶ UV Install Logs</div>
4116
- <div class="uv-logs-content" style="display: none;">
4117
- Building kernels-benchmark-tools @ file:///home/ubuntu/Projects/kernels-benchmarks-consolidated/tools
4118
- Downloading pillow (6.7MiB)
4119
- Downloading sympy (6.0MiB)
4120
- Downloading nvidia-nccl-cu12 (307.4MiB)
4121
- Downloading nvidia-cusparselt-cu12 (273.9MiB)
4122
- Downloading nvidia-cusolver-cu12 (255.1MiB)
4123
- Downloading nvidia-cufile-cu12 (1.1MiB)
4124
- Downloading fonttools (4.7MiB)
4125
- Downloading nvidia-cudnn-cu12 (674.0MiB)
4126
- Downloading nvidia-cuda-nvrtc-cu12 (84.0MiB)
4127
- Downloading numpy (15.9MiB)
4128
- Downloading setuptools (1.1MiB)
4129
- Downloading nvidia-cuda-cupti-cu12 (9.8MiB)
4130
- Downloading matplotlib (8.3MiB)
4131
- Downloading networkx (1.9MiB)
4132
- Downloading nvidia-curand-cu12 (60.7MiB)
4133
- Downloading nvidia-nvjitlink-cu12 (37.4MiB)
4134
- Downloading kiwisolver (1.4MiB)
4135
- Downloading nvidia-cusparse-cu12 (274.9MiB)
4136
- Downloading nvidia-cufft-cu12 (184.2MiB)
4137
- Downloading nvidia-cublas-cu12 (566.8MiB)
4138
- Downloading torch (846.8MiB)
4139
- Downloading triton (148.4MiB)
4140
- Downloading nvidia-cufile-cu12
4141
- Downloading kiwisolver
4142
- Downloading setuptools
4143
- Downloading fonttools
4144
- Downloading networkx
4145
- Downloading pillow
4146
- Built kernels-benchmark-tools @ file:///home/ubuntu/Projects/kernels-benchmarks-consolidated/tools
4147
- Downloading matplotlib
4148
- Downloading nvidia-cuda-cupti-cu12
4149
- Downloading numpy
4150
- Downloading nvidia-nvjitlink-cu12
4151
- Downloading sympy
4152
- Downloading nvidia-curand-cu12
4153
- Downloading nvidia-cuda-nvrtc-cu12
4154
- Downloading triton
4155
- Downloading nvidia-cufft-cu12
4156
- Downloading nvidia-cusolver-cu12
4157
- Downloading nvidia-cusparse-cu12
4158
- Downloading nvidia-cusparselt-cu12
4159
- Downloading nvidia-nccl-cu12
4160
- Downloading nvidia-cublas-cu12
4161
- Downloading nvidia-cudnn-cu12
4162
- Downloading torch
4163
- Installed 37 packages in 228ms
4164
- </div>
4165
- </div>
4166
  <div class="cell-artifacts">
4167
  <h4>Artifacts:</h4>
4168
  <a href="artifacts/benchmark/attention.jsonl" class="artifact" target="_blank">attention.jsonl</a>
 
3857
  <div class="system-info">
3858
  <div class="system-info-header">Generated on:</div>
3859
  <div class="system-info-content">
3860
+ Linux x86_64 | Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35
3861
  </div>
3862
  </div>
3863
 
 
3869
  <span class="collapse-indicators">
3870
  <span onclick="toggleCode('benchmark')" style="cursor: pointer;">▼ code</span>
3871
  <span onclick="toggleOutput('benchmark')" style="cursor: pointer;">▼ output</span>
3872
+ <span id="uv-indicator-benchmark" style="cursor: default; opacity: 0.3;">▶ uv-logs</span>
3873
  </span> |
3874
+ Cell: benchmark | 4.02s
3875
  | <button class="run-btn" onclick="runCell('benchmark')">▶ run</button>
3876
  <button class="copy-btn" onclick="copyCell('benchmark')">Copy</button>
3877
  <a href="cells/benchmark.py" target="_blank" class="raw-btn">Raw</a>
 
3924
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
3925
  Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
3926
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
3927
+ torch_mem_eff 4.61% 329.029us 32.49% 2.320ms 2.320ms 0.000us 0.00% 5.545ms 5.545ms 1
3928
+ torch_mem_eff 0.00% 0.000us 0.00% 0.000us 0.000us 5.524ms 100.54% 5.524ms 5.524ms 1
3929
+ aten::scaled_dot_product_attention 0.42% 29.860us 2.75% 196.242us 65.414us 0.000us 0.00% 4.878ms 1.626ms 3
3930
+ aten::_scaled_dot_product_efficient_attention 0.35% 25.230us 2.33% 166.382us 55.461us 0.000us 0.00% 4.878ms 1.626ms 3
3931
+ aten::_efficient_attention_forward 0.73% 52.049us 1.68% 119.861us 39.954us 4.878ms 88.79% 4.878ms 1.626ms 3
3932
+ fmha_cutlassF_bf16_aligned_64x128_rf_sm80(PyTorchMem... 0.00% 0.000us 0.00% 0.000us 0.000us 4.878ms 88.79% 4.878ms 1.626ms 3
3933
+ aten::contiguous 0.18% 13.143us 24.28% 1.734ms 192.643us 0.000us 0.00% 666.300us 74.033us 9
3934
+ aten::clone 0.50% 35.608us 24.09% 1.721ms 191.183us 0.000us 0.00% 666.300us 74.033us 9
3935
+ aten::copy_ 1.01% 71.952us 22.59% 1.613ms 179.214us 615.708us 11.21% 666.300us 74.033us 9
3936
+ void at::native::elementwise_kernel&lt;128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 615.708us 11.21% 615.708us 68.412us 9
3937
+ Activity Buffer Request 20.33% 1.452ms 20.33% 1.452ms 1.452ms 50.592us 0.92% 50.592us 50.592us 1
3938
+ aten::transpose 0.87% 61.994us 1.16% 82.494us 3.437us 0.000us 0.00% 0.000us 0.000us 24
3939
+ aten::as_strided 0.29% 20.500us 0.29% 20.500us 0.854us 0.000us 0.00% 0.000us 0.000us 24
3940
+ aten::empty_like 0.25% 17.742us 1.01% 72.112us 8.012us 0.000us 0.00% 0.000us 0.000us 9
3941
+ aten::empty 1.17% 83.610us 1.17% 83.610us 3.981us 0.000us 0.00% 0.000us 0.000us 21
3942
+ cudaLaunchKernel 1.60% 114.582us 1.60% 114.582us 9.548us 0.000us 0.00% 0.000us 0.000us 12
3943
+ cudaStreamIsCapturing 0.04% 3.180us 0.04% 3.180us 1.060us 0.000us 0.00% 0.000us 0.000us 3
3944
+ cudaFuncSetAttribute 0.14% 10.280us 0.14% 10.280us 3.427us 0.000us 0.00% 0.000us 0.000us 3
3945
+ cudaDeviceSynchronize 67.51% 4.821ms 67.51% 4.821ms 4.821ms 0.000us 0.00% 0.000us 0.000us 1
3946
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
3947
+ Self CPU time total: 7.141ms
3948
+ Self CUDA time total: 5.494ms
3949
 
3950
 
3951
 
 
3955
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
3956
  Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
3957
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
3958
+ torch_mem_eff 3.39% 253.102us 28.13% 2.097ms 2.097ms 0.000us 0.00% 5.972ms 5.972ms 1
3959
+ torch_mem_eff 0.00% 0.000us 0.00% 0.000us 0.000us 5.926ms 100.15% 5.926ms 5.926ms 1
3960
+ aten::scaled_dot_product_attention 0.26% 19.190us 1.92% 143.113us 47.704us 0.000us 0.00% 5.278ms 1.759ms 3
3961
+ aten::_scaled_dot_product_efficient_attention 0.26% 19.540us 1.66% 123.923us 41.308us 0.000us 0.00% 5.278ms 1.759ms 3
3962
+ aten::_efficient_attention_forward 0.37% 27.385us 1.10% 81.652us 27.217us 5.278ms 89.20% 5.278ms 1.759ms 3
3963
+ fmha_cutlassF_bf16_aligned_64x128_rf_sm80(PyTorchMem... 0.00% 0.000us 0.00% 0.000us 0.000us 5.278ms 89.20% 5.278ms 1.759ms 3
3964
+ aten::contiguous 0.09% 6.999us 22.26% 1.660ms 184.423us 0.000us 0.00% 693.503us 77.056us 9
3965
+ aten::clone 0.31% 23.031us 22.17% 1.653ms 183.645us 0.000us 0.00% 693.503us 77.056us 9
3966
+ aten::copy_ 0.83% 61.989us 21.18% 1.579ms 175.477us 638.911us 10.80% 693.503us 77.056us 9
3967
+ void at::native::elementwise_kernel&lt;128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 638.911us 10.80% 638.911us 70.990us 9
3968
+ Activity Buffer Request 19.45% 1.450ms 19.45% 1.450ms 1.450ms 54.592us 0.92% 54.592us 54.592us 1
3969
+ aten::transpose 0.64% 47.641us 0.86% 64.101us 2.671us 0.000us 0.00% 0.000us 0.000us 24
3970
+ aten::as_strided 0.22% 16.460us 0.22% 16.460us 0.686us 0.000us 0.00% 0.000us 0.000us 24
3971
+ aten::empty_like 0.16% 11.730us 0.68% 50.483us 5.609us 0.000us 0.00% 0.000us 0.000us 9
3972
+ aten::empty 0.86% 64.470us 0.86% 64.470us 3.070us 0.000us 0.00% 0.000us 0.000us 21
3973
+ cudaLaunchKernel 1.21% 90.240us 1.21% 90.240us 7.520us 0.000us 0.00% 0.000us 0.000us 12
3974
+ cudaStreamIsCapturing 0.03% 2.290us 0.03% 2.290us 0.763us 0.000us 0.00% 0.000us 0.000us 3
3975
+ cudaFuncSetAttribute 0.04% 3.130us 0.04% 3.130us 1.043us 0.000us 0.00% 0.000us 0.000us 3
3976
+ cudaDeviceSynchronize 71.87% 5.359ms 71.87% 5.359ms 5.359ms 0.000us 0.00% 0.000us 0.000us 1
3977
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
3978
+ Self CPU time total: 7.456ms
3979
+ Self CUDA time total: 5.917ms
3980
 
3981
 
3982
 
 
3986
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
3987
  Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
3988
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
3989
+ torch_mem_eff 3.16% 240.823us 26.89% 2.051ms 2.051ms 0.000us 0.00% 6.167ms 6.167ms 1
3990
+ torch_mem_eff 0.00% 0.000us 0.00% 0.000us 0.000us 6.117ms 100.14% 6.117ms 6.117ms 1
3991
+ aten::scaled_dot_product_attention 0.24% 18.220us 1.81% 137.732us 45.911us 0.000us 0.00% 5.453ms 1.818ms 3
3992
+ aten::_scaled_dot_product_efficient_attention 0.24% 18.402us 1.57% 119.512us 39.837us 0.000us 0.00% 5.453ms 1.818ms 3
3993
+ aten::_efficient_attention_forward 0.35% 26.389us 1.04% 79.670us 26.557us 5.453ms 89.28% 5.453ms 1.818ms 3
3994
+ fmha_cutlassF_bf16_aligned_64x128_rf_sm80(PyTorchMem... 0.00% 0.000us 0.00% 0.000us 0.000us 5.453ms 89.28% 5.453ms 1.818ms 3
3995
+ aten::contiguous 0.09% 6.950us 21.38% 1.630ms 181.132us 0.000us 0.00% 713.534us 79.282us 9
3996
+ aten::clone 0.28% 21.189us 21.28% 1.623ms 180.360us 0.000us 0.00% 713.534us 79.282us 9
3997
+ aten::copy_ 0.81% 62.032us 20.34% 1.551ms 172.330us 655.038us 10.72% 713.534us 79.282us 9
3998
+ void at::native::elementwise_kernel&lt;128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 655.038us 10.72% 655.038us 72.782us 9
3999
+ Activity Buffer Request 18.63% 1.421ms 18.63% 1.421ms 1.421ms 58.496us 0.96% 58.496us 58.496us 1
4000
+ aten::transpose 0.62% 47.348us 0.84% 63.699us 2.654us 0.000us 0.00% 0.000us 0.000us 24
4001
+ aten::as_strided 0.21% 16.351us 0.21% 16.351us 0.681us 0.000us 0.00% 0.000us 0.000us 24
4002
+ aten::empty_like 0.15% 11.091us 0.67% 51.081us 5.676us 0.000us 0.00% 0.000us 0.000us 9
4003
+ aten::empty 0.86% 65.760us 0.86% 65.760us 3.131us 0.000us 0.00% 0.000us 0.000us 21
4004
+ cudaLaunchKernel 1.18% 89.982us 1.18% 89.982us 7.498us 0.000us 0.00% 0.000us 0.000us 12
4005
+ cudaStreamIsCapturing 0.03% 2.210us 0.03% 2.210us 0.737us 0.000us 0.00% 0.000us 0.000us 3
4006
+ cudaFuncSetAttribute 0.04% 3.100us 0.04% 3.100us 1.033us 0.000us 0.00% 0.000us 0.000us 3
4007
+ cudaDeviceSynchronize 73.11% 5.575ms 73.11% 5.575ms 5.575ms 0.000us 0.00% 0.000us 0.000us 1
4008
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4009
+ Self CPU time total: 7.626ms
4010
+ Self CUDA time total: 6.108ms
4011
 
4012
 
4013
 
 
4017
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4018
  Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
4019
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4020
+ torch_mem_eff 4.44% 356.182us 33.00% 2.648ms 2.648ms 0.000us 0.00% 6.210ms 6.210ms 1
4021
+ torch_mem_eff 0.00% 0.000us 0.00% 0.000us 0.000us 6.165ms 100.21% 6.165ms 6.165ms 1
4022
+ aten::scaled_dot_product_attention 0.29% 23.400us 2.31% 185.263us 61.754us 0.000us 0.00% 5.497ms 1.832ms 3
4023
+ aten::_scaled_dot_product_efficient_attention 0.29% 23.202us 2.02% 161.863us 53.954us 0.000us 0.00% 5.497ms 1.832ms 3
4024
+ aten::_efficient_attention_forward 0.44% 35.239us 1.36% 108.811us 36.270us 5.497ms 89.36% 5.497ms 1.832ms 3
4025
+ fmha_cutlassF_bf16_aligned_64x128_rf_sm80(PyTorchMem... 0.00% 0.000us 0.00% 0.000us 0.000us 5.497ms 89.36% 5.497ms 1.832ms 3
4026
+ aten::contiguous 0.11% 9.040us 25.54% 2.050ms 227.726us 0.000us 0.00% 712.735us 79.193us 9
4027
+ aten::clone 0.35% 28.461us 25.43% 2.040ms 226.722us 0.000us 0.00% 712.735us 79.193us 9
4028
+ aten::copy_ 1.02% 82.020us 24.22% 1.944ms 215.993us 654.527us 10.64% 712.735us 79.193us 9
4029
+ void at::native::elementwise_kernel&lt;128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 654.527us 10.64% 654.527us 72.725us 9
4030
+ Activity Buffer Request 19.35% 1.553ms 19.35% 1.553ms 1.553ms 58.208us 0.95% 58.208us 58.208us 1
4031
+ aten::transpose 0.81% 64.960us 1.09% 87.330us 3.639us 0.000us 0.00% 0.000us 0.000us 24
4032
+ aten::as_strided 0.28% 22.370us 0.28% 22.370us 0.932us 0.000us 0.00% 0.000us 0.000us 24
4033
+ aten::empty_like 0.19% 15.081us 0.85% 68.092us 7.566us 0.000us 0.00% 0.000us 0.000us 9
4034
+ aten::empty 1.09% 87.522us 1.09% 87.522us 4.168us 0.000us 0.00% 0.000us 0.000us 21
4035
+ cudaLaunchKernel 4.25% 341.154us 4.25% 341.154us 28.429us 0.000us 0.00% 0.000us 0.000us 12
4036
+ cudaStreamIsCapturing 0.04% 2.841us 0.04% 2.841us 0.947us 0.000us 0.00% 0.000us 0.000us 3
4037
+ cudaFuncSetAttribute 0.05% 4.120us 0.05% 4.120us 1.373us 0.000us 0.00% 0.000us 0.000us 3
4038
+ cudaDeviceSynchronize 67.00% 5.376ms 67.00% 5.376ms 5.376ms 0.000us 0.00% 0.000us 0.000us 1
4039
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4040
+ Self CPU time total: 8.025ms
4041
+ Self CUDA time total: 6.152ms
4042
 
4043
 
4044
 
 
4048
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4049
  Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
4050
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4051
+ torch_mem_eff 3.33% 272.217us 28.45% 2.323ms 2.323ms 0.000us 0.00% 6.452ms 6.452ms 1
4052
+ torch_mem_eff 0.00% 0.000us 0.00% 0.000us 0.000us 6.401ms 100.14% 6.401ms 6.401ms 1
4053
+ aten::scaled_dot_product_attention 0.25% 20.040us 1.74% 141.700us 47.233us 0.000us 0.00% 5.729ms 1.910ms 3
4054
+ aten::_scaled_dot_product_efficient_attention 0.23% 18.560us 1.49% 121.660us 40.553us 0.000us 0.00% 5.729ms 1.910ms 3
4055
+ aten::_efficient_attention_forward 0.34% 27.420us 1.00% 81.440us 27.147us 5.729ms 89.62% 5.729ms 1.910ms 3
4056
+ fmha_cutlassF_bf16_aligned_64x128_rf_sm80(PyTorchMem... 0.00% 0.000us 0.00% 0.000us 0.000us 5.729ms 89.62% 5.729ms 1.910ms 3
4057
+ aten::contiguous 0.09% 7.310us 22.83% 1.865ms 207.177us 0.000us 0.00% 723.614us 80.402us 9
4058
+ aten::clone 0.27% 22.438us 22.75% 1.857ms 206.364us 0.000us 0.00% 723.614us 80.402us 9
4059
+ aten::copy_ 0.75% 61.292us 21.84% 1.783ms 198.108us 663.806us 10.38% 723.614us 80.402us 9
4060
+ void at::native::elementwise_kernel&lt;128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 663.806us 10.38% 663.806us 73.756us 9
4061
+ Activity Buffer Request 18.13% 1.481ms 18.13% 1.481ms 1.481ms 59.808us 0.94% 59.808us 59.808us 1
4062
+ aten::transpose 0.61% 49.591us 0.81% 66.019us 2.751us 0.000us 0.00% 0.000us 0.000us 24
4063
+ aten::as_strided 0.20% 16.428us 0.20% 16.428us 0.684us 0.000us 0.00% 0.000us 0.000us 24
4064
+ aten::empty_like 0.14% 11.501us 0.64% 51.871us 5.763us 0.000us 0.00% 0.000us 0.000us 9
4065
+ aten::empty 0.80% 65.620us 0.80% 65.620us 3.125us 0.000us 0.00% 0.000us 0.000us 21
4066
+ cudaLaunchKernel 3.24% 264.473us 3.24% 264.473us 22.039us 0.000us 0.00% 0.000us 0.000us 12
4067
+ cudaStreamIsCapturing 0.03% 2.310us 0.03% 2.310us 0.770us 0.000us 0.00% 0.000us 0.000us 3
4068
+ cudaFuncSetAttribute 0.04% 3.060us 0.04% 3.060us 1.020us 0.000us 0.00% 0.000us 0.000us 3
4069
+ cudaDeviceSynchronize 71.55% 5.843ms 71.55% 5.843ms 5.843ms 0.000us 0.00% 0.000us 0.000us 1
4070
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4071
+ Self CPU time total: 8.166ms
4072
+ Self CUDA time total: 6.392ms
4073
 
4074
 
4075
 
 
4079
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4080
  Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
4081
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4082
+ torch_mem_eff 2.84% 238.921us 26.25% 2.206ms 2.206ms 0.000us 0.00% 6.803ms 6.803ms 1
4083
+ torch_mem_eff 0.00% 0.000us 0.00% 0.000us 0.000us 6.751ms 100.13% 6.751ms 6.751ms 1
4084
+ aten::scaled_dot_product_attention 0.23% 19.080us 1.67% 140.122us 46.707us 0.000us 0.00% 6.072ms 2.024ms 3
4085
+ aten::_scaled_dot_product_efficient_attention 0.22% 18.680us 1.44% 121.042us 40.347us 0.000us 0.00% 6.072ms 2.024ms 3
4086
+ aten::_efficient_attention_forward 0.32% 27.009us 0.95% 79.840us 26.613us 6.072ms 90.07% 6.072ms 2.024ms 3
4087
+ fmha_cutlassF_bf16_aligned_64x128_rf_sm80(PyTorchMem... 0.00% 0.000us 0.00% 0.000us 0.000us 6.072ms 90.07% 6.072ms 2.024ms 3
4088
+ aten::contiguous 0.09% 7.439us 21.24% 1.785ms 198.324us 0.000us 0.00% 731.099us 81.233us 9
4089
+ aten::clone 0.26% 21.852us 21.15% 1.777ms 197.498us 0.000us 0.00% 731.099us 81.233us 9
4090
+ aten::copy_ 0.77% 64.769us 20.27% 1.703ms 189.239us 669.820us 9.93% 731.099us 81.233us 9
4091
+ void at::native::elementwise_kernel&lt;128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 669.820us 9.93% 669.820us 74.424us 9
4092
+ Activity Buffer Request 16.92% 1.422ms 16.92% 1.422ms 1.422ms 61.279us 0.91% 61.279us 61.279us 1
4093
+ aten::transpose 0.57% 48.271us 0.77% 64.334us 2.681us 0.000us 0.00% 0.000us 0.000us 24
4094
+ aten::as_strided 0.19% 16.063us 0.19% 16.063us 0.669us 0.000us 0.00% 0.000us 0.000us 24
4095
+ aten::empty_like 0.14% 11.440us 0.62% 52.480us 5.831us 0.000us 0.00% 0.000us 0.000us 9
4096
+ aten::empty 0.79% 66.661us 0.79% 66.661us 3.174us 0.000us 0.00% 0.000us 0.000us 21
4097
+ cudaLaunchKernel 2.84% 238.383us 2.84% 238.383us 19.865us 0.000us 0.00% 0.000us 0.000us 12
4098
+ cudaStreamIsCapturing 0.03% 2.270us 0.03% 2.270us 0.757us 0.000us 0.00% 0.000us 0.000us 3
4099
+ cudaFuncSetAttribute 0.04% 3.090us 0.04% 3.090us 1.030us 0.000us 0.00% 0.000us 0.000us 3
4100
+ cudaDeviceSynchronize 73.75% 6.196ms 73.75% 6.196ms 6.196ms 0.000us 0.00% 0.000us 0.000us 1
4101
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4102
+ Self CPU time total: 8.402ms
4103
+ Self CUDA time total: 6.742ms
4104
 
4105
 
4106
  impl wl p50(ms) ok
4107
+ torch_mem_eff cuda_attn_L128_bfloat16 1.89 True
4108
+ torch_mem_eff cuda_attn_L256_bfloat16 1.95 True
4109
+ torch_mem_eff cuda_attn_L320_bfloat16 2.05 True
4110
+ torch_mem_eff cuda_attn_L384_bfloat16 2.08 True
4111
+ torch_mem_eff cuda_attn_L448_bfloat16 2.13 True
4112
+ torch_mem_eff cuda_attn_L512_bfloat16 2.27 True
4113
  </pre></div>
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
4114
  <div class="cell-artifacts">
4115
  <h4>Artifacts:</h4>
4116
  <a href="artifacts/benchmark/attention.jsonl" class="artifact" target="_blank">attention.jsonl</a>
flash_attn/impls/sage_attention.html CHANGED
@@ -3857,7 +3857,7 @@ span.linenos.special { color: #000000; background-color: #ffffc0; padding-left:
3857
  <div class="system-info">
3858
  <div class="system-info-header">Generated on:</div>
3859
  <div class="system-info-content">
3860
- Linux x86_64 | Linux-5.15.0-1084-aws-x86_64-with-glibc2.31
3861
  </div>
3862
  </div>
3863
 
@@ -3871,7 +3871,7 @@ span.linenos.special { color: #000000; background-color: #ffffc0; padding-left:
3871
  <span onclick="toggleOutput('benchmark')" style="cursor: pointer;">▼ output</span>
3872
  <span id="uv-indicator-benchmark" onclick="toggleUvLogsFromHeader('benchmark')" style="cursor: pointer;">▶ uv-logs</span>
3873
  </span> |
3874
- Cell: benchmark | 44.02s
3875
  | <button class="run-btn" onclick="runCell('benchmark')">▶ run</button>
3876
  <button class="copy-btn" onclick="copyCell('benchmark')">Copy</button>
3877
  <a href="cells/benchmark.py" target="_blank" class="raw-btn">Raw</a>
@@ -3921,76 +3921,28 @@ Cell: benchmark | 44.02s
3921
  <div class="cell-stdout"><pre class="stdout-text">Running attention benchmark on cuda with 6 workloads.
3922
  impl wl p50(ms) ok
3923
  sage_int8_fp16 cuda_attn_L128_bfloat16 FAIL False
3924
- Error: module &#x27;sage_attention_c9b3a60c7e3c5091&#x27; has no attribute &#x27;fwd&#x27;
3925
  sage_int8_fp16 cuda_attn_L256_bfloat16 FAIL False
3926
- Error: module &#x27;sage_attention_c9b3a60c7e3c5091&#x27; has no attribute &#x27;fwd&#x27;
3927
  sage_int8_fp16 cuda_attn_L320_bfloat16 FAIL False
3928
- Error: module &#x27;sage_attention_c9b3a60c7e3c5091&#x27; has no attribute &#x27;fwd&#x27;
3929
  sage_int8_fp16 cuda_attn_L384_bfloat16 FAIL False
3930
- Error: module &#x27;sage_attention_c9b3a60c7e3c5091&#x27; has no attribute &#x27;fwd&#x27;
3931
  sage_int8_fp16 cuda_attn_L448_bfloat16 FAIL False
3932
- Error: module &#x27;sage_attention_c9b3a60c7e3c5091&#x27; has no attribute &#x27;fwd&#x27;
3933
  sage_int8_fp16 cuda_attn_L512_bfloat16 FAIL False
3934
- Error: module &#x27;sage_attention_c9b3a60c7e3c5091&#x27; has no attribute &#x27;fwd&#x27;
3935
  </pre></div>
3936
  <div class="uv-install-logs" id="uv-logs-benchmark">
3937
  <div class="uv-logs-header" onclick="toggleUvLogs(this)">▶ UV Install Logs</div>
3938
  <div class="uv-logs-content" style="display: none;">
3939
- Building kernels-benchmark-tools @ file:///home/ubuntu/Projects/kernels-benchmarks-consolidated/tools
3940
- Downloading pillow (6.7MiB)
3941
- Downloading hf-xet (3.2MiB)
3942
- Downloading networkx (1.9MiB)
3943
- Downloading setuptools (1.1MiB)
3944
- Downloading nvidia-cufile-cu12 (1.1MiB)
3945
- Downloading numpy (15.9MiB)
3946
- Downloading nvidia-cuda-cupti-cu12 (9.8MiB)
3947
- Downloading nvidia-cublas-cu12 (566.8MiB)
3948
- Downloading nvidia-cusparselt-cu12 (273.9MiB)
3949
- Downloading nvidia-curand-cu12 (60.7MiB)
3950
- Downloading sympy (6.0MiB)
3951
- Downloading kiwisolver (1.4MiB)
3952
- Downloading fonttools (4.7MiB)
3953
- Downloading nvidia-nccl-cu12 (307.4MiB)
3954
- Downloading nvidia-cusparse-cu12 (274.9MiB)
3955
- Downloading nvidia-cufft-cu12 (184.2MiB)
3956
- Downloading nvidia-cuda-nvrtc-cu12 (84.0MiB)
3957
- Downloading matplotlib (8.3MiB)
3958
- Downloading nvidia-cusolver-cu12 (255.1MiB)
3959
- Downloading nvidia-cudnn-cu12 (674.0MiB)
3960
- Downloading nvidia-nvjitlink-cu12 (37.4MiB)
3961
- Downloading triton (148.4MiB)
3962
- Downloading torch (846.8MiB)
3963
- Downloading nvidia-cufile-cu12
3964
- Downloading kiwisolver
3965
- Downloading hf-xet
3966
- Downloading setuptools
3967
- Downloading fonttools
3968
- Downloading networkx
3969
- Downloading pillow
3970
- Built kernels-benchmark-tools @ file:///home/ubuntu/Projects/kernels-benchmarks-consolidated/tools
3971
- Downloading nvidia-cuda-cupti-cu12
3972
- Downloading matplotlib
3973
- Downloading numpy
3974
- Downloading sympy
3975
- Downloading nvidia-nvjitlink-cu12
3976
- Downloading nvidia-curand-cu12
3977
- Downloading nvidia-cuda-nvrtc-cu12
3978
- Downloading triton
3979
- Downloading nvidia-cufft-cu12
3980
- Downloading nvidia-cusolver-cu12
3981
- Downloading nvidia-cusparselt-cu12
3982
- Downloading nvidia-cusparse-cu12
3983
- Downloading nvidia-nccl-cu12
3984
- Downloading nvidia-cublas-cu12
3985
- Downloading nvidia-cudnn-cu12
3986
- Downloading torch
3987
- Installed 48 packages in 211ms
3988
  </div>
3989
  </div>
3990
  <div class="cell-stderr">Fetching 11 files: 0%| | 0/11 [00:00&lt;?, ?it/s]
3991
- Fetching 11 files: 9%|▉ | 1/11 [00:00&lt;00:02, 3.52it/s]
3992
- Fetching 11 files: 73%|███████▎ | 8/11 [00:00&lt;00:00, 9.29it/s]
3993
- Fetching 11 files: 100%|██████████| 11/11 [00:00&lt;00:00, 12.03it/s]</div>
3994
  <div class="cell-artifacts">
3995
  <h4>Artifacts:</h4>
3996
  <a href="artifacts/benchmark/attention.jsonl" class="artifact" target="_blank">attention.jsonl</a>
 
3857
  <div class="system-info">
3858
  <div class="system-info-header">Generated on:</div>
3859
  <div class="system-info-content">
3860
+ Linux x86_64 | Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35
3861
  </div>
3862
  </div>
3863
 
 
3871
  <span onclick="toggleOutput('benchmark')" style="cursor: pointer;">▼ output</span>
3872
  <span id="uv-indicator-benchmark" onclick="toggleUvLogsFromHeader('benchmark')" style="cursor: pointer;">▶ uv-logs</span>
3873
  </span> |
3874
+ Cell: benchmark | 4.37s
3875
  | <button class="run-btn" onclick="runCell('benchmark')">▶ run</button>
3876
  <button class="copy-btn" onclick="copyCell('benchmark')">Copy</button>
3877
  <a href="cells/benchmark.py" target="_blank" class="raw-btn">Raw</a>
 
3921
  <div class="cell-stdout"><pre class="stdout-text">Running attention benchmark on cuda with 6 workloads.
3922
  impl wl p50(ms) ok
3923
  sage_int8_fp16 cuda_attn_L128_bfloat16 FAIL False
3924
+ Error: module &#x27;sage_attention_12c766386675beb4&#x27; has no attribute &#x27;fwd&#x27;
3925
  sage_int8_fp16 cuda_attn_L256_bfloat16 FAIL False
3926
+ Error: module &#x27;sage_attention_12c766386675beb4&#x27; has no attribute &#x27;fwd&#x27;
3927
  sage_int8_fp16 cuda_attn_L320_bfloat16 FAIL False
3928
+ Error: module &#x27;sage_attention_12c766386675beb4&#x27; has no attribute &#x27;fwd&#x27;
3929
  sage_int8_fp16 cuda_attn_L384_bfloat16 FAIL False
3930
+ Error: module &#x27;sage_attention_12c766386675beb4&#x27; has no attribute &#x27;fwd&#x27;
3931
  sage_int8_fp16 cuda_attn_L448_bfloat16 FAIL False
3932
+ Error: module &#x27;sage_attention_12c766386675beb4&#x27; has no attribute &#x27;fwd&#x27;
3933
  sage_int8_fp16 cuda_attn_L512_bfloat16 FAIL False
3934
+ Error: module &#x27;sage_attention_12c766386675beb4&#x27; has no attribute &#x27;fwd&#x27;
3935
  </pre></div>
3936
  <div class="uv-install-logs" id="uv-logs-benchmark">
3937
  <div class="uv-logs-header" onclick="toggleUvLogs(this)">▶ UV Install Logs</div>
3938
  <div class="uv-logs-content" style="display: none;">
3939
+ Installed 1 package in 11ms
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
3940
  </div>
3941
  </div>
3942
  <div class="cell-stderr">Fetching 11 files: 0%| | 0/11 [00:00&lt;?, ?it/s]
3943
+ Fetching 11 files: 27%|██▋ | 3/11 [00:00&lt;00:00, 14.92it/s]
3944
+ Fetching 11 files: 73%|███████▎ | 8/11 [00:00&lt;00:00, 14.19it/s]
3945
+ Fetching 11 files: 100%|██████████| 11/11 [00:00&lt;00:00, 19.60it/s]</div>
3946
  <div class="cell-artifacts">
3947
  <h4>Artifacts:</h4>
3948
  <a href="artifacts/benchmark/attention.jsonl" class="artifact" target="_blank">attention.jsonl</a>
flash_attn/impls/xformers.html CHANGED
@@ -3857,7 +3857,7 @@ span.linenos.special { color: #000000; background-color: #ffffc0; padding-left:
3857
  <div class="system-info">
3858
  <div class="system-info-header">Generated on:</div>
3859
  <div class="system-info-content">
3860
- Linux x86_64 | Linux-5.15.0-1084-aws-x86_64-with-glibc2.31
3861
  </div>
3862
  </div>
3863
 
@@ -3871,7 +3871,7 @@ span.linenos.special { color: #000000; background-color: #ffffc0; padding-left:
3871
  <span onclick="toggleOutput('benchmark')" style="cursor: pointer;">▼ output</span>
3872
  <span id="uv-indicator-benchmark" onclick="toggleUvLogsFromHeader('benchmark')" style="cursor: pointer;">▶ uv-logs</span>
3873
  </span> |
3874
- Cell: benchmark | 45.32s
3875
  | <button class="run-btn" onclick="runCell('benchmark')">▶ run</button>
3876
  <button class="copy-btn" onclick="copyCell('benchmark')">Copy</button>
3877
  <a href="cells/benchmark.py" target="_blank" class="raw-btn">Raw</a>
@@ -3923,21 +3923,21 @@ PROFILE TRACE: xformers_meff | cuda_attn_L128_bfloat16
3923
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
3924
  Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
3925
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
3926
- xformers_meff 5.28% 517.181us 24.85% 2.433ms 2.433ms 0.000us 0.00% 10.583ms 10.583ms 1
3927
- xformers_flash3::flash_fwd 2.21% 216.725us 19.17% 1.877ms 625.707us 0.000us 0.00% 10.583ms 3.528ms 3
3928
- flash_attn_3::fwd 0.75% 73.471us 16.96% 1.660ms 553.465us 7.934ms 100.00% 10.583ms 3.528ms 3
3929
- xformers_meff 0.00% 0.000us 0.00% 0.000us 0.000us 7.935ms 100.02% 7.935ms 7.935ms 1
3930
- void cutlass::device_kernel&lt;flash::enable_sm80_to_sm... 0.00% 0.000us 0.00% 0.000us 0.000us 7.934ms 100.00% 7.934ms 2.645ms 3
3931
- Activity Buffer Request 15.30% 1.498ms 15.30% 1.498ms 1.498ms 2.649ms 33.39% 2.649ms 2.649ms 1
3932
- aten::empty 0.35% 34.410us 0.35% 34.410us 5.735us 0.000us 0.00% 0.000us 0.000us 6
3933
- cudaFuncSetAttribute 0.13% 13.051us 0.13% 13.051us 4.350us 0.000us 0.00% 0.000us 0.000us 3
3934
- cudaLaunchKernel 0.42% 41.351us 0.42% 41.351us 13.784us 0.000us 0.00% 0.000us 0.000us 3
3935
- aten::reshape 0.14% 13.581us 0.40% 38.881us 6.480us 0.000us 0.00% 0.000us 0.000us 6
3936
- aten::view 0.26% 25.300us 0.26% 25.300us 4.217us 0.000us 0.00% 0.000us 0.000us 6
3937
- cudaDeviceSynchronize 75.15% 7.358ms 75.15% 7.358ms 7.358ms 0.000us 0.00% 0.000us 0.000us 1
3938
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
3939
- Self CPU time total: 9.791ms
3940
- Self CUDA time total: 7.934ms
3941
 
3942
 
3943
 
@@ -3947,21 +3947,21 @@ PROFILE TRACE: xformers_meff | cuda_attn_L256_bfloat16
3947
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
3948
  Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
3949
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
3950
- xformers_meff 2.97% 376.750us 17.03% 2.160ms 2.160ms 0.000us 0.00% 14.695ms 14.695ms 1
3951
- xformers_flash3::flash_fwd 1.31% 166.673us 13.88% 1.760ms 586.646us 0.000us 0.00% 14.695ms 4.898ms 3
3952
- flash_attn_3::fwd 0.41% 52.370us 12.57% 1.593ms 531.088us 11.013ms 100.00% 14.695ms 4.898ms 3
3953
- xformers_meff 0.00% 0.000us 0.00% 0.000us 0.000us 11.015ms 100.02% 11.015ms 11.015ms 1
3954
- void cutlass::device_kernel&lt;flash::enable_sm80_to_sm... 0.00% 0.000us 0.00% 0.000us 0.000us 11.013ms 100.00% 11.013ms 3.671ms 3
3955
- Activity Buffer Request 11.62% 1.473ms 11.62% 1.473ms 1.473ms 3.682ms 33.43% 3.682ms 3.682ms 1
3956
- aten::empty 0.22% 28.511us 0.22% 28.511us 4.752us 0.000us 0.00% 0.000us 0.000us 6
3957
- cudaFuncSetAttribute 0.04% 5.391us 0.04% 5.391us 1.797us 0.000us 0.00% 0.000us 0.000us 3
3958
- cudaLaunchKernel 0.27% 34.441us 0.27% 34.441us 11.480us 0.000us 0.00% 0.000us 0.000us 3
3959
- aten::reshape 0.07% 8.699us 0.18% 22.949us 3.825us 0.000us 0.00% 0.000us 0.000us 6
3960
- aten::view 0.11% 14.250us 0.11% 14.250us 2.375us 0.000us 0.00% 0.000us 0.000us 6
3961
- cudaDeviceSynchronize 82.97% 10.518ms 82.97% 10.518ms 10.518ms 0.000us 0.00% 0.000us 0.000us 1
3962
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
3963
- Self CPU time total: 12.678ms
3964
- Self CUDA time total: 11.013ms
3965
 
3966
 
3967
 
@@ -3971,21 +3971,21 @@ PROFILE TRACE: xformers_meff | cuda_attn_L320_bfloat16
3971
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
3972
  Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
3973
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
3974
- xformers_meff 2.76% 351.879us 17.06% 2.178ms 2.178ms 0.000us 0.00% 14.911ms 14.911ms 1
3975
- xformers_flash3::flash_fwd 1.47% 187.843us 14.11% 1.803ms 600.839us 0.000us 0.00% 14.911ms 4.970ms 3
3976
- flash_attn_3::fwd 0.41% 52.611us 12.64% 1.615ms 538.225us 11.083ms 100.00% 14.911ms 4.970ms 3
3977
- xformers_meff 0.00% 0.000us 0.00% 0.000us 0.000us 11.085ms 100.02% 11.085ms 11.085ms 1
3978
- void cutlass::device_kernel&lt;flash::enable_sm80_to_sm... 0.00% 0.000us 0.00% 0.000us 0.000us 11.083ms 100.00% 11.083ms 3.694ms 3
3979
- Activity Buffer Request 11.67% 1.491ms 11.67% 1.491ms 1.491ms 3.829ms 34.54% 3.829ms 3.829ms 1
3980
- aten::empty 0.23% 29.661us 0.23% 29.661us 4.944us 0.000us 0.00% 0.000us 0.000us 6
3981
- cudaFuncSetAttribute 0.04% 5.680us 0.04% 5.680us 1.893us 0.000us 0.00% 0.000us 0.000us 3
3982
- cudaLaunchKernel 0.28% 35.941us 0.28% 35.941us 11.980us 0.000us 0.00% 0.000us 0.000us 3
3983
- aten::reshape 0.07% 8.779us 0.19% 23.920us 3.987us 0.000us 0.00% 0.000us 0.000us 6
3984
- aten::view 0.12% 15.141us 0.12% 15.141us 2.524us 0.000us 0.00% 0.000us 0.000us 6
3985
- cudaDeviceSynchronize 82.94% 10.593ms 82.94% 10.593ms 10.593ms 0.000us 0.00% 0.000us 0.000us 1
3986
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
3987
- Self CPU time total: 12.771ms
3988
- Self CUDA time total: 11.083ms
3989
 
3990
 
3991
 
@@ -3995,21 +3995,21 @@ PROFILE TRACE: xformers_meff | cuda_attn_L384_bfloat16
3995
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
3996
  Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
3997
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
3998
- xformers_meff 2.60% 343.688us 18.22% 2.412ms 2.412ms 0.000us 0.00% 15.065ms 15.065ms 1
3999
- xformers_flash3::flash_fwd 1.25% 165.081us 15.45% 2.045ms 681.611us 0.000us 0.00% 15.065ms 5.022ms 3
4000
- flash_attn_3::fwd 0.38% 50.950us 14.20% 1.880ms 626.584us 11.285ms 100.00% 15.065ms 5.022ms 3
4001
- xformers_meff 0.00% 0.000us 0.00% 0.000us 0.000us 11.286ms 100.02% 11.286ms 11.286ms 1
4002
- void cutlass::device_kernel&lt;flash::enable_sm80_to_sm... 0.00% 0.000us 0.00% 0.000us 0.000us 11.285ms 100.00% 11.285ms 3.762ms 3
4003
- Activity Buffer Request 11.56% 1.531ms 11.56% 1.531ms 1.531ms 3.781ms 33.50% 3.781ms 3.781ms 1
4004
- aten::empty 0.22% 29.192us 0.22% 29.192us 4.865us 0.000us 0.00% 0.000us 0.000us 6
4005
- cudaFuncSetAttribute 0.04% 5.370us 0.04% 5.370us 1.790us 0.000us 0.00% 0.000us 0.000us 3
4006
- cudaLaunchKernel 1.99% 263.376us 1.99% 263.376us 87.792us 0.000us 0.00% 0.000us 0.000us 3
4007
- aten::reshape 0.07% 9.160us 0.18% 23.762us 3.960us 0.000us 0.00% 0.000us 0.000us 6
4008
- aten::view 0.11% 14.602us 0.11% 14.602us 2.434us 0.000us 0.00% 0.000us 0.000us 6
4009
- cudaDeviceSynchronize 81.78% 10.825ms 81.78% 10.825ms 10.825ms 0.000us 0.00% 0.000us 0.000us 1
4010
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4011
- Self CPU time total: 13.238ms
4012
- Self CUDA time total: 11.285ms
4013
 
4014
 
4015
 
@@ -4019,21 +4019,21 @@ PROFILE TRACE: xformers_meff | cuda_attn_L448_bfloat16
4019
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4020
  Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
4021
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4022
- xformers_meff 2.46% 345.459us 17.01% 2.385ms 2.385ms 0.000us 0.00% 16.124ms 16.124ms 1
4023
- xformers_flash3::flash_fwd 1.15% 161.632us 14.38% 2.017ms 672.171us 0.000us 0.00% 16.124ms 5.375ms 3
4024
- flash_attn_3::fwd 0.37% 51.683us 13.23% 1.855ms 618.293us 12.092ms 100.00% 16.124ms 5.375ms 3
4025
- xformers_meff 0.00% 0.000us 0.00% 0.000us 0.000us 12.094ms 100.02% 12.094ms 12.094ms 1
4026
- void cutlass::device_kernel&lt;flash::enable_sm80_to_sm... 0.00% 0.000us 0.00% 0.000us 0.000us 12.092ms 100.00% 12.092ms 4.031ms 3
4027
- Activity Buffer Request 10.69% 1.499ms 10.69% 1.499ms 1.499ms 4.032ms 33.35% 4.032ms 4.032ms 1
4028
- aten::empty 0.21% 29.140us 0.21% 29.140us 4.857us 0.000us 0.00% 0.000us 0.000us 6
4029
- cudaFuncSetAttribute 0.04% 5.520us 0.04% 5.520us 1.840us 0.000us 0.00% 0.000us 0.000us 3
4030
- cudaLaunchKernel 1.92% 269.435us 1.92% 269.435us 89.812us 0.000us 0.00% 0.000us 0.000us 3
4031
- aten::reshape 0.06% 9.069us 0.16% 22.880us 3.813us 0.000us 0.00% 0.000us 0.000us 6
4032
- aten::view 0.10% 13.811us 0.10% 13.811us 2.302us 0.000us 0.00% 0.000us 0.000us 6
4033
- cudaDeviceSynchronize 82.99% 11.636ms 82.99% 11.636ms 11.636ms 0.000us 0.00% 0.000us 0.000us 1
4034
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4035
- Self CPU time total: 14.021ms
4036
- Self CUDA time total: 12.092ms
4037
 
4038
 
4039
 
@@ -4043,83 +4043,37 @@ PROFILE TRACE: xformers_meff | cuda_attn_L512_bfloat16
4043
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4044
  Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
4045
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4046
- xformers_meff 2.36% 347.389us 16.65% 2.455ms 2.455ms 0.000us 0.00% 16.980ms 16.980ms 1
4047
- xformers_flash3::flash_fwd 1.09% 160.181us 14.14% 2.085ms 695.001us 0.000us 0.00% 16.980ms 5.660ms 3
4048
- flash_attn_3::fwd 0.36% 52.921us 13.05% 1.925ms 641.607us 12.735ms 100.00% 16.980ms 5.660ms 3
4049
- xformers_meff 0.00% 0.000us 0.00% 0.000us 0.000us 12.738ms 100.02% 12.738ms 12.738ms 1
4050
- void cutlass::device_kernel&lt;flash::enable_sm80_to_sm... 0.00% 0.000us 0.00% 0.000us 0.000us 12.735ms 100.00% 12.735ms 4.245ms 3
4051
- Activity Buffer Request 10.11% 1.491ms 10.11% 1.491ms 1.491ms 4.245ms 33.33% 4.245ms 4.245ms 1
4052
- aten::empty 0.20% 29.922us 0.20% 29.922us 4.987us 0.000us 0.00% 0.000us 0.000us 6
4053
- cudaFuncSetAttribute 0.04% 5.530us 0.04% 5.530us 1.843us 0.000us 0.00% 0.000us 0.000us 3
4054
- cudaLaunchKernel 2.34% 345.117us 2.34% 345.117us 115.039us 0.000us 0.00% 0.000us 0.000us 3
4055
- aten::reshape 0.06% 8.379us 0.15% 22.620us 3.770us 0.000us 0.00% 0.000us 0.000us 6
4056
- aten::view 0.10% 14.241us 0.10% 14.241us 2.373us 0.000us 0.00% 0.000us 0.000us 6
4057
- cudaDeviceSynchronize 83.35% 12.290ms 83.35% 12.290ms 12.290ms 0.000us 0.00% 0.000us 0.000us 1
4058
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4059
- Self CPU time total: 14.745ms
4060
- Self CUDA time total: 12.735ms
4061
 
4062
 
4063
  impl wl p50(ms) ok
4064
- xformers_meff cuda_attn_L128_bfloat16 3.60 True
4065
- xformers_meff cuda_attn_L256_bfloat16 3.43 True
4066
- xformers_meff cuda_attn_L320_bfloat16 4.10 True
4067
- xformers_meff cuda_attn_L384_bfloat16 4.01 True
4068
- xformers_meff cuda_attn_L448_bfloat16 4.21 True
4069
- xformers_meff cuda_attn_L512_bfloat16 4.43 True
4070
  </pre></div>
4071
  <div class="uv-install-logs" id="uv-logs-benchmark">
4072
  <div class="uv-logs-header" onclick="toggleUvLogs(this)">▶ UV Install Logs</div>
4073
  <div class="uv-logs-content" style="display: none;">
4074
- Building kernels-benchmark-tools @ file:///home/ubuntu/Projects/kernels-benchmarks-consolidated/tools
4075
- Downloading nvidia-cusparse-cu12 (274.9MiB)
4076
- Downloading nvidia-curand-cu12 (60.7MiB)
4077
- Downloading triton (148.4MiB)
4078
- Downloading matplotlib (8.3MiB)
4079
- Downloading pillow (6.7MiB)
4080
- Downloading setuptools (1.1MiB)
4081
- Downloading nvidia-cudnn-cu12 (674.0MiB)
4082
- Downloading torch (846.8MiB)
4083
- Downloading nvidia-nvjitlink-cu12 (37.4MiB)
4084
- Downloading nvidia-cusparselt-cu12 (273.9MiB)
4085
- Downloading nvidia-cusolver-cu12 (255.1MiB)
4086
- Downloading nvidia-cufile-cu12 (1.1MiB)
4087
- Downloading nvidia-nccl-cu12 (307.4MiB)
4088
- Downloading networkx (1.9MiB)
4089
- Downloading kiwisolver (1.4MiB)
4090
- Downloading fonttools (4.7MiB)
4091
- Downloading numpy (15.9MiB)
4092
- Downloading sympy (6.0MiB)
4093
- Downloading nvidia-cublas-cu12 (566.8MiB)
4094
- Downloading nvidia-cuda-cupti-cu12 (9.8MiB)
4095
- Downloading nvidia-cufft-cu12 (184.2MiB)
4096
- Downloading nvidia-cuda-nvrtc-cu12 (84.0MiB)
4097
  Downloading xformers (111.8MiB)
4098
- Downloading nvidia-cufile-cu12
4099
- Downloading kiwisolver
4100
- Downloading setuptools
4101
- Downloading fonttools
4102
- Downloading networkx
4103
- Downloading pillow
4104
- Built kernels-benchmark-tools @ file:///home/ubuntu/Projects/kernels-benchmarks-consolidated/tools
4105
- Downloading nvidia-cuda-cupti-cu12
4106
- Downloading matplotlib
4107
- Downloading numpy
4108
- Downloading sympy
4109
- Downloading nvidia-nvjitlink-cu12
4110
- Downloading nvidia-curand-cu12
4111
- Downloading nvidia-cuda-nvrtc-cu12
4112
  Downloading xformers
4113
- Downloading triton
4114
- Downloading nvidia-cufft-cu12
4115
- Downloading nvidia-cusolver-cu12
4116
- Downloading nvidia-cusparse-cu12
4117
- Downloading nvidia-cusparselt-cu12
4118
- Downloading nvidia-nccl-cu12
4119
- Downloading nvidia-cublas-cu12
4120
- Downloading nvidia-cudnn-cu12
4121
- Downloading torch
4122
- Installed 38 packages in 211ms
4123
  </div>
4124
  </div>
4125
  <div class="cell-artifacts">
 
3857
  <div class="system-info">
3858
  <div class="system-info-header">Generated on:</div>
3859
  <div class="system-info-content">
3860
+ Linux x86_64 | Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35
3861
  </div>
3862
  </div>
3863
 
 
3871
  <span onclick="toggleOutput('benchmark')" style="cursor: pointer;">▼ output</span>
3872
  <span id="uv-indicator-benchmark" onclick="toggleUvLogsFromHeader('benchmark')" style="cursor: pointer;">▶ uv-logs</span>
3873
  </span> |
3874
+ Cell: benchmark | 5.09s
3875
  | <button class="run-btn" onclick="runCell('benchmark')">▶ run</button>
3876
  <button class="copy-btn" onclick="copyCell('benchmark')">Copy</button>
3877
  <a href="cells/benchmark.py" target="_blank" class="raw-btn">Raw</a>
 
3923
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
3924
  Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
3925
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
3926
+ xformers_meff 10.73% 481.606us 51.24% 2.299ms 2.299ms 0.000us 0.00% 3.630ms 3.630ms 1
3927
+ xformers_flash3::flash_fwd 4.33% 194.084us 39.70% 1.781ms 593.782us 0.000us 0.00% 3.630ms 1.210ms 3
3928
+ flash_attn_3::fwd 1.76% 78.961us 35.37% 1.587ms 529.087us 2.729ms 100.00% 3.630ms 1.210ms 3
3929
+ xformers_meff 0.00% 0.000us 0.00% 0.000us 0.000us 2.730ms 100.05% 2.730ms 2.730ms 1
3930
+ void cutlass::device_kernel&lt;flash::enable_sm80_to_sm... 0.00% 0.000us 0.00% 0.000us 0.000us 2.729ms 100.00% 2.729ms 909.588us 3
3931
+ Activity Buffer Request 31.70% 1.423ms 31.70% 1.423ms 1.423ms 901.535us 33.04% 901.535us 901.535us 1
3932
+ aten::empty 0.75% 33.761us 0.75% 33.761us 5.627us 0.000us 0.00% 0.000us 0.000us 6
3933
+ cudaFuncSetAttribute 0.28% 12.380us 0.28% 12.380us 4.127us 0.000us 0.00% 0.000us 0.000us 3
3934
+ cudaLaunchKernel 0.88% 39.570us 0.88% 39.570us 13.190us 0.000us 0.00% 0.000us 0.000us 3
3935
+ aten::reshape 0.30% 13.520us 0.80% 36.080us 6.013us 0.000us 0.00% 0.000us 0.000us 6
3936
+ aten::view 0.50% 22.560us 0.50% 22.560us 3.760us 0.000us 0.00% 0.000us 0.000us 6
3937
+ cudaDeviceSynchronize 48.76% 2.188ms 48.76% 2.188ms 2.188ms 0.000us 0.00% 0.000us 0.000us 1
3938
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
3939
+ Self CPU time total: 4.487ms
3940
+ Self CUDA time total: 2.729ms
3941
 
3942
 
3943
 
 
3947
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
3948
  Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
3949
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
3950
+ xformers_meff 7.10% 312.113us 46.81% 2.059ms 2.059ms 0.000us 0.00% 3.744ms 3.744ms 1
3951
+ xformers_flash3::flash_fwd 3.88% 170.673us 39.17% 1.723ms 574.405us 0.000us 0.00% 3.744ms 1.248ms 3
3952
+ flash_attn_3::fwd 1.28% 56.171us 35.29% 1.553ms 517.514us 2.795ms 100.00% 3.744ms 1.248ms 3
3953
+ xformers_meff 0.00% 0.000us 0.00% 0.000us 0.000us 2.796ms 100.05% 2.796ms 2.796ms 1
3954
+ void cutlass::device_kernel&lt;flash::enable_sm80_to_sm... 0.00% 0.000us 0.00% 0.000us 0.000us 2.795ms 100.00% 2.795ms 931.630us 3
3955
+ Activity Buffer Request 32.47% 1.428ms 32.47% 1.428ms 1.428ms 948.729us 33.95% 948.729us 948.729us 1
3956
+ aten::empty 0.66% 29.091us 0.66% 29.091us 4.848us 0.000us 0.00% 0.000us 0.000us 6
3957
+ cudaFuncSetAttribute 0.13% 5.590us 0.13% 5.590us 1.863us 0.000us 0.00% 0.000us 0.000us 3
3958
+ cudaLaunchKernel 0.76% 33.440us 0.76% 33.440us 11.147us 0.000us 0.00% 0.000us 0.000us 3
3959
+ aten::reshape 0.20% 8.951us 0.54% 23.831us 3.972us 0.000us 0.00% 0.000us 0.000us 6
3960
+ aten::view 0.34% 14.880us 0.34% 14.880us 2.480us 0.000us 0.00% 0.000us 0.000us 6
3961
+ cudaDeviceSynchronize 53.19% 2.340ms 53.19% 2.340ms 2.340ms 0.000us 0.00% 0.000us 0.000us 1
3962
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
3963
+ Self CPU time total: 4.399ms
3964
+ Self CUDA time total: 2.795ms
3965
 
3966
 
3967
 
 
3971
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
3972
  Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
3973
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
3974
+ xformers_meff 6.52% 299.466us 45.41% 2.085ms 2.085ms 0.000us 0.00% 3.907ms 3.907ms 1
3975
+ xformers_flash3::flash_fwd 3.09% 142.061us 38.39% 1.763ms 587.558us 0.000us 0.00% 3.907ms 1.302ms 3
3976
+ flash_attn_3::fwd 1.15% 53.012us 35.30% 1.621ms 540.204us 2.913ms 100.00% 3.907ms 1.302ms 3
3977
+ xformers_meff 0.00% 0.000us 0.00% 0.000us 0.000us 2.915ms 100.06% 2.915ms 2.915ms 1
3978
+ void cutlass::device_kernel&lt;flash::enable_sm80_to_sm... 0.00% 0.000us 0.00% 0.000us 0.000us 2.913ms 100.00% 2.913ms 971.158us 3
3979
+ Activity Buffer Request 32.68% 1.500ms 32.68% 1.500ms 1.500ms 993.281us 34.09% 993.281us 993.281us 1
3980
+ aten::empty 0.62% 28.380us 0.62% 28.380us 4.730us 0.000us 0.00% 0.000us 0.000us 6
3981
+ cudaFuncSetAttribute 0.11% 5.270us 0.11% 5.270us 1.757us 0.000us 0.00% 0.000us 0.000us 3
3982
+ cudaLaunchKernel 0.73% 33.640us 0.73% 33.640us 11.213us 0.000us 0.00% 0.000us 0.000us 3
3983
+ aten::reshape 0.18% 8.421us 0.49% 22.660us 3.777us 0.000us 0.00% 0.000us 0.000us 6
3984
+ aten::view 0.31% 14.239us 0.31% 14.239us 2.373us 0.000us 0.00% 0.000us 0.000us 6
3985
+ cudaDeviceSynchronize 54.59% 2.507ms 54.59% 2.507ms 2.507ms 0.000us 0.00% 0.000us 0.000us 1
3986
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
3987
+ Self CPU time total: 4.591ms
3988
+ Self CUDA time total: 2.913ms
3989
 
3990
 
3991
 
 
3995
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
3996
  Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
3997
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
3998
+ xformers_meff 6.26% 300.335us 46.54% 2.234ms 2.234ms 0.000us 0.00% 3.980ms 3.980ms 1
3999
+ xformers_flash3::flash_fwd 3.08% 147.673us 39.81% 1.911ms 637.009us 0.000us 0.00% 3.980ms 1.327ms 3
4000
+ flash_attn_3::fwd 1.12% 53.571us 36.74% 1.763ms 587.785us 2.981ms 100.00% 3.980ms 1.327ms 3
4001
+ xformers_meff 0.00% 0.000us 0.00% 0.000us 0.000us 2.982ms 100.05% 2.982ms 2.982ms 1
4002
+ void cutlass::device_kernel&lt;flash::enable_sm80_to_sm... 0.00% 0.000us 0.00% 0.000us 0.000us 2.981ms 100.00% 2.981ms 993.631us 3
4003
+ Activity Buffer Request 29.81% 1.431ms 29.81% 1.431ms 1.431ms 999.263us 33.52% 999.263us 999.263us 1
4004
+ aten::empty 0.60% 28.930us 0.60% 28.930us 4.822us 0.000us 0.00% 0.000us 0.000us 6
4005
+ cudaFuncSetAttribute 0.12% 5.610us 0.12% 5.610us 1.870us 0.000us 0.00% 0.000us 0.000us 3
4006
+ cudaLaunchKernel 5.09% 244.533us 5.09% 244.533us 81.511us 0.000us 0.00% 0.000us 0.000us 3
4007
+ aten::reshape 0.18% 8.489us 0.47% 22.530us 3.755us 0.000us 0.00% 0.000us 0.000us 6
4008
+ aten::view 0.29% 14.041us 0.29% 14.041us 2.340us 0.000us 0.00% 0.000us 0.000us 6
4009
+ cudaDeviceSynchronize 53.46% 2.566ms 53.46% 2.566ms 2.566ms 0.000us 0.00% 0.000us 0.000us 1
4010
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4011
+ Self CPU time total: 4.800ms
4012
+ Self CUDA time total: 2.981ms
4013
 
4014
 
4015
 
 
4019
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4020
  Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
4021
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4022
+ xformers_meff 5.98% 313.865us 42.05% 2.207ms 2.207ms 0.000us 0.00% 4.635ms 4.635ms 1
4023
+ xformers_flash3::flash_fwd 2.80% 146.723us 35.63% 1.870ms 623.176us 0.000us 0.00% 4.635ms 1.545ms 3
4024
+ flash_attn_3::fwd 0.99% 51.861us 32.83% 1.723ms 574.268us 3.467ms 100.00% 4.635ms 1.545ms 3
4025
+ xformers_meff 0.00% 0.000us 0.00% 0.000us 0.000us 3.469ms 100.05% 3.469ms 3.469ms 1
4026
+ void cutlass::device_kernel&lt;flash::enable_sm80_to_sm... 0.00% 0.000us 0.00% 0.000us 0.000us 3.467ms 100.00% 3.467ms 1.156ms 3
4027
+ Activity Buffer Request 27.82% 1.460ms 27.82% 1.460ms 1.460ms 1.168ms 33.68% 1.168ms 1.168ms 1
4028
+ aten::empty 0.56% 29.260us 0.56% 29.260us 4.877us 0.000us 0.00% 0.000us 0.000us 6
4029
+ cudaFuncSetAttribute 0.12% 6.040us 0.12% 6.040us 2.013us 0.000us 0.00% 0.000us 0.000us 3
4030
+ cudaLaunchKernel 3.35% 175.903us 3.35% 175.903us 58.634us 0.000us 0.00% 0.000us 0.000us 3
4031
+ aten::reshape 0.16% 8.638us 0.44% 23.169us 3.862us 0.000us 0.00% 0.000us 0.000us 6
4032
+ aten::view 0.28% 14.531us 0.28% 14.531us 2.422us 0.000us 0.00% 0.000us 0.000us 6
4033
+ cudaDeviceSynchronize 57.95% 3.041ms 57.95% 3.041ms 3.041ms 0.000us 0.00% 0.000us 0.000us 1
4034
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4035
+ Self CPU time total: 5.247ms
4036
+ Self CUDA time total: 3.467ms
4037
 
4038
 
4039
 
 
4043
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4044
  Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
4045
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4046
+ xformers_meff 5.97% 309.094us 41.86% 2.166ms 2.166ms 0.000us 0.00% 4.567ms 4.567ms 1
4047
+ xformers_flash3::flash_fwd 2.75% 142.242us 35.45% 1.834ms 611.405us 0.000us 0.00% 4.567ms 1.522ms 3
4048
+ flash_attn_3::fwd 1.04% 53.951us 32.70% 1.692ms 563.991us 3.419ms 100.00% 4.567ms 1.522ms 3
4049
+ xformers_meff 0.00% 0.000us 0.00% 0.000us 0.000us 3.421ms 100.05% 3.421ms 3.421ms 1
4050
+ void cutlass::device_kernel&lt;flash::enable_sm80_to_sm... 0.00% 0.000us 0.00% 0.000us 0.000us 3.419ms 100.00% 3.419ms 1.140ms 3
4051
+ Activity Buffer Request 27.74% 1.436ms 27.74% 1.436ms 1.436ms 1.148ms 33.59% 1.148ms 1.148ms 1
4052
+ aten::empty 0.58% 29.770us 0.58% 29.770us 4.962us 0.000us 0.00% 0.000us 0.000us 6
4053
+ cudaFuncSetAttribute 0.11% 5.591us 0.11% 5.591us 1.864us 0.000us 0.00% 0.000us 0.000us 3
4054
+ cudaLaunchKernel 3.23% 167.152us 3.23% 167.152us 55.717us 0.000us 0.00% 0.000us 0.000us 3
4055
+ aten::reshape 0.16% 8.371us 0.44% 22.751us 3.792us 0.000us 0.00% 0.000us 0.000us 6
4056
+ aten::view 0.28% 14.380us 0.28% 14.380us 2.397us 0.000us 0.00% 0.000us 0.000us 6
4057
+ cudaDeviceSynchronize 58.14% 3.008ms 58.14% 3.008ms 3.008ms 0.000us 0.00% 0.000us 0.000us 1
4058
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4059
+ Self CPU time total: 5.174ms
4060
+ Self CUDA time total: 3.419ms
4061
 
4062
 
4063
  impl wl p50(ms) ok
4064
+ xformers_meff cuda_attn_L128_bfloat16 1.00 True
4065
+ xformers_meff cuda_attn_L256_bfloat16 1.04 True
4066
+ xformers_meff cuda_attn_L320_bfloat16 1.09 True
4067
+ xformers_meff cuda_attn_L384_bfloat16 1.11 True
4068
+ xformers_meff cuda_attn_L448_bfloat16 1.26 True
4069
+ xformers_meff cuda_attn_L512_bfloat16 1.25 True
4070
  </pre></div>
4071
  <div class="uv-install-logs" id="uv-logs-benchmark">
4072
  <div class="uv-logs-header" onclick="toggleUvLogs(this)">▶ UV Install Logs</div>
4073
  <div class="uv-logs-content" style="display: none;">
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
4074
  Downloading xformers (111.8MiB)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
4075
  Downloading xformers
4076
+ Installed 1 package in 14ms
 
 
 
 
 
 
 
 
 
4077
  </div>
4078
  </div>
4079
  <div class="cell-artifacts">
flash_attn/results/artifacts/combine/latency.svg CHANGED

Git LFS Details

  • SHA256: b3bfd4c5e82f8daf2fec939924eb6dc23b3e5d20e8327316e4f8b69db047e2a9
  • Pointer size: 130 Bytes
  • Size of remote file: 24 kB

Git LFS Details

  • SHA256: a94beca550ea0b3ff8a0f0eef062da6a6179ae09e78edc24cbacb71d8bd623a2
  • Pointer size: 130 Bytes
  • Size of remote file: 24.8 kB
flash_attn/results/combined_results.html CHANGED
@@ -3857,7 +3857,7 @@ span.linenos.special { color: #000000; background-color: #ffffc0; padding-left:
3857
  <div class="system-info">
3858
  <div class="system-info-header">Generated on:</div>
3859
  <div class="system-info-content">
3860
- Linux x86_64 | Linux-5.15.0-1084-aws-x86_64-with-glibc2.31
3861
  </div>
3862
  </div>
3863
 
@@ -3872,7 +3872,7 @@ span.linenos.special { color: #000000; background-color: #ffffc0; padding-left:
3872
  <rdf:RDF>
3873
  <ns2:Work>
3874
  <dc:type rdf:resource="http://purl.org/dc/dcmitype/StillImage" />
3875
- <dc:date>2025-10-24T19:27:34.267507</dc:date>
3876
  <dc:format>image/svg+xml</dc:format>
3877
  <dc:creator>
3878
  <ns2:Agent>
@@ -3891,320 +3891,333 @@ span.linenos.special { color: #000000; background-color: #ffffc0; padding-left:
3891
  </g>
3892
  <g id="axes--1" class="axes">
3893
  <g id="patch_2">
3894
- <path d="M 38.27 447.507117 L 835.361742 447.507117 L 835.361742 26.88 L 38.27 26.88 L 38.27 447.507117 z " style="fill: none" />
3895
  </g>
3896
  <g id="matplotlib.axis_1">
3897
  <g id="xtick_1">
3898
  <g id="grid-x--1" class="grid grid-x">
3899
- <path d="M 74.501443 447.507117 L 74.501443 26.88 " clip-path="url(#p0d2e0c97d5)" style="fill: none; stroke: #b0b0b0; stroke-opacity: 0.3; stroke-width: 0.8; stroke-linecap: square" />
3900
  </g>
3901
  <g id="line2d_1">
3902
  <defs>
3903
  <path id="mafb3703e5b" d="M 0 0 L 0 3.5 " style="stroke: #000000; stroke-width: 0.8" />
3904
  </defs>
3905
  <g>
3906
- <use ns4:href="#mafb3703e5b" x="74.501443" y="447.507117" style="stroke: #000000; stroke-width: 0.8" />
3907
  </g>
3908
  </g>
3909
  <g id="text_1">
3910
- <text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif" transform="translate(31.87119 548.84621) rotate(-45)">cuda_attn_L128_bfloat16</text>
3911
  </g>
3912
  </g>
3913
  <g id="xtick_2">
3914
  <g id="grid-x--2" class="grid grid-x">
3915
- <path d="M 219.427214 447.507117 L 219.427214 26.88 " clip-path="url(#p0d2e0c97d5)" style="fill: none; stroke: #b0b0b0; stroke-opacity: 0.3; stroke-width: 0.8; stroke-linecap: square" />
3916
  </g>
3917
  <g id="line2d_2">
3918
  <g>
3919
- <use ns4:href="#mafb3703e5b" x="219.427214" y="447.507117" style="stroke: #000000; stroke-width: 0.8" />
3920
  </g>
3921
  </g>
3922
  <g id="text_2">
3923
- <text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif" transform="translate(176.796962 548.84621) rotate(-45)">cuda_attn_L256_bfloat16</text>
3924
  </g>
3925
  </g>
3926
  <g id="xtick_3">
3927
  <g id="grid-x--3" class="grid grid-x">
3928
- <path d="M 364.352985 447.507117 L 364.352985 26.88 " clip-path="url(#p0d2e0c97d5)" style="fill: none; stroke: #b0b0b0; stroke-opacity: 0.3; stroke-width: 0.8; stroke-linecap: square" />
3929
  </g>
3930
  <g id="line2d_3">
3931
  <g>
3932
- <use ns4:href="#mafb3703e5b" x="364.352985" y="447.507117" style="stroke: #000000; stroke-width: 0.8" />
3933
  </g>
3934
  </g>
3935
  <g id="text_3">
3936
- <text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif" transform="translate(321.722733 548.84621) rotate(-45)">cuda_attn_L320_bfloat16</text>
3937
  </g>
3938
  </g>
3939
  <g id="xtick_4">
3940
  <g id="grid-x--4" class="grid grid-x">
3941
- <path d="M 509.278756 447.507117 L 509.278756 26.88 " clip-path="url(#p0d2e0c97d5)" style="fill: none; stroke: #b0b0b0; stroke-opacity: 0.3; stroke-width: 0.8; stroke-linecap: square" />
3942
  </g>
3943
  <g id="line2d_4">
3944
  <g>
3945
- <use ns4:href="#mafb3703e5b" x="509.278756" y="447.507117" style="stroke: #000000; stroke-width: 0.8" />
3946
  </g>
3947
  </g>
3948
  <g id="text_4">
3949
- <text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif" transform="translate(466.648504 548.84621) rotate(-45)">cuda_attn_L384_bfloat16</text>
3950
  </g>
3951
  </g>
3952
  <g id="xtick_5">
3953
  <g id="grid-x--5" class="grid grid-x">
3954
- <path d="M 654.204528 447.507117 L 654.204528 26.88 " clip-path="url(#p0d2e0c97d5)" style="fill: none; stroke: #b0b0b0; stroke-opacity: 0.3; stroke-width: 0.8; stroke-linecap: square" />
3955
  </g>
3956
  <g id="line2d_5">
3957
  <g>
3958
- <use ns4:href="#mafb3703e5b" x="654.204528" y="447.507117" style="stroke: #000000; stroke-width: 0.8" />
3959
  </g>
3960
  </g>
3961
  <g id="text_5">
3962
- <text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif" transform="translate(611.574275 548.84621) rotate(-45)">cuda_attn_L448_bfloat16</text>
3963
  </g>
3964
  </g>
3965
  <g id="xtick_6">
3966
  <g id="grid-x--6" class="grid grid-x">
3967
- <path d="M 799.130299 447.507117 L 799.130299 26.88 " clip-path="url(#p0d2e0c97d5)" style="fill: none; stroke: #b0b0b0; stroke-opacity: 0.3; stroke-width: 0.8; stroke-linecap: square" />
3968
  </g>
3969
  <g id="line2d_6">
3970
  <g>
3971
- <use ns4:href="#mafb3703e5b" x="799.130299" y="447.507117" style="stroke: #000000; stroke-width: 0.8" />
3972
  </g>
3973
  </g>
3974
  <g id="text_6">
3975
- <text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif" transform="translate(756.500046 548.84621) rotate(-45)">cuda_attn_L512_bfloat16</text>
3976
  </g>
3977
  </g>
3978
  <g id="label--x" class="xlabel">
3979
- <text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: middle" x="436.815871" y="562.111872" transform="rotate(-0 436.815871 562.111872)">Workload</text>
3980
  </g>
3981
  </g>
3982
  <g id="matplotlib.axis_2">
3983
  <g id="ytick_1">
3984
  <g id="grid-y--2" class="grid grid-y">
3985
- <path d="M 38.27 416.200827 L 835.361742 416.200827 " clip-path="url(#p0d2e0c97d5)" style="fill: none; stroke: #b0b0b0; stroke-opacity: 0.3; stroke-width: 0.8; stroke-linecap: square" />
3986
  </g>
3987
  <g id="line2d_7">
3988
  <defs>
3989
  <path id="m0fca2865ba" d="M 0 0 L -3.5 0 " style="stroke: #000000; stroke-width: 0.8" />
3990
  </defs>
3991
  <g>
3992
- <use ns4:href="#m0fca2865ba" x="38.27" y="416.200827" style="stroke: #000000; stroke-width: 0.8" />
3993
  </g>
3994
  </g>
3995
  <g id="text_7">
3996
- <text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: end" x="31.27" y="420.000045" transform="rotate(-0 31.27 420.000045)">3</text>
3997
  </g>
3998
  </g>
3999
  <g id="ytick_2">
4000
  <g id="grid-y--3" class="grid grid-y">
4001
- <path d="M 38.27 348.555726 L 835.361742 348.555726 " clip-path="url(#p0d2e0c97d5)" style="fill: none; stroke: #b0b0b0; stroke-opacity: 0.3; stroke-width: 0.8; stroke-linecap: square" />
4002
  </g>
4003
  <g id="line2d_8">
4004
  <g>
4005
- <use ns4:href="#m0fca2865ba" x="38.27" y="348.555726" style="stroke: #000000; stroke-width: 0.8" />
4006
  </g>
4007
  </g>
4008
  <g id="text_8">
4009
- <text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: end" x="31.27" y="352.354944" transform="rotate(-0 31.27 352.354944)">4</text>
4010
  </g>
4011
  </g>
4012
  <g id="ytick_3">
4013
  <g id="grid-y--4" class="grid grid-y">
4014
- <path d="M 38.27 280.910625 L 835.361742 280.910625 " clip-path="url(#p0d2e0c97d5)" style="fill: none; stroke: #b0b0b0; stroke-opacity: 0.3; stroke-width: 0.8; stroke-linecap: square" />
4015
  </g>
4016
  <g id="line2d_9">
4017
  <g>
4018
- <use ns4:href="#m0fca2865ba" x="38.27" y="280.910625" style="stroke: #000000; stroke-width: 0.8" />
4019
  </g>
4020
  </g>
4021
  <g id="text_9">
4022
- <text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: end" x="31.27" y="284.709843" transform="rotate(-0 31.27 284.709843)">5</text>
4023
  </g>
4024
  </g>
4025
  <g id="ytick_4">
4026
  <g id="grid-y--5" class="grid grid-y">
4027
- <path d="M 38.27 213.265524 L 835.361742 213.265524 " clip-path="url(#p0d2e0c97d5)" style="fill: none; stroke: #b0b0b0; stroke-opacity: 0.3; stroke-width: 0.8; stroke-linecap: square" />
4028
  </g>
4029
  <g id="line2d_10">
4030
  <g>
4031
- <use ns4:href="#m0fca2865ba" x="38.27" y="213.265524" style="stroke: #000000; stroke-width: 0.8" />
4032
  </g>
4033
  </g>
4034
  <g id="text_10">
4035
- <text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: end" x="31.27" y="217.064743" transform="rotate(-0 31.27 217.064743)">6</text>
4036
  </g>
4037
  </g>
4038
  <g id="ytick_5">
4039
  <g id="grid-y--6" class="grid grid-y">
4040
- <path d="M 38.27 145.620423 L 835.361742 145.620423 " clip-path="url(#p0d2e0c97d5)" style="fill: none; stroke: #b0b0b0; stroke-opacity: 0.3; stroke-width: 0.8; stroke-linecap: square" />
4041
  </g>
4042
  <g id="line2d_11">
4043
  <g>
4044
- <use ns4:href="#m0fca2865ba" x="38.27" y="145.620423" style="stroke: #000000; stroke-width: 0.8" />
4045
  </g>
4046
  </g>
4047
  <g id="text_11">
4048
- <text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: end" x="31.27" y="149.419642" transform="rotate(-0 31.27 149.419642)">7</text>
4049
  </g>
4050
  </g>
4051
  <g id="ytick_6">
4052
  <g id="grid-y--7" class="grid grid-y">
4053
- <path d="M 38.27 77.975322 L 835.361742 77.975322 " clip-path="url(#p0d2e0c97d5)" style="fill: none; stroke: #b0b0b0; stroke-opacity: 0.3; stroke-width: 0.8; stroke-linecap: square" />
4054
  </g>
4055
  <g id="line2d_12">
4056
  <g>
4057
- <use ns4:href="#m0fca2865ba" x="38.27" y="77.975322" style="stroke: #000000; stroke-width: 0.8" />
4058
  </g>
4059
  </g>
4060
  <g id="text_12">
4061
- <text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: end" x="31.27" y="81.774541" transform="rotate(-0 31.27 81.774541)">8</text>
 
 
 
 
 
 
 
 
 
 
 
 
 
4062
  </g>
4063
  </g>
4064
  <g id="label--y" class="ylabel">
4065
- <text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: middle" x="18.827813" y="237.193558" transform="rotate(-90 18.827813 237.193558)">Latency P50 (ms)</text>
4066
  </g>
4067
  </g>
4068
  <g id="series--torch-flash-ma" class="series">
4069
- <path d="M 74.501443 342.178283 L 219.427214 295.140246 L 364.352985 287.657208 L 509.278756 281.974208 L 654.204528 277.480409 L 799.130299 249.123044 " clip-path="url(#p0d2e0c97d5)" style="fill: none; stroke: #1f77b4; stroke-width: 1.5; stroke-linecap: square" />
4070
  <defs>
4071
  <path id="md7efaf3aec" d="M 0 3 C 0.795609 3 1.55874 2.683901 2.12132 2.12132 C 2.683901 1.55874 3 0.795609 3 0 C 3 -0.795609 2.683901 -1.55874 2.12132 -2.12132 C 1.55874 -2.683901 0.795609 -3 0 -3 C -0.795609 -3 -1.55874 -2.683901 -2.12132 -2.12132 C -2.683901 -1.55874 -3 -0.795609 -3 0 C -3 0.795609 -2.683901 1.55874 -2.12132 2.12132 C -1.55874 2.683901 -0.795609 3 0 3 z " style="stroke: #1f77b4" />
4072
  </defs>
4073
- <g clip-path="url(#p0d2e0c97d5)">
4074
- <use ns4:href="#md7efaf3aec" x="74.501443" y="342.178283" style="fill: #1f77b4; stroke: #1f77b4" />
4075
- <use ns4:href="#md7efaf3aec" x="219.427214" y="295.140246" style="fill: #1f77b4; stroke: #1f77b4" />
4076
- <use ns4:href="#md7efaf3aec" x="364.352985" y="287.657208" style="fill: #1f77b4; stroke: #1f77b4" />
4077
- <use ns4:href="#md7efaf3aec" x="509.278756" y="281.974208" style="fill: #1f77b4; stroke: #1f77b4" />
4078
- <use ns4:href="#md7efaf3aec" x="654.204528" y="277.480409" style="fill: #1f77b4; stroke: #1f77b4" />
4079
- <use ns4:href="#md7efaf3aec" x="799.130299" y="249.123044" style="fill: #1f77b4; stroke: #1f77b4" />
4080
  </g>
4081
  </g>
4082
  <g id="series--torch-mem-eff" class="series">
4083
- <path d="M 74.501443 161.453774 L 219.427214 129.630134 L 364.352985 110.285937 L 509.278756 105.945081 L 654.204528 80.334108 L 799.130299 45.999414 " clip-path="url(#p0d2e0c97d5)" style="fill: none; stroke: #ff7f0e; stroke-width: 1.5; stroke-linecap: square" />
4084
  <defs>
4085
  <path id="m9b8c54d372" d="M 0 3 C 0.795609 3 1.55874 2.683901 2.12132 2.12132 C 2.683901 1.55874 3 0.795609 3 0 C 3 -0.795609 2.683901 -1.55874 2.12132 -2.12132 C 1.55874 -2.683901 0.795609 -3 0 -3 C -0.795609 -3 -1.55874 -2.683901 -2.12132 -2.12132 C -2.683901 -1.55874 -3 -0.795609 -3 0 C -3 0.795609 -2.683901 1.55874 -2.12132 2.12132 C -1.55874 2.683901 -0.795609 3 0 3 z " style="stroke: #ff7f0e" />
4086
  </defs>
4087
- <g clip-path="url(#p0d2e0c97d5)">
4088
- <use ns4:href="#m9b8c54d372" x="74.501443" y="161.453774" style="fill: #ff7f0e; stroke: #ff7f0e" />
4089
- <use ns4:href="#m9b8c54d372" x="219.427214" y="129.630134" style="fill: #ff7f0e; stroke: #ff7f0e" />
4090
- <use ns4:href="#m9b8c54d372" x="364.352985" y="110.285937" style="fill: #ff7f0e; stroke: #ff7f0e" />
4091
- <use ns4:href="#m9b8c54d372" x="509.278756" y="105.945081" style="fill: #ff7f0e; stroke: #ff7f0e" />
4092
- <use ns4:href="#m9b8c54d372" x="654.204528" y="80.334108" style="fill: #ff7f0e; stroke: #ff7f0e" />
4093
- <use ns4:href="#m9b8c54d372" x="799.130299" y="45.999414" style="fill: #ff7f0e; stroke: #ff7f0e" />
4094
  </g>
4095
  </g>
4096
  <g id="series--xformers-meff" class="series">
4097
- <path d="M 74.501443 375.900177 L 219.427214 387.059046 L 364.352985 342.097108 L 509.278756 347.750344 L 654.204528 334.407754 L 799.130299 319.565404 " clip-path="url(#p0d2e0c97d5)" style="fill: none; stroke: #2ca02c; stroke-width: 1.5; stroke-linecap: square" />
4098
  <defs>
4099
  <path id="mc655281e0b" d="M 0 3 C 0.795609 3 1.55874 2.683901 2.12132 2.12132 C 2.683901 1.55874 3 0.795609 3 0 C 3 -0.795609 2.683901 -1.55874 2.12132 -2.12132 C 1.55874 -2.683901 0.795609 -3 0 -3 C -0.795609 -3 -1.55874 -2.683901 -2.12132 -2.12132 C -2.683901 -1.55874 -3 -0.795609 -3 0 C -3 0.795609 -2.683901 1.55874 -2.12132 2.12132 C -1.55874 2.683901 -0.795609 3 0 3 z " style="stroke: #2ca02c" />
4100
  </defs>
4101
- <g clip-path="url(#p0d2e0c97d5)">
4102
- <use ns4:href="#mc655281e0b" x="74.501443" y="375.900177" style="fill: #2ca02c; stroke: #2ca02c" />
4103
- <use ns4:href="#mc655281e0b" x="219.427214" y="387.059046" style="fill: #2ca02c; stroke: #2ca02c" />
4104
- <use ns4:href="#mc655281e0b" x="364.352985" y="342.097108" style="fill: #2ca02c; stroke: #2ca02c" />
4105
- <use ns4:href="#mc655281e0b" x="509.278756" y="347.750344" style="fill: #2ca02c; stroke: #2ca02c" />
4106
- <use ns4:href="#mc655281e0b" x="654.204528" y="334.407754" style="fill: #2ca02c; stroke: #2ca02c" />
4107
- <use ns4:href="#mc655281e0b" x="799.130299" y="319.565404" style="fill: #2ca02c; stroke: #2ca02c" />
4108
  </g>
4109
  </g>
4110
  <g id="series--hf-kernels-flash-attn" class="series">
4111
- <path d="M 74.501443 428.387702 L 219.427214 354.446599 L 364.352985 340.491147 L 509.278756 339.909398 L 654.204528 340.987724 L 799.130299 309.967712 " clip-path="url(#p0d2e0c97d5)" style="fill: none; stroke: #d62728; stroke-width: 1.5; stroke-linecap: square" />
4112
  <defs>
4113
  <path id="m61c8040d7e" d="M 0 3 C 0.795609 3 1.55874 2.683901 2.12132 2.12132 C 2.683901 1.55874 3 0.795609 3 0 C 3 -0.795609 2.683901 -1.55874 2.12132 -2.12132 C 1.55874 -2.683901 0.795609 -3 0 -3 C -0.795609 -3 -1.55874 -2.683901 -2.12132 -2.12132 C -2.683901 -1.55874 -3 -0.795609 -3 0 C -3 0.795609 -2.683901 1.55874 -2.12132 2.12132 C -1.55874 2.683901 -0.795609 3 0 3 z " style="stroke: #d62728" />
4114
  </defs>
4115
- <g clip-path="url(#p0d2e0c97d5)">
4116
- <use ns4:href="#m61c8040d7e" x="74.501443" y="428.387702" style="fill: #d62728; stroke: #d62728" />
4117
- <use ns4:href="#m61c8040d7e" x="219.427214" y="354.446599" style="fill: #d62728; stroke: #d62728" />
4118
- <use ns4:href="#m61c8040d7e" x="364.352985" y="340.491147" style="fill: #d62728; stroke: #d62728" />
4119
- <use ns4:href="#m61c8040d7e" x="509.278756" y="339.909398" style="fill: #d62728; stroke: #d62728" />
4120
- <use ns4:href="#m61c8040d7e" x="654.204528" y="340.987724" style="fill: #d62728; stroke: #d62728" />
4121
- <use ns4:href="#m61c8040d7e" x="799.130299" y="309.967712" style="fill: #d62728; stroke: #d62728" />
4122
  </g>
4123
  </g>
4124
  <g id="series--hf-kernels-flash-attn3" class="series">
4125
- <path d="M 74.501443 401.329117 L 219.427214 364.396182 L 364.352985 354.430502 L 509.278756 350.637501 L 654.204528 335.557722 L 799.130299 321.02931 " clip-path="url(#p0d2e0c97d5)" style="fill: none; stroke: #9467bd; stroke-width: 1.5; stroke-linecap: square" />
4126
  <defs>
4127
  <path id="m7cd35be9cc" d="M 0 3 C 0.795609 3 1.55874 2.683901 2.12132 2.12132 C 2.683901 1.55874 3 0.795609 3 0 C 3 -0.795609 2.683901 -1.55874 2.12132 -2.12132 C 1.55874 -2.683901 0.795609 -3 0 -3 C -0.795609 -3 -1.55874 -2.683901 -2.12132 -2.12132 C -2.683901 -1.55874 -3 -0.795609 -3 0 C -3 0.795609 -2.683901 1.55874 -2.12132 2.12132 C -1.55874 2.683901 -0.795609 3 0 3 z " style="stroke: #9467bd" />
4128
  </defs>
4129
- <g clip-path="url(#p0d2e0c97d5)">
4130
- <use ns4:href="#m7cd35be9cc" x="74.501443" y="401.329117" style="fill: #9467bd; stroke: #9467bd" />
4131
- <use ns4:href="#m7cd35be9cc" x="219.427214" y="364.396182" style="fill: #9467bd; stroke: #9467bd" />
4132
- <use ns4:href="#m7cd35be9cc" x="364.352985" y="354.430502" style="fill: #9467bd; stroke: #9467bd" />
4133
- <use ns4:href="#m7cd35be9cc" x="509.278756" y="350.637501" style="fill: #9467bd; stroke: #9467bd" />
4134
- <use ns4:href="#m7cd35be9cc" x="654.204528" y="335.557722" style="fill: #9467bd; stroke: #9467bd" />
4135
- <use ns4:href="#m7cd35be9cc" x="799.130299" y="321.02931" style="fill: #9467bd; stroke: #9467bd" />
4136
  </g>
4137
  </g>
4138
  <g id="patch_3">
4139
- <path d="M 38.27 447.507117 L 38.27 26.88 " style="fill: none; stroke: #000000; stroke-width: 0.8; stroke-linejoin: miter; stroke-linecap: square" />
4140
  </g>
4141
  <g id="patch_4">
4142
  <path d="M 835.361742 447.507117 L 835.361742 26.88 " style="fill: none; stroke: #000000; stroke-width: 0.8; stroke-linejoin: miter; stroke-linecap: square" />
4143
  </g>
4144
  <g id="patch_5">
4145
- <path d="M 38.27 447.507117 L 835.361742 447.507117 " style="fill: none; stroke: #000000; stroke-width: 0.8; stroke-linejoin: miter; stroke-linecap: square" />
4146
  </g>
4147
  <g id="patch_6">
4148
- <path d="M 38.27 26.88 L 835.361742 26.88 " style="fill: none; stroke: #000000; stroke-width: 0.8; stroke-linejoin: miter; stroke-linecap: square" />
4149
  </g>
4150
- <g id="text_13">
4151
- <text style="font-size: 12px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: middle" x="436.815871" y="20.88" transform="rotate(-0 436.815871 20.88)">Attention Implementation Latency</text>
4152
  </g>
4153
  <g id="legend" class="legend">
4154
  <g id="patch_7">
4155
- <path d="M 45.27 109.66125 L 188.765313 109.66125 Q 190.765313 109.66125 190.765313 107.66125 L 190.765313 33.88 Q 190.765313 31.88 188.765313 31.88 L 45.27 31.88 Q 43.27 31.88 43.27 33.88 L 43.27 107.66125 Q 43.27 109.66125 45.27 109.66125 L 45.27 109.66125 z " style="fill: none; opacity: 0.8; stroke: #cccccc; stroke-linejoin: miter" />
4156
  </g>
4157
- <g id="line2d_13">
4158
- <path d="M 47.27 39.978438 L 57.27 39.978438 L 67.27 39.978438 " style="fill: none; stroke: #1f77b4; stroke-width: 1.5; stroke-linecap: square" />
4159
  <g>
4160
- <use ns4:href="#md7efaf3aec" x="57.27" y="39.978438" style="fill: #1f77b4; stroke: #1f77b4" />
4161
  </g>
4162
  </g>
4163
  <g id="legend-label--torch-flash-ma" class="legend">
4164
- <text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: start" x="75.27" y="43.478438" transform="rotate(-0 75.27 43.478438)">torch_flash_ma</text>
4165
  </g>
4166
- <g id="line2d_14">
4167
- <path d="M 47.27 54.934687 L 57.27 54.934687 L 67.27 54.934687 " style="fill: none; stroke: #ff7f0e; stroke-width: 1.5; stroke-linecap: square" />
4168
  <g>
4169
- <use ns4:href="#m9b8c54d372" x="57.27" y="54.934687" style="fill: #ff7f0e; stroke: #ff7f0e" />
4170
  </g>
4171
  </g>
4172
  <g id="legend-label--torch-mem-eff" class="legend">
4173
- <text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: start" x="75.27" y="58.434687" transform="rotate(-0 75.27 58.434687)">torch_mem_eff</text>
4174
  </g>
4175
- <g id="line2d_15">
4176
- <path d="M 47.27 69.890938 L 57.27 69.890938 L 67.27 69.890938 " style="fill: none; stroke: #2ca02c; stroke-width: 1.5; stroke-linecap: square" />
4177
  <g>
4178
- <use ns4:href="#mc655281e0b" x="57.27" y="69.890938" style="fill: #2ca02c; stroke: #2ca02c" />
4179
  </g>
4180
  </g>
4181
  <g id="legend-label--xformers-meff" class="legend">
4182
- <text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: start" x="75.27" y="73.390938" transform="rotate(-0 75.27 73.390938)">xformers_meff</text>
4183
  </g>
4184
- <g id="line2d_16">
4185
- <path d="M 47.27 84.847188 L 57.27 84.847188 L 67.27 84.847188 " style="fill: none; stroke: #d62728; stroke-width: 1.5; stroke-linecap: square" />
4186
  <g>
4187
- <use ns4:href="#m61c8040d7e" x="57.27" y="84.847188" style="fill: #d62728; stroke: #d62728" />
4188
  </g>
4189
  </g>
4190
  <g id="legend-label--hf-kernels-flash-attn" class="legend">
4191
- <text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: start" x="75.27" y="88.347188" transform="rotate(-0 75.27 88.347188)">hf_kernels_flash_attn</text>
4192
  </g>
4193
- <g id="line2d_17">
4194
- <path d="M 47.27 99.803438 L 57.27 99.803438 L 67.27 99.803438 " style="fill: none; stroke: #9467bd; stroke-width: 1.5; stroke-linecap: square" />
4195
  <g>
4196
- <use ns4:href="#m7cd35be9cc" x="57.27" y="99.803438" style="fill: #9467bd; stroke: #9467bd" />
4197
  </g>
4198
  </g>
4199
  <g id="legend-label--hf-kernels-flash-attn3" class="legend">
4200
- <text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: start" x="75.27" y="103.303438" transform="rotate(-0 75.27 103.303438)">hf_kernels_flash_attn3</text>
4201
  </g>
4202
  </g>
4203
  </g>
4204
  </g>
4205
  <defs>
4206
- <clipPath id="p0d2e0c97d5">
4207
- <rect x="38.27" y="26.88" width="797.091742" height="420.627117" />
4208
  </clipPath>
4209
  </defs>
4210
  </svg>
@@ -4217,7 +4230,7 @@ span.linenos.special { color: #000000; background-color: #ffffc0; padding-left:
4217
  <span onclick="toggleOutput('combine')" style="cursor: pointer;">▼ output</span>
4218
  <span id="uv-indicator-combine" onclick="toggleUvLogsFromHeader('combine')" style="cursor: pointer;">▶ uv-logs</span>
4219
  </span> |
4220
- Cell: combine | 39.40s
4221
  | <button class="run-btn" onclick="runCell('combine')">▶ run</button>
4222
  <button class="copy-btn" onclick="copyCell('combine')">Copy</button>
4223
  <a href="cells/combine.py" target="_blank" class="raw-btn">Raw</a>
@@ -4297,25 +4310,25 @@ Cell: combine | 39.40s
4297
  <div class="cell-stdout"><pre class="stdout-text">======================================================================
4298
  LOADING BENCHMARK DATA
4299
  ======================================================================
4300
- ✓ Flash (PyTorch SDPA) : /home/ubuntu/Projects/kernels-benchmarks-consolidated/benches/flash_attn/impls/.uvnote/cache/1229e2a918a2e0c395750645114ee4e0e721d5f703c5221972db88ca3fe9e8b9
4301
- ✓ MemEff (PyTorch SDPA) : /home/ubuntu/Projects/kernels-benchmarks-consolidated/benches/flash_attn/impls/.uvnote/cache/2ad4910cf70b34b5a3a316e2d789b9763d6651ee3d6727249ff229320cd58d24
4302
- ✓ xFormers : /home/ubuntu/Projects/kernels-benchmarks-consolidated/benches/flash_attn/impls/.uvnote/cache/4883c01f586350408f08d21d8d78943b44e953dd559356f9392f803696daca1a
4303
- ✓ HF Kernels Flash Attn : /home/ubuntu/Projects/kernels-benchmarks-consolidated/benches/flash_attn/impls/.uvnote/cache/0c83351d95394732eb53074c734760a0bef9733834298a6c04a08d9ec6a12660
4304
- ✓ HF Kernels Flash Attn3 : /home/ubuntu/Projects/kernels-benchmarks-consolidated/benches/flash_attn/impls/.uvnote/cache/f514c47cfa55db88f58672a033a040f67509b5051b8e3f332dd6d20ae85a88a8
4305
- ✓ SageAttention : /home/ubuntu/Projects/kernels-benchmarks-consolidated/benches/flash_attn/impls/.uvnote/cache/44c62c72b6cd63934c6e75ade8b74c9428734ea4f94c030632b382a8f0107a57
4306
 
4307
  ✓ Found Flash (PyTorch SDPA)
4308
- Path: /home/ubuntu/Projects/kernels-benchmarks-consolidated/benches/flash_attn/impls/.uvnote/cache/1229e2a918a2e0c395750645114ee4e0e721d5f703c5221972db88ca3fe9e8b9/attention.jsonl
4309
  ✓ Found MemEff (PyTorch SDPA)
4310
- Path: /home/ubuntu/Projects/kernels-benchmarks-consolidated/benches/flash_attn/impls/.uvnote/cache/2ad4910cf70b34b5a3a316e2d789b9763d6651ee3d6727249ff229320cd58d24/attention.jsonl
4311
  ✓ Found xFormers
4312
- Path: /home/ubuntu/Projects/kernels-benchmarks-consolidated/benches/flash_attn/impls/.uvnote/cache/4883c01f586350408f08d21d8d78943b44e953dd559356f9392f803696daca1a/attention.jsonl
4313
  ✓ Found HF Kernels Flash Attn
4314
- Path: /home/ubuntu/Projects/kernels-benchmarks-consolidated/benches/flash_attn/impls/.uvnote/cache/0c83351d95394732eb53074c734760a0bef9733834298a6c04a08d9ec6a12660/attention.jsonl
4315
  ✓ Found HF Kernels Flash Attn3
4316
- Path: /home/ubuntu/Projects/kernels-benchmarks-consolidated/benches/flash_attn/impls/.uvnote/cache/f514c47cfa55db88f58672a033a040f67509b5051b8e3f332dd6d20ae85a88a8/attention.jsonl
4317
  ✓ Found SageAttention
4318
- Path: /home/ubuntu/Projects/kernels-benchmarks-consolidated/benches/flash_attn/impls/.uvnote/cache/44c62c72b6cd63934c6e75ade8b74c9428734ea4f94c030632b382a8f0107a57/attention.jsonl
4319
 
4320
  ======================================================================
4321
  Summary: 6 found, 0 skipped, 0 missing
@@ -4324,48 +4337,48 @@ Summary: 6 found, 0 skipped, 0 missing
4324
  COMBINED BENCHMARK SUMMARY
4325
 
4326
  impl wl p50(ms) ok
4327
- hf_kernels_flash_attn cuda_attn_L128_bfloat16 2.82 True
4328
- hf_kernels_flash_attn cuda_attn_L256_bfloat16 3.91 True
4329
- hf_kernels_flash_attn cuda_attn_L320_bfloat16 4.12 True
4330
- hf_kernels_flash_attn cuda_attn_L384_bfloat16 4.13 True
4331
- hf_kernels_flash_attn cuda_attn_L448_bfloat16 4.11 True
4332
- hf_kernels_flash_attn cuda_attn_L512_bfloat16 4.57 True
4333
- hf_kernels_flash_attn3 cuda_attn_L128_bfloat16 3.22 True
4334
- hf_kernels_flash_attn3 cuda_attn_L256_bfloat16 3.77 True
4335
- hf_kernels_flash_attn3 cuda_attn_L320_bfloat16 3.91 True
4336
- hf_kernels_flash_attn3 cuda_attn_L384_bfloat16 3.97 True
4337
- hf_kernels_flash_attn3 cuda_attn_L448_bfloat16 4.19 True
4338
- hf_kernels_flash_attn3 cuda_attn_L512_bfloat16 4.41 True
4339
  sage_int8_fp16 cuda_attn_L128_bfloat16 FAIL False
4340
- Error: module &#x27;sage_attention_c9b3a60c7e3c5091&#x27; has no attribute &#x27;fwd&#x27;
4341
  sage_int8_fp16 cuda_attn_L256_bfloat16 FAIL False
4342
- Error: module &#x27;sage_attention_c9b3a60c7e3c5091&#x27; has no attribute &#x27;fwd&#x27;
4343
  sage_int8_fp16 cuda_attn_L320_bfloat16 FAIL False
4344
- Error: module &#x27;sage_attention_c9b3a60c7e3c5091&#x27; has no attribute &#x27;fwd&#x27;
4345
  sage_int8_fp16 cuda_attn_L384_bfloat16 FAIL False
4346
- Error: module &#x27;sage_attention_c9b3a60c7e3c5091&#x27; has no attribute &#x27;fwd&#x27;
4347
  sage_int8_fp16 cuda_attn_L448_bfloat16 FAIL False
4348
- Error: module &#x27;sage_attention_c9b3a60c7e3c5091&#x27; has no attribute &#x27;fwd&#x27;
4349
  sage_int8_fp16 cuda_attn_L512_bfloat16 FAIL False
4350
- Error: module &#x27;sage_attention_c9b3a60c7e3c5091&#x27; has no attribute &#x27;fwd&#x27;
4351
- torch_flash_ma cuda_attn_L128_bfloat16 4.09 True
4352
- torch_flash_ma cuda_attn_L256_bfloat16 4.79 True
4353
- torch_flash_ma cuda_attn_L320_bfloat16 4.90 True
4354
- torch_flash_ma cuda_attn_L384_bfloat16 4.98 True
4355
- torch_flash_ma cuda_attn_L448_bfloat16 5.05 True
4356
- torch_flash_ma cuda_attn_L512_bfloat16 5.47 True
4357
- torch_mem_eff cuda_attn_L128_bfloat16 6.77 True
4358
- torch_mem_eff cuda_attn_L256_bfloat16 7.24 True
4359
- torch_mem_eff cuda_attn_L320_bfloat16 7.52 True
4360
- torch_mem_eff cuda_attn_L384_bfloat16 7.59 True
4361
- torch_mem_eff cuda_attn_L448_bfloat16 7.97 True
4362
- torch_mem_eff cuda_attn_L512_bfloat16 8.47 True
4363
- xformers_meff cuda_attn_L128_bfloat16 3.60 True
4364
- xformers_meff cuda_attn_L256_bfloat16 3.43 True
4365
- xformers_meff cuda_attn_L320_bfloat16 4.10 True
4366
- xformers_meff cuda_attn_L384_bfloat16 4.01 True
4367
- xformers_meff cuda_attn_L448_bfloat16 4.21 True
4368
- xformers_meff cuda_attn_L512_bfloat16 4.43 True
4369
 
4370
  GENERATING COMBINED VISUALIZATION
4371
 
@@ -4389,53 +4402,7 @@ Implementations included:
4389
  <div class="uv-install-logs" id="uv-logs-combine">
4390
  <div class="uv-logs-header" onclick="toggleUvLogs(this)">▶ UV Install Logs</div>
4391
  <div class="uv-logs-content" style="display: none;">
4392
- Building kernels-benchmark-tools @ file:///home/ubuntu/Projects/kernels-benchmarks-consolidated/tools
4393
- Downloading nvidia-cufft-cu12 (184.2MiB)
4394
- Downloading nvidia-cufile-cu12 (1.1MiB)
4395
- Downloading networkx (1.9MiB)
4396
- Downloading kiwisolver (1.4MiB)
4397
- Downloading nvidia-cusolver-cu12 (255.1MiB)
4398
- Downloading nvidia-cuda-cupti-cu12 (9.8MiB)
4399
- Downloading pillow (6.7MiB)
4400
- Downloading nvidia-cuda-nvrtc-cu12 (84.0MiB)
4401
- Downloading nvidia-cusparse-cu12 (274.9MiB)
4402
- Downloading nvidia-cublas-cu12 (566.8MiB)
4403
- Downloading numpy (15.9MiB)
4404
- Downloading fonttools (4.7MiB)
4405
- Downloading setuptools (1.1MiB)
4406
- Downloading sympy (6.0MiB)
4407
- Downloading nvidia-nccl-cu12 (307.4MiB)
4408
- Downloading nvidia-nvjitlink-cu12 (37.4MiB)
4409
- Downloading nvidia-cusparselt-cu12 (273.9MiB)
4410
- Downloading nvidia-cudnn-cu12 (674.0MiB)
4411
- Downloading torch (846.8MiB)
4412
- Downloading matplotlib (8.3MiB)
4413
- Downloading triton (148.4MiB)
4414
- Downloading nvidia-curand-cu12 (60.7MiB)
4415
- Downloading nvidia-cufile-cu12
4416
- Downloading kiwisolver
4417
- Downloading setuptools
4418
- Downloading fonttools
4419
- Downloading networkx
4420
- Downloading pillow
4421
- Built kernels-benchmark-tools @ file:///home/ubuntu/Projects/kernels-benchmarks-consolidated/tools
4422
- Downloading matplotlib
4423
- Downloading nvidia-cuda-cupti-cu12
4424
- Downloading numpy
4425
- Downloading nvidia-nvjitlink-cu12
4426
- Downloading sympy
4427
- Downloading nvidia-curand-cu12
4428
- Downloading nvidia-cuda-nvrtc-cu12
4429
- Downloading triton
4430
- Downloading nvidia-cufft-cu12
4431
- Downloading nvidia-cusolver-cu12
4432
- Downloading nvidia-cusparselt-cu12
4433
- Downloading nvidia-cusparse-cu12
4434
- Downloading nvidia-nccl-cu12
4435
- Downloading nvidia-cublas-cu12
4436
- Downloading nvidia-cudnn-cu12
4437
- Downloading torch
4438
- Installed 37 packages in 230ms
4439
  </div>
4440
  </div>
4441
  <div class="cell-artifacts">
@@ -4448,7 +4415,7 @@ Installed 37 packages in 230ms
4448
  <rdf:RDF>
4449
  <ns2:Work>
4450
  <dc:type rdf:resource="http://purl.org/dc/dcmitype/StillImage" />
4451
- <dc:date>2025-10-24T19:27:34.267507</dc:date>
4452
  <dc:format>image/svg+xml</dc:format>
4453
  <dc:creator>
4454
  <ns2:Agent>
@@ -4467,320 +4434,333 @@ Installed 37 packages in 230ms
4467
  </g>
4468
  <g id="axes--1" class="axes">
4469
  <g id="patch_2">
4470
- <path d="M 38.27 447.507117 L 835.361742 447.507117 L 835.361742 26.88 L 38.27 26.88 L 38.27 447.507117 z " style="fill: none" />
4471
  </g>
4472
  <g id="matplotlib.axis_1">
4473
  <g id="xtick_1">
4474
  <g id="grid-x--1" class="grid grid-x">
4475
- <path d="M 74.501443 447.507117 L 74.501443 26.88 " clip-path="url(#p0d2e0c97d5)" style="fill: none; stroke: #b0b0b0; stroke-opacity: 0.3; stroke-width: 0.8; stroke-linecap: square" />
4476
  </g>
4477
  <g id="line2d_1">
4478
  <defs>
4479
  <path id="mafb3703e5b" d="M 0 0 L 0 3.5 " style="stroke: #000000; stroke-width: 0.8" />
4480
  </defs>
4481
  <g>
4482
- <use ns4:href="#mafb3703e5b" x="74.501443" y="447.507117" style="stroke: #000000; stroke-width: 0.8" />
4483
  </g>
4484
  </g>
4485
  <g id="text_1">
4486
- <text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif" transform="translate(31.87119 548.84621) rotate(-45)">cuda_attn_L128_bfloat16</text>
4487
  </g>
4488
  </g>
4489
  <g id="xtick_2">
4490
  <g id="grid-x--2" class="grid grid-x">
4491
- <path d="M 219.427214 447.507117 L 219.427214 26.88 " clip-path="url(#p0d2e0c97d5)" style="fill: none; stroke: #b0b0b0; stroke-opacity: 0.3; stroke-width: 0.8; stroke-linecap: square" />
4492
  </g>
4493
  <g id="line2d_2">
4494
  <g>
4495
- <use ns4:href="#mafb3703e5b" x="219.427214" y="447.507117" style="stroke: #000000; stroke-width: 0.8" />
4496
  </g>
4497
  </g>
4498
  <g id="text_2">
4499
- <text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif" transform="translate(176.796962 548.84621) rotate(-45)">cuda_attn_L256_bfloat16</text>
4500
  </g>
4501
  </g>
4502
  <g id="xtick_3">
4503
  <g id="grid-x--3" class="grid grid-x">
4504
- <path d="M 364.352985 447.507117 L 364.352985 26.88 " clip-path="url(#p0d2e0c97d5)" style="fill: none; stroke: #b0b0b0; stroke-opacity: 0.3; stroke-width: 0.8; stroke-linecap: square" />
4505
  </g>
4506
  <g id="line2d_3">
4507
  <g>
4508
- <use ns4:href="#mafb3703e5b" x="364.352985" y="447.507117" style="stroke: #000000; stroke-width: 0.8" />
4509
  </g>
4510
  </g>
4511
  <g id="text_3">
4512
- <text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif" transform="translate(321.722733 548.84621) rotate(-45)">cuda_attn_L320_bfloat16</text>
4513
  </g>
4514
  </g>
4515
  <g id="xtick_4">
4516
  <g id="grid-x--4" class="grid grid-x">
4517
- <path d="M 509.278756 447.507117 L 509.278756 26.88 " clip-path="url(#p0d2e0c97d5)" style="fill: none; stroke: #b0b0b0; stroke-opacity: 0.3; stroke-width: 0.8; stroke-linecap: square" />
4518
  </g>
4519
  <g id="line2d_4">
4520
  <g>
4521
- <use ns4:href="#mafb3703e5b" x="509.278756" y="447.507117" style="stroke: #000000; stroke-width: 0.8" />
4522
  </g>
4523
  </g>
4524
  <g id="text_4">
4525
- <text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif" transform="translate(466.648504 548.84621) rotate(-45)">cuda_attn_L384_bfloat16</text>
4526
  </g>
4527
  </g>
4528
  <g id="xtick_5">
4529
  <g id="grid-x--5" class="grid grid-x">
4530
- <path d="M 654.204528 447.507117 L 654.204528 26.88 " clip-path="url(#p0d2e0c97d5)" style="fill: none; stroke: #b0b0b0; stroke-opacity: 0.3; stroke-width: 0.8; stroke-linecap: square" />
4531
  </g>
4532
  <g id="line2d_5">
4533
  <g>
4534
- <use ns4:href="#mafb3703e5b" x="654.204528" y="447.507117" style="stroke: #000000; stroke-width: 0.8" />
4535
  </g>
4536
  </g>
4537
  <g id="text_5">
4538
- <text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif" transform="translate(611.574275 548.84621) rotate(-45)">cuda_attn_L448_bfloat16</text>
4539
  </g>
4540
  </g>
4541
  <g id="xtick_6">
4542
  <g id="grid-x--6" class="grid grid-x">
4543
- <path d="M 799.130299 447.507117 L 799.130299 26.88 " clip-path="url(#p0d2e0c97d5)" style="fill: none; stroke: #b0b0b0; stroke-opacity: 0.3; stroke-width: 0.8; stroke-linecap: square" />
4544
  </g>
4545
  <g id="line2d_6">
4546
  <g>
4547
- <use ns4:href="#mafb3703e5b" x="799.130299" y="447.507117" style="stroke: #000000; stroke-width: 0.8" />
4548
  </g>
4549
  </g>
4550
  <g id="text_6">
4551
- <text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif" transform="translate(756.500046 548.84621) rotate(-45)">cuda_attn_L512_bfloat16</text>
4552
  </g>
4553
  </g>
4554
  <g id="label--x" class="xlabel">
4555
- <text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: middle" x="436.815871" y="562.111872" transform="rotate(-0 436.815871 562.111872)">Workload</text>
4556
  </g>
4557
  </g>
4558
  <g id="matplotlib.axis_2">
4559
  <g id="ytick_1">
4560
  <g id="grid-y--2" class="grid grid-y">
4561
- <path d="M 38.27 416.200827 L 835.361742 416.200827 " clip-path="url(#p0d2e0c97d5)" style="fill: none; stroke: #b0b0b0; stroke-opacity: 0.3; stroke-width: 0.8; stroke-linecap: square" />
4562
  </g>
4563
  <g id="line2d_7">
4564
  <defs>
4565
  <path id="m0fca2865ba" d="M 0 0 L -3.5 0 " style="stroke: #000000; stroke-width: 0.8" />
4566
  </defs>
4567
  <g>
4568
- <use ns4:href="#m0fca2865ba" x="38.27" y="416.200827" style="stroke: #000000; stroke-width: 0.8" />
4569
  </g>
4570
  </g>
4571
  <g id="text_7">
4572
- <text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: end" x="31.27" y="420.000045" transform="rotate(-0 31.27 420.000045)">3</text>
4573
  </g>
4574
  </g>
4575
  <g id="ytick_2">
4576
  <g id="grid-y--3" class="grid grid-y">
4577
- <path d="M 38.27 348.555726 L 835.361742 348.555726 " clip-path="url(#p0d2e0c97d5)" style="fill: none; stroke: #b0b0b0; stroke-opacity: 0.3; stroke-width: 0.8; stroke-linecap: square" />
4578
  </g>
4579
  <g id="line2d_8">
4580
  <g>
4581
- <use ns4:href="#m0fca2865ba" x="38.27" y="348.555726" style="stroke: #000000; stroke-width: 0.8" />
4582
  </g>
4583
  </g>
4584
  <g id="text_8">
4585
- <text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: end" x="31.27" y="352.354944" transform="rotate(-0 31.27 352.354944)">4</text>
4586
  </g>
4587
  </g>
4588
  <g id="ytick_3">
4589
  <g id="grid-y--4" class="grid grid-y">
4590
- <path d="M 38.27 280.910625 L 835.361742 280.910625 " clip-path="url(#p0d2e0c97d5)" style="fill: none; stroke: #b0b0b0; stroke-opacity: 0.3; stroke-width: 0.8; stroke-linecap: square" />
4591
  </g>
4592
  <g id="line2d_9">
4593
  <g>
4594
- <use ns4:href="#m0fca2865ba" x="38.27" y="280.910625" style="stroke: #000000; stroke-width: 0.8" />
4595
  </g>
4596
  </g>
4597
  <g id="text_9">
4598
- <text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: end" x="31.27" y="284.709843" transform="rotate(-0 31.27 284.709843)">5</text>
4599
  </g>
4600
  </g>
4601
  <g id="ytick_4">
4602
  <g id="grid-y--5" class="grid grid-y">
4603
- <path d="M 38.27 213.265524 L 835.361742 213.265524 " clip-path="url(#p0d2e0c97d5)" style="fill: none; stroke: #b0b0b0; stroke-opacity: 0.3; stroke-width: 0.8; stroke-linecap: square" />
4604
  </g>
4605
  <g id="line2d_10">
4606
  <g>
4607
- <use ns4:href="#m0fca2865ba" x="38.27" y="213.265524" style="stroke: #000000; stroke-width: 0.8" />
4608
  </g>
4609
  </g>
4610
  <g id="text_10">
4611
- <text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: end" x="31.27" y="217.064743" transform="rotate(-0 31.27 217.064743)">6</text>
4612
  </g>
4613
  </g>
4614
  <g id="ytick_5">
4615
  <g id="grid-y--6" class="grid grid-y">
4616
- <path d="M 38.27 145.620423 L 835.361742 145.620423 " clip-path="url(#p0d2e0c97d5)" style="fill: none; stroke: #b0b0b0; stroke-opacity: 0.3; stroke-width: 0.8; stroke-linecap: square" />
4617
  </g>
4618
  <g id="line2d_11">
4619
  <g>
4620
- <use ns4:href="#m0fca2865ba" x="38.27" y="145.620423" style="stroke: #000000; stroke-width: 0.8" />
4621
  </g>
4622
  </g>
4623
  <g id="text_11">
4624
- <text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: end" x="31.27" y="149.419642" transform="rotate(-0 31.27 149.419642)">7</text>
4625
  </g>
4626
  </g>
4627
  <g id="ytick_6">
4628
  <g id="grid-y--7" class="grid grid-y">
4629
- <path d="M 38.27 77.975322 L 835.361742 77.975322 " clip-path="url(#p0d2e0c97d5)" style="fill: none; stroke: #b0b0b0; stroke-opacity: 0.3; stroke-width: 0.8; stroke-linecap: square" />
4630
  </g>
4631
  <g id="line2d_12">
4632
  <g>
4633
- <use ns4:href="#m0fca2865ba" x="38.27" y="77.975322" style="stroke: #000000; stroke-width: 0.8" />
4634
  </g>
4635
  </g>
4636
  <g id="text_12">
4637
- <text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: end" x="31.27" y="81.774541" transform="rotate(-0 31.27 81.774541)">8</text>
 
 
 
 
 
 
 
 
 
 
 
 
 
4638
  </g>
4639
  </g>
4640
  <g id="label--y" class="ylabel">
4641
- <text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: middle" x="18.827813" y="237.193558" transform="rotate(-90 18.827813 237.193558)">Latency P50 (ms)</text>
4642
  </g>
4643
  </g>
4644
  <g id="series--torch-flash-ma" class="series">
4645
- <path d="M 74.501443 342.178283 L 219.427214 295.140246 L 364.352985 287.657208 L 509.278756 281.974208 L 654.204528 277.480409 L 799.130299 249.123044 " clip-path="url(#p0d2e0c97d5)" style="fill: none; stroke: #1f77b4; stroke-width: 1.5; stroke-linecap: square" />
4646
  <defs>
4647
  <path id="md7efaf3aec" d="M 0 3 C 0.795609 3 1.55874 2.683901 2.12132 2.12132 C 2.683901 1.55874 3 0.795609 3 0 C 3 -0.795609 2.683901 -1.55874 2.12132 -2.12132 C 1.55874 -2.683901 0.795609 -3 0 -3 C -0.795609 -3 -1.55874 -2.683901 -2.12132 -2.12132 C -2.683901 -1.55874 -3 -0.795609 -3 0 C -3 0.795609 -2.683901 1.55874 -2.12132 2.12132 C -1.55874 2.683901 -0.795609 3 0 3 z " style="stroke: #1f77b4" />
4648
  </defs>
4649
- <g clip-path="url(#p0d2e0c97d5)">
4650
- <use ns4:href="#md7efaf3aec" x="74.501443" y="342.178283" style="fill: #1f77b4; stroke: #1f77b4" />
4651
- <use ns4:href="#md7efaf3aec" x="219.427214" y="295.140246" style="fill: #1f77b4; stroke: #1f77b4" />
4652
- <use ns4:href="#md7efaf3aec" x="364.352985" y="287.657208" style="fill: #1f77b4; stroke: #1f77b4" />
4653
- <use ns4:href="#md7efaf3aec" x="509.278756" y="281.974208" style="fill: #1f77b4; stroke: #1f77b4" />
4654
- <use ns4:href="#md7efaf3aec" x="654.204528" y="277.480409" style="fill: #1f77b4; stroke: #1f77b4" />
4655
- <use ns4:href="#md7efaf3aec" x="799.130299" y="249.123044" style="fill: #1f77b4; stroke: #1f77b4" />
4656
  </g>
4657
  </g>
4658
  <g id="series--torch-mem-eff" class="series">
4659
- <path d="M 74.501443 161.453774 L 219.427214 129.630134 L 364.352985 110.285937 L 509.278756 105.945081 L 654.204528 80.334108 L 799.130299 45.999414 " clip-path="url(#p0d2e0c97d5)" style="fill: none; stroke: #ff7f0e; stroke-width: 1.5; stroke-linecap: square" />
4660
  <defs>
4661
  <path id="m9b8c54d372" d="M 0 3 C 0.795609 3 1.55874 2.683901 2.12132 2.12132 C 2.683901 1.55874 3 0.795609 3 0 C 3 -0.795609 2.683901 -1.55874 2.12132 -2.12132 C 1.55874 -2.683901 0.795609 -3 0 -3 C -0.795609 -3 -1.55874 -2.683901 -2.12132 -2.12132 C -2.683901 -1.55874 -3 -0.795609 -3 0 C -3 0.795609 -2.683901 1.55874 -2.12132 2.12132 C -1.55874 2.683901 -0.795609 3 0 3 z " style="stroke: #ff7f0e" />
4662
  </defs>
4663
- <g clip-path="url(#p0d2e0c97d5)">
4664
- <use ns4:href="#m9b8c54d372" x="74.501443" y="161.453774" style="fill: #ff7f0e; stroke: #ff7f0e" />
4665
- <use ns4:href="#m9b8c54d372" x="219.427214" y="129.630134" style="fill: #ff7f0e; stroke: #ff7f0e" />
4666
- <use ns4:href="#m9b8c54d372" x="364.352985" y="110.285937" style="fill: #ff7f0e; stroke: #ff7f0e" />
4667
- <use ns4:href="#m9b8c54d372" x="509.278756" y="105.945081" style="fill: #ff7f0e; stroke: #ff7f0e" />
4668
- <use ns4:href="#m9b8c54d372" x="654.204528" y="80.334108" style="fill: #ff7f0e; stroke: #ff7f0e" />
4669
- <use ns4:href="#m9b8c54d372" x="799.130299" y="45.999414" style="fill: #ff7f0e; stroke: #ff7f0e" />
4670
  </g>
4671
  </g>
4672
  <g id="series--xformers-meff" class="series">
4673
- <path d="M 74.501443 375.900177 L 219.427214 387.059046 L 364.352985 342.097108 L 509.278756 347.750344 L 654.204528 334.407754 L 799.130299 319.565404 " clip-path="url(#p0d2e0c97d5)" style="fill: none; stroke: #2ca02c; stroke-width: 1.5; stroke-linecap: square" />
4674
  <defs>
4675
  <path id="mc655281e0b" d="M 0 3 C 0.795609 3 1.55874 2.683901 2.12132 2.12132 C 2.683901 1.55874 3 0.795609 3 0 C 3 -0.795609 2.683901 -1.55874 2.12132 -2.12132 C 1.55874 -2.683901 0.795609 -3 0 -3 C -0.795609 -3 -1.55874 -2.683901 -2.12132 -2.12132 C -2.683901 -1.55874 -3 -0.795609 -3 0 C -3 0.795609 -2.683901 1.55874 -2.12132 2.12132 C -1.55874 2.683901 -0.795609 3 0 3 z " style="stroke: #2ca02c" />
4676
  </defs>
4677
- <g clip-path="url(#p0d2e0c97d5)">
4678
- <use ns4:href="#mc655281e0b" x="74.501443" y="375.900177" style="fill: #2ca02c; stroke: #2ca02c" />
4679
- <use ns4:href="#mc655281e0b" x="219.427214" y="387.059046" style="fill: #2ca02c; stroke: #2ca02c" />
4680
- <use ns4:href="#mc655281e0b" x="364.352985" y="342.097108" style="fill: #2ca02c; stroke: #2ca02c" />
4681
- <use ns4:href="#mc655281e0b" x="509.278756" y="347.750344" style="fill: #2ca02c; stroke: #2ca02c" />
4682
- <use ns4:href="#mc655281e0b" x="654.204528" y="334.407754" style="fill: #2ca02c; stroke: #2ca02c" />
4683
- <use ns4:href="#mc655281e0b" x="799.130299" y="319.565404" style="fill: #2ca02c; stroke: #2ca02c" />
4684
  </g>
4685
  </g>
4686
  <g id="series--hf-kernels-flash-attn" class="series">
4687
- <path d="M 74.501443 428.387702 L 219.427214 354.446599 L 364.352985 340.491147 L 509.278756 339.909398 L 654.204528 340.987724 L 799.130299 309.967712 " clip-path="url(#p0d2e0c97d5)" style="fill: none; stroke: #d62728; stroke-width: 1.5; stroke-linecap: square" />
4688
  <defs>
4689
  <path id="m61c8040d7e" d="M 0 3 C 0.795609 3 1.55874 2.683901 2.12132 2.12132 C 2.683901 1.55874 3 0.795609 3 0 C 3 -0.795609 2.683901 -1.55874 2.12132 -2.12132 C 1.55874 -2.683901 0.795609 -3 0 -3 C -0.795609 -3 -1.55874 -2.683901 -2.12132 -2.12132 C -2.683901 -1.55874 -3 -0.795609 -3 0 C -3 0.795609 -2.683901 1.55874 -2.12132 2.12132 C -1.55874 2.683901 -0.795609 3 0 3 z " style="stroke: #d62728" />
4690
  </defs>
4691
- <g clip-path="url(#p0d2e0c97d5)">
4692
- <use ns4:href="#m61c8040d7e" x="74.501443" y="428.387702" style="fill: #d62728; stroke: #d62728" />
4693
- <use ns4:href="#m61c8040d7e" x="219.427214" y="354.446599" style="fill: #d62728; stroke: #d62728" />
4694
- <use ns4:href="#m61c8040d7e" x="364.352985" y="340.491147" style="fill: #d62728; stroke: #d62728" />
4695
- <use ns4:href="#m61c8040d7e" x="509.278756" y="339.909398" style="fill: #d62728; stroke: #d62728" />
4696
- <use ns4:href="#m61c8040d7e" x="654.204528" y="340.987724" style="fill: #d62728; stroke: #d62728" />
4697
- <use ns4:href="#m61c8040d7e" x="799.130299" y="309.967712" style="fill: #d62728; stroke: #d62728" />
4698
  </g>
4699
  </g>
4700
  <g id="series--hf-kernels-flash-attn3" class="series">
4701
- <path d="M 74.501443 401.329117 L 219.427214 364.396182 L 364.352985 354.430502 L 509.278756 350.637501 L 654.204528 335.557722 L 799.130299 321.02931 " clip-path="url(#p0d2e0c97d5)" style="fill: none; stroke: #9467bd; stroke-width: 1.5; stroke-linecap: square" />
4702
  <defs>
4703
  <path id="m7cd35be9cc" d="M 0 3 C 0.795609 3 1.55874 2.683901 2.12132 2.12132 C 2.683901 1.55874 3 0.795609 3 0 C 3 -0.795609 2.683901 -1.55874 2.12132 -2.12132 C 1.55874 -2.683901 0.795609 -3 0 -3 C -0.795609 -3 -1.55874 -2.683901 -2.12132 -2.12132 C -2.683901 -1.55874 -3 -0.795609 -3 0 C -3 0.795609 -2.683901 1.55874 -2.12132 2.12132 C -1.55874 2.683901 -0.795609 3 0 3 z " style="stroke: #9467bd" />
4704
  </defs>
4705
- <g clip-path="url(#p0d2e0c97d5)">
4706
- <use ns4:href="#m7cd35be9cc" x="74.501443" y="401.329117" style="fill: #9467bd; stroke: #9467bd" />
4707
- <use ns4:href="#m7cd35be9cc" x="219.427214" y="364.396182" style="fill: #9467bd; stroke: #9467bd" />
4708
- <use ns4:href="#m7cd35be9cc" x="364.352985" y="354.430502" style="fill: #9467bd; stroke: #9467bd" />
4709
- <use ns4:href="#m7cd35be9cc" x="509.278756" y="350.637501" style="fill: #9467bd; stroke: #9467bd" />
4710
- <use ns4:href="#m7cd35be9cc" x="654.204528" y="335.557722" style="fill: #9467bd; stroke: #9467bd" />
4711
- <use ns4:href="#m7cd35be9cc" x="799.130299" y="321.02931" style="fill: #9467bd; stroke: #9467bd" />
4712
  </g>
4713
  </g>
4714
  <g id="patch_3">
4715
- <path d="M 38.27 447.507117 L 38.27 26.88 " style="fill: none; stroke: #000000; stroke-width: 0.8; stroke-linejoin: miter; stroke-linecap: square" />
4716
  </g>
4717
  <g id="patch_4">
4718
  <path d="M 835.361742 447.507117 L 835.361742 26.88 " style="fill: none; stroke: #000000; stroke-width: 0.8; stroke-linejoin: miter; stroke-linecap: square" />
4719
  </g>
4720
  <g id="patch_5">
4721
- <path d="M 38.27 447.507117 L 835.361742 447.507117 " style="fill: none; stroke: #000000; stroke-width: 0.8; stroke-linejoin: miter; stroke-linecap: square" />
4722
  </g>
4723
  <g id="patch_6">
4724
- <path d="M 38.27 26.88 L 835.361742 26.88 " style="fill: none; stroke: #000000; stroke-width: 0.8; stroke-linejoin: miter; stroke-linecap: square" />
4725
  </g>
4726
- <g id="text_13">
4727
- <text style="font-size: 12px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: middle" x="436.815871" y="20.88" transform="rotate(-0 436.815871 20.88)">Attention Implementation Latency</text>
4728
  </g>
4729
  <g id="legend" class="legend">
4730
  <g id="patch_7">
4731
- <path d="M 45.27 109.66125 L 188.765313 109.66125 Q 190.765313 109.66125 190.765313 107.66125 L 190.765313 33.88 Q 190.765313 31.88 188.765313 31.88 L 45.27 31.88 Q 43.27 31.88 43.27 33.88 L 43.27 107.66125 Q 43.27 109.66125 45.27 109.66125 L 45.27 109.66125 z " style="fill: none; opacity: 0.8; stroke: #cccccc; stroke-linejoin: miter" />
4732
  </g>
4733
- <g id="line2d_13">
4734
- <path d="M 47.27 39.978438 L 57.27 39.978438 L 67.27 39.978438 " style="fill: none; stroke: #1f77b4; stroke-width: 1.5; stroke-linecap: square" />
4735
  <g>
4736
- <use ns4:href="#md7efaf3aec" x="57.27" y="39.978438" style="fill: #1f77b4; stroke: #1f77b4" />
4737
  </g>
4738
  </g>
4739
  <g id="legend-label--torch-flash-ma" class="legend">
4740
- <text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: start" x="75.27" y="43.478438" transform="rotate(-0 75.27 43.478438)">torch_flash_ma</text>
4741
  </g>
4742
- <g id="line2d_14">
4743
- <path d="M 47.27 54.934687 L 57.27 54.934687 L 67.27 54.934687 " style="fill: none; stroke: #ff7f0e; stroke-width: 1.5; stroke-linecap: square" />
4744
  <g>
4745
- <use ns4:href="#m9b8c54d372" x="57.27" y="54.934687" style="fill: #ff7f0e; stroke: #ff7f0e" />
4746
  </g>
4747
  </g>
4748
  <g id="legend-label--torch-mem-eff" class="legend">
4749
- <text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: start" x="75.27" y="58.434687" transform="rotate(-0 75.27 58.434687)">torch_mem_eff</text>
4750
  </g>
4751
- <g id="line2d_15">
4752
- <path d="M 47.27 69.890938 L 57.27 69.890938 L 67.27 69.890938 " style="fill: none; stroke: #2ca02c; stroke-width: 1.5; stroke-linecap: square" />
4753
  <g>
4754
- <use ns4:href="#mc655281e0b" x="57.27" y="69.890938" style="fill: #2ca02c; stroke: #2ca02c" />
4755
  </g>
4756
  </g>
4757
  <g id="legend-label--xformers-meff" class="legend">
4758
- <text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: start" x="75.27" y="73.390938" transform="rotate(-0 75.27 73.390938)">xformers_meff</text>
4759
  </g>
4760
- <g id="line2d_16">
4761
- <path d="M 47.27 84.847188 L 57.27 84.847188 L 67.27 84.847188 " style="fill: none; stroke: #d62728; stroke-width: 1.5; stroke-linecap: square" />
4762
  <g>
4763
- <use ns4:href="#m61c8040d7e" x="57.27" y="84.847188" style="fill: #d62728; stroke: #d62728" />
4764
  </g>
4765
  </g>
4766
  <g id="legend-label--hf-kernels-flash-attn" class="legend">
4767
- <text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: start" x="75.27" y="88.347188" transform="rotate(-0 75.27 88.347188)">hf_kernels_flash_attn</text>
4768
  </g>
4769
- <g id="line2d_17">
4770
- <path d="M 47.27 99.803438 L 57.27 99.803438 L 67.27 99.803438 " style="fill: none; stroke: #9467bd; stroke-width: 1.5; stroke-linecap: square" />
4771
  <g>
4772
- <use ns4:href="#m7cd35be9cc" x="57.27" y="99.803438" style="fill: #9467bd; stroke: #9467bd" />
4773
  </g>
4774
  </g>
4775
  <g id="legend-label--hf-kernels-flash-attn3" class="legend">
4776
- <text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: start" x="75.27" y="103.303438" transform="rotate(-0 75.27 103.303438)">hf_kernels_flash_attn3</text>
4777
  </g>
4778
  </g>
4779
  </g>
4780
  </g>
4781
  <defs>
4782
- <clipPath id="p0d2e0c97d5">
4783
- <rect x="38.27" y="26.88" width="797.091742" height="420.627117" />
4784
  </clipPath>
4785
  </defs>
4786
  </svg>
 
3857
  <div class="system-info">
3858
  <div class="system-info-header">Generated on:</div>
3859
  <div class="system-info-content">
3860
+ Linux x86_64 | Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35
3861
  </div>
3862
  </div>
3863
 
 
3872
  <rdf:RDF>
3873
  <ns2:Work>
3874
  <dc:type rdf:resource="http://purl.org/dc/dcmitype/StillImage" />
3875
+ <dc:date>2025-10-27T14:46:38.946915</dc:date>
3876
  <dc:format>image/svg+xml</dc:format>
3877
  <dc:creator>
3878
  <ns2:Agent>
 
3891
  </g>
3892
  <g id="axes--1" class="axes">
3893
  <g id="patch_2">
3894
+ <path d="M 47.81 447.507117 L 835.361742 447.507117 L 835.361742 26.88 L 47.81 26.88 L 47.81 447.507117 z " style="fill: none" />
3895
  </g>
3896
  <g id="matplotlib.axis_1">
3897
  <g id="xtick_1">
3898
  <g id="grid-x--1" class="grid grid-x">
3899
+ <path d="M 83.607806 447.507117 L 83.607806 26.88 " clip-path="url(#p09feef2583)" style="fill: none; stroke: #b0b0b0; stroke-opacity: 0.3; stroke-width: 0.8; stroke-linecap: square" />
3900
  </g>
3901
  <g id="line2d_1">
3902
  <defs>
3903
  <path id="mafb3703e5b" d="M 0 0 L 0 3.5 " style="stroke: #000000; stroke-width: 0.8" />
3904
  </defs>
3905
  <g>
3906
+ <use ns4:href="#mafb3703e5b" x="83.607806" y="447.507117" style="stroke: #000000; stroke-width: 0.8" />
3907
  </g>
3908
  </g>
3909
  <g id="text_1">
3910
+ <text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif" transform="translate(40.977554 548.84621) rotate(-45)">cuda_attn_L128_bfloat16</text>
3911
  </g>
3912
  </g>
3913
  <g id="xtick_2">
3914
  <g id="grid-x--2" class="grid grid-x">
3915
+ <path d="M 226.799032 447.507117 L 226.799032 26.88 " clip-path="url(#p09feef2583)" style="fill: none; stroke: #b0b0b0; stroke-opacity: 0.3; stroke-width: 0.8; stroke-linecap: square" />
3916
  </g>
3917
  <g id="line2d_2">
3918
  <g>
3919
+ <use ns4:href="#mafb3703e5b" x="226.799032" y="447.507117" style="stroke: #000000; stroke-width: 0.8" />
3920
  </g>
3921
  </g>
3922
  <g id="text_2">
3923
+ <text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif" transform="translate(184.16878 548.84621) rotate(-45)">cuda_attn_L256_bfloat16</text>
3924
  </g>
3925
  </g>
3926
  <g id="xtick_3">
3927
  <g id="grid-x--3" class="grid grid-x">
3928
+ <path d="M 369.990258 447.507117 L 369.990258 26.88 " clip-path="url(#p09feef2583)" style="fill: none; stroke: #b0b0b0; stroke-opacity: 0.3; stroke-width: 0.8; stroke-linecap: square" />
3929
  </g>
3930
  <g id="line2d_3">
3931
  <g>
3932
+ <use ns4:href="#mafb3703e5b" x="369.990258" y="447.507117" style="stroke: #000000; stroke-width: 0.8" />
3933
  </g>
3934
  </g>
3935
  <g id="text_3">
3936
+ <text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif" transform="translate(327.360005 548.84621) rotate(-45)">cuda_attn_L320_bfloat16</text>
3937
  </g>
3938
  </g>
3939
  <g id="xtick_4">
3940
  <g id="grid-x--4" class="grid grid-x">
3941
+ <path d="M 513.181484 447.507117 L 513.181484 26.88 " clip-path="url(#p09feef2583)" style="fill: none; stroke: #b0b0b0; stroke-opacity: 0.3; stroke-width: 0.8; stroke-linecap: square" />
3942
  </g>
3943
  <g id="line2d_4">
3944
  <g>
3945
+ <use ns4:href="#mafb3703e5b" x="513.181484" y="447.507117" style="stroke: #000000; stroke-width: 0.8" />
3946
  </g>
3947
  </g>
3948
  <g id="text_4">
3949
+ <text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif" transform="translate(470.551231 548.84621) rotate(-45)">cuda_attn_L384_bfloat16</text>
3950
  </g>
3951
  </g>
3952
  <g id="xtick_5">
3953
  <g id="grid-x--5" class="grid grid-x">
3954
+ <path d="M 656.37271 447.507117 L 656.37271 26.88 " clip-path="url(#p09feef2583)" style="fill: none; stroke: #b0b0b0; stroke-opacity: 0.3; stroke-width: 0.8; stroke-linecap: square" />
3955
  </g>
3956
  <g id="line2d_5">
3957
  <g>
3958
+ <use ns4:href="#mafb3703e5b" x="656.37271" y="447.507117" style="stroke: #000000; stroke-width: 0.8" />
3959
  </g>
3960
  </g>
3961
  <g id="text_5">
3962
+ <text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif" transform="translate(613.742457 548.84621) rotate(-45)">cuda_attn_L448_bfloat16</text>
3963
  </g>
3964
  </g>
3965
  <g id="xtick_6">
3966
  <g id="grid-x--6" class="grid grid-x">
3967
+ <path d="M 799.563935 447.507117 L 799.563935 26.88 " clip-path="url(#p09feef2583)" style="fill: none; stroke: #b0b0b0; stroke-opacity: 0.3; stroke-width: 0.8; stroke-linecap: square" />
3968
  </g>
3969
  <g id="line2d_6">
3970
  <g>
3971
+ <use ns4:href="#mafb3703e5b" x="799.563935" y="447.507117" style="stroke: #000000; stroke-width: 0.8" />
3972
  </g>
3973
  </g>
3974
  <g id="text_6">
3975
+ <text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif" transform="translate(756.933683 548.84621) rotate(-45)">cuda_attn_L512_bfloat16</text>
3976
  </g>
3977
  </g>
3978
  <g id="label--x" class="xlabel">
3979
+ <text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: middle" x="441.585871" y="562.111872" transform="rotate(-0 441.585871 562.111872)">Workload</text>
3980
  </g>
3981
  </g>
3982
  <g id="matplotlib.axis_2">
3983
  <g id="ytick_1">
3984
  <g id="grid-y--2" class="grid grid-y">
3985
+ <path d="M 47.81 413.210177 L 835.361742 413.210177 " clip-path="url(#p09feef2583)" style="fill: none; stroke: #b0b0b0; stroke-opacity: 0.3; stroke-width: 0.8; stroke-linecap: square" />
3986
  </g>
3987
  <g id="line2d_7">
3988
  <defs>
3989
  <path id="m0fca2865ba" d="M 0 0 L -3.5 0 " style="stroke: #000000; stroke-width: 0.8" />
3990
  </defs>
3991
  <g>
3992
+ <use ns4:href="#m0fca2865ba" x="47.81" y="413.210177" style="stroke: #000000; stroke-width: 0.8" />
3993
  </g>
3994
  </g>
3995
  <g id="text_7">
3996
+ <text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: end" x="40.81" y="417.009396" transform="rotate(-0 40.81 417.009396)">1.0</text>
3997
  </g>
3998
  </g>
3999
  <g id="ytick_2">
4000
  <g id="grid-y--3" class="grid grid-y">
4001
+ <path d="M 47.81 355.233116 L 835.361742 355.233116 " clip-path="url(#p09feef2583)" style="fill: none; stroke: #b0b0b0; stroke-opacity: 0.3; stroke-width: 0.8; stroke-linecap: square" />
4002
  </g>
4003
  <g id="line2d_8">
4004
  <g>
4005
+ <use ns4:href="#m0fca2865ba" x="47.81" y="355.233116" style="stroke: #000000; stroke-width: 0.8" />
4006
  </g>
4007
  </g>
4008
  <g id="text_8">
4009
+ <text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: end" x="40.81" y="359.032335" transform="rotate(-0 40.81 359.032335)">1.2</text>
4010
  </g>
4011
  </g>
4012
  <g id="ytick_3">
4013
  <g id="grid-y--4" class="grid grid-y">
4014
+ <path d="M 47.81 297.256055 L 835.361742 297.256055 " clip-path="url(#p09feef2583)" style="fill: none; stroke: #b0b0b0; stroke-opacity: 0.3; stroke-width: 0.8; stroke-linecap: square" />
4015
  </g>
4016
  <g id="line2d_9">
4017
  <g>
4018
+ <use ns4:href="#m0fca2865ba" x="47.81" y="297.256055" style="stroke: #000000; stroke-width: 0.8" />
4019
  </g>
4020
  </g>
4021
  <g id="text_9">
4022
+ <text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: end" x="40.81" y="301.055273" transform="rotate(-0 40.81 301.055273)">1.4</text>
4023
  </g>
4024
  </g>
4025
  <g id="ytick_4">
4026
  <g id="grid-y--5" class="grid grid-y">
4027
+ <path d="M 47.81 239.278993 L 835.361742 239.278993 " clip-path="url(#p09feef2583)" style="fill: none; stroke: #b0b0b0; stroke-opacity: 0.3; stroke-width: 0.8; stroke-linecap: square" />
4028
  </g>
4029
  <g id="line2d_10">
4030
  <g>
4031
+ <use ns4:href="#m0fca2865ba" x="47.81" y="239.278993" style="stroke: #000000; stroke-width: 0.8" />
4032
  </g>
4033
  </g>
4034
  <g id="text_10">
4035
+ <text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: end" x="40.81" y="243.078212" transform="rotate(-0 40.81 243.078212)">1.6</text>
4036
  </g>
4037
  </g>
4038
  <g id="ytick_5">
4039
  <g id="grid-y--6" class="grid grid-y">
4040
+ <path d="M 47.81 181.301932 L 835.361742 181.301932 " clip-path="url(#p09feef2583)" style="fill: none; stroke: #b0b0b0; stroke-opacity: 0.3; stroke-width: 0.8; stroke-linecap: square" />
4041
  </g>
4042
  <g id="line2d_11">
4043
  <g>
4044
+ <use ns4:href="#m0fca2865ba" x="47.81" y="181.301932" style="stroke: #000000; stroke-width: 0.8" />
4045
  </g>
4046
  </g>
4047
  <g id="text_11">
4048
+ <text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: end" x="40.81" y="185.101151" transform="rotate(-0 40.81 185.101151)">1.8</text>
4049
  </g>
4050
  </g>
4051
  <g id="ytick_6">
4052
  <g id="grid-y--7" class="grid grid-y">
4053
+ <path d="M 47.81 123.324871 L 835.361742 123.324871 " clip-path="url(#p09feef2583)" style="fill: none; stroke: #b0b0b0; stroke-opacity: 0.3; stroke-width: 0.8; stroke-linecap: square" />
4054
  </g>
4055
  <g id="line2d_12">
4056
  <g>
4057
+ <use ns4:href="#m0fca2865ba" x="47.81" y="123.324871" style="stroke: #000000; stroke-width: 0.8" />
4058
  </g>
4059
  </g>
4060
  <g id="text_12">
4061
+ <text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: end" x="40.81" y="127.124089" transform="rotate(-0 40.81 127.124089)">2.0</text>
4062
+ </g>
4063
+ </g>
4064
+ <g id="ytick_7">
4065
+ <g id="grid-y--8" class="grid grid-y">
4066
+ <path d="M 47.81 65.347809 L 835.361742 65.347809 " clip-path="url(#p09feef2583)" style="fill: none; stroke: #b0b0b0; stroke-opacity: 0.3; stroke-width: 0.8; stroke-linecap: square" />
4067
+ </g>
4068
+ <g id="line2d_13">
4069
+ <g>
4070
+ <use ns4:href="#m0fca2865ba" x="47.81" y="65.347809" style="stroke: #000000; stroke-width: 0.8" />
4071
+ </g>
4072
+ </g>
4073
+ <g id="text_13">
4074
+ <text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: end" x="40.81" y="69.147028" transform="rotate(-0 40.81 69.147028)">2.2</text>
4075
  </g>
4076
  </g>
4077
  <g id="label--y" class="ylabel">
4078
+ <text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: middle" x="18.827187" y="237.193558" transform="rotate(-90 18.827187 237.193558)">Latency P50 (ms)</text>
4079
  </g>
4080
  </g>
4081
  <g id="series--torch-flash-ma" class="series">
4082
+ <path d="M 83.607806 349.439178 L 226.799032 333.602454 L 369.990258 324.473676 L 513.181484 316.069901 L 656.37271 272.899601 L 799.563935 261.559288 " clip-path="url(#p09feef2583)" style="fill: none; stroke: #1f77b4; stroke-width: 1.5; stroke-linecap: square" />
4083
  <defs>
4084
  <path id="md7efaf3aec" d="M 0 3 C 0.795609 3 1.55874 2.683901 2.12132 2.12132 C 2.683901 1.55874 3 0.795609 3 0 C 3 -0.795609 2.683901 -1.55874 2.12132 -2.12132 C 1.55874 -2.683901 0.795609 -3 0 -3 C -0.795609 -3 -1.55874 -2.683901 -2.12132 -2.12132 C -2.683901 -1.55874 -3 -0.795609 -3 0 C -3 0.795609 -2.683901 1.55874 -2.12132 2.12132 C -1.55874 2.683901 -0.795609 3 0 3 z " style="stroke: #1f77b4" />
4085
  </defs>
4086
+ <g clip-path="url(#p09feef2583)">
4087
+ <use ns4:href="#md7efaf3aec" x="83.607806" y="349.439178" style="fill: #1f77b4; stroke: #1f77b4" />
4088
+ <use ns4:href="#md7efaf3aec" x="226.799032" y="333.602454" style="fill: #1f77b4; stroke: #1f77b4" />
4089
+ <use ns4:href="#md7efaf3aec" x="369.990258" y="324.473676" style="fill: #1f77b4; stroke: #1f77b4" />
4090
+ <use ns4:href="#md7efaf3aec" x="513.181484" y="316.069901" style="fill: #1f77b4; stroke: #1f77b4" />
4091
+ <use ns4:href="#md7efaf3aec" x="656.37271" y="272.899601" style="fill: #1f77b4; stroke: #1f77b4" />
4092
+ <use ns4:href="#md7efaf3aec" x="799.563935" y="261.559288" style="fill: #1f77b4; stroke: #1f77b4" />
4093
  </g>
4094
  </g>
4095
  <g id="series--torch-mem-eff" class="series">
4096
+ <path d="M 83.607806 156.020744 L 226.799032 138.969401 L 369.990258 109.128607 L 513.181484 99.249026 L 656.37271 87.05645 L 799.563935 45.999414 " clip-path="url(#p09feef2583)" style="fill: none; stroke: #ff7f0e; stroke-width: 1.5; stroke-linecap: square" />
4097
  <defs>
4098
  <path id="m9b8c54d372" d="M 0 3 C 0.795609 3 1.55874 2.683901 2.12132 2.12132 C 2.683901 1.55874 3 0.795609 3 0 C 3 -0.795609 2.683901 -1.55874 2.12132 -2.12132 C 1.55874 -2.683901 0.795609 -3 0 -3 C -0.795609 -3 -1.55874 -2.683901 -2.12132 -2.12132 C -2.683901 -1.55874 -3 -0.795609 -3 0 C -3 0.795609 -2.683901 1.55874 -2.12132 2.12132 C -1.55874 2.683901 -0.795609 3 0 3 z " style="stroke: #ff7f0e" />
4099
  </defs>
4100
+ <g clip-path="url(#p09feef2583)">
4101
+ <use ns4:href="#m9b8c54d372" x="83.607806" y="156.020744" style="fill: #ff7f0e; stroke: #ff7f0e" />
4102
+ <use ns4:href="#m9b8c54d372" x="226.799032" y="138.969401" style="fill: #ff7f0e; stroke: #ff7f0e" />
4103
+ <use ns4:href="#m9b8c54d372" x="369.990258" y="109.128607" style="fill: #ff7f0e; stroke: #ff7f0e" />
4104
+ <use ns4:href="#m9b8c54d372" x="513.181484" y="99.249026" style="fill: #ff7f0e; stroke: #ff7f0e" />
4105
+ <use ns4:href="#m9b8c54d372" x="656.37271" y="87.05645" style="fill: #ff7f0e; stroke: #ff7f0e" />
4106
+ <use ns4:href="#m9b8c54d372" x="799.563935" y="45.999414" style="fill: #ff7f0e; stroke: #ff7f0e" />
4107
  </g>
4108
  </g>
4109
  <g id="series--xformers-meff" class="series">
4110
+ <path d="M 83.607806 414.345368 L 226.799032 400.181572 L 369.990258 385.808769 L 513.181484 380.581847 L 656.37271 338.122056 L 799.563935 339.866876 " clip-path="url(#p09feef2583)" style="fill: none; stroke: #2ca02c; stroke-width: 1.5; stroke-linecap: square" />
4111
  <defs>
4112
  <path id="mc655281e0b" d="M 0 3 C 0.795609 3 1.55874 2.683901 2.12132 2.12132 C 2.683901 1.55874 3 0.795609 3 0 C 3 -0.795609 2.683901 -1.55874 2.12132 -2.12132 C 1.55874 -2.683901 0.795609 -3 0 -3 C -0.795609 -3 -1.55874 -2.683901 -2.12132 -2.12132 C -2.683901 -1.55874 -3 -0.795609 -3 0 C -3 0.795609 -2.683901 1.55874 -2.12132 2.12132 C -1.55874 2.683901 -0.795609 3 0 3 z " style="stroke: #2ca02c" />
4113
  </defs>
4114
+ <g clip-path="url(#p09feef2583)">
4115
+ <use ns4:href="#mc655281e0b" x="83.607806" y="414.345368" style="fill: #2ca02c; stroke: #2ca02c" />
4116
+ <use ns4:href="#mc655281e0b" x="226.799032" y="400.181572" style="fill: #2ca02c; stroke: #2ca02c" />
4117
+ <use ns4:href="#mc655281e0b" x="369.990258" y="385.808769" style="fill: #2ca02c; stroke: #2ca02c" />
4118
+ <use ns4:href="#mc655281e0b" x="513.181484" y="380.581847" style="fill: #2ca02c; stroke: #2ca02c" />
4119
+ <use ns4:href="#mc655281e0b" x="656.37271" y="338.122056" style="fill: #2ca02c; stroke: #2ca02c" />
4120
+ <use ns4:href="#mc655281e0b" x="799.563935" y="339.866876" style="fill: #2ca02c; stroke: #2ca02c" />
4121
  </g>
4122
  </g>
4123
  <g id="series--hf-kernels-flash-attn" class="series">
4124
+ <path d="M 83.607806 420.20395 L 226.799032 407.432473 L 369.990258 399.40236 L 513.181484 392.590345 L 656.37271 345.709514 L 799.563935 346.355668 " clip-path="url(#p09feef2583)" style="fill: none; stroke: #d62728; stroke-width: 1.5; stroke-linecap: square" />
4125
  <defs>
4126
  <path id="m61c8040d7e" d="M 0 3 C 0.795609 3 1.55874 2.683901 2.12132 2.12132 C 2.683901 1.55874 3 0.795609 3 0 C 3 -0.795609 2.683901 -1.55874 2.12132 -2.12132 C 1.55874 -2.683901 0.795609 -3 0 -3 C -0.795609 -3 -1.55874 -2.683901 -2.12132 -2.12132 C -2.683901 -1.55874 -3 -0.795609 -3 0 C -3 0.795609 -2.683901 1.55874 -2.12132 2.12132 C -1.55874 2.683901 -0.795609 3 0 3 z " style="stroke: #d62728" />
4127
  </defs>
4128
+ <g clip-path="url(#p09feef2583)">
4129
+ <use ns4:href="#m61c8040d7e" x="83.607806" y="420.20395" style="fill: #d62728; stroke: #d62728" />
4130
+ <use ns4:href="#m61c8040d7e" x="226.799032" y="407.432473" style="fill: #d62728; stroke: #d62728" />
4131
+ <use ns4:href="#m61c8040d7e" x="369.990258" y="399.40236" style="fill: #d62728; stroke: #d62728" />
4132
+ <use ns4:href="#m61c8040d7e" x="513.181484" y="392.590345" style="fill: #d62728; stroke: #d62728" />
4133
+ <use ns4:href="#m61c8040d7e" x="656.37271" y="345.709514" style="fill: #d62728; stroke: #d62728" />
4134
+ <use ns4:href="#m61c8040d7e" x="799.563935" y="346.355668" style="fill: #d62728; stroke: #d62728" />
4135
  </g>
4136
  </g>
4137
  <g id="series--hf-kernels-flash-attn3" class="series">
4138
+ <path d="M 83.607806 428.387702 L 226.799032 420.061906 L 369.990258 405.625328 L 513.181484 401.010644 L 656.37271 352.807645 L 799.563935 359.622849 " clip-path="url(#p09feef2583)" style="fill: none; stroke: #9467bd; stroke-width: 1.5; stroke-linecap: square" />
4139
  <defs>
4140
  <path id="m7cd35be9cc" d="M 0 3 C 0.795609 3 1.55874 2.683901 2.12132 2.12132 C 2.683901 1.55874 3 0.795609 3 0 C 3 -0.795609 2.683901 -1.55874 2.12132 -2.12132 C 1.55874 -2.683901 0.795609 -3 0 -3 C -0.795609 -3 -1.55874 -2.683901 -2.12132 -2.12132 C -2.683901 -1.55874 -3 -0.795609 -3 0 C -3 0.795609 -2.683901 1.55874 -2.12132 2.12132 C -1.55874 2.683901 -0.795609 3 0 3 z " style="stroke: #9467bd" />
4141
  </defs>
4142
+ <g clip-path="url(#p09feef2583)">
4143
+ <use ns4:href="#m7cd35be9cc" x="83.607806" y="428.387702" style="fill: #9467bd; stroke: #9467bd" />
4144
+ <use ns4:href="#m7cd35be9cc" x="226.799032" y="420.061906" style="fill: #9467bd; stroke: #9467bd" />
4145
+ <use ns4:href="#m7cd35be9cc" x="369.990258" y="405.625328" style="fill: #9467bd; stroke: #9467bd" />
4146
+ <use ns4:href="#m7cd35be9cc" x="513.181484" y="401.010644" style="fill: #9467bd; stroke: #9467bd" />
4147
+ <use ns4:href="#m7cd35be9cc" x="656.37271" y="352.807645" style="fill: #9467bd; stroke: #9467bd" />
4148
+ <use ns4:href="#m7cd35be9cc" x="799.563935" y="359.622849" style="fill: #9467bd; stroke: #9467bd" />
4149
  </g>
4150
  </g>
4151
  <g id="patch_3">
4152
+ <path d="M 47.81 447.507117 L 47.81 26.88 " style="fill: none; stroke: #000000; stroke-width: 0.8; stroke-linejoin: miter; stroke-linecap: square" />
4153
  </g>
4154
  <g id="patch_4">
4155
  <path d="M 835.361742 447.507117 L 835.361742 26.88 " style="fill: none; stroke: #000000; stroke-width: 0.8; stroke-linejoin: miter; stroke-linecap: square" />
4156
  </g>
4157
  <g id="patch_5">
4158
+ <path d="M 47.81 447.507117 L 835.361742 447.507117 " style="fill: none; stroke: #000000; stroke-width: 0.8; stroke-linejoin: miter; stroke-linecap: square" />
4159
  </g>
4160
  <g id="patch_6">
4161
+ <path d="M 47.81 26.88 L 835.361742 26.88 " style="fill: none; stroke: #000000; stroke-width: 0.8; stroke-linejoin: miter; stroke-linecap: square" />
4162
  </g>
4163
+ <g id="text_14">
4164
+ <text style="font-size: 12px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: middle" x="441.585871" y="20.88" transform="rotate(-0 441.585871 20.88)">Attention Implementation Latency</text>
4165
  </g>
4166
  <g id="legend" class="legend">
4167
  <g id="patch_7">
4168
+ <path d="M 54.81 109.66125 L 198.305313 109.66125 Q 200.305313 109.66125 200.305313 107.66125 L 200.305313 33.88 Q 200.305313 31.88 198.305313 31.88 L 54.81 31.88 Q 52.81 31.88 52.81 33.88 L 52.81 107.66125 Q 52.81 109.66125 54.81 109.66125 L 54.81 109.66125 z " style="fill: none; opacity: 0.8; stroke: #cccccc; stroke-linejoin: miter" />
4169
  </g>
4170
+ <g id="line2d_14">
4171
+ <path d="M 56.81 39.978438 L 66.81 39.978438 L 76.81 39.978438 " style="fill: none; stroke: #1f77b4; stroke-width: 1.5; stroke-linecap: square" />
4172
  <g>
4173
+ <use ns4:href="#md7efaf3aec" x="66.81" y="39.978438" style="fill: #1f77b4; stroke: #1f77b4" />
4174
  </g>
4175
  </g>
4176
  <g id="legend-label--torch-flash-ma" class="legend">
4177
+ <text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: start" x="84.81" y="43.478438" transform="rotate(-0 84.81 43.478438)">torch_flash_ma</text>
4178
  </g>
4179
+ <g id="line2d_15">
4180
+ <path d="M 56.81 54.934687 L 66.81 54.934687 L 76.81 54.934687 " style="fill: none; stroke: #ff7f0e; stroke-width: 1.5; stroke-linecap: square" />
4181
  <g>
4182
+ <use ns4:href="#m9b8c54d372" x="66.81" y="54.934687" style="fill: #ff7f0e; stroke: #ff7f0e" />
4183
  </g>
4184
  </g>
4185
  <g id="legend-label--torch-mem-eff" class="legend">
4186
+ <text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: start" x="84.81" y="58.434687" transform="rotate(-0 84.81 58.434687)">torch_mem_eff</text>
4187
  </g>
4188
+ <g id="line2d_16">
4189
+ <path d="M 56.81 69.890938 L 66.81 69.890938 L 76.81 69.890938 " style="fill: none; stroke: #2ca02c; stroke-width: 1.5; stroke-linecap: square" />
4190
  <g>
4191
+ <use ns4:href="#mc655281e0b" x="66.81" y="69.890938" style="fill: #2ca02c; stroke: #2ca02c" />
4192
  </g>
4193
  </g>
4194
  <g id="legend-label--xformers-meff" class="legend">
4195
+ <text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: start" x="84.81" y="73.390938" transform="rotate(-0 84.81 73.390938)">xformers_meff</text>
4196
  </g>
4197
+ <g id="line2d_17">
4198
+ <path d="M 56.81 84.847188 L 66.81 84.847188 L 76.81 84.847188 " style="fill: none; stroke: #d62728; stroke-width: 1.5; stroke-linecap: square" />
4199
  <g>
4200
+ <use ns4:href="#m61c8040d7e" x="66.81" y="84.847188" style="fill: #d62728; stroke: #d62728" />
4201
  </g>
4202
  </g>
4203
  <g id="legend-label--hf-kernels-flash-attn" class="legend">
4204
+ <text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: start" x="84.81" y="88.347188" transform="rotate(-0 84.81 88.347188)">hf_kernels_flash_attn</text>
4205
  </g>
4206
+ <g id="line2d_18">
4207
+ <path d="M 56.81 99.803438 L 66.81 99.803438 L 76.81 99.803438 " style="fill: none; stroke: #9467bd; stroke-width: 1.5; stroke-linecap: square" />
4208
  <g>
4209
+ <use ns4:href="#m7cd35be9cc" x="66.81" y="99.803438" style="fill: #9467bd; stroke: #9467bd" />
4210
  </g>
4211
  </g>
4212
  <g id="legend-label--hf-kernels-flash-attn3" class="legend">
4213
+ <text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: start" x="84.81" y="103.303438" transform="rotate(-0 84.81 103.303438)">hf_kernels_flash_attn3</text>
4214
  </g>
4215
  </g>
4216
  </g>
4217
  </g>
4218
  <defs>
4219
+ <clipPath id="p09feef2583">
4220
+ <rect x="47.81" y="26.88" width="787.551742" height="420.627117" />
4221
  </clipPath>
4222
  </defs>
4223
  </svg>
 
4230
  <span onclick="toggleOutput('combine')" style="cursor: pointer;">▼ output</span>
4231
  <span id="uv-indicator-combine" onclick="toggleUvLogsFromHeader('combine')" style="cursor: pointer;">▶ uv-logs</span>
4232
  </span> |
4233
+ Cell: combine | 4.50s
4234
  | <button class="run-btn" onclick="runCell('combine')">▶ run</button>
4235
  <button class="copy-btn" onclick="copyCell('combine')">Copy</button>
4236
  <a href="cells/combine.py" target="_blank" class="raw-btn">Raw</a>
 
4310
  <div class="cell-stdout"><pre class="stdout-text">======================================================================
4311
  LOADING BENCHMARK DATA
4312
  ======================================================================
4313
+ ✓ Flash (PyTorch SDPA) : /__w/kernels-benchmarks/kernels-benchmarks/benches/flash_attn/impls/.uvnote/cache/4b81c2b991fc4a0f70c4117e933abc4007fd7f3f55394d7778a4074adf29df04
4314
+ ✓ MemEff (PyTorch SDPA) : /__w/kernels-benchmarks/kernels-benchmarks/benches/flash_attn/impls/.uvnote/cache/a23b7ad9cfb9e9968ec4a8f126174dc4a3ab5e6999c65a44570f93656598bd2f
4315
+ ✓ xFormers : /__w/kernels-benchmarks/kernels-benchmarks/benches/flash_attn/impls/.uvnote/cache/269846603898e0ee1872d7a8b40fca43ba558b2f3400f8a7bedb1ee79df7da58
4316
+ ✓ HF Kernels Flash Attn : /__w/kernels-benchmarks/kernels-benchmarks/benches/flash_attn/impls/.uvnote/cache/c1c92a22d205ca145ffb0083188c0f8eef512cfd6aa091b1e49d6329fbd08849
4317
+ ✓ HF Kernels Flash Attn3 : /__w/kernels-benchmarks/kernels-benchmarks/benches/flash_attn/impls/.uvnote/cache/8d741e4aa09c527ddf0f50ffa03a7e840559990c66178bfb9cf04bd97f3efd20
4318
+ ✓ SageAttention : /__w/kernels-benchmarks/kernels-benchmarks/benches/flash_attn/impls/.uvnote/cache/1355120a3e88bcb74f4130be51dfe8b03e7dc2b7823f2a53b20da7899570a16f
4319
 
4320
  ✓ Found Flash (PyTorch SDPA)
4321
+ Path: /__w/kernels-benchmarks/kernels-benchmarks/benches/flash_attn/impls/.uvnote/cache/4b81c2b991fc4a0f70c4117e933abc4007fd7f3f55394d7778a4074adf29df04/attention.jsonl
4322
  ✓ Found MemEff (PyTorch SDPA)
4323
+ Path: /__w/kernels-benchmarks/kernels-benchmarks/benches/flash_attn/impls/.uvnote/cache/a23b7ad9cfb9e9968ec4a8f126174dc4a3ab5e6999c65a44570f93656598bd2f/attention.jsonl
4324
  ✓ Found xFormers
4325
+ Path: /__w/kernels-benchmarks/kernels-benchmarks/benches/flash_attn/impls/.uvnote/cache/269846603898e0ee1872d7a8b40fca43ba558b2f3400f8a7bedb1ee79df7da58/attention.jsonl
4326
  ✓ Found HF Kernels Flash Attn
4327
+ Path: /__w/kernels-benchmarks/kernels-benchmarks/benches/flash_attn/impls/.uvnote/cache/c1c92a22d205ca145ffb0083188c0f8eef512cfd6aa091b1e49d6329fbd08849/attention.jsonl
4328
  ✓ Found HF Kernels Flash Attn3
4329
+ Path: /__w/kernels-benchmarks/kernels-benchmarks/benches/flash_attn/impls/.uvnote/cache/8d741e4aa09c527ddf0f50ffa03a7e840559990c66178bfb9cf04bd97f3efd20/attention.jsonl
4330
  ✓ Found SageAttention
4331
+ Path: /__w/kernels-benchmarks/kernels-benchmarks/benches/flash_attn/impls/.uvnote/cache/1355120a3e88bcb74f4130be51dfe8b03e7dc2b7823f2a53b20da7899570a16f/attention.jsonl
4332
 
4333
  ======================================================================
4334
  Summary: 6 found, 0 skipped, 0 missing
 
4337
  COMBINED BENCHMARK SUMMARY
4338
 
4339
  impl wl p50(ms) ok
4340
+ hf_kernels_flash_attn cuda_attn_L128_bfloat16 0.98 True
4341
+ hf_kernels_flash_attn cuda_attn_L256_bfloat16 1.02 True
4342
+ hf_kernels_flash_attn cuda_attn_L320_bfloat16 1.05 True
4343
+ hf_kernels_flash_attn cuda_attn_L384_bfloat16 1.07 True
4344
+ hf_kernels_flash_attn cuda_attn_L448_bfloat16 1.23 True
4345
+ hf_kernels_flash_attn cuda_attn_L512_bfloat16 1.23 True
4346
+ hf_kernels_flash_attn3 cuda_attn_L128_bfloat16 0.95 True
4347
+ hf_kernels_flash_attn3 cuda_attn_L256_bfloat16 0.98 True
4348
+ hf_kernels_flash_attn3 cuda_attn_L320_bfloat16 1.03 True
4349
+ hf_kernels_flash_attn3 cuda_attn_L384_bfloat16 1.04 True
4350
+ hf_kernels_flash_attn3 cuda_attn_L448_bfloat16 1.21 True
4351
+ hf_kernels_flash_attn3 cuda_attn_L512_bfloat16 1.18 True
4352
  sage_int8_fp16 cuda_attn_L128_bfloat16 FAIL False
4353
+ Error: module &#x27;sage_attention_12c766386675beb4&#x27; has no attribute &#x27;fwd&#x27;
4354
  sage_int8_fp16 cuda_attn_L256_bfloat16 FAIL False
4355
+ Error: module &#x27;sage_attention_12c766386675beb4&#x27; has no attribute &#x27;fwd&#x27;
4356
  sage_int8_fp16 cuda_attn_L320_bfloat16 FAIL False
4357
+ Error: module &#x27;sage_attention_12c766386675beb4&#x27; has no attribute &#x27;fwd&#x27;
4358
  sage_int8_fp16 cuda_attn_L384_bfloat16 FAIL False
4359
+ Error: module &#x27;sage_attention_12c766386675beb4&#x27; has no attribute &#x27;fwd&#x27;
4360
  sage_int8_fp16 cuda_attn_L448_bfloat16 FAIL False
4361
+ Error: module &#x27;sage_attention_12c766386675beb4&#x27; has no attribute &#x27;fwd&#x27;
4362
  sage_int8_fp16 cuda_attn_L512_bfloat16 FAIL False
4363
+ Error: module &#x27;sage_attention_12c766386675beb4&#x27; has no attribute &#x27;fwd&#x27;
4364
+ torch_flash_ma cuda_attn_L128_bfloat16 1.22 True
4365
+ torch_flash_ma cuda_attn_L256_bfloat16 1.27 True
4366
+ torch_flash_ma cuda_attn_L320_bfloat16 1.31 True
4367
+ torch_flash_ma cuda_attn_L384_bfloat16 1.34 True
4368
+ torch_flash_ma cuda_attn_L448_bfloat16 1.48 True
4369
+ torch_flash_ma cuda_attn_L512_bfloat16 1.52 True
4370
+ torch_mem_eff cuda_attn_L128_bfloat16 1.89 True
4371
+ torch_mem_eff cuda_attn_L256_bfloat16 1.95 True
4372
+ torch_mem_eff cuda_attn_L320_bfloat16 2.05 True
4373
+ torch_mem_eff cuda_attn_L384_bfloat16 2.08 True
4374
+ torch_mem_eff cuda_attn_L448_bfloat16 2.13 True
4375
+ torch_mem_eff cuda_attn_L512_bfloat16 2.27 True
4376
+ xformers_meff cuda_attn_L128_bfloat16 1.00 True
4377
+ xformers_meff cuda_attn_L256_bfloat16 1.04 True
4378
+ xformers_meff cuda_attn_L320_bfloat16 1.09 True
4379
+ xformers_meff cuda_attn_L384_bfloat16 1.11 True
4380
+ xformers_meff cuda_attn_L448_bfloat16 1.26 True
4381
+ xformers_meff cuda_attn_L512_bfloat16 1.25 True
4382
 
4383
  GENERATING COMBINED VISUALIZATION
4384
 
 
4402
  <div class="uv-install-logs" id="uv-logs-combine">
4403
  <div class="uv-logs-header" onclick="toggleUvLogs(this)">▶ UV Install Logs</div>
4404
  <div class="uv-logs-content" style="display: none;">
4405
+ Installed 37 packages in 259ms
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
4406
  </div>
4407
  </div>
4408
  <div class="cell-artifacts">
 
4415
  <rdf:RDF>
4416
  <ns2:Work>
4417
  <dc:type rdf:resource="http://purl.org/dc/dcmitype/StillImage" />
4418
+ <dc:date>2025-10-27T14:46:38.946915</dc:date>
4419
  <dc:format>image/svg+xml</dc:format>
4420
  <dc:creator>
4421
  <ns2:Agent>
 
4434
  </g>
4435
  <g id="axes--1" class="axes">
4436
  <g id="patch_2">
4437
+ <path d="M 47.81 447.507117 L 835.361742 447.507117 L 835.361742 26.88 L 47.81 26.88 L 47.81 447.507117 z " style="fill: none" />
4438
  </g>
4439
  <g id="matplotlib.axis_1">
4440
  <g id="xtick_1">
4441
  <g id="grid-x--1" class="grid grid-x">
4442
+ <path d="M 83.607806 447.507117 L 83.607806 26.88 " clip-path="url(#p09feef2583)" style="fill: none; stroke: #b0b0b0; stroke-opacity: 0.3; stroke-width: 0.8; stroke-linecap: square" />
4443
  </g>
4444
  <g id="line2d_1">
4445
  <defs>
4446
  <path id="mafb3703e5b" d="M 0 0 L 0 3.5 " style="stroke: #000000; stroke-width: 0.8" />
4447
  </defs>
4448
  <g>
4449
+ <use ns4:href="#mafb3703e5b" x="83.607806" y="447.507117" style="stroke: #000000; stroke-width: 0.8" />
4450
  </g>
4451
  </g>
4452
  <g id="text_1">
4453
+ <text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif" transform="translate(40.977554 548.84621) rotate(-45)">cuda_attn_L128_bfloat16</text>
4454
  </g>
4455
  </g>
4456
  <g id="xtick_2">
4457
  <g id="grid-x--2" class="grid grid-x">
4458
+ <path d="M 226.799032 447.507117 L 226.799032 26.88 " clip-path="url(#p09feef2583)" style="fill: none; stroke: #b0b0b0; stroke-opacity: 0.3; stroke-width: 0.8; stroke-linecap: square" />
4459
  </g>
4460
  <g id="line2d_2">
4461
  <g>
4462
+ <use ns4:href="#mafb3703e5b" x="226.799032" y="447.507117" style="stroke: #000000; stroke-width: 0.8" />
4463
  </g>
4464
  </g>
4465
  <g id="text_2">
4466
+ <text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif" transform="translate(184.16878 548.84621) rotate(-45)">cuda_attn_L256_bfloat16</text>
4467
  </g>
4468
  </g>
4469
  <g id="xtick_3">
4470
  <g id="grid-x--3" class="grid grid-x">
4471
+ <path d="M 369.990258 447.507117 L 369.990258 26.88 " clip-path="url(#p09feef2583)" style="fill: none; stroke: #b0b0b0; stroke-opacity: 0.3; stroke-width: 0.8; stroke-linecap: square" />
4472
  </g>
4473
  <g id="line2d_3">
4474
  <g>
4475
+ <use ns4:href="#mafb3703e5b" x="369.990258" y="447.507117" style="stroke: #000000; stroke-width: 0.8" />
4476
  </g>
4477
  </g>
4478
  <g id="text_3">
4479
+ <text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif" transform="translate(327.360005 548.84621) rotate(-45)">cuda_attn_L320_bfloat16</text>
4480
  </g>
4481
  </g>
4482
  <g id="xtick_4">
4483
  <g id="grid-x--4" class="grid grid-x">
4484
+ <path d="M 513.181484 447.507117 L 513.181484 26.88 " clip-path="url(#p09feef2583)" style="fill: none; stroke: #b0b0b0; stroke-opacity: 0.3; stroke-width: 0.8; stroke-linecap: square" />
4485
  </g>
4486
  <g id="line2d_4">
4487
  <g>
4488
+ <use ns4:href="#mafb3703e5b" x="513.181484" y="447.507117" style="stroke: #000000; stroke-width: 0.8" />
4489
  </g>
4490
  </g>
4491
  <g id="text_4">
4492
+ <text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif" transform="translate(470.551231 548.84621) rotate(-45)">cuda_attn_L384_bfloat16</text>
4493
  </g>
4494
  </g>
4495
  <g id="xtick_5">
4496
  <g id="grid-x--5" class="grid grid-x">
4497
+ <path d="M 656.37271 447.507117 L 656.37271 26.88 " clip-path="url(#p09feef2583)" style="fill: none; stroke: #b0b0b0; stroke-opacity: 0.3; stroke-width: 0.8; stroke-linecap: square" />
4498
  </g>
4499
  <g id="line2d_5">
4500
  <g>
4501
+ <use ns4:href="#mafb3703e5b" x="656.37271" y="447.507117" style="stroke: #000000; stroke-width: 0.8" />
4502
  </g>
4503
  </g>
4504
  <g id="text_5">
4505
+ <text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif" transform="translate(613.742457 548.84621) rotate(-45)">cuda_attn_L448_bfloat16</text>
4506
  </g>
4507
  </g>
4508
  <g id="xtick_6">
4509
  <g id="grid-x--6" class="grid grid-x">
4510
+ <path d="M 799.563935 447.507117 L 799.563935 26.88 " clip-path="url(#p09feef2583)" style="fill: none; stroke: #b0b0b0; stroke-opacity: 0.3; stroke-width: 0.8; stroke-linecap: square" />
4511
  </g>
4512
  <g id="line2d_6">
4513
  <g>
4514
+ <use ns4:href="#mafb3703e5b" x="799.563935" y="447.507117" style="stroke: #000000; stroke-width: 0.8" />
4515
  </g>
4516
  </g>
4517
  <g id="text_6">
4518
+ <text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif" transform="translate(756.933683 548.84621) rotate(-45)">cuda_attn_L512_bfloat16</text>
4519
  </g>
4520
  </g>
4521
  <g id="label--x" class="xlabel">
4522
+ <text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: middle" x="441.585871" y="562.111872" transform="rotate(-0 441.585871 562.111872)">Workload</text>
4523
  </g>
4524
  </g>
4525
  <g id="matplotlib.axis_2">
4526
  <g id="ytick_1">
4527
  <g id="grid-y--2" class="grid grid-y">
4528
+ <path d="M 47.81 413.210177 L 835.361742 413.210177 " clip-path="url(#p09feef2583)" style="fill: none; stroke: #b0b0b0; stroke-opacity: 0.3; stroke-width: 0.8; stroke-linecap: square" />
4529
  </g>
4530
  <g id="line2d_7">
4531
  <defs>
4532
  <path id="m0fca2865ba" d="M 0 0 L -3.5 0 " style="stroke: #000000; stroke-width: 0.8" />
4533
  </defs>
4534
  <g>
4535
+ <use ns4:href="#m0fca2865ba" x="47.81" y="413.210177" style="stroke: #000000; stroke-width: 0.8" />
4536
  </g>
4537
  </g>
4538
  <g id="text_7">
4539
+ <text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: end" x="40.81" y="417.009396" transform="rotate(-0 40.81 417.009396)">1.0</text>
4540
  </g>
4541
  </g>
4542
  <g id="ytick_2">
4543
  <g id="grid-y--3" class="grid grid-y">
4544
+ <path d="M 47.81 355.233116 L 835.361742 355.233116 " clip-path="url(#p09feef2583)" style="fill: none; stroke: #b0b0b0; stroke-opacity: 0.3; stroke-width: 0.8; stroke-linecap: square" />
4545
  </g>
4546
  <g id="line2d_8">
4547
  <g>
4548
+ <use ns4:href="#m0fca2865ba" x="47.81" y="355.233116" style="stroke: #000000; stroke-width: 0.8" />
4549
  </g>
4550
  </g>
4551
  <g id="text_8">
4552
+ <text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: end" x="40.81" y="359.032335" transform="rotate(-0 40.81 359.032335)">1.2</text>
4553
  </g>
4554
  </g>
4555
  <g id="ytick_3">
4556
  <g id="grid-y--4" class="grid grid-y">
4557
+ <path d="M 47.81 297.256055 L 835.361742 297.256055 " clip-path="url(#p09feef2583)" style="fill: none; stroke: #b0b0b0; stroke-opacity: 0.3; stroke-width: 0.8; stroke-linecap: square" />
4558
  </g>
4559
  <g id="line2d_9">
4560
  <g>
4561
+ <use ns4:href="#m0fca2865ba" x="47.81" y="297.256055" style="stroke: #000000; stroke-width: 0.8" />
4562
  </g>
4563
  </g>
4564
  <g id="text_9">
4565
+ <text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: end" x="40.81" y="301.055273" transform="rotate(-0 40.81 301.055273)">1.4</text>
4566
  </g>
4567
  </g>
4568
  <g id="ytick_4">
4569
  <g id="grid-y--5" class="grid grid-y">
4570
+ <path d="M 47.81 239.278993 L 835.361742 239.278993 " clip-path="url(#p09feef2583)" style="fill: none; stroke: #b0b0b0; stroke-opacity: 0.3; stroke-width: 0.8; stroke-linecap: square" />
4571
  </g>
4572
  <g id="line2d_10">
4573
  <g>
4574
+ <use ns4:href="#m0fca2865ba" x="47.81" y="239.278993" style="stroke: #000000; stroke-width: 0.8" />
4575
  </g>
4576
  </g>
4577
  <g id="text_10">
4578
+ <text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: end" x="40.81" y="243.078212" transform="rotate(-0 40.81 243.078212)">1.6</text>
4579
  </g>
4580
  </g>
4581
  <g id="ytick_5">
4582
  <g id="grid-y--6" class="grid grid-y">
4583
+ <path d="M 47.81 181.301932 L 835.361742 181.301932 " clip-path="url(#p09feef2583)" style="fill: none; stroke: #b0b0b0; stroke-opacity: 0.3; stroke-width: 0.8; stroke-linecap: square" />
4584
  </g>
4585
  <g id="line2d_11">
4586
  <g>
4587
+ <use ns4:href="#m0fca2865ba" x="47.81" y="181.301932" style="stroke: #000000; stroke-width: 0.8" />
4588
  </g>
4589
  </g>
4590
  <g id="text_11">
4591
+ <text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: end" x="40.81" y="185.101151" transform="rotate(-0 40.81 185.101151)">1.8</text>
4592
  </g>
4593
  </g>
4594
  <g id="ytick_6">
4595
  <g id="grid-y--7" class="grid grid-y">
4596
+ <path d="M 47.81 123.324871 L 835.361742 123.324871 " clip-path="url(#p09feef2583)" style="fill: none; stroke: #b0b0b0; stroke-opacity: 0.3; stroke-width: 0.8; stroke-linecap: square" />
4597
  </g>
4598
  <g id="line2d_12">
4599
  <g>
4600
+ <use ns4:href="#m0fca2865ba" x="47.81" y="123.324871" style="stroke: #000000; stroke-width: 0.8" />
4601
  </g>
4602
  </g>
4603
  <g id="text_12">
4604
+ <text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: end" x="40.81" y="127.124089" transform="rotate(-0 40.81 127.124089)">2.0</text>
4605
+ </g>
4606
+ </g>
4607
+ <g id="ytick_7">
4608
+ <g id="grid-y--8" class="grid grid-y">
4609
+ <path d="M 47.81 65.347809 L 835.361742 65.347809 " clip-path="url(#p09feef2583)" style="fill: none; stroke: #b0b0b0; stroke-opacity: 0.3; stroke-width: 0.8; stroke-linecap: square" />
4610
+ </g>
4611
+ <g id="line2d_13">
4612
+ <g>
4613
+ <use ns4:href="#m0fca2865ba" x="47.81" y="65.347809" style="stroke: #000000; stroke-width: 0.8" />
4614
+ </g>
4615
+ </g>
4616
+ <g id="text_13">
4617
+ <text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: end" x="40.81" y="69.147028" transform="rotate(-0 40.81 69.147028)">2.2</text>
4618
  </g>
4619
  </g>
4620
  <g id="label--y" class="ylabel">
4621
+ <text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: middle" x="18.827187" y="237.193558" transform="rotate(-90 18.827187 237.193558)">Latency P50 (ms)</text>
4622
  </g>
4623
  </g>
4624
  <g id="series--torch-flash-ma" class="series">
4625
+ <path d="M 83.607806 349.439178 L 226.799032 333.602454 L 369.990258 324.473676 L 513.181484 316.069901 L 656.37271 272.899601 L 799.563935 261.559288 " clip-path="url(#p09feef2583)" style="fill: none; stroke: #1f77b4; stroke-width: 1.5; stroke-linecap: square" />
4626
  <defs>
4627
  <path id="md7efaf3aec" d="M 0 3 C 0.795609 3 1.55874 2.683901 2.12132 2.12132 C 2.683901 1.55874 3 0.795609 3 0 C 3 -0.795609 2.683901 -1.55874 2.12132 -2.12132 C 1.55874 -2.683901 0.795609 -3 0 -3 C -0.795609 -3 -1.55874 -2.683901 -2.12132 -2.12132 C -2.683901 -1.55874 -3 -0.795609 -3 0 C -3 0.795609 -2.683901 1.55874 -2.12132 2.12132 C -1.55874 2.683901 -0.795609 3 0 3 z " style="stroke: #1f77b4" />
4628
  </defs>
4629
+ <g clip-path="url(#p09feef2583)">
4630
+ <use ns4:href="#md7efaf3aec" x="83.607806" y="349.439178" style="fill: #1f77b4; stroke: #1f77b4" />
4631
+ <use ns4:href="#md7efaf3aec" x="226.799032" y="333.602454" style="fill: #1f77b4; stroke: #1f77b4" />
4632
+ <use ns4:href="#md7efaf3aec" x="369.990258" y="324.473676" style="fill: #1f77b4; stroke: #1f77b4" />
4633
+ <use ns4:href="#md7efaf3aec" x="513.181484" y="316.069901" style="fill: #1f77b4; stroke: #1f77b4" />
4634
+ <use ns4:href="#md7efaf3aec" x="656.37271" y="272.899601" style="fill: #1f77b4; stroke: #1f77b4" />
4635
+ <use ns4:href="#md7efaf3aec" x="799.563935" y="261.559288" style="fill: #1f77b4; stroke: #1f77b4" />
4636
  </g>
4637
  </g>
4638
  <g id="series--torch-mem-eff" class="series">
4639
+ <path d="M 83.607806 156.020744 L 226.799032 138.969401 L 369.990258 109.128607 L 513.181484 99.249026 L 656.37271 87.05645 L 799.563935 45.999414 " clip-path="url(#p09feef2583)" style="fill: none; stroke: #ff7f0e; stroke-width: 1.5; stroke-linecap: square" />
4640
  <defs>
4641
  <path id="m9b8c54d372" d="M 0 3 C 0.795609 3 1.55874 2.683901 2.12132 2.12132 C 2.683901 1.55874 3 0.795609 3 0 C 3 -0.795609 2.683901 -1.55874 2.12132 -2.12132 C 1.55874 -2.683901 0.795609 -3 0 -3 C -0.795609 -3 -1.55874 -2.683901 -2.12132 -2.12132 C -2.683901 -1.55874 -3 -0.795609 -3 0 C -3 0.795609 -2.683901 1.55874 -2.12132 2.12132 C -1.55874 2.683901 -0.795609 3 0 3 z " style="stroke: #ff7f0e" />
4642
  </defs>
4643
+ <g clip-path="url(#p09feef2583)">
4644
+ <use ns4:href="#m9b8c54d372" x="83.607806" y="156.020744" style="fill: #ff7f0e; stroke: #ff7f0e" />
4645
+ <use ns4:href="#m9b8c54d372" x="226.799032" y="138.969401" style="fill: #ff7f0e; stroke: #ff7f0e" />
4646
+ <use ns4:href="#m9b8c54d372" x="369.990258" y="109.128607" style="fill: #ff7f0e; stroke: #ff7f0e" />
4647
+ <use ns4:href="#m9b8c54d372" x="513.181484" y="99.249026" style="fill: #ff7f0e; stroke: #ff7f0e" />
4648
+ <use ns4:href="#m9b8c54d372" x="656.37271" y="87.05645" style="fill: #ff7f0e; stroke: #ff7f0e" />
4649
+ <use ns4:href="#m9b8c54d372" x="799.563935" y="45.999414" style="fill: #ff7f0e; stroke: #ff7f0e" />
4650
  </g>
4651
  </g>
4652
  <g id="series--xformers-meff" class="series">
4653
+ <path d="M 83.607806 414.345368 L 226.799032 400.181572 L 369.990258 385.808769 L 513.181484 380.581847 L 656.37271 338.122056 L 799.563935 339.866876 " clip-path="url(#p09feef2583)" style="fill: none; stroke: #2ca02c; stroke-width: 1.5; stroke-linecap: square" />
4654
  <defs>
4655
  <path id="mc655281e0b" d="M 0 3 C 0.795609 3 1.55874 2.683901 2.12132 2.12132 C 2.683901 1.55874 3 0.795609 3 0 C 3 -0.795609 2.683901 -1.55874 2.12132 -2.12132 C 1.55874 -2.683901 0.795609 -3 0 -3 C -0.795609 -3 -1.55874 -2.683901 -2.12132 -2.12132 C -2.683901 -1.55874 -3 -0.795609 -3 0 C -3 0.795609 -2.683901 1.55874 -2.12132 2.12132 C -1.55874 2.683901 -0.795609 3 0 3 z " style="stroke: #2ca02c" />
4656
  </defs>
4657
+ <g clip-path="url(#p09feef2583)">
4658
+ <use ns4:href="#mc655281e0b" x="83.607806" y="414.345368" style="fill: #2ca02c; stroke: #2ca02c" />
4659
+ <use ns4:href="#mc655281e0b" x="226.799032" y="400.181572" style="fill: #2ca02c; stroke: #2ca02c" />
4660
+ <use ns4:href="#mc655281e0b" x="369.990258" y="385.808769" style="fill: #2ca02c; stroke: #2ca02c" />
4661
+ <use ns4:href="#mc655281e0b" x="513.181484" y="380.581847" style="fill: #2ca02c; stroke: #2ca02c" />
4662
+ <use ns4:href="#mc655281e0b" x="656.37271" y="338.122056" style="fill: #2ca02c; stroke: #2ca02c" />
4663
+ <use ns4:href="#mc655281e0b" x="799.563935" y="339.866876" style="fill: #2ca02c; stroke: #2ca02c" />
4664
  </g>
4665
  </g>
4666
  <g id="series--hf-kernels-flash-attn" class="series">
4667
+ <path d="M 83.607806 420.20395 L 226.799032 407.432473 L 369.990258 399.40236 L 513.181484 392.590345 L 656.37271 345.709514 L 799.563935 346.355668 " clip-path="url(#p09feef2583)" style="fill: none; stroke: #d62728; stroke-width: 1.5; stroke-linecap: square" />
4668
  <defs>
4669
  <path id="m61c8040d7e" d="M 0 3 C 0.795609 3 1.55874 2.683901 2.12132 2.12132 C 2.683901 1.55874 3 0.795609 3 0 C 3 -0.795609 2.683901 -1.55874 2.12132 -2.12132 C 1.55874 -2.683901 0.795609 -3 0 -3 C -0.795609 -3 -1.55874 -2.683901 -2.12132 -2.12132 C -2.683901 -1.55874 -3 -0.795609 -3 0 C -3 0.795609 -2.683901 1.55874 -2.12132 2.12132 C -1.55874 2.683901 -0.795609 3 0 3 z " style="stroke: #d62728" />
4670
  </defs>
4671
+ <g clip-path="url(#p09feef2583)">
4672
+ <use ns4:href="#m61c8040d7e" x="83.607806" y="420.20395" style="fill: #d62728; stroke: #d62728" />
4673
+ <use ns4:href="#m61c8040d7e" x="226.799032" y="407.432473" style="fill: #d62728; stroke: #d62728" />
4674
+ <use ns4:href="#m61c8040d7e" x="369.990258" y="399.40236" style="fill: #d62728; stroke: #d62728" />
4675
+ <use ns4:href="#m61c8040d7e" x="513.181484" y="392.590345" style="fill: #d62728; stroke: #d62728" />
4676
+ <use ns4:href="#m61c8040d7e" x="656.37271" y="345.709514" style="fill: #d62728; stroke: #d62728" />
4677
+ <use ns4:href="#m61c8040d7e" x="799.563935" y="346.355668" style="fill: #d62728; stroke: #d62728" />
4678
  </g>
4679
  </g>
4680
  <g id="series--hf-kernels-flash-attn3" class="series">
4681
+ <path d="M 83.607806 428.387702 L 226.799032 420.061906 L 369.990258 405.625328 L 513.181484 401.010644 L 656.37271 352.807645 L 799.563935 359.622849 " clip-path="url(#p09feef2583)" style="fill: none; stroke: #9467bd; stroke-width: 1.5; stroke-linecap: square" />
4682
  <defs>
4683
  <path id="m7cd35be9cc" d="M 0 3 C 0.795609 3 1.55874 2.683901 2.12132 2.12132 C 2.683901 1.55874 3 0.795609 3 0 C 3 -0.795609 2.683901 -1.55874 2.12132 -2.12132 C 1.55874 -2.683901 0.795609 -3 0 -3 C -0.795609 -3 -1.55874 -2.683901 -2.12132 -2.12132 C -2.683901 -1.55874 -3 -0.795609 -3 0 C -3 0.795609 -2.683901 1.55874 -2.12132 2.12132 C -1.55874 2.683901 -0.795609 3 0 3 z " style="stroke: #9467bd" />
4684
  </defs>
4685
+ <g clip-path="url(#p09feef2583)">
4686
+ <use ns4:href="#m7cd35be9cc" x="83.607806" y="428.387702" style="fill: #9467bd; stroke: #9467bd" />
4687
+ <use ns4:href="#m7cd35be9cc" x="226.799032" y="420.061906" style="fill: #9467bd; stroke: #9467bd" />
4688
+ <use ns4:href="#m7cd35be9cc" x="369.990258" y="405.625328" style="fill: #9467bd; stroke: #9467bd" />
4689
+ <use ns4:href="#m7cd35be9cc" x="513.181484" y="401.010644" style="fill: #9467bd; stroke: #9467bd" />
4690
+ <use ns4:href="#m7cd35be9cc" x="656.37271" y="352.807645" style="fill: #9467bd; stroke: #9467bd" />
4691
+ <use ns4:href="#m7cd35be9cc" x="799.563935" y="359.622849" style="fill: #9467bd; stroke: #9467bd" />
4692
  </g>
4693
  </g>
4694
  <g id="patch_3">
4695
+ <path d="M 47.81 447.507117 L 47.81 26.88 " style="fill: none; stroke: #000000; stroke-width: 0.8; stroke-linejoin: miter; stroke-linecap: square" />
4696
  </g>
4697
  <g id="patch_4">
4698
  <path d="M 835.361742 447.507117 L 835.361742 26.88 " style="fill: none; stroke: #000000; stroke-width: 0.8; stroke-linejoin: miter; stroke-linecap: square" />
4699
  </g>
4700
  <g id="patch_5">
4701
+ <path d="M 47.81 447.507117 L 835.361742 447.507117 " style="fill: none; stroke: #000000; stroke-width: 0.8; stroke-linejoin: miter; stroke-linecap: square" />
4702
  </g>
4703
  <g id="patch_6">
4704
+ <path d="M 47.81 26.88 L 835.361742 26.88 " style="fill: none; stroke: #000000; stroke-width: 0.8; stroke-linejoin: miter; stroke-linecap: square" />
4705
  </g>
4706
+ <g id="text_14">
4707
+ <text style="font-size: 12px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: middle" x="441.585871" y="20.88" transform="rotate(-0 441.585871 20.88)">Attention Implementation Latency</text>
4708
  </g>
4709
  <g id="legend" class="legend">
4710
  <g id="patch_7">
4711
+ <path d="M 54.81 109.66125 L 198.305313 109.66125 Q 200.305313 109.66125 200.305313 107.66125 L 200.305313 33.88 Q 200.305313 31.88 198.305313 31.88 L 54.81 31.88 Q 52.81 31.88 52.81 33.88 L 52.81 107.66125 Q 52.81 109.66125 54.81 109.66125 L 54.81 109.66125 z " style="fill: none; opacity: 0.8; stroke: #cccccc; stroke-linejoin: miter" />
4712
  </g>
4713
+ <g id="line2d_14">
4714
+ <path d="M 56.81 39.978438 L 66.81 39.978438 L 76.81 39.978438 " style="fill: none; stroke: #1f77b4; stroke-width: 1.5; stroke-linecap: square" />
4715
  <g>
4716
+ <use ns4:href="#md7efaf3aec" x="66.81" y="39.978438" style="fill: #1f77b4; stroke: #1f77b4" />
4717
  </g>
4718
  </g>
4719
  <g id="legend-label--torch-flash-ma" class="legend">
4720
+ <text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: start" x="84.81" y="43.478438" transform="rotate(-0 84.81 43.478438)">torch_flash_ma</text>
4721
  </g>
4722
+ <g id="line2d_15">
4723
+ <path d="M 56.81 54.934687 L 66.81 54.934687 L 76.81 54.934687 " style="fill: none; stroke: #ff7f0e; stroke-width: 1.5; stroke-linecap: square" />
4724
  <g>
4725
+ <use ns4:href="#m9b8c54d372" x="66.81" y="54.934687" style="fill: #ff7f0e; stroke: #ff7f0e" />
4726
  </g>
4727
  </g>
4728
  <g id="legend-label--torch-mem-eff" class="legend">
4729
+ <text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: start" x="84.81" y="58.434687" transform="rotate(-0 84.81 58.434687)">torch_mem_eff</text>
4730
  </g>
4731
+ <g id="line2d_16">
4732
+ <path d="M 56.81 69.890938 L 66.81 69.890938 L 76.81 69.890938 " style="fill: none; stroke: #2ca02c; stroke-width: 1.5; stroke-linecap: square" />
4733
  <g>
4734
+ <use ns4:href="#mc655281e0b" x="66.81" y="69.890938" style="fill: #2ca02c; stroke: #2ca02c" />
4735
  </g>
4736
  </g>
4737
  <g id="legend-label--xformers-meff" class="legend">
4738
+ <text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: start" x="84.81" y="73.390938" transform="rotate(-0 84.81 73.390938)">xformers_meff</text>
4739
  </g>
4740
+ <g id="line2d_17">
4741
+ <path d="M 56.81 84.847188 L 66.81 84.847188 L 76.81 84.847188 " style="fill: none; stroke: #d62728; stroke-width: 1.5; stroke-linecap: square" />
4742
  <g>
4743
+ <use ns4:href="#m61c8040d7e" x="66.81" y="84.847188" style="fill: #d62728; stroke: #d62728" />
4744
  </g>
4745
  </g>
4746
  <g id="legend-label--hf-kernels-flash-attn" class="legend">
4747
+ <text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: start" x="84.81" y="88.347188" transform="rotate(-0 84.81 88.347188)">hf_kernels_flash_attn</text>
4748
  </g>
4749
+ <g id="line2d_18">
4750
+ <path d="M 56.81 99.803438 L 66.81 99.803438 L 76.81 99.803438 " style="fill: none; stroke: #9467bd; stroke-width: 1.5; stroke-linecap: square" />
4751
  <g>
4752
+ <use ns4:href="#m7cd35be9cc" x="66.81" y="99.803438" style="fill: #9467bd; stroke: #9467bd" />
4753
  </g>
4754
  </g>
4755
  <g id="legend-label--hf-kernels-flash-attn3" class="legend">
4756
+ <text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: start" x="84.81" y="103.303438" transform="rotate(-0 84.81 103.303438)">hf_kernels_flash_attn3</text>
4757
  </g>
4758
  </g>
4759
  </g>
4760
  </g>
4761
  <defs>
4762
+ <clipPath id="p09feef2583">
4763
+ <rect x="47.81" y="26.88" width="787.551742" height="420.627117" />
4764
  </clipPath>
4765
  </defs>
4766
  </svg>
layer_norm/impls/artifacts/benchmark/layer_norm.jsonl CHANGED
The diff for this file is too large to render. See raw diff
 
layer_norm/impls/cells/benchmark.py CHANGED
@@ -3,6 +3,7 @@
3
  # dependencies = [
4
  # "numpy",
5
  # "torch==2.8.0",
 
6
  # "kernels-benchmark-tools",
7
  # ]
8
  #
@@ -12,15 +13,37 @@
12
  import torch
13
  import sys
14
  from kernels_benchmark_tools import KernelTypeEnum, run_benchmark
 
15
 
 
 
16
 
17
- def torch_layer_norm(x, weight, bias, eps: float = 1e-5):
18
- return torch.nn.functional.layer_norm(x, (x.shape[-1],), weight, bias, eps)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
19
 
20
 
21
  run_benchmark(
22
  kernel_type=KernelTypeEnum.LAYER_NORM,
23
- impl_name="torch_layer_norm",
24
- impl_tags={"family": "torch", "op": "layer_norm"},
25
- impl_func=torch_layer_norm,
26
  )
 
3
  # dependencies = [
4
  # "numpy",
5
  # "torch==2.8.0",
6
+ # "kernels",
7
  # "kernels-benchmark-tools",
8
  # ]
9
  #
 
13
  import torch
14
  import sys
15
  from kernels_benchmark_tools import KernelTypeEnum, run_benchmark
16
+ from kernels import get_kernel
17
 
18
+ # Load the layer norm kernel
19
+ layer_norm_kernel = get_kernel("kernels-community/layer-norm")
20
 
21
+
22
+ def hf_kernels_layer_norm(x, weight, bias, eps: float = 1e-5):
23
+ B, S, D = x.shape
24
+ # The kernel expects [N, D] input; support beta (bias) if provided.
25
+ out = layer_norm_kernel.dropout_add_ln_fwd(
26
+ input=x.view(-1, D),
27
+ gamma=weight,
28
+ beta=bias,
29
+ rowscale=None,
30
+ colscale=None,
31
+ x0_subset=None,
32
+ z_subset=None,
33
+ dropout_p=0.0,
34
+ epsilon=eps,
35
+ rowscale_const=1.0,
36
+ z_numrows=S,
37
+ gen=None,
38
+ residual_in_fp32=False,
39
+ is_rms_norm=False,
40
+ )[0].view(B, S, D)
41
+ return out
42
 
43
 
44
  run_benchmark(
45
  kernel_type=KernelTypeEnum.LAYER_NORM,
46
+ impl_name="hf_kernels_layer_norm",
47
+ impl_tags={"family": "hf-kernels", "repo": "kernels-community/layer-norm", "op": "layer_norm"},
48
+ impl_func=hf_kernels_layer_norm,
49
  )
layer_norm/impls/hf_kernels_layer_norm.html CHANGED
The diff for this file is too large to render. See raw diff
 
layer_norm/impls/torch_layer_norm.html CHANGED
The diff for this file is too large to render. See raw diff
 
layer_norm/results/artifacts/combine/latency.svg CHANGED

Git LFS Details

  • SHA256: 9666e51a7b23e41e320cf61de04ef7044c3870632454dcae02bf6d9c87decec7
  • Pointer size: 128 Bytes
  • Size of remote file: 947 Bytes

Git LFS Details

  • SHA256: e7883bd5f88a9163cc9fdaeec2076ca6319f97d413c6bea136db33612dc2b864
  • Pointer size: 128 Bytes
  • Size of remote file: 947 Bytes
layer_norm/results/combined_results.html CHANGED
@@ -3857,7 +3857,7 @@ span.linenos.special { color: #000000; background-color: #ffffc0; padding-left:
3857
  <div class="system-info">
3858
  <div class="system-info-header">Generated on:</div>
3859
  <div class="system-info-content">
3860
- Linux x86_64 | Linux-5.15.0-1084-aws-x86_64-with-glibc2.31
3861
  </div>
3862
  </div>
3863
 
@@ -3872,7 +3872,7 @@ span.linenos.special { color: #000000; background-color: #ffffc0; padding-left:
3872
  <rdf:RDF>
3873
  <ns2:Work>
3874
  <dc:type rdf:resource="http://purl.org/dc/dcmitype/StillImage" />
3875
- <dc:date>2025-10-24T19:26:16.447564</dc:date>
3876
  <dc:format>image/svg+xml</dc:format>
3877
  <dc:creator>
3878
  <ns2:Agent>
@@ -3900,7 +3900,7 @@ span.linenos.special { color: #000000; background-color: #ffffc0; padding-left:
3900
  <span onclick="toggleOutput('combine')" style="cursor: pointer;">▼ output</span>
3901
  <span id="uv-indicator-combine" onclick="toggleUvLogsFromHeader('combine')" style="cursor: pointer;">▶ uv-logs</span>
3902
  </span> |
3903
- Cell: combine | 38.84s
3904
  | <button class="run-btn" onclick="runCell('combine')">▶ run</button>
3905
  <button class="copy-btn" onclick="copyCell('combine')">Copy</button>
3906
  <a href="cells/combine.py" target="_blank" class="raw-btn">Raw</a>
@@ -3972,13 +3972,13 @@ Cell: combine | 38.84s
3972
  <div class="cell-stdout"><pre class="stdout-text">======================================================================
3973
  LOADING BENCHMARK DATA
3974
  ======================================================================
3975
- ✓ PyTorch LayerNorm : /home/ubuntu/Projects/kernels-benchmarks-consolidated/benches/layer_norm/impls/.uvnote/cache/679c54caaf848e698d978e76e5f2839b8565918d30fac991242aebea8229f1c9
3976
- ✓ HF Kernels LayerNorm : /home/ubuntu/Projects/kernels-benchmarks-consolidated/benches/layer_norm/impls/.uvnote/cache/f7de4b4d3171468ce97015124a3af1a23ef8d4ff4f319bd566a88676d47f08db
3977
 
3978
  ✓ Found PyTorch LayerNorm
3979
- Path: /home/ubuntu/Projects/kernels-benchmarks-consolidated/benches/layer_norm/impls/.uvnote/cache/679c54caaf848e698d978e76e5f2839b8565918d30fac991242aebea8229f1c9/layer_norm.jsonl
3980
  ✓ Found HF Kernels LayerNorm
3981
- Path: /home/ubuntu/Projects/kernels-benchmarks-consolidated/benches/layer_norm/impls/.uvnote/cache/f7de4b4d3171468ce97015124a3af1a23ef8d4ff4f319bd566a88676d47f08db/layer_norm.jsonl
3982
 
3983
  ======================================================================
3984
  Summary: 2 found, 0 skipped, 0 missing
@@ -3987,102 +3987,102 @@ Summary: 2 found, 0 skipped, 0 missing
3987
  COMBINED BENCHMARK SUMMARY
3988
 
3989
  impl wl p50(ms) ok
3990
- hf_kernels_layer_norm LN_B16_S1024_D1024 0.29 False
3991
- hf_kernels_layer_norm LN_B16_S1024_D2048 0.61 False
3992
- hf_kernels_layer_norm LN_B16_S1024_D4096 1.15 False
3993
- hf_kernels_layer_norm LN_B16_S1024_D8192 2.27 False
3994
  hf_kernels_layer_norm LN_B16_S128_D1024 0.05 False
3995
  hf_kernels_layer_norm LN_B16_S128_D2048 0.05 False
3996
- hf_kernels_layer_norm LN_B16_S128_D4096 0.06 False
3997
- hf_kernels_layer_norm LN_B16_S128_D8192 0.30 False
3998
- hf_kernels_layer_norm LN_B16_S2048_D1024 0.61 False
3999
- hf_kernels_layer_norm LN_B16_S2048_D2048 1.20 False
4000
- hf_kernels_layer_norm LN_B16_S2048_D4096 2.27 False
4001
- hf_kernels_layer_norm LN_B16_S2048_D8192 4.51 False
4002
- hf_kernels_layer_norm LN_B16_S512_D1024 0.06 False
4003
- hf_kernels_layer_norm LN_B16_S512_D2048 0.30 False
4004
- hf_kernels_layer_norm LN_B16_S512_D4096 0.59 False
4005
- hf_kernels_layer_norm LN_B16_S512_D8192 1.16 False
4006
  hf_kernels_layer_norm LN_B1_S1024_D1024 0.05 False
4007
  hf_kernels_layer_norm LN_B1_S1024_D2048 0.05 False
4008
  hf_kernels_layer_norm LN_B1_S1024_D4096 0.05 False
4009
- hf_kernels_layer_norm LN_B1_S1024_D8192 0.06 False
4010
- hf_kernels_layer_norm LN_B1_S128_D1024 0.05 False
4011
  hf_kernels_layer_norm LN_B1_S128_D2048 0.05 False
4012
  hf_kernels_layer_norm LN_B1_S128_D4096 0.05 False
4013
  hf_kernels_layer_norm LN_B1_S128_D8192 0.05 False
4014
  hf_kernels_layer_norm LN_B1_S2048_D1024 0.05 False
4015
  hf_kernels_layer_norm LN_B1_S2048_D2048 0.05 False
4016
- hf_kernels_layer_norm LN_B1_S2048_D4096 0.06 False
4017
- hf_kernels_layer_norm LN_B1_S2048_D8192 0.29 False
4018
  hf_kernels_layer_norm LN_B1_S512_D1024 0.05 False
4019
  hf_kernels_layer_norm LN_B1_S512_D2048 0.05 False
4020
  hf_kernels_layer_norm LN_B1_S512_D4096 0.05 False
4021
  hf_kernels_layer_norm LN_B1_S512_D8192 0.05 False
4022
  hf_kernels_layer_norm LN_B4_S1024_D1024 0.05 False
4023
- hf_kernels_layer_norm LN_B4_S1024_D2048 0.07 False
4024
- hf_kernels_layer_norm LN_B4_S1024_D4096 0.29 False
4025
- hf_kernels_layer_norm LN_B4_S1024_D8192 0.59 False
4026
  hf_kernels_layer_norm LN_B4_S128_D1024 0.05 False
4027
  hf_kernels_layer_norm LN_B4_S128_D2048 0.05 False
4028
  hf_kernels_layer_norm LN_B4_S128_D4096 0.05 False
4029
  hf_kernels_layer_norm LN_B4_S128_D8192 0.05 False
4030
- hf_kernels_layer_norm LN_B4_S2048_D1024 0.06 False
4031
- hf_kernels_layer_norm LN_B4_S2048_D2048 0.30 False
4032
- hf_kernels_layer_norm LN_B4_S2048_D4096 0.60 False
4033
- hf_kernels_layer_norm LN_B4_S2048_D8192 1.15 False
4034
  hf_kernels_layer_norm LN_B4_S512_D1024 0.05 False
4035
  hf_kernels_layer_norm LN_B4_S512_D2048 0.05 False
4036
- hf_kernels_layer_norm LN_B4_S512_D4096 0.06 False
4037
- hf_kernels_layer_norm LN_B4_S512_D8192 0.29 False
4038
- torch_layer_norm LN_B16_S1024_D1024 0.29 False
4039
- torch_layer_norm LN_B16_S1024_D2048 0.59 False
4040
- torch_layer_norm LN_B16_S1024_D4096 1.15 False
4041
- torch_layer_norm LN_B16_S1024_D8192 2.27 False
4042
  torch_layer_norm LN_B16_S128_D1024 0.03 False
4043
- torch_layer_norm LN_B16_S128_D2048 0.04 False
4044
- torch_layer_norm LN_B16_S128_D4096 0.05 False
4045
- torch_layer_norm LN_B16_S128_D8192 0.27 False
4046
- torch_layer_norm LN_B16_S2048_D1024 0.59 False
4047
- torch_layer_norm LN_B16_S2048_D2048 1.16 False
4048
- torch_layer_norm LN_B16_S2048_D4096 2.30 False
4049
- torch_layer_norm LN_B16_S2048_D8192 4.51 False
4050
- torch_layer_norm LN_B16_S512_D1024 0.07 False
4051
- torch_layer_norm LN_B16_S512_D2048 0.29 False
4052
- torch_layer_norm LN_B16_S512_D4096 0.59 False
4053
- torch_layer_norm LN_B16_S512_D8192 1.15 False
4054
  torch_layer_norm LN_B1_S1024_D1024 0.03 False
4055
  torch_layer_norm LN_B1_S1024_D2048 0.03 False
4056
- torch_layer_norm LN_B1_S1024_D4096 0.04 False
4057
- torch_layer_norm LN_B1_S1024_D8192 0.05 False
4058
- torch_layer_norm LN_B1_S128_D1024 0.03 False
4059
  torch_layer_norm LN_B1_S128_D2048 0.03 False
4060
  torch_layer_norm LN_B1_S128_D4096 0.03 False
4061
  torch_layer_norm LN_B1_S128_D8192 0.03 False
4062
- torch_layer_norm LN_B1_S2048_D1024 0.04 False
4063
- torch_layer_norm LN_B1_S2048_D2048 0.04 False
4064
- torch_layer_norm LN_B1_S2048_D4096 0.05 False
4065
- torch_layer_norm LN_B1_S2048_D8192 0.27 False
4066
  torch_layer_norm LN_B1_S512_D1024 0.03 False
4067
  torch_layer_norm LN_B1_S512_D2048 0.03 False
4068
  torch_layer_norm LN_B1_S512_D4096 0.03 False
4069
- torch_layer_norm LN_B1_S512_D8192 0.04 False
4070
- torch_layer_norm LN_B4_S1024_D1024 0.05 False
4071
- torch_layer_norm LN_B4_S1024_D2048 0.06 False
4072
- torch_layer_norm LN_B4_S1024_D4096 0.28 False
4073
- torch_layer_norm LN_B4_S1024_D8192 0.59 False
4074
  torch_layer_norm LN_B4_S128_D1024 0.03 False
4075
  torch_layer_norm LN_B4_S128_D2048 0.03 False
4076
  torch_layer_norm LN_B4_S128_D4096 0.03 False
4077
- torch_layer_norm LN_B4_S128_D8192 0.04 False
4078
- torch_layer_norm LN_B4_S2048_D1024 0.07 False
4079
- torch_layer_norm LN_B4_S2048_D2048 0.28 False
4080
- torch_layer_norm LN_B4_S2048_D4096 0.58 False
4081
- torch_layer_norm LN_B4_S2048_D8192 1.15 False
4082
  torch_layer_norm LN_B4_S512_D1024 0.03 False
4083
- torch_layer_norm LN_B4_S512_D2048 0.04 False
4084
- torch_layer_norm LN_B4_S512_D4096 0.05 False
4085
- torch_layer_norm LN_B4_S512_D8192 0.27 False
4086
 
4087
  GENERATING COMBINED VISUALIZATION
4088
 
@@ -4101,53 +4101,7 @@ Implementations included:
4101
  <div class="uv-install-logs" id="uv-logs-combine">
4102
  <div class="uv-logs-header" onclick="toggleUvLogs(this)">▶ UV Install Logs</div>
4103
  <div class="uv-logs-content" style="display: none;">
4104
- Building kernels-benchmark-tools @ file:///home/ubuntu/Projects/kernels-benchmarks-consolidated/tools
4105
- Downloading sympy (6.0MiB)
4106
- Downloading pillow (6.7MiB)
4107
- Downloading setuptools (1.1MiB)
4108
- Downloading fonttools (4.7MiB)
4109
- Downloading nvidia-nvjitlink-cu12 (37.4MiB)
4110
- Downloading kiwisolver (1.4MiB)
4111
- Downloading nvidia-cudnn-cu12 (674.0MiB)
4112
- Downloading networkx (1.9MiB)
4113
- Downloading matplotlib (8.3MiB)
4114
- Downloading nvidia-curand-cu12 (60.7MiB)
4115
- Downloading numpy (15.9MiB)
4116
- Downloading nvidia-cuda-nvrtc-cu12 (84.0MiB)
4117
- Downloading nvidia-cufile-cu12 (1.1MiB)
4118
- Downloading nvidia-cusparselt-cu12 (273.9MiB)
4119
- Downloading nvidia-cufft-cu12 (184.2MiB)
4120
- Downloading nvidia-cuda-cupti-cu12 (9.8MiB)
4121
- Downloading nvidia-cublas-cu12 (566.8MiB)
4122
- Downloading nvidia-nccl-cu12 (307.4MiB)
4123
- Downloading nvidia-cusparse-cu12 (274.9MiB)
4124
- Downloading nvidia-cusolver-cu12 (255.1MiB)
4125
- Downloading torch (846.8MiB)
4126
- Downloading triton (148.4MiB)
4127
- Downloading nvidia-cufile-cu12
4128
- Downloading kiwisolver
4129
- Downloading setuptools
4130
- Downloading networkx
4131
- Downloading fonttools
4132
- Downloading pillow
4133
- Built kernels-benchmark-tools @ file:///home/ubuntu/Projects/kernels-benchmarks-consolidated/tools
4134
- Downloading matplotlib
4135
- Downloading nvidia-cuda-cupti-cu12
4136
- Downloading numpy
4137
- Downloading nvidia-nvjitlink-cu12
4138
- Downloading sympy
4139
- Downloading nvidia-curand-cu12
4140
- Downloading nvidia-cuda-nvrtc-cu12
4141
- Downloading triton
4142
- Downloading nvidia-cufft-cu12
4143
- Downloading nvidia-cusolver-cu12
4144
- Downloading nvidia-cusparse-cu12
4145
- Downloading nvidia-cusparselt-cu12
4146
- Downloading nvidia-nccl-cu12
4147
- Downloading nvidia-cublas-cu12
4148
- Downloading nvidia-cudnn-cu12
4149
- Downloading torch
4150
- Installed 37 packages in 205ms
4151
  </div>
4152
  </div>
4153
  <div class="cell-artifacts">
@@ -4160,7 +4114,7 @@ Installed 37 packages in 205ms
4160
  <rdf:RDF>
4161
  <ns2:Work>
4162
  <dc:type rdf:resource="http://purl.org/dc/dcmitype/StillImage" />
4163
- <dc:date>2025-10-24T19:26:16.447564</dc:date>
4164
  <dc:format>image/svg+xml</dc:format>
4165
  <dc:creator>
4166
  <ns2:Agent>
 
3857
  <div class="system-info">
3858
  <div class="system-info-header">Generated on:</div>
3859
  <div class="system-info-content">
3860
+ Linux x86_64 | Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35
3861
  </div>
3862
  </div>
3863
 
 
3872
  <rdf:RDF>
3873
  <ns2:Work>
3874
  <dc:type rdf:resource="http://purl.org/dc/dcmitype/StillImage" />
3875
+ <dc:date>2025-10-27T14:46:34.455868</dc:date>
3876
  <dc:format>image/svg+xml</dc:format>
3877
  <dc:creator>
3878
  <ns2:Agent>
 
3900
  <span onclick="toggleOutput('combine')" style="cursor: pointer;">▼ output</span>
3901
  <span id="uv-indicator-combine" onclick="toggleUvLogsFromHeader('combine')" style="cursor: pointer;">▶ uv-logs</span>
3902
  </span> |
3903
+ Cell: combine | 4.28s
3904
  | <button class="run-btn" onclick="runCell('combine')">▶ run</button>
3905
  <button class="copy-btn" onclick="copyCell('combine')">Copy</button>
3906
  <a href="cells/combine.py" target="_blank" class="raw-btn">Raw</a>
 
3972
  <div class="cell-stdout"><pre class="stdout-text">======================================================================
3973
  LOADING BENCHMARK DATA
3974
  ======================================================================
3975
+ ✓ PyTorch LayerNorm : /__w/kernels-benchmarks/kernels-benchmarks/benches/layer_norm/impls/.uvnote/cache/4403c31e9bef6e648597b4fcc9cfdc402678aaa4f90636b74325f12d334214a3
3976
+ ✓ HF Kernels LayerNorm : /__w/kernels-benchmarks/kernels-benchmarks/benches/layer_norm/impls/.uvnote/cache/bd278151199f29b397d85857b87922edaa39a62623fb28e0465de47d6a3bac74
3977
 
3978
  ✓ Found PyTorch LayerNorm
3979
+ Path: /__w/kernels-benchmarks/kernels-benchmarks/benches/layer_norm/impls/.uvnote/cache/4403c31e9bef6e648597b4fcc9cfdc402678aaa4f90636b74325f12d334214a3/layer_norm.jsonl
3980
  ✓ Found HF Kernels LayerNorm
3981
+ Path: /__w/kernels-benchmarks/kernels-benchmarks/benches/layer_norm/impls/.uvnote/cache/bd278151199f29b397d85857b87922edaa39a62623fb28e0465de47d6a3bac74/layer_norm.jsonl
3982
 
3983
  ======================================================================
3984
  Summary: 2 found, 0 skipped, 0 missing
 
3987
  COMBINED BENCHMARK SUMMARY
3988
 
3989
  impl wl p50(ms) ok
3990
+ hf_kernels_layer_norm LN_B16_S1024_D1024 0.05 False
3991
+ hf_kernels_layer_norm LN_B16_S1024_D2048 0.22 False
3992
+ hf_kernels_layer_norm LN_B16_S1024_D4096 0.44 False
3993
+ hf_kernels_layer_norm LN_B16_S1024_D8192 0.84 False
3994
  hf_kernels_layer_norm LN_B16_S128_D1024 0.05 False
3995
  hf_kernels_layer_norm LN_B16_S128_D2048 0.05 False
3996
+ hf_kernels_layer_norm LN_B16_S128_D4096 0.05 False
3997
+ hf_kernels_layer_norm LN_B16_S128_D8192 0.05 False
3998
+ hf_kernels_layer_norm LN_B16_S2048_D1024 0.21 False
3999
+ hf_kernels_layer_norm LN_B16_S2048_D2048 0.46 False
4000
+ hf_kernels_layer_norm LN_B16_S2048_D4096 0.84 False
4001
+ hf_kernels_layer_norm LN_B16_S2048_D8192 1.65 False
4002
+ hf_kernels_layer_norm LN_B16_S512_D1024 0.05 False
4003
+ hf_kernels_layer_norm LN_B16_S512_D2048 0.05 False
4004
+ hf_kernels_layer_norm LN_B16_S512_D4096 0.21 False
4005
+ hf_kernels_layer_norm LN_B16_S512_D8192 0.43 False
4006
  hf_kernels_layer_norm LN_B1_S1024_D1024 0.05 False
4007
  hf_kernels_layer_norm LN_B1_S1024_D2048 0.05 False
4008
  hf_kernels_layer_norm LN_B1_S1024_D4096 0.05 False
4009
+ hf_kernels_layer_norm LN_B1_S1024_D8192 0.05 False
4010
+ hf_kernels_layer_norm LN_B1_S128_D1024 0.04 False
4011
  hf_kernels_layer_norm LN_B1_S128_D2048 0.05 False
4012
  hf_kernels_layer_norm LN_B1_S128_D4096 0.05 False
4013
  hf_kernels_layer_norm LN_B1_S128_D8192 0.05 False
4014
  hf_kernels_layer_norm LN_B1_S2048_D1024 0.05 False
4015
  hf_kernels_layer_norm LN_B1_S2048_D2048 0.05 False
4016
+ hf_kernels_layer_norm LN_B1_S2048_D4096 0.05 False
4017
+ hf_kernels_layer_norm LN_B1_S2048_D8192 0.05 False
4018
  hf_kernels_layer_norm LN_B1_S512_D1024 0.05 False
4019
  hf_kernels_layer_norm LN_B1_S512_D2048 0.05 False
4020
  hf_kernels_layer_norm LN_B1_S512_D4096 0.05 False
4021
  hf_kernels_layer_norm LN_B1_S512_D8192 0.05 False
4022
  hf_kernels_layer_norm LN_B4_S1024_D1024 0.05 False
4023
+ hf_kernels_layer_norm LN_B4_S1024_D2048 0.05 False
4024
+ hf_kernels_layer_norm LN_B4_S1024_D4096 0.05 False
4025
+ hf_kernels_layer_norm LN_B4_S1024_D8192 0.21 False
4026
  hf_kernels_layer_norm LN_B4_S128_D1024 0.05 False
4027
  hf_kernels_layer_norm LN_B4_S128_D2048 0.05 False
4028
  hf_kernels_layer_norm LN_B4_S128_D4096 0.05 False
4029
  hf_kernels_layer_norm LN_B4_S128_D8192 0.05 False
4030
+ hf_kernels_layer_norm LN_B4_S2048_D1024 0.05 False
4031
+ hf_kernels_layer_norm LN_B4_S2048_D2048 0.06 False
4032
+ hf_kernels_layer_norm LN_B4_S2048_D4096 0.21 False
4033
+ hf_kernels_layer_norm LN_B4_S2048_D8192 0.44 False
4034
  hf_kernels_layer_norm LN_B4_S512_D1024 0.05 False
4035
  hf_kernels_layer_norm LN_B4_S512_D2048 0.05 False
4036
+ hf_kernels_layer_norm LN_B4_S512_D4096 0.05 False
4037
+ hf_kernels_layer_norm LN_B4_S512_D8192 0.05 False
4038
+ torch_layer_norm LN_B16_S1024_D1024 0.05 False
4039
+ torch_layer_norm LN_B16_S1024_D2048 0.21 False
4040
+ torch_layer_norm LN_B16_S1024_D4096 0.42 False
4041
+ torch_layer_norm LN_B16_S1024_D8192 0.85 False
4042
  torch_layer_norm LN_B16_S128_D1024 0.03 False
4043
+ torch_layer_norm LN_B16_S128_D2048 0.03 False
4044
+ torch_layer_norm LN_B16_S128_D4096 0.04 False
4045
+ torch_layer_norm LN_B16_S128_D8192 0.05 False
4046
+ torch_layer_norm LN_B16_S2048_D1024 0.21 False
4047
+ torch_layer_norm LN_B16_S2048_D2048 0.42 False
4048
+ torch_layer_norm LN_B16_S2048_D4096 0.82 False
4049
+ torch_layer_norm LN_B16_S2048_D8192 1.68 False
4050
+ torch_layer_norm LN_B16_S512_D1024 0.04 False
4051
+ torch_layer_norm LN_B16_S512_D2048 0.05 False
4052
+ torch_layer_norm LN_B16_S512_D4096 0.21 False
4053
+ torch_layer_norm LN_B16_S512_D8192 0.43 False
4054
  torch_layer_norm LN_B1_S1024_D1024 0.03 False
4055
  torch_layer_norm LN_B1_S1024_D2048 0.03 False
4056
+ torch_layer_norm LN_B1_S1024_D4096 0.03 False
4057
+ torch_layer_norm LN_B1_S1024_D8192 0.04 False
4058
+ torch_layer_norm LN_B1_S128_D1024 0.02 False
4059
  torch_layer_norm LN_B1_S128_D2048 0.03 False
4060
  torch_layer_norm LN_B1_S128_D4096 0.03 False
4061
  torch_layer_norm LN_B1_S128_D8192 0.03 False
4062
+ torch_layer_norm LN_B1_S2048_D1024 0.03 False
4063
+ torch_layer_norm LN_B1_S2048_D2048 0.03 False
4064
+ torch_layer_norm LN_B1_S2048_D4096 0.04 False
4065
+ torch_layer_norm LN_B1_S2048_D8192 0.05 False
4066
  torch_layer_norm LN_B1_S512_D1024 0.03 False
4067
  torch_layer_norm LN_B1_S512_D2048 0.03 False
4068
  torch_layer_norm LN_B1_S512_D4096 0.03 False
4069
+ torch_layer_norm LN_B1_S512_D8192 0.03 False
4070
+ torch_layer_norm LN_B4_S1024_D1024 0.03 False
4071
+ torch_layer_norm LN_B4_S1024_D2048 0.04 False
4072
+ torch_layer_norm LN_B4_S1024_D4096 0.05 False
4073
+ torch_layer_norm LN_B4_S1024_D8192 0.20 False
4074
  torch_layer_norm LN_B4_S128_D1024 0.03 False
4075
  torch_layer_norm LN_B4_S128_D2048 0.03 False
4076
  torch_layer_norm LN_B4_S128_D4096 0.03 False
4077
+ torch_layer_norm LN_B4_S128_D8192 0.03 False
4078
+ torch_layer_norm LN_B4_S2048_D1024 0.04 False
4079
+ torch_layer_norm LN_B4_S2048_D2048 0.05 False
4080
+ torch_layer_norm LN_B4_S2048_D4096 0.21 False
4081
+ torch_layer_norm LN_B4_S2048_D8192 0.44 False
4082
  torch_layer_norm LN_B4_S512_D1024 0.03 False
4083
+ torch_layer_norm LN_B4_S512_D2048 0.03 False
4084
+ torch_layer_norm LN_B4_S512_D4096 0.04 False
4085
+ torch_layer_norm LN_B4_S512_D8192 0.05 False
4086
 
4087
  GENERATING COMBINED VISUALIZATION
4088
 
 
4101
  <div class="uv-install-logs" id="uv-logs-combine">
4102
  <div class="uv-logs-header" onclick="toggleUvLogs(this)">▶ UV Install Logs</div>
4103
  <div class="uv-logs-content" style="display: none;">
4104
+ Installed 37 packages in 260ms
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
4105
  </div>
4106
  </div>
4107
  <div class="cell-artifacts">
 
4114
  <rdf:RDF>
4115
  <ns2:Work>
4116
  <dc:type rdf:resource="http://purl.org/dc/dcmitype/StillImage" />
4117
+ <dc:date>2025-10-27T14:46:34.455868</dc:date>
4118
  <dc:format>image/svg+xml</dc:format>
4119
  <dc:creator>
4120
  <ns2:Agent>