diff --git a/activation/impls/artifacts/benchmark/activation.jsonl b/activation/impls/artifacts/benchmark/activation.jsonl
index 434051e7334ba93dc285ce3121b82c5d0482ab20..bd99204f3d8c69a9f9ed111fe61aacdcbe3a9171 100644
--- a/activation/impls/artifacts/benchmark/activation.jsonl
+++ b/activation/impls/artifacts/benchmark/activation.jsonl
@@ -1,9 +1,9 @@
-{"ts": "2025-10-24T19:24:48Z", "run": "f61cf615fbaa45ae9db9a8ea612d3936", "impl": "torch_eager", "tags": {"family": "hf-kernels", "backend": "eager"}, "wl": {"name": "cuda_T128_D768", "num_tokens": 128, "hidden_dim": 768, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L4", "sm": "8.9", "py": "3.12.7", "plat": "Linux-5.15.0-1084-aws-x86_64-with-glibc2.31"}, "lat_ms": {"p10": 0.04845100920647383, "p50": 0.04891102435067296, "p90": 0.05595100810751319, "mean": 0.051765027455985546, "iqr": 0.007269962225109339, "raw_times": [0.04868104588240385, 0.04891102435067296, 0.04845100920647383, 0.0568310497328639, 0.05595100810751319], "has_warnings": true, "reps": 5, "warmup": 2}, "compile_ms": 0.055451004300266504, "peak_bytes": 2164736, "ok": true, "absmax": 0.0, "corr": {"ok": true, "rtol": 0.01, "atol": 0.1, "absmax": 0.0, "mae": 0.0, "mse": 0.0, "relmax": 0.0, "ref": "swiglu_bfloat16"}, "err": null}
-{"ts": "2025-10-24T19:24:48Z", "run": "f61cf615fbaa45ae9db9a8ea612d3936", "impl": "torch_eager", "tags": {"family": "hf-kernels", "backend": "eager"}, "wl": {"name": "cuda_T128_D1024", "num_tokens": 128, "hidden_dim": 1024, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L4", "sm": "8.9", "py": "3.12.7", "plat": "Linux-5.15.0-1084-aws-x86_64-with-glibc2.31"}, "lat_ms": {"p10": 0.053381023462861776, "p50": 0.05378195783123374, "p90": 0.055961019825190306, "mean": 0.05980720743536949, "iqr": 0.002529995981603861, "raw_times": [0.055961019825190306, 0.053381023462861776, 0.08248101221397519, 0.05378195783123374, 0.053431023843586445], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.061191036365926266, "peak_bytes": 2885632, "ok": true, "absmax": 0.0, "corr": {"ok": true, "rtol": 0.01, "atol": 0.1, "absmax": 0.0, "mae": 0.0, "mse": 0.0, "relmax": 0.0, "ref": "swiglu_bfloat16"}, "err": null}
-{"ts": "2025-10-24T19:24:48Z", "run": "f61cf615fbaa45ae9db9a8ea612d3936", "impl": "torch_eager", "tags": {"family": "hf-kernels", "backend": "eager"}, "wl": {"name": "cuda_T128_D2048", "num_tokens": 128, "hidden_dim": 2048, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L4", "sm": "8.9", "py": "3.12.7", "plat": "Linux-5.15.0-1084-aws-x86_64-with-glibc2.31"}, "lat_ms": {"p10": 0.05066103767603636, "p50": 0.0530310207977891, "p90": 0.0544210197404027, "mean": 0.052935024723410606, "iqr": 0.002769986167550087, "raw_times": [0.05066103767603636, 0.0544210197404027, 0.0530310207977891, 0.05491101182997227, 0.05165103357285261], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.057801022194325924, "peak_bytes": 5769216, "ok": true, "absmax": 0.0, "corr": {"ok": true, "rtol": 0.01, "atol": 0.1, "absmax": 0.0, "mae": 0.0, "mse": 0.0, "relmax": 0.0, "ref": "swiglu_bfloat16"}, "err": null}
-{"ts": "2025-10-24T19:24:48Z", "run": "f61cf615fbaa45ae9db9a8ea612d3936", "impl": "torch_eager", "tags": {"family": "hf-kernels", "backend": "eager"}, "wl": {"name": "cuda_T256_D768", "num_tokens": 256, "hidden_dim": 768, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L4", "sm": "8.9", "py": "3.12.7", "plat": "Linux-5.15.0-1084-aws-x86_64-with-glibc2.31"}, "lat_ms": {"p10": 0.051781011279672384, "p50": 0.053541967645287514, "p90": 0.05360104842111468, "mean": 0.05314521258696914, "iqr": 0.0010800431482493877, "raw_times": [0.052521005272865295, 0.05428103031590581, 0.05360104842111468, 0.053541967645287514, 0.051781011279672384], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.0558110186830163, "peak_bytes": 4327424, "ok": true, "absmax": 0.0, "corr": {"ok": true, "rtol": 0.01, "atol": 0.1, "absmax": 0.0, "mae": 0.0, "mse": 0.0, "relmax": 0.0, "ref": "swiglu_bfloat16"}, "err": null}
-{"ts": "2025-10-24T19:24:48Z", "run": "f61cf615fbaa45ae9db9a8ea612d3936", "impl": "torch_eager", "tags": {"family": "hf-kernels", "backend": "eager"}, "wl": {"name": "cuda_T256_D1024", "num_tokens": 256, "hidden_dim": 1024, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L4", "sm": "8.9", "py": "3.12.7", "plat": "Linux-5.15.0-1084-aws-x86_64-with-glibc2.31"}, "lat_ms": {"p10": 0.05193101242184639, "p50": 0.05308096297085285, "p90": 0.05407101707533002, "mean": 0.05422099493443966, "iqr": 0.0010799849405884743, "raw_times": [0.052991032134741545, 0.05407101707533002, 0.05903095006942749, 0.05308096297085285, 0.05193101242184639], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.05453097401186824, "peak_bytes": 5769216, "ok": true, "absmax": 0.0, "corr": {"ok": true, "rtol": 0.01, "atol": 0.1, "absmax": 0.0, "mae": 0.0, "mse": 0.0, "relmax": 0.0, "ref": "swiglu_bfloat16"}, "err": null}
-{"ts": "2025-10-24T19:24:48Z", "run": "f61cf615fbaa45ae9db9a8ea612d3936", "impl": "torch_eager", "tags": {"family": "hf-kernels", "backend": "eager"}, "wl": {"name": "cuda_T256_D2048", "num_tokens": 256, "hidden_dim": 2048, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L4", "sm": "8.9", "py": "3.12.7", "plat": "Linux-5.15.0-1084-aws-x86_64-with-glibc2.31"}, "lat_ms": {"p10": 0.05170103395357728, "p50": 0.05204096669331193, "p90": 0.0529709504917264, "mean": 0.0523771857842803, "iqr": 0.0009989598765969276, "raw_times": [0.05204096669331193, 0.05170103395357728, 0.05320098716765642, 0.05197199061512947, 0.0529709504917264], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.05481095286086202, "peak_bytes": 11536384, "ok": true, "absmax": 0.0, "corr": {"ok": true, "rtol": 0.01, "atol": 0.1, "absmax": 0.0, "mae": 0.0, "mse": 0.0, "relmax": 0.0, "ref": "swiglu_bfloat16"}, "err": null}
-{"ts": "2025-10-24T19:24:49Z", "run": "f61cf615fbaa45ae9db9a8ea612d3936", "impl": "torch_eager", "tags": {"family": "hf-kernels", "backend": "eager"}, "wl": {"name": "cuda_T512_D768", "num_tokens": 512, "hidden_dim": 768, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L4", "sm": "8.9", "py": "3.12.7", "plat": "Linux-5.15.0-1084-aws-x86_64-with-glibc2.31"}, "lat_ms": {"p10": 0.05258101737126708, "p50": 0.05330098792910576, "p90": 0.053990981541574, "mean": 0.053516996558755636, "iqr": 0.0007699709385633469, "raw_times": [0.05258101737126708, 0.053221010603010654, 0.05330098792910576, 0.054490985348820686, 0.053990981541574], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.05454104393720627, "peak_bytes": 8652800, "ok": true, "absmax": 0.0, "corr": {"ok": true, "rtol": 0.01, "atol": 0.1, "absmax": 0.0, "mae": 0.0, "mse": 0.0, "relmax": 0.0, "ref": "swiglu_bfloat16"}, "err": null}
-{"ts": "2025-10-24T19:24:49Z", "run": "f61cf615fbaa45ae9db9a8ea612d3936", "impl": "torch_eager", "tags": {"family": "hf-kernels", "backend": "eager"}, "wl": {"name": "cuda_T512_D1024", "num_tokens": 512, "hidden_dim": 1024, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L4", "sm": "8.9", "py": "3.12.7", "plat": "Linux-5.15.0-1084-aws-x86_64-with-glibc2.31"}, "lat_ms": {"p10": 0.050960981752723455, "p50": 0.05149102071300149, "p90": 0.05149102071300149, "mean": 0.051745015662163496, "iqr": 0.00012997770681977272, "raw_times": [0.050960981752723455, 0.05149102071300149, 0.05149102071300149, 0.05342101212590933, 0.05136104300618172], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.05551200592890382, "peak_bytes": 11536384, "ok": true, "absmax": 0.0, "corr": {"ok": true, "rtol": 0.01, "atol": 0.1, "absmax": 0.0, "mae": 0.0, "mse": 0.0, "relmax": 0.0, "ref": "swiglu_bfloat16"}, "err": null}
-{"ts": "2025-10-24T19:24:49Z", "run": "f61cf615fbaa45ae9db9a8ea612d3936", "impl": "torch_eager", "tags": {"family": "hf-kernels", "backend": "eager"}, "wl": {"name": "cuda_T512_D2048", "num_tokens": 512, "hidden_dim": 2048, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L4", "sm": "8.9", "py": "3.12.7", "plat": "Linux-5.15.0-1084-aws-x86_64-with-glibc2.31"}, "lat_ms": {"p10": 0.049641006626188755, "p50": 0.05309097468852997, "p90": 0.05348102422431111, "mean": 0.052487198263406754, "iqr": 0.0008300412446260452, "raw_times": [0.049641006626188755, 0.05357200279831886, 0.05265098297968507, 0.05309097468852997, 0.05348102422431111], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.05537096876651049, "peak_bytes": 23070720, "ok": true, "absmax": 0.0, "corr": {"ok": true, "rtol": 0.01, "atol": 0.1, "absmax": 0.0, "mae": 0.0, "mse": 0.0, "relmax": 0.0, "ref": "swiglu_bfloat16"}, "err": null}
+{"ts": "2025-10-27T14:46:29Z", "run": "cb61ba9d82ad40cba986f04f71dd51b6", "impl": "hf_kernels_swiglu", "tags": {"family": "hf-kernels", "backend": "cuda"}, "wl": {"name": "cuda_T128_D768", "num_tokens": 128, "hidden_dim": 768, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.024320999955307343, "p50": 0.025090999997701147, "p90": 0.02569000002949906, "mean": 0.026606800020090304, "iqr": 0.0010690000635804608, "raw_times": [0.03331100015202537, 0.025090999997701147, 0.024320999955307343, 0.02569000002949906, 0.0246209999659186], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.03336100007800269, "peak_bytes": 1966080, "ok": true, "absmax": 0.0, "corr": {"ok": true, "rtol": 0.02, "atol": 0.02, "absmax": 0.0, "mae": 0.0, "mse": 0.0, "ref": "swiglu_fp32"}, "err": null}
+{"ts": "2025-10-27T14:46:29Z", "run": "cb61ba9d82ad40cba986f04f71dd51b6", "impl": "hf_kernels_swiglu", "tags": {"family": "hf-kernels", "backend": "cuda"}, "wl": {"name": "cuda_T128_D1024", "num_tokens": 128, "hidden_dim": 1024, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.028640999971685233, "p50": 0.02958999994007172, "p90": 0.030561000130546745, "mean": 0.02986059994327661, "iqr": 0.0012610003068402875, "raw_times": [0.029299999823706457, 0.028640999971685233, 0.02958999994007172, 0.0312109998503729, 0.030561000130546745], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.03354099999341997, "peak_bytes": 2621440, "ok": true, "absmax": 0.0, "corr": {"ok": true, "rtol": 0.02, "atol": 0.02, "absmax": 0.0, "mae": 0.0, "mse": 0.0, "ref": "swiglu_fp32"}, "err": null}
+{"ts": "2025-10-27T14:46:29Z", "run": "cb61ba9d82ad40cba986f04f71dd51b6", "impl": "hf_kernels_swiglu", "tags": {"family": "hf-kernels", "backend": "cuda"}, "wl": {"name": "cuda_T128_D2048", "num_tokens": 128, "hidden_dim": 2048, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.02880000010918593, "p50": 0.030331000061778468, "p90": 0.030401000003621448, "mean": 0.030208600037440192, "iqr": 0.0004209998678561533, "raw_times": [0.02880000010918593, 0.03153099987684982, 0.029980000135765295, 0.030331000061778468, 0.030401000003621448], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.03317000005154114, "peak_bytes": 5242880, "ok": true, "absmax": 0.0, "corr": {"ok": true, "rtol": 0.02, "atol": 0.02, "absmax": 0.0, "mae": 0.0, "mse": 0.0, "ref": "swiglu_fp32"}, "err": null}
+{"ts": "2025-10-27T14:46:29Z", "run": "cb61ba9d82ad40cba986f04f71dd51b6", "impl": "hf_kernels_swiglu", "tags": {"family": "hf-kernels", "backend": "cuda"}, "wl": {"name": "cuda_T256_D768", "num_tokens": 256, "hidden_dim": 768, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.02921000009337149, "p50": 0.0294310000299447, "p90": 0.029789999871354667, "mean": 0.029938399984530406, "iqr": 0.0004489997991186101, "raw_times": [0.0294310000299447, 0.02921000009337149, 0.03191999985574512, 0.029341000072236056, 0.029789999871354667], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.03343000003042107, "peak_bytes": 3932160, "ok": true, "absmax": 0.0, "corr": {"ok": true, "rtol": 0.02, "atol": 0.02, "absmax": 0.0, "mae": 0.0, "mse": 0.0, "ref": "swiglu_fp32"}, "err": null}
+{"ts": "2025-10-27T14:46:29Z", "run": "cb61ba9d82ad40cba986f04f71dd51b6", "impl": "hf_kernels_swiglu", "tags": {"family": "hf-kernels", "backend": "cuda"}, "wl": {"name": "cuda_T256_D1024", "num_tokens": 256, "hidden_dim": 1024, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.029799999992974335, "p50": 0.031021000040709623, "p90": 0.031239999998433632, "mean": 0.03210639997632825, "iqr": 0.0009289999525208259, "raw_times": [0.038159999803610845, 0.031021000040709623, 0.029799999992974335, 0.030311000045912806, 0.031239999998433632], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.03207100007784902, "peak_bytes": 5242880, "ok": true, "absmax": 0.0, "corr": {"ok": true, "rtol": 0.02, "atol": 0.02, "absmax": 0.0, "mae": 0.0, "mse": 0.0, "ref": "swiglu_fp32"}, "err": null}
+{"ts": "2025-10-27T14:46:29Z", "run": "cb61ba9d82ad40cba986f04f71dd51b6", "impl": "hf_kernels_swiglu", "tags": {"family": "hf-kernels", "backend": "cuda"}, "wl": {"name": "cuda_T256_D2048", "num_tokens": 256, "hidden_dim": 2048, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.0278800000614865, "p50": 0.028550999786602915, "p90": 0.029250000125102815, "mean": 0.02903839999817137, "iqr": 0.0010100000054080738, "raw_times": [0.0278800000614865, 0.02824000011969474, 0.028550999786602915, 0.029250000125102815, 0.03127099989796989], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.03262000018366962, "peak_bytes": 10485760, "ok": true, "absmax": 0.0, "corr": {"ok": true, "rtol": 0.02, "atol": 0.02, "absmax": 0.0, "mae": 0.0, "mse": 0.0, "ref": "swiglu_fp32"}, "err": null}
+{"ts": "2025-10-27T14:46:29Z", "run": "cb61ba9d82ad40cba986f04f71dd51b6", "impl": "hf_kernels_swiglu", "tags": {"family": "hf-kernels", "backend": "cuda"}, "wl": {"name": "cuda_T512_D768", "num_tokens": 512, "hidden_dim": 768, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.02945000005638576, "p50": 0.029881000045861583, "p90": 0.03017099993485317, "mean": 0.03019639998456114, "iqr": 0.0005509998572961194, "raw_times": [0.029881000045861583, 0.03185999980814813, 0.03017099993485317, 0.02945000005638576, 0.02962000007755705], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.031610000178261544, "peak_bytes": 7864320, "ok": true, "absmax": 0.0, "corr": {"ok": true, "rtol": 0.02, "atol": 0.02, "absmax": 0.0, "mae": 0.0, "mse": 0.0, "ref": "swiglu_fp32"}, "err": null}
+{"ts": "2025-10-27T14:46:29Z", "run": "cb61ba9d82ad40cba986f04f71dd51b6", "impl": "hf_kernels_swiglu", "tags": {"family": "hf-kernels", "backend": "cuda"}, "wl": {"name": "cuda_T512_D1024", "num_tokens": 512, "hidden_dim": 1024, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.02807000009852345, "p50": 0.028989999918849207, "p90": 0.02929000015683414, "mean": 0.028920200020365883, "iqr": 0.0003590002961573191, "raw_times": [0.029320000066945795, 0.02929000015683414, 0.02807000009852345, 0.028989999918849207, 0.02893099986067682], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.033219999977518455, "peak_bytes": 10485760, "ok": true, "absmax": 0.0, "corr": {"ok": true, "rtol": 0.02, "atol": 0.02, "absmax": 0.0, "mae": 0.0, "mse": 0.0, "ref": "swiglu_fp32"}, "err": null}
+{"ts": "2025-10-27T14:46:29Z", "run": "cb61ba9d82ad40cba986f04f71dd51b6", "impl": "hf_kernels_swiglu", "tags": {"family": "hf-kernels", "backend": "cuda"}, "wl": {"name": "cuda_T512_D2048", "num_tokens": 512, "hidden_dim": 2048, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.029301000040504732, "p50": 0.03090099994551565, "p90": 0.03149000008306757, "mean": 0.03127060003862425, "iqr": 0.0014889999420120148, "raw_times": [0.029301000040504732, 0.030001000141055556, 0.03149000008306757, 0.03465999998297775, 0.03090099994551565], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.03197000000909611, "peak_bytes": 20971520, "ok": true, "absmax": 0.0, "corr": {"ok": true, "rtol": 0.02, "atol": 0.02, "absmax": 0.0, "mae": 0.0, "mse": 0.0, "ref": "swiglu_fp32"}, "err": null}
diff --git a/activation/impls/cells/benchmark.py b/activation/impls/cells/benchmark.py
index 711af9e01652ef5081b507affd0f7df9ac99e644..04f9df27c14acf429b58dba6cf0677c00cbbbced 100644
--- a/activation/impls/cells/benchmark.py
+++ b/activation/impls/cells/benchmark.py
@@ -4,6 +4,7 @@
 #     "numpy",
 #     "torch==2.8.0",
 #     "kernels-benchmark-tools",
+#     "kernels",
 # ]
 #
 # [tool.uv.sources]
@@ -12,17 +13,22 @@
 import torch
 import sys
 from kernels_benchmark_tools import KernelTypeEnum, run_benchmark
-import torch, torch.nn.functional as F
+from kernels import get_kernel
 
+# Load the activation kernel
+activation = get_kernel("kernels-community/activation")
 
-def swiglu_eager(x):
-    d = x.shape[-1] // 2
-    return F.silu(x[..., :d]) * x[..., d:]
+
+def hf_kernels_swiglu(input_tensor):
+    hidden_dim = input_tensor.shape[-1] // 2
+    out_shape = input_tensor.shape[:-1] + (hidden_dim,)
+    out = torch.empty(out_shape, dtype=input_tensor.dtype, device=input_tensor.device)
+    return activation.silu_and_mul(out, input_tensor)
 
 
 run_benchmark(
     kernel_type=KernelTypeEnum.ACTIVATION,
-    impl_name="torch_eager",
-    impl_tags={"family":"hf-kernels", "backend":"eager"},
-    impl_func=swiglu_eager,
+    impl_name="hf_kernels_swiglu",
+    impl_tags={"family": "hf-kernels", "backend": "cuda"},
+    impl_func=hf_kernels_swiglu,
 )
\ No newline at end of file
diff --git a/activation/impls/hf_kernels_swiglu.html b/activation/impls/hf_kernels_swiglu.html
index 5e56fd70e1083aed94a3dfe7ff9853871555ec6f..4784a25a1ce49622400c06aa26dd061266b6418d 100644
--- a/activation/impls/hf_kernels_swiglu.html
+++ b/activation/impls/hf_kernels_swiglu.html
@@ -3857,7 +3857,7 @@ span.linenos.special { color: #000000; background-color: #ffffc0; padding-left:
     <div class="system-info">
         <div class="system-info-header">Generated on:</div>
         <div class="system-info-content">
-            Linux x86_64 | Linux-5.15.0-1084-aws-x86_64-with-glibc2.31
+            Linux x86_64 | Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35
         </div>
     </div>
     
@@ -3871,7 +3871,7 @@ span.linenos.special { color: #000000; background-color: #ffffc0; padding-left:
 <span onclick="toggleOutput('nv')" style="cursor: pointer;">▼ output</span>
  <span id="uv-indicator-nv" style="cursor: default; opacity: 0.3;">▶ uv-logs</span>
 </span> | 
-Cell: nv | 4.02s
+Cell: nv | 0.26s
  | <button class="run-btn" onclick="runCell('nv')">▶ run</button>
 <button class="copy-btn" onclick="copyCell('nv')">Copy</button>
 <a href="cells/nv.py" target="_blank" class="raw-btn">Raw</a>
@@ -3887,34 +3887,22 @@ Cell: nv | 4.02s
 </div>
 </div>
 <div id="output-nv" class="cell-output">
-<div class="cell-stdout"><pre class="stdout-text">Fri Oct 24 19:18:43 2025       
+<div class="cell-stdout"><pre class="stdout-text">Mon Oct 27 14:46:00 2025       
 +-----------------------------------------------------------------------------------------+
-| NVIDIA-SMI 560.35.05              Driver Version: 560.35.05      CUDA Version: 12.6     |
+| NVIDIA-SMI 570.195.03             Driver Version: 570.195.03     CUDA Version: 12.8     |
 |-----------------------------------------+------------------------+----------------------+
 | GPU  Name                 Persistence-M | Bus-Id          Disp.A | Volatile Uncorr. ECC |
 | Fan  Temp   Perf          Pwr:Usage/Cap |           Memory-Usage | GPU-Util  Compute M. |
 |                                         |                        |               MIG M. |
 |=========================================+========================+======================|
-|   0  NVIDIA L4                      Off |   00000000:38:00.0 Off |                    0 |
-| N/A   35C    P0             27W /   72W |       1MiB /  23034MiB |      0%      Default |
-|                                         |                        |                  N/A |
-+-----------------------------------------+------------------------+----------------------+
-|   1  NVIDIA L4                      Off |   00000000:3A:00.0 Off |                    0 |
-| N/A   32C    P0             28W /   72W |       1MiB /  23034MiB |      2%      Default |
-|                                         |                        |                  N/A |
-+-----------------------------------------+------------------------+----------------------+
-|   2  NVIDIA L4                      Off |   00000000:3C:00.0 Off |                    0 |
-| N/A   34C    P0             27W /   72W |       1MiB /  23034MiB |      1%      Default |
-|                                         |                        |                  N/A |
-+-----------------------------------------+------------------------+----------------------+
-|   3  NVIDIA L4                      Off |   00000000:3E:00.0 Off |                    0 |
-| N/A   32C    P0             27W /   72W |       1MiB /  23034MiB |      2%      Default |
+|   0  NVIDIA L40S                    On  |   00000000:4D:00.0 Off |                    0 |
+| N/A   32C    P0            153W /  350W |       0MiB /  46068MiB |     75%      Default |
 |                                         |                        |                  N/A |
 +-----------------------------------------+------------------------+----------------------+
 
 +-----------------------------------------------------------------------------------------+
 | Processes:                                                                              |
-|  GPU   GI   CI        PID   Type   Process name                              GPU Memory |
+|  GPU   GI   CI              PID   Type   Process name                        GPU Memory |
 |        ID   ID                                                               Usage      |
 |=========================================================================================|
 |  No running processes found                                                             |
@@ -3932,7 +3920,7 @@ Cell: nv | 4.02s
 <span onclick="toggleOutput('benchmark')" style="cursor: pointer;">▼ output</span>
  <span id="uv-indicator-benchmark" onclick="toggleUvLogsFromHeader('benchmark')" style="cursor: pointer;">▶ uv-logs</span>
 </span> | 
-Cell: benchmark | 43.68s
+Cell: benchmark | 4.32s
  | <button class="run-btn" onclick="runCell('benchmark')">▶ run</button>
 <button class="copy-btn" onclick="copyCell('benchmark')">Copy</button>
 <a href="cells/benchmark.py" target="_blank" class="raw-btn">Raw</a>
@@ -3988,17 +3976,17 @@ PROFILE TRACE: hf_kernels_swiglu | cuda_T128_D768
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
                                                    Name    Self CPU %      Self CPU   CPU total %     CPU total  CPU time avg     Self CUDA   Self CUDA %    CUDA total  CUDA time avg    # of Calls  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-                                      hf_kernels_swiglu         0.00%       0.000us         0.00%       0.000us       0.000us      72.447us      1172.85%      72.447us      72.447us             1  
-                                      hf_kernels_swiglu        10.70%     189.904us        99.62%       1.769ms       1.769ms       0.000us         0.00%       8.289us       8.289us             1  
-                      _activation_beeaae6::silu_and_mul         1.07%      18.931us        86.38%       1.534ms     511.168us       6.177us       100.00%       8.289us       2.763us             3  
-void vllm::act_and_mul_kernel&lt;c10::BFloat16, &amp;(c10::...         0.00%       0.000us         0.00%       0.000us       0.000us       6.177us       100.00%       6.177us       2.059us             3  
-                                Activity Buffer Request        82.95%       1.473ms        82.95%       1.473ms       1.473ms       2.112us        34.19%       2.112us       2.112us             1  
-                                            aten::empty         2.54%      45.151us         2.54%      45.151us      15.050us       0.000us         0.00%       0.000us       0.000us             3  
-                                       cudaLaunchKernel         2.36%      41.961us         2.36%      41.961us      13.987us       0.000us         0.00%       0.000us       0.000us             3  
-                                  cudaDeviceSynchronize         0.38%       6.701us         0.38%       6.701us       6.701us       0.000us         0.00%       0.000us       0.000us             1  
+                                      hf_kernels_swiglu         0.00%       0.000us         0.00%       0.000us       0.000us      80.128us      1940.62%      80.128us      80.128us             1  
+                                      hf_kernels_swiglu        11.19%     199.383us        99.56%       1.774ms       1.774ms       0.000us         0.00%       5.634us       5.634us             1  
+                      _activation_beeaae6::silu_and_mul         1.10%      19.601us        85.64%       1.526ms     508.618us       4.129us       100.00%       5.634us       1.878us             3  
+void vllm::act_and_mul_kernel&lt;c10::BFloat16, &amp;(c10::...         0.00%       0.000us         0.00%       0.000us       0.000us       4.129us       100.00%       4.129us       1.376us             3  
+                                Activity Buffer Request        82.30%       1.466ms        82.30%       1.466ms       1.466ms       1.505us        36.45%       1.505us       1.505us             1  
+                                            aten::empty         2.73%      48.641us         2.73%      48.641us      16.214us       0.000us         0.00%       0.000us       0.000us             3  
+                                       cudaLaunchKernel         2.24%      39.931us         2.24%      39.931us      13.310us       0.000us         0.00%       0.000us       0.000us             3  
+                                  cudaDeviceSynchronize         0.44%       7.891us         0.44%       7.891us       7.891us       0.000us         0.00%       0.000us       0.000us             1  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-Self CPU time total: 1.775ms
-Self CUDA time total: 6.177us
+Self CPU time total: 1.782ms
+Self CUDA time total: 4.129us
 
 
 
@@ -4008,17 +3996,17 @@ PROFILE TRACE: hf_kernels_swiglu | cuda_T128_D1024
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
                                                    Name    Self CPU %      Self CPU   CPU total %     CPU total  CPU time avg     Self CUDA   Self CUDA %    CUDA total  CUDA time avg    # of Calls  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-                                      hf_kernels_swiglu         0.00%       0.000us         0.00%       0.000us       0.000us      91.934us      1135.55%      91.934us      91.934us             1  
-                                      hf_kernels_swiglu         6.80%     114.004us        99.69%       1.672ms       1.672ms       0.000us         0.00%      10.816us      10.816us             1  
-                      _activation_beeaae6::silu_and_mul         1.26%      21.089us        91.64%       1.537ms     512.271us       8.096us       100.00%      10.816us       3.605us             3  
-void vllm::act_and_mul_kernel&lt;c10::BFloat16, &amp;(c10::...         0.00%       0.000us         0.00%       0.000us       0.000us       8.096us       100.00%       8.096us       2.699us             3  
-                                Activity Buffer Request        88.69%       1.487ms        88.69%       1.487ms       1.487ms       2.720us        33.60%       2.720us       2.720us             1  
-                                            aten::empty         1.24%      20.870us         1.24%      20.870us       6.957us       0.000us         0.00%       0.000us       0.000us             3  
-                                       cudaLaunchKernel         1.70%      28.501us         1.70%      28.501us       9.500us       0.000us         0.00%       0.000us       0.000us             3  
-                                  cudaDeviceSynchronize         0.31%       5.260us         0.31%       5.260us       5.260us       0.000us         0.00%       0.000us       0.000us             1  
+                                      hf_kernels_swiglu         0.00%       0.000us         0.00%       0.000us       0.000us      77.823us      1961.76%      77.823us      77.823us             1  
+                                      hf_kernels_swiglu         7.28%     119.722us        99.70%       1.640ms       1.640ms       0.000us         0.00%       5.311us       5.311us             1  
+                      _activation_beeaae6::silu_and_mul         1.57%      25.841us        91.18%       1.500ms     499.858us       3.967us       100.00%       5.311us       1.770us             3  
+void vllm::act_and_mul_kernel&lt;c10::BFloat16, &amp;(c10::...         0.00%       0.000us         0.00%       0.000us       0.000us       3.967us       100.00%       3.967us       1.322us             3  
+                                Activity Buffer Request        87.74%       1.443ms        87.74%       1.443ms       1.443ms       1.344us        33.88%       1.344us       1.344us             1  
+                                            aten::empty         1.24%      20.410us         1.24%      20.410us       6.803us       0.000us         0.00%       0.000us       0.000us             3  
+                                       cudaLaunchKernel         1.86%      30.650us         1.86%      30.650us      10.217us       0.000us         0.00%       0.000us       0.000us             3  
+                                  cudaDeviceSynchronize         0.30%       4.930us         0.30%       4.930us       4.930us       0.000us         0.00%       0.000us       0.000us             1  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-Self CPU time total: 1.677ms
-Self CUDA time total: 8.096us
+Self CPU time total: 1.645ms
+Self CUDA time total: 3.967us
 
 
 
@@ -4028,17 +4016,17 @@ PROFILE TRACE: hf_kernels_swiglu | cuda_T128_D2048
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
                                                    Name    Self CPU %      Self CPU   CPU total %     CPU total  CPU time avg     Self CUDA   Self CUDA %    CUDA total  CUDA time avg    # of Calls  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-                                      hf_kernels_swiglu         0.00%       0.000us         0.00%       0.000us       0.000us      67.039us       596.86%      67.039us      67.039us             1  
-                                      hf_kernels_swiglu         5.22%      85.373us        99.71%       1.630ms       1.630ms       0.000us         0.00%      15.008us      15.008us             1  
-                      _activation_beeaae6::silu_and_mul         1.19%      19.431us        93.38%       1.527ms     508.877us      11.232us       100.00%      15.008us       5.003us             3  
-void vllm::act_and_mul_kernel&lt;c10::BFloat16, &amp;(c10::...         0.00%       0.000us         0.00%       0.000us       0.000us      11.232us       100.00%      11.232us       3.744us             3  
-                                Activity Buffer Request        90.58%       1.481ms        90.58%       1.481ms       1.481ms       3.776us        33.62%       3.776us       3.776us             1  
-                                            aten::empty         1.11%      18.160us         1.11%      18.160us       6.053us       0.000us         0.00%       0.000us       0.000us             3  
-                                       cudaLaunchKernel         1.61%      26.370us         1.61%      26.370us       8.790us       0.000us         0.00%       0.000us       0.000us             3  
-                                  cudaDeviceSynchronize         0.29%       4.730us         0.29%       4.730us       4.730us       0.000us         0.00%       0.000us       0.000us             1  
+                                      hf_kernels_swiglu         0.00%       0.000us         0.00%       0.000us       0.000us      67.487us      1369.46%      67.487us      67.487us             1  
+                                      hf_kernels_swiglu         6.70%     107.400us        99.69%       1.598ms       1.598ms       0.000us         0.00%       6.592us       6.592us             1  
+                      _activation_beeaae6::silu_and_mul         1.32%      21.191us        91.79%       1.471ms     490.438us       4.928us       100.00%       6.592us       2.197us             3  
+void vllm::act_and_mul_kernel&lt;c10::BFloat16, &amp;(c10::...         0.00%       0.000us         0.00%       0.000us       0.000us       4.928us       100.00%       4.928us       1.643us             3  
+                                Activity Buffer Request        88.89%       1.425ms        88.89%       1.425ms       1.425ms       1.664us        33.77%       1.664us       1.664us             1  
+                                            aten::empty         1.20%      19.281us         1.20%      19.281us       6.427us       0.000us         0.00%       0.000us       0.000us             3  
+                                       cudaLaunchKernel         1.57%      25.210us         1.57%      25.210us       8.403us       0.000us         0.00%       0.000us       0.000us             3  
+                                  cudaDeviceSynchronize         0.31%       4.970us         0.31%       4.970us       4.970us       0.000us         0.00%       0.000us       0.000us             1  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-Self CPU time total: 1.635ms
-Self CUDA time total: 11.232us
+Self CPU time total: 1.603ms
+Self CUDA time total: 4.928us
 
 
 
@@ -4048,17 +4036,17 @@ PROFILE TRACE: hf_kernels_swiglu | cuda_T256_D768
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
                                                    Name    Self CPU %      Self CPU   CPU total %     CPU total  CPU time avg     Self CUDA   Self CUDA %    CUDA total  CUDA time avg    # of Calls  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-                                      hf_kernels_swiglu         0.00%       0.000us         0.00%       0.000us       0.000us      69.598us       870.08%      69.598us      69.598us             1  
-                                      hf_kernels_swiglu         4.94%      87.632us        99.74%       1.771ms       1.771ms       0.000us         0.00%      10.719us      10.719us             1  
-                      _activation_beeaae6::silu_and_mul         1.09%      19.352us        93.69%       1.663ms     554.452us       7.999us       100.00%      10.719us       3.573us             3  
-void vllm::act_and_mul_kernel&lt;c10::BFloat16, &amp;(c10::...         0.00%       0.000us         0.00%       0.000us       0.000us       7.999us       100.00%       7.999us       2.666us             3  
-                                Activity Buffer Request        83.17%       1.477ms        83.17%       1.477ms       1.477ms       2.720us        34.00%       2.720us       2.720us             1  
-                                            aten::empty         1.11%      19.710us         1.11%      19.710us       6.570us       0.000us         0.00%       0.000us       0.000us             3  
-                                       cudaLaunchKernel         9.43%     167.443us         9.43%     167.443us      55.814us       0.000us         0.00%       0.000us       0.000us             3  
-                                  cudaDeviceSynchronize         0.26%       4.610us         0.26%       4.610us       4.610us       0.000us         0.00%       0.000us       0.000us             1  
+                                      hf_kernels_swiglu         0.00%       0.000us         0.00%       0.000us       0.000us      75.265us      1768.03%      75.265us      75.265us             1  
+                                      hf_kernels_swiglu         6.51%     118.032us        99.70%       1.807ms       1.807ms       0.000us         0.00%       5.697us       5.697us             1  
+                      _activation_beeaae6::silu_and_mul         1.22%      22.071us        92.05%       1.668ms     556.119us       4.257us       100.00%       5.697us       1.899us             3  
+void vllm::act_and_mul_kernel&lt;c10::BFloat16, &amp;(c10::...         0.00%       0.000us         0.00%       0.000us       0.000us       4.257us       100.00%       4.257us       1.419us             3  
+                                Activity Buffer Request        79.39%       1.439ms        79.39%       1.439ms       1.439ms       1.440us        33.83%       1.440us       1.440us             1  
+                                            aten::empty         1.14%      20.640us         1.14%      20.640us       6.880us       0.000us         0.00%       0.000us       0.000us             3  
+                                       cudaLaunchKernel        11.45%     207.513us        11.45%     207.513us      69.171us       0.000us         0.00%       0.000us       0.000us             3  
+                                  cudaDeviceSynchronize         0.30%       5.350us         0.30%       5.350us       5.350us       0.000us         0.00%       0.000us       0.000us             1  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-Self CPU time total: 1.775ms
-Self CUDA time total: 7.999us
+Self CPU time total: 1.812ms
+Self CUDA time total: 4.257us
 
 
 
@@ -4068,17 +4056,17 @@ PROFILE TRACE: hf_kernels_swiglu | cuda_T256_D1024
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
                                                    Name    Self CPU %      Self CPU   CPU total %     CPU total  CPU time avg     Self CUDA   Self CUDA %    CUDA total  CUDA time avg    # of Calls  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-                                      hf_kernels_swiglu         0.00%       0.000us         0.00%       0.000us       0.000us      70.239us       570.12%      70.239us      70.239us             1  
-                                      hf_kernels_swiglu         5.14%      91.331us        99.75%       1.772ms       1.772ms       0.000us         0.00%      16.448us      16.448us             1  
-                      _activation_beeaae6::silu_and_mul         1.09%      19.360us        93.54%       1.662ms     553.872us      12.320us       100.00%      16.448us       5.483us             3  
-void vllm::act_and_mul_kernel&lt;c10::BFloat16, &amp;(c10::...         0.00%       0.000us         0.00%       0.000us       0.000us      12.320us       100.00%      12.320us       4.107us             3  
-                                Activity Buffer Request        83.14%       1.477ms        83.14%       1.477ms       1.477ms       4.128us        33.51%       4.128us       4.128us             1  
-                                            aten::empty         1.07%      19.032us         1.07%      19.032us       6.344us       0.000us         0.00%       0.000us       0.000us             3  
-                                       cudaLaunchKernel         9.31%     165.333us         9.31%     165.333us      55.111us       0.000us         0.00%       0.000us       0.000us             3  
-                                  cudaDeviceSynchronize         0.25%       4.400us         0.25%       4.400us       4.400us       0.000us         0.00%       0.000us       0.000us             1  
+                                      hf_kernels_swiglu         0.00%       0.000us         0.00%       0.000us       0.000us      65.471us      1111.94%      65.471us      65.471us             1  
+                                      hf_kernels_swiglu        19.52%      89.390us        98.84%     452.537us     452.537us       0.000us         0.00%       7.872us       7.872us             1  
+                      _activation_beeaae6::silu_and_mul         5.02%      23.003us        75.04%     343.547us     114.516us       5.888us       100.00%       7.872us       2.624us             3  
+void vllm::act_and_mul_kernel&lt;c10::BFloat16, &amp;(c10::...         0.00%       0.000us         0.00%       0.000us       0.000us       5.888us       100.00%       5.888us       1.963us             3  
+                                Activity Buffer Request        33.89%     155.152us        33.89%     155.152us     155.152us       1.984us        33.70%       1.984us       1.984us             1  
+                                            aten::empty         4.28%      19.600us         4.28%      19.600us       6.533us       0.000us         0.00%       0.000us       0.000us             3  
+                                       cudaLaunchKernel        36.13%     165.392us        36.13%     165.392us      55.131us       0.000us         0.00%       0.000us       0.000us             3  
+                                  cudaDeviceSynchronize         1.16%       5.290us         1.16%       5.290us       5.290us       0.000us         0.00%       0.000us       0.000us             1  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-Self CPU time total: 1.776ms
-Self CUDA time total: 12.320us
+Self CPU time total: 457.827us
+Self CUDA time total: 5.888us
 
 
 
@@ -4088,17 +4076,17 @@ PROFILE TRACE: hf_kernels_swiglu | cuda_T256_D2048
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
                                                    Name    Self CPU %      Self CPU   CPU total %     CPU total  CPU time avg     Self CUDA   Self CUDA %    CUDA total  CUDA time avg    # of Calls  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-                                      hf_kernels_swiglu         0.00%       0.000us         0.00%       0.000us       0.000us      68.766us       394.32%      68.766us      68.766us             1  
-                                      hf_kernels_swiglu        16.12%      86.942us        99.12%     534.642us     534.642us       0.000us         0.00%      23.263us      23.263us             1  
-                      _activation_beeaae6::silu_and_mul         3.56%      19.181us        79.14%     426.890us     142.297us      17.439us       100.00%      23.263us       7.754us             3  
-void vllm::act_and_mul_kernel&lt;c10::BFloat16, &amp;(c10::...         0.00%       0.000us         0.00%       0.000us       0.000us      17.439us       100.00%      17.439us       5.813us             3  
-                                Activity Buffer Request        44.72%     241.246us        44.72%     241.246us     241.246us       5.824us        33.40%       5.824us       5.824us             1  
-                                            aten::empty         3.86%      20.810us         3.86%      20.810us       6.937us       0.000us         0.00%       0.000us       0.000us             3  
-                                       cudaLaunchKernel        30.86%     166.463us        30.86%     166.463us      55.488us       0.000us         0.00%       0.000us       0.000us             3  
-                                  cudaDeviceSynchronize         0.88%       4.760us         0.88%       4.760us       4.760us       0.000us         0.00%       0.000us       0.000us             1  
+                                      hf_kernels_swiglu         0.00%       0.000us         0.00%       0.000us       0.000us      68.383us       879.52%      68.383us      68.383us             1  
+                                      hf_kernels_swiglu         6.83%     118.711us        99.72%       1.734ms       1.734ms       0.000us         0.00%      10.367us      10.367us             1  
+                      _activation_beeaae6::silu_and_mul         1.25%      21.741us        91.78%       1.596ms     531.855us       7.775us       100.00%      10.367us       3.456us             3  
+void vllm::act_and_mul_kernel&lt;c10::BFloat16, &amp;(c10::...         0.00%       0.000us         0.00%       0.000us       0.000us       7.775us       100.00%       7.775us       2.592us             3  
+                                Activity Buffer Request        81.74%       1.421ms        81.74%       1.421ms       1.421ms       2.592us        33.34%       2.592us       2.592us             1  
+                                            aten::empty         1.11%      19.311us         1.11%      19.311us       6.437us       0.000us         0.00%       0.000us       0.000us             3  
+                                       cudaLaunchKernel         8.79%     152.752us         8.79%     152.752us      50.917us       0.000us         0.00%       0.000us       0.000us             3  
+                                  cudaDeviceSynchronize         0.28%       4.930us         0.28%       4.930us       4.930us       0.000us         0.00%       0.000us       0.000us             1  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-Self CPU time total: 539.402us
-Self CUDA time total: 17.439us
+Self CPU time total: 1.739ms
+Self CUDA time total: 7.775us
 
 
 
@@ -4108,17 +4096,17 @@ PROFILE TRACE: hf_kernels_swiglu | cuda_T512_D768
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
                                                    Name    Self CPU %      Self CPU   CPU total %     CPU total  CPU time avg     Self CUDA   Self CUDA %    CUDA total  CUDA time avg    # of Calls  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-                                      hf_kernels_swiglu         0.00%       0.000us         0.00%       0.000us       0.000us      67.422us       541.63%      67.422us      67.422us             1  
-                                      hf_kernels_swiglu        15.67%      86.170us        99.13%     545.172us     545.172us       0.000us         0.00%      16.576us      16.576us             1  
-                      _activation_beeaae6::silu_and_mul         3.45%      18.981us        79.89%     439.370us     146.457us      12.448us       100.00%      16.576us       5.525us             3  
-void vllm::act_and_mul_kernel&lt;c10::BFloat16, &amp;(c10::...         0.00%       0.000us         0.00%       0.000us       0.000us      12.448us       100.00%      12.448us       4.149us             3  
-                                Activity Buffer Request        46.28%     254.506us        46.28%     254.506us     254.506us       4.128us        33.16%       4.128us       4.128us             1  
-                                            aten::empty         3.57%      19.632us         3.57%      19.632us       6.544us       0.000us         0.00%       0.000us       0.000us             3  
-                                       cudaLaunchKernel        30.16%     165.883us        30.16%     165.883us      55.294us       0.000us         0.00%       0.000us       0.000us             3  
-                                  cudaDeviceSynchronize         0.87%       4.770us         0.87%       4.770us       4.770us       0.000us         0.00%       0.000us       0.000us             1  
+                                      hf_kernels_swiglu         0.00%       0.000us         0.00%       0.000us       0.000us      70.527us      1069.89%      70.527us      70.527us             1  
+                                      hf_kernels_swiglu         6.20%     108.691us        99.73%       1.749ms       1.749ms       0.000us         0.00%       8.800us       8.800us             1  
+                      _activation_beeaae6::silu_and_mul         1.29%      22.622us        92.35%       1.619ms     539.785us       6.592us       100.00%       8.800us       2.933us             3  
+void vllm::act_and_mul_kernel&lt;c10::BFloat16, &amp;(c10::...         0.00%       0.000us         0.00%       0.000us       0.000us       6.592us       100.00%       6.592us       2.197us             3  
+                                Activity Buffer Request        82.48%       1.446ms        82.48%       1.446ms       1.446ms       2.208us        33.50%       2.208us       2.208us             1  
+                                            aten::empty         1.18%      20.650us         1.18%      20.650us       6.883us       0.000us         0.00%       0.000us       0.000us             3  
+                                       cudaLaunchKernel         8.58%     150.492us         8.58%     150.492us      50.164us       0.000us         0.00%       0.000us       0.000us             3  
+                                  cudaDeviceSynchronize         0.27%       4.780us         0.27%       4.780us       4.780us       0.000us         0.00%       0.000us       0.000us             1  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-Self CPU time total: 549.942us
-Self CUDA time total: 12.448us
+Self CPU time total: 1.753ms
+Self CUDA time total: 6.592us
 
 
 
@@ -4128,17 +4116,17 @@ PROFILE TRACE: hf_kernels_swiglu | cuda_T512_D1024
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
                                                    Name    Self CPU %      Self CPU   CPU total %     CPU total  CPU time avg     Self CUDA   Self CUDA %    CUDA total  CUDA time avg    # of Calls  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-                                      hf_kernels_swiglu         0.00%       0.000us         0.00%       0.000us       0.000us      70.941us       341.59%      70.941us      70.941us             1  
-                                      hf_kernels_swiglu        15.89%      87.442us        99.17%     545.692us     545.692us       0.000us         0.00%      27.744us      27.744us             1  
-                      _activation_beeaae6::silu_and_mul         3.49%      19.210us        79.79%     439.080us     146.360us      20.768us       100.00%      27.744us       9.248us             3  
-void vllm::act_and_mul_kernel&lt;c10::BFloat16, &amp;(c10::...         0.00%       0.000us         0.00%       0.000us       0.000us      20.768us       100.00%      20.768us       6.923us             3  
-                                Activity Buffer Request        46.16%     253.986us        46.16%     253.986us     253.986us       6.976us        33.59%       6.976us       6.976us             1  
-                                            aten::empty         3.48%      19.170us         3.48%      19.170us       6.390us       0.000us         0.00%       0.000us       0.000us             3  
-                                       cudaLaunchKernel        30.15%     165.884us        30.15%     165.884us      55.295us       0.000us         0.00%       0.000us       0.000us             3  
-                                  cudaDeviceSynchronize         0.83%       4.591us         0.83%       4.591us       4.591us       0.000us         0.00%       0.000us       0.000us             1  
+                                      hf_kernels_swiglu         0.00%       0.000us         0.00%       0.000us       0.000us      66.591us       703.03%      66.591us      66.591us             1  
+                                      hf_kernels_swiglu        22.91%      88.512us        98.75%     381.506us     381.506us       0.000us         0.00%      12.640us      12.640us             1  
+                      _activation_beeaae6::silu_and_mul         5.22%      20.151us        70.42%     272.064us      90.688us       9.472us       100.00%      12.640us       4.213us             3  
+void vllm::act_and_mul_kernel&lt;c10::BFloat16, &amp;(c10::...         0.00%       0.000us         0.00%       0.000us       0.000us       9.472us       100.00%       9.472us       3.157us             3  
+                                Activity Buffer Request        26.21%     101.241us        26.21%     101.241us     101.241us       3.168us        33.45%       3.168us       3.168us             1  
+                                            aten::empty         5.42%      20.930us         5.42%      20.930us       6.977us       0.000us         0.00%       0.000us       0.000us             3  
+                                       cudaLaunchKernel        39.00%     150.672us        39.00%     150.672us      50.224us       0.000us         0.00%       0.000us       0.000us             3  
+                                  cudaDeviceSynchronize         1.25%       4.820us         1.25%       4.820us       4.820us       0.000us         0.00%       0.000us       0.000us             1  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-Self CPU time total: 550.283us
-Self CUDA time total: 20.768us
+Self CPU time total: 386.326us
+Self CUDA time total: 9.472us
 
 
 
@@ -4148,17 +4136,17 @@ PROFILE TRACE: hf_kernels_swiglu | cuda_T512_D2048
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
                                                    Name    Self CPU %      Self CPU   CPU total %     CPU total  CPU time avg     Self CUDA   Self CUDA %    CUDA total  CUDA time avg    # of Calls  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-                                      hf_kernels_swiglu         0.00%       0.000us         0.00%       0.000us       0.000us      70.780us       228.74%      70.780us      70.780us             1  
-                                      hf_kernels_swiglu        16.83%      85.362us        99.15%     502.911us     502.911us       0.000us         0.00%      41.183us      41.183us             1  
-                      _activation_beeaae6::silu_and_mul         3.74%      18.980us        78.74%     399.388us     133.129us      30.943us       100.00%      41.183us      13.728us             3  
-void vllm::act_and_mul_kernel&lt;c10::BFloat16, &amp;(c10::...         0.00%       0.000us         0.00%       0.000us       0.000us      30.943us       100.00%      30.943us      10.314us             3  
-                                Activity Buffer Request        42.65%     216.335us        42.65%     216.335us     216.335us      10.240us        33.09%      10.240us      10.240us             1  
-                                            aten::empty         3.58%      18.161us         3.58%      18.161us       6.054us       0.000us         0.00%       0.000us       0.000us             3  
-                                       cudaLaunchKernel        32.35%     164.073us        32.35%     164.073us      54.691us       0.000us         0.00%       0.000us       0.000us             3  
-                                  cudaDeviceSynchronize         0.85%       4.320us         0.85%       4.320us       4.320us       0.000us         0.00%       0.000us       0.000us             1  
+                                      hf_kernels_swiglu         0.00%       0.000us         0.00%       0.000us       0.000us      67.295us       514.21%      67.295us      67.295us             1  
+                                      hf_kernels_swiglu        24.05%     101.492us        98.90%     417.266us     417.266us       0.000us         0.00%      17.503us      17.503us             1  
+                      _activation_beeaae6::silu_and_mul         5.33%      22.480us        70.08%     295.684us      98.561us      13.087us       100.00%      17.503us       5.834us             3  
+void vllm::act_and_mul_kernel&lt;c10::BFloat16, &amp;(c10::...         0.00%       0.000us         0.00%       0.000us       0.000us      13.087us       100.00%      13.087us       4.362us             3  
+                                Activity Buffer Request        28.92%     122.012us        28.92%     122.012us     122.012us       4.416us        33.74%       4.416us       4.416us             1  
+                                            aten::empty         4.76%      20.090us         4.76%      20.090us       6.697us       0.000us         0.00%       0.000us       0.000us             3  
+                                       cudaLaunchKernel        35.83%     151.192us        35.83%     151.192us      50.397us       0.000us         0.00%       0.000us       0.000us             3  
+                                  cudaDeviceSynchronize         1.10%       4.660us         1.10%       4.660us       4.660us       0.000us         0.00%       0.000us       0.000us             1  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-Self CPU time total: 507.231us
-Self CUDA time total: 30.943us
+Self CPU time total: 421.926us
+Self CUDA time total: 13.087us
 
 
 impl                     wl                  p50(ms)  ok
@@ -4175,61 +4163,12 @@ hf_kernels_swiglu        cuda_T512_D768         0.03  True
 <div class="uv-install-logs" id="uv-logs-benchmark">
 <div class="uv-logs-header" onclick="toggleUvLogs(this)">▶ UV Install Logs</div>
 <div class="uv-logs-content" style="display: none;">
-   Building kernels-benchmark-tools @ file:///home/ubuntu/Projects/kernels-benchmarks-consolidated/tools
-Downloading numpy (15.9MiB)
-Downloading setuptools (1.1MiB)
-Downloading sympy (6.0MiB)
-Downloading nvidia-cufft-cu12 (184.2MiB)
-Downloading triton (148.4MiB)
-Downloading nvidia-cudnn-cu12 (674.0MiB)
-Downloading nvidia-cusparselt-cu12 (273.9MiB)
-Downloading nvidia-nvjitlink-cu12 (37.4MiB)
-Downloading nvidia-cuda-cupti-cu12 (9.8MiB)
-Downloading nvidia-cusolver-cu12 (255.1MiB)
-Downloading nvidia-cuda-nvrtc-cu12 (84.0MiB)
-Downloading pillow (6.7MiB)
-Downloading networkx (1.9MiB)
-Downloading matplotlib (8.3MiB)
-Downloading nvidia-cusparse-cu12 (274.9MiB)
-Downloading nvidia-curand-cu12 (60.7MiB)
-Downloading kiwisolver (1.4MiB)
-Downloading nvidia-cufile-cu12 (1.1MiB)
-Downloading hf-xet (3.2MiB)
-Downloading nvidia-nccl-cu12 (307.4MiB)
-Downloading fonttools (4.7MiB)
-Downloading nvidia-cublas-cu12 (566.8MiB)
-Downloading torch (846.8MiB)
- Downloading nvidia-cufile-cu12
- Downloading kiwisolver
- Downloading hf-xet
- Downloading setuptools
- Downloading fonttools
- Downloading networkx
- Downloading pillow
-      Built kernels-benchmark-tools @ file:///home/ubuntu/Projects/kernels-benchmarks-consolidated/tools
- Downloading nvidia-cuda-cupti-cu12
- Downloading matplotlib
- Downloading numpy
- Downloading nvidia-nvjitlink-cu12
- Downloading sympy
- Downloading nvidia-curand-cu12
- Downloading nvidia-cuda-nvrtc-cu12
- Downloading triton
- Downloading nvidia-cufft-cu12
- Downloading nvidia-cusolver-cu12
- Downloading nvidia-cusparse-cu12
- Downloading nvidia-cusparselt-cu12
- Downloading nvidia-nccl-cu12
- Downloading nvidia-cublas-cu12
- Downloading nvidia-cudnn-cu12
- Downloading torch
-Installed 47 packages in 234ms
+Installed 15 packages in 15ms
 </div>
 </div>
 <div class="cell-stderr">Fetching 7 files:   0%|          | 0/7 [00:00&lt;?, ?it/s]
-Fetching 7 files:  14%|█▍        | 1/7 [00:00&lt;00:01,  4.08it/s]
-Fetching 7 files:  71%|███████▏  | 5/7 [00:00&lt;00:00,  9.40it/s]
-Fetching 7 files: 100%|██████████| 7/7 [00:00&lt;00:00, 12.20it/s]</div>
+Fetching 7 files:  71%|███████▏  | 5/7 [00:00&lt;00:00, 13.68it/s]
+Fetching 7 files: 100%|██████████| 7/7 [00:00&lt;00:00, 19.14it/s]</div>
 <div class="cell-artifacts">
 <h4>Artifacts:</h4>
 <a href="artifacts/benchmark/activation.jsonl" class="artifact" target="_blank">activation.jsonl</a>
diff --git a/activation/impls/torch_swiglu.html b/activation/impls/torch_swiglu.html
index fad608debc07afac9b5948fbe5ed773bd10d6a06..6347cf8477b3c77c2a153235fedda937b464164d 100644
--- a/activation/impls/torch_swiglu.html
+++ b/activation/impls/torch_swiglu.html
@@ -3857,7 +3857,7 @@ span.linenos.special { color: #000000; background-color: #ffffc0; padding-left:
     <div class="system-info">
         <div class="system-info-header">Generated on:</div>
         <div class="system-info-content">
-            Linux x86_64 | Linux-5.15.0-1084-aws-x86_64-with-glibc2.31
+            Linux x86_64 | Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35
         </div>
     </div>
     
@@ -3871,7 +3871,7 @@ span.linenos.special { color: #000000; background-color: #ffffc0; padding-left:
 <span onclick="toggleOutput('nv')" style="cursor: pointer;">▼ output</span>
  <span id="uv-indicator-nv" style="cursor: default; opacity: 0.3;">▶ uv-logs</span>
 </span> | 
-Cell: nv | 4.02s
+Cell: nv | 0.26s
  | <button class="run-btn" onclick="runCell('nv')">▶ run</button>
 <button class="copy-btn" onclick="copyCell('nv')">Copy</button>
 <a href="cells/nv.py" target="_blank" class="raw-btn">Raw</a>
@@ -3887,34 +3887,22 @@ Cell: nv | 4.02s
 </div>
 </div>
 <div id="output-nv" class="cell-output">
-<div class="cell-stdout"><pre class="stdout-text">Fri Oct 24 19:24:09 2025       
+<div class="cell-stdout"><pre class="stdout-text">Mon Oct 27 14:46:00 2025       
 +-----------------------------------------------------------------------------------------+
-| NVIDIA-SMI 560.35.05              Driver Version: 560.35.05      CUDA Version: 12.6     |
+| NVIDIA-SMI 570.195.03             Driver Version: 570.195.03     CUDA Version: 12.8     |
 |-----------------------------------------+------------------------+----------------------+
 | GPU  Name                 Persistence-M | Bus-Id          Disp.A | Volatile Uncorr. ECC |
 | Fan  Temp   Perf          Pwr:Usage/Cap |           Memory-Usage | GPU-Util  Compute M. |
 |                                         |                        |               MIG M. |
 |=========================================+========================+======================|
-|   0  NVIDIA L4                      Off |   00000000:38:00.0 Off |                    0 |
-| N/A   36C    P0             27W /   72W |       1MiB /  23034MiB |      2%      Default |
-|                                         |                        |                  N/A |
-+-----------------------------------------+------------------------+----------------------+
-|   1  NVIDIA L4                      Off |   00000000:3A:00.0 Off |                    0 |
-| N/A   33C    P0             28W /   72W |       1MiB /  23034MiB |      0%      Default |
-|                                         |                        |                  N/A |
-+-----------------------------------------+------------------------+----------------------+
-|   2  NVIDIA L4                      Off |   00000000:3C:00.0 Off |                    0 |
-| N/A   34C    P0             27W /   72W |       1MiB /  23034MiB |      0%      Default |
-|                                         |                        |                  N/A |
-+-----------------------------------------+------------------------+----------------------+
-|   3  NVIDIA L4                      Off |   00000000:3E:00.0 Off |                    0 |
-| N/A   33C    P0             27W /   72W |       1MiB /  23034MiB |      2%      Default |
+|   0  NVIDIA L40S                    On  |   00000000:4D:00.0 Off |                    0 |
+| N/A   32C    P0            153W /  350W |       0MiB /  46068MiB |     75%      Default |
 |                                         |                        |                  N/A |
 +-----------------------------------------+------------------------+----------------------+
 
 +-----------------------------------------------------------------------------------------+
 | Processes:                                                                              |
-|  GPU   GI   CI        PID   Type   Process name                              GPU Memory |
+|  GPU   GI   CI              PID   Type   Process name                        GPU Memory |
 |        ID   ID                                                               Usage      |
 |=========================================================================================|
 |  No running processes found                                                             |
@@ -3932,7 +3920,7 @@ Cell: nv | 4.02s
 <span onclick="toggleOutput('benchmark')" style="cursor: pointer;">▼ output</span>
  <span id="uv-indicator-benchmark" onclick="toggleUvLogsFromHeader('benchmark')" style="cursor: pointer;">▶ uv-logs</span>
 </span> | 
-Cell: benchmark | 42.42s
+Cell: benchmark | 6.99s
  | <button class="run-btn" onclick="runCell('benchmark')">▶ run</button>
 <button class="copy-btn" onclick="copyCell('benchmark')">Copy</button>
 <a href="cells/benchmark.py" target="_blank" class="raw-btn">Raw</a>
@@ -3982,20 +3970,20 @@ PROFILE TRACE: torch_eager | cuda_T128_D768
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
                                                    Name    Self CPU %      Self CPU   CPU total %     CPU total  CPU time avg     Self CUDA   Self CUDA %    CUDA total  CUDA time avg    # of Calls  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-                                            torch_eager         0.00%       0.000us         0.00%       0.000us       0.000us     237.726us      1604.74%     237.726us     237.726us             1  
-                                            torch_eager        11.30%     225.975us        99.63%       1.992ms       1.992ms       0.000us         0.00%      17.566us      17.566us             1  
-                                             aten::silu         3.42%      68.411us        81.12%       1.622ms     540.728us       7.646us        51.61%      10.398us       3.466us             3  
-void at::native::elementwise_kernel&lt;128, 4, at::nati...         0.00%       0.000us         0.00%       0.000us       0.000us       7.646us        51.61%       7.646us       2.549us             3  
-                                              aten::mul         2.15%      42.970us         3.33%      66.621us      22.207us       7.168us        48.39%       7.168us       2.389us             3  
-void at::native::elementwise_kernel&lt;128, 4, at::nati...         0.00%       0.000us         0.00%       0.000us       0.000us       7.168us        48.39%       7.168us       2.389us             3  
-                                Activity Buffer Request        74.74%       1.495ms        74.74%       1.495ms       1.495ms       2.752us        18.58%       2.752us       2.752us             1  
-                                            aten::slice         3.26%      65.261us         3.88%      77.582us      12.930us       0.000us         0.00%       0.000us       0.000us             6  
-                                       aten::as_strided         0.62%      12.321us         0.62%      12.321us       2.053us       0.000us         0.00%       0.000us       0.000us             6  
-                                       cudaLaunchKernel         4.14%      82.803us         4.14%      82.803us      13.800us       0.000us         0.00%       0.000us       0.000us             6  
-                                  cudaDeviceSynchronize         0.37%       7.380us         0.37%       7.380us       7.380us       0.000us         0.00%       0.000us       0.000us             1  
+                                            torch_eager         0.00%       0.000us         0.00%       0.000us       0.000us     183.359us      1436.08%     183.359us     183.359us             1  
+                                            torch_eager        11.24%     212.694us        99.53%       1.883ms       1.883ms       0.000us         0.00%      15.072us      15.072us             1  
+                                             aten::silu         3.31%      62.660us        82.30%       1.557ms     519.134us       6.527us        51.12%       8.831us       2.944us             3  
+void at::native::elementwise_kernel&lt;128, 4, at::nati...         0.00%       0.000us         0.00%       0.000us       0.000us       6.527us        51.12%       6.527us       2.176us             3  
+                                              aten::mul         1.85%      35.100us         2.98%      56.340us      18.780us       6.241us        48.88%       6.241us       2.080us             3  
+void at::native::elementwise_kernel&lt;128, 4, at::nati...         0.00%       0.000us         0.00%       0.000us       0.000us       6.241us        48.88%       6.241us       2.080us             3  
+                                Activity Buffer Request        76.74%       1.452ms        76.74%       1.452ms       1.452ms       2.304us        18.05%       2.304us       2.304us             1  
+                                            aten::slice         2.41%      45.561us         3.01%      56.902us       9.484us       0.000us         0.00%       0.000us       0.000us             6  
+                                       aten::as_strided         0.60%      11.341us         0.60%      11.341us       1.890us       0.000us         0.00%       0.000us       0.000us             6  
+                                       cudaLaunchKernel         3.37%      63.741us         3.37%      63.741us      10.623us       0.000us         0.00%       0.000us       0.000us             6  
+                                  cudaDeviceSynchronize         0.47%       8.969us         0.47%       8.969us       8.969us       0.000us         0.00%       0.000us       0.000us             1  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-Self CPU time total: 2.000ms
-Self CUDA time total: 14.814us
+Self CPU time total: 1.892ms
+Self CUDA time total: 12.768us
 
 
 
@@ -4005,20 +3993,20 @@ PROFILE TRACE: torch_eager | cuda_T128_D1024
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
                                                    Name    Self CPU %      Self CPU   CPU total %     CPU total  CPU time avg     Self CUDA   Self CUDA %    CUDA total  CUDA time avg    # of Calls  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-                                            torch_eager         0.00%       0.000us         0.00%       0.000us       0.000us     155.197us      1056.55%     155.197us     155.197us             1  
-                                            torch_eager         6.38%     113.914us        99.69%       1.779ms       1.779ms       0.000us         0.00%      17.249us      17.249us             1  
-                                             aten::silu         2.13%      37.960us        88.89%       1.587ms     528.841us       7.616us        51.85%      10.176us       3.392us             3  
-void at::native::elementwise_kernel&lt;128, 4, at::nati...         0.00%       0.000us         0.00%       0.000us       0.000us       7.616us        51.85%       7.616us       2.539us             3  
-                                              aten::mul         1.58%      28.130us         2.63%      46.991us      15.664us       7.073us        48.15%       7.073us       2.358us             3  
-void at::native::elementwise_kernel&lt;128, 4, at::nati...         0.00%       0.000us         0.00%       0.000us       0.000us       7.073us        48.15%       7.073us       2.358us             3  
-                                Activity Buffer Request        85.27%       1.522ms        85.27%       1.522ms       1.522ms       2.560us        17.43%       2.560us       2.560us             1  
-                                            aten::slice         1.43%      25.481us         1.78%      31.850us       5.308us       0.000us         0.00%       0.000us       0.000us             6  
-                                       aten::as_strided         0.36%       6.369us         0.36%       6.369us       1.061us       0.000us         0.00%       0.000us       0.000us             6  
-                                       cudaLaunchKernel         2.55%      45.552us         2.55%      45.552us       7.592us       0.000us         0.00%       0.000us       0.000us             6  
-                                  cudaDeviceSynchronize         0.31%       5.590us         0.31%       5.590us       5.590us       0.000us         0.00%       0.000us       0.000us             1  
+                                            torch_eager         0.00%       0.000us         0.00%       0.000us       0.000us     158.431us      1279.63%     158.431us     158.431us             1  
+                                            torch_eager         6.85%     117.301us        99.69%       1.707ms       1.707ms       0.000us         0.00%      14.557us      14.557us             1  
+                                             aten::silu         2.45%      41.990us        88.25%       1.511ms     503.680us       6.398us        51.68%       8.574us       2.858us             3  
+void at::native::elementwise_kernel&lt;128, 4, at::nati...         0.00%       0.000us         0.00%       0.000us       0.000us       6.398us        51.68%       6.398us       2.133us             3  
+                                              aten::mul         1.63%      27.830us         2.78%      47.630us      15.877us       5.983us        48.32%       5.983us       1.994us             3  
+void at::native::elementwise_kernel&lt;128, 4, at::nati...         0.00%       0.000us         0.00%       0.000us       0.000us       5.983us        48.32%       5.983us       1.994us             3  
+                                Activity Buffer Request        84.28%       1.443ms        84.28%       1.443ms       1.443ms       2.176us        17.58%       2.176us       2.176us             1  
+                                            aten::slice         1.45%      24.820us         1.81%      30.931us       5.155us       0.000us         0.00%       0.000us       0.000us             6  
+                                       aten::as_strided         0.36%       6.111us         0.36%       6.111us       1.019us       0.000us         0.00%       0.000us       0.000us             6  
+                                       cudaLaunchKernel         2.67%      45.711us         2.67%      45.711us       7.618us       0.000us         0.00%       0.000us       0.000us             6  
+                                  cudaDeviceSynchronize         0.31%       5.320us         0.31%       5.320us       5.320us       0.000us         0.00%       0.000us       0.000us             1  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-Self CPU time total: 1.785ms
-Self CUDA time total: 14.689us
+Self CPU time total: 1.712ms
+Self CUDA time total: 12.381us
 
 
 
@@ -4028,20 +4016,20 @@ PROFILE TRACE: torch_eager | cuda_T128_D2048
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
                                                    Name    Self CPU %      Self CPU   CPU total %     CPU total  CPU time avg     Self CUDA   Self CUDA %    CUDA total  CUDA time avg    # of Calls  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-                                            torch_eager         0.00%       0.000us         0.00%       0.000us       0.000us     157.724us       928.23%     157.724us     157.724us             1  
-                                            torch_eager         6.06%     107.501us        99.72%       1.769ms       1.769ms       0.000us         0.00%      19.872us      19.872us             1  
-                                             aten::silu         2.60%      46.162us        89.17%       1.581ms     527.145us       8.576us        50.47%      11.456us       3.819us             3  
-void at::native::elementwise_kernel&lt;128, 4, at::nati...         0.00%       0.000us         0.00%       0.000us       0.000us       8.576us        50.47%       8.576us       2.859us             3  
-                                              aten::mul         1.54%      27.281us         2.61%      46.211us      15.404us       8.416us        49.53%       8.416us       2.805us             3  
-void at::native::elementwise_kernel&lt;128, 4, at::nati...         0.00%       0.000us         0.00%       0.000us       0.000us       8.416us        49.53%       8.416us       2.805us             3  
-                                Activity Buffer Request        85.05%       1.508ms        85.05%       1.508ms       1.508ms       2.880us        16.95%       2.880us       2.880us             1  
-                                            aten::slice         1.51%      26.721us         1.88%      33.391us       5.565us       0.000us         0.00%       0.000us       0.000us             6  
-                                       aten::as_strided         0.38%       6.670us         0.38%       6.670us       1.112us       0.000us         0.00%       0.000us       0.000us             6  
-                                       cudaLaunchKernel         2.58%      45.781us         2.58%      45.781us       7.630us       0.000us         0.00%       0.000us       0.000us             6  
-                                  cudaDeviceSynchronize         0.28%       4.940us         0.28%       4.940us       4.940us       0.000us         0.00%       0.000us       0.000us             1  
+                                            torch_eager         0.00%       0.000us         0.00%       0.000us       0.000us     145.182us      1095.88%     145.182us     145.182us             1  
+                                            torch_eager         6.28%     105.841us        99.65%       1.680ms       1.680ms       0.000us         0.00%      15.552us      15.552us             1  
+                                             aten::silu         2.40%      40.400us        89.03%       1.501ms     500.258us       6.816us        51.45%       9.120us       3.040us             3  
+void at::native::elementwise_kernel&lt;128, 4, at::nati...         0.00%       0.000us         0.00%       0.000us       0.000us       6.816us        51.45%       6.816us       2.272us             3  
+                                              aten::mul         1.52%      25.690us         2.64%      44.480us      14.827us       6.432us        48.55%       6.432us       2.144us             3  
+void at::native::elementwise_kernel&lt;128, 4, at::nati...         0.00%       0.000us         0.00%       0.000us       0.000us       6.432us        48.55%       6.432us       2.144us             3  
+                                Activity Buffer Request        85.10%       1.434ms        85.10%       1.434ms       1.434ms       2.304us        17.39%       2.304us       2.304us             1  
+                                            aten::slice         1.37%      23.030us         1.70%      28.690us       4.782us       0.000us         0.00%       0.000us       0.000us             6  
+                                       aten::as_strided         0.34%       5.660us         0.34%       5.660us       0.943us       0.000us         0.00%       0.000us       0.000us             6  
+                                       cudaLaunchKernel         2.66%      44.762us         2.66%      44.762us       7.460us       0.000us         0.00%       0.000us       0.000us             6  
+                                  cudaDeviceSynchronize         0.35%       5.820us         0.35%       5.820us       5.820us       0.000us         0.00%       0.000us       0.000us             1  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-Self CPU time total: 1.773ms
-Self CUDA time total: 16.992us
+Self CPU time total: 1.686ms
+Self CUDA time total: 13.248us
 
 
 
@@ -4051,20 +4039,20 @@ PROFILE TRACE: torch_eager | cuda_T256_D768
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
                                                    Name    Self CPU %      Self CPU   CPU total %     CPU total  CPU time avg     Self CUDA   Self CUDA %    CUDA total  CUDA time avg    # of Calls  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-                                            torch_eager         0.00%       0.000us         0.00%       0.000us       0.000us     154.972us       984.26%     154.972us     154.972us             1  
-                                            torch_eager         7.81%     106.363us        99.66%       1.357ms       1.357ms       0.000us         0.00%      18.497us      18.497us             1  
-                                             aten::silu         3.01%      41.020us        86.15%       1.173ms     391.021us       8.096us        51.42%      10.848us       3.616us             3  
-void at::native::elementwise_kernel&lt;128, 4, at::nati...         0.00%       0.000us         0.00%       0.000us       0.000us       8.096us        51.42%       8.096us       2.699us             3  
-                                              aten::mul         1.89%      25.761us         3.27%      44.581us      14.860us       7.649us        48.58%       7.649us       2.550us             3  
-void at::native::elementwise_kernel&lt;128, 4, at::nati...         0.00%       0.000us         0.00%       0.000us       0.000us       7.649us        48.58%       7.649us       2.550us             3  
-                                Activity Buffer Request        68.76%     936.210us        68.76%     936.210us     936.210us       2.752us        17.48%       2.752us       2.752us             1  
-                                            aten::slice         1.90%      25.829us         2.43%      33.031us       5.505us       0.000us         0.00%       0.000us       0.000us             6  
-                                       aten::as_strided         0.53%       7.202us         0.53%       7.202us       1.200us       0.000us         0.00%       0.000us       0.000us             6  
-                                       cudaLaunchKernel        15.76%     214.654us        15.76%     214.654us      35.776us       0.000us         0.00%       0.000us       0.000us             6  
-                                  cudaDeviceSynchronize         0.34%       4.590us         0.34%       4.590us       4.590us       0.000us         0.00%       0.000us       0.000us             1  
+                                            torch_eager         0.00%       0.000us         0.00%       0.000us       0.000us     145.025us      1135.85%     145.025us     145.025us             1  
+                                            torch_eager         7.55%     116.292us        99.65%       1.535ms       1.535ms       0.000us         0.00%      14.976us      14.976us             1  
+                                             aten::silu         2.67%      41.061us        87.34%       1.345ms     448.460us       6.592us        51.63%       8.800us       2.933us             3  
+void at::native::elementwise_kernel&lt;128, 4, at::nati...         0.00%       0.000us         0.00%       0.000us       0.000us       6.592us        51.63%       6.592us       2.197us             3  
+                                              aten::mul         1.71%      26.359us         2.88%      44.330us      14.777us       6.176us        48.37%       6.176us       2.059us             3  
+void at::native::elementwise_kernel&lt;128, 4, at::nati...         0.00%       0.000us         0.00%       0.000us       0.000us       6.176us        48.37%       6.176us       2.059us             3  
+                                Activity Buffer Request        69.61%       1.072ms        69.61%       1.072ms       1.072ms       2.208us        17.29%       2.208us       2.208us             1  
+                                            aten::slice         1.52%      23.350us         1.89%      29.050us       4.842us       0.000us         0.00%       0.000us       0.000us             6  
+                                       aten::as_strided         0.37%       5.700us         0.37%       5.700us       0.950us       0.000us         0.00%       0.000us       0.000us             6  
+                                       cudaLaunchKernel        16.23%     250.045us        16.23%     250.045us      41.674us       0.000us         0.00%       0.000us       0.000us             6  
+                                  cudaDeviceSynchronize         0.35%       5.360us         0.35%       5.360us       5.360us       0.000us         0.00%       0.000us       0.000us             1  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-Self CPU time total: 1.362ms
-Self CUDA time total: 15.745us
+Self CPU time total: 1.540ms
+Self CUDA time total: 12.768us
 
 
 
@@ -4074,20 +4062,20 @@ PROFILE TRACE: torch_eager | cuda_T256_D1024
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
                                                    Name    Self CPU %      Self CPU   CPU total %     CPU total  CPU time avg     Self CUDA   Self CUDA %    CUDA total  CUDA time avg    # of Calls  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-                                            torch_eager         0.00%       0.000us         0.00%       0.000us       0.000us     155.837us       910.37%     155.837us     155.837us             1  
-                                            torch_eager         5.68%     106.351us        99.75%       1.869ms       1.869ms       0.000us         0.00%      20.126us      20.126us             1  
-                                             aten::silu         2.11%      39.481us        89.91%       1.685ms     561.559us       8.671us        50.65%      11.679us       3.893us             3  
-void at::native::elementwise_kernel&lt;128, 4, at::nati...         0.00%       0.000us         0.00%       0.000us       0.000us       8.671us        50.65%       8.671us       2.890us             3  
-                                              aten::mul         1.44%      26.891us         2.49%      46.661us      15.554us       8.447us        49.35%       8.447us       2.816us             3  
-void at::native::elementwise_kernel&lt;128, 4, at::nati...         0.00%       0.000us         0.00%       0.000us       0.000us       8.447us        49.35%       8.447us       2.816us             3  
-                                Activity Buffer Request        78.61%       1.473ms        78.61%       1.473ms       1.473ms       3.008us        17.57%       3.008us       3.008us             1  
-                                            aten::slice         1.33%      24.861us         1.68%      31.451us       5.242us       0.000us         0.00%       0.000us       0.000us             6  
-                                       aten::as_strided         0.35%       6.590us         0.35%       6.590us       1.098us       0.000us         0.00%       0.000us       0.000us             6  
-                                       cudaLaunchKernel        10.25%     192.054us        10.25%     192.054us      32.009us       0.000us         0.00%       0.000us       0.000us             6  
-                                  cudaDeviceSynchronize         0.25%       4.670us         0.25%       4.670us       4.670us       0.000us         0.00%       0.000us       0.000us             1  
+                                            torch_eager         0.00%       0.000us         0.00%       0.000us       0.000us     144.030us      1089.82%     144.030us     144.030us             1  
+                                            torch_eager         5.82%     104.551us        99.68%       1.792ms       1.792ms       0.000us         0.00%      15.488us      15.488us             1  
+                                             aten::silu         2.32%      41.682us        89.81%       1.614ms     538.151us       6.752us        51.09%       9.024us       3.008us             3  
+void at::native::elementwise_kernel&lt;128, 4, at::nati...         0.00%       0.000us         0.00%       0.000us       0.000us       6.752us        51.09%       6.752us       2.251us             3  
+                                              aten::mul         1.41%      25.409us         2.48%      44.550us      14.850us       6.464us        48.91%       6.464us       2.155us             3  
+void at::native::elementwise_kernel&lt;128, 4, at::nati...         0.00%       0.000us         0.00%       0.000us       0.000us       6.464us        48.91%       6.464us       2.155us             3  
+                                Activity Buffer Request        78.50%       1.411ms        78.50%       1.411ms       1.411ms       2.272us        17.19%       2.272us       2.272us             1  
+                                            aten::slice         1.27%      22.830us         1.58%      28.320us       4.720us       0.000us         0.00%       0.000us       0.000us             6  
+                                       aten::as_strided         0.31%       5.490us         0.31%       5.490us       0.915us       0.000us         0.00%       0.000us       0.000us             6  
+                                       cudaLaunchKernel        10.06%     180.853us        10.06%     180.853us      30.142us       0.000us         0.00%       0.000us       0.000us             6  
+                                  cudaDeviceSynchronize         0.32%       5.710us         0.32%       5.710us       5.710us       0.000us         0.00%       0.000us       0.000us             1  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-Self CPU time total: 1.874ms
-Self CUDA time total: 17.118us
+Self CPU time total: 1.798ms
+Self CUDA time total: 13.216us
 
 
 
@@ -4097,20 +4085,20 @@ PROFILE TRACE: torch_eager | cuda_T256_D2048
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
                                                    Name    Self CPU %      Self CPU   CPU total %     CPU total  CPU time avg     Self CUDA   Self CUDA %    CUDA total  CUDA time avg    # of Calls  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-                                            torch_eager         0.00%       0.000us         0.00%       0.000us       0.000us     155.486us       596.92%     155.486us     155.486us             1  
-                                            torch_eager        20.98%     108.302us        99.11%     511.621us     511.621us       0.000us         0.00%      30.592us      30.592us             1  
-                                             aten::silu         7.61%      39.290us        63.32%     326.866us     108.955us      13.504us        51.84%      18.048us       6.016us             3  
-void at::native::elementwise_kernel&lt;128, 4, at::nati...         0.00%       0.000us         0.00%       0.000us       0.000us      13.504us        51.84%      13.504us       4.501us             3  
-                                              aten::mul         5.03%      25.960us         8.46%      43.671us      14.557us      12.544us        48.16%      12.544us       4.181us             3  
-void at::native::elementwise_kernel&lt;128, 4, at::nati...         0.00%       0.000us         0.00%       0.000us       0.000us      12.544us        48.16%      12.544us       4.181us             3  
-                                Activity Buffer Request        25.15%     129.813us        25.15%     129.813us     129.813us       4.544us        17.44%       4.544us       4.544us             1  
-                                            aten::slice         5.13%      26.471us         6.35%      32.782us       5.464us       0.000us         0.00%       0.000us       0.000us             6  
-                                       aten::as_strided         1.22%       6.311us         1.22%       6.311us       1.052us       0.000us         0.00%       0.000us       0.000us             6  
-                                       cudaLaunchKernel        33.99%     175.474us        33.99%     175.474us      29.246us       0.000us         0.00%       0.000us       0.000us             6  
-                                  cudaDeviceSynchronize         0.89%       4.611us         0.89%       4.611us       4.611us       0.000us         0.00%       0.000us       0.000us             1  
+                                            torch_eager         0.00%       0.000us         0.00%       0.000us       0.000us     140.382us       902.66%     140.382us     140.382us             1  
+                                            torch_eager        21.39%     103.633us        98.99%     479.697us     479.697us       0.000us         0.00%      18.240us      18.240us             1  
+                                             aten::silu         8.56%      41.460us        63.18%     306.154us     102.051us       7.936us        51.03%      10.624us       3.541us             3  
+void at::native::elementwise_kernel&lt;128, 4, at::nati...         0.00%       0.000us         0.00%       0.000us       0.000us       7.936us        51.03%       7.936us       2.645us             3  
+                                              aten::mul         4.90%      23.759us         8.63%      41.840us      13.947us       7.616us        48.97%       7.616us       2.539us             3  
+void at::native::elementwise_kernel&lt;128, 4, at::nati...         0.00%       0.000us         0.00%       0.000us       0.000us       7.616us        48.97%       7.616us       2.539us             3  
+                                Activity Buffer Request        23.12%     112.032us        23.12%     112.032us     112.032us       2.688us        17.28%       2.688us       2.688us             1  
+                                            aten::slice         4.68%      22.671us         5.79%      28.070us       4.678us       0.000us         0.00%       0.000us       0.000us             6  
+                                       aten::as_strided         1.11%       5.399us         1.11%       5.399us       0.900us       0.000us         0.00%       0.000us       0.000us             6  
+                                       cudaLaunchKernel        35.23%     170.743us        35.23%     170.743us      28.457us       0.000us         0.00%       0.000us       0.000us             6  
+                                  cudaDeviceSynchronize         1.01%       4.900us         1.01%       4.900us       4.900us       0.000us         0.00%       0.000us       0.000us             1  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-Self CPU time total: 516.232us
-Self CUDA time total: 26.048us
+Self CPU time total: 484.597us
+Self CUDA time total: 15.552us
 
 
 
@@ -4120,20 +4108,20 @@ PROFILE TRACE: torch_eager | cuda_T512_D768
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
                                                    Name    Self CPU %      Self CPU   CPU total %     CPU total  CPU time avg     Self CUDA   Self CUDA %    CUDA total  CUDA time avg    # of Calls  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-                                            torch_eager         0.00%       0.000us         0.00%       0.000us       0.000us     163.293us       685.87%     163.293us     163.293us             1  
-                                            torch_eager         5.58%     106.954us        99.75%       1.910ms       1.910ms       0.000us         0.00%      27.872us      27.872us             1  
-                                             aten::silu         2.13%      40.799us        89.92%       1.722ms     574.075us      12.032us        50.54%      16.096us       5.365us             3  
-void at::native::elementwise_kernel&lt;128, 4, at::nati...         0.00%       0.000us         0.00%       0.000us       0.000us      12.032us        50.54%      12.032us       4.011us             3  
-                                              aten::mul         1.39%      26.590us         2.40%      46.050us      15.350us      11.776us        49.46%      11.776us       3.925us             3  
-void at::native::elementwise_kernel&lt;128, 4, at::nati...         0.00%       0.000us         0.00%       0.000us       0.000us      11.776us        49.46%      11.776us       3.925us             3  
-                                Activity Buffer Request        79.43%       1.521ms        79.43%       1.521ms       1.521ms       4.064us        17.07%       4.064us       4.064us             1  
-                                            aten::slice         1.44%      27.592us         1.83%      35.091us       5.849us       0.000us         0.00%       0.000us       0.000us             6  
-                                       aten::as_strided         0.39%       7.499us         0.39%       7.499us       1.250us       0.000us         0.00%       0.000us       0.000us             6  
-                                       cudaLaunchKernel         9.38%     179.564us         9.38%     179.564us      29.927us       0.000us         0.00%       0.000us       0.000us             6  
-                                  cudaDeviceSynchronize         0.25%       4.880us         0.25%       4.880us       4.880us       0.000us         0.00%       0.000us       0.000us             1  
+                                            torch_eager         0.00%       0.000us         0.00%       0.000us       0.000us     145.662us      1011.54%     145.662us     145.662us             1  
+                                            torch_eager         5.99%     108.381us        99.73%       1.804ms       1.804ms       0.000us         0.00%      16.896us      16.896us             1  
+                                             aten::silu         2.28%      41.342us        89.69%       1.623ms     540.945us       7.392us        51.33%       9.888us       3.296us             3  
+void at::native::elementwise_kernel&lt;128, 4, at::nati...         0.00%       0.000us         0.00%       0.000us       0.000us       7.392us        51.33%       7.392us       2.464us             3  
+                                              aten::mul         1.44%      26.049us         2.45%      44.420us      14.807us       7.008us        48.67%       7.008us       2.336us             3  
+void at::native::elementwise_kernel&lt;128, 4, at::nati...         0.00%       0.000us         0.00%       0.000us       0.000us       7.008us        48.67%       7.008us       2.336us             3  
+                                Activity Buffer Request        78.99%       1.429ms        78.99%       1.429ms       1.429ms       2.496us        17.33%       2.496us       2.496us             1  
+                                            aten::slice         1.28%      23.160us         1.59%      28.810us       4.802us       0.000us         0.00%       0.000us       0.000us             6  
+                                       aten::as_strided         0.31%       5.650us         0.31%       5.650us       0.942us       0.000us         0.00%       0.000us       0.000us             6  
+                                       cudaLaunchKernel         9.43%     170.603us         9.43%     170.603us      28.434us       0.000us         0.00%       0.000us       0.000us             6  
+                                  cudaDeviceSynchronize         0.27%       4.930us         0.27%       4.930us       4.930us       0.000us         0.00%       0.000us       0.000us             1  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-Self CPU time total: 1.915ms
-Self CUDA time total: 23.808us
+Self CPU time total: 1.809ms
+Self CUDA time total: 14.400us
 
 
 
@@ -4143,20 +4131,20 @@ PROFILE TRACE: torch_eager | cuda_T512_D1024
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
                                                    Name    Self CPU %      Self CPU   CPU total %     CPU total  CPU time avg     Self CUDA   Self CUDA %    CUDA total  CUDA time avg    # of Calls  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-                                            torch_eager         0.00%       0.000us         0.00%       0.000us       0.000us     157.181us       605.66%     157.181us     157.181us             1  
-                                            torch_eager         5.64%     105.982us        99.73%       1.874ms       1.874ms       0.000us         0.00%      30.528us      30.528us             1  
-                                             aten::silu         2.16%      40.612us        89.86%       1.688ms     562.829us      13.440us        51.79%      18.016us       6.005us             3  
-void at::native::elementwise_kernel&lt;128, 4, at::nati...         0.00%       0.000us         0.00%       0.000us       0.000us      13.440us        51.79%      13.440us       4.480us             3  
-                                              aten::mul         1.34%      25.270us         2.38%      44.720us      14.907us      12.512us        48.21%      12.512us       4.171us             3  
-void at::native::elementwise_kernel&lt;128, 4, at::nati...         0.00%       0.000us         0.00%       0.000us       0.000us      12.512us        48.21%      12.512us       4.171us             3  
-                                Activity Buffer Request        79.27%       1.489ms        79.27%       1.489ms       1.489ms       4.576us        17.63%       4.576us       4.576us             1  
-                                            aten::slice         1.48%      27.801us         1.85%      34.741us       5.790us       0.000us         0.00%       0.000us       0.000us             6  
-                                       aten::as_strided         0.37%       6.940us         0.37%       6.940us       1.157us       0.000us         0.00%       0.000us       0.000us             6  
-                                       cudaLaunchKernel         9.47%     177.873us         9.47%     177.873us      29.645us       0.000us         0.00%       0.000us       0.000us             6  
-                                  cudaDeviceSynchronize         0.27%       5.010us         0.27%       5.010us       5.010us       0.000us         0.00%       0.000us       0.000us             1  
+                                            torch_eager         0.00%       0.000us         0.00%       0.000us       0.000us     142.206us       914.45%     142.206us     142.206us             1  
+                                            torch_eager        21.70%     105.494us        98.87%     480.727us     480.727us       0.000us         0.00%      18.239us      18.239us             1  
+                                             aten::silu         8.21%      39.900us        62.39%     303.354us     101.118us       7.966us        51.23%      10.654us       3.551us             3  
+void at::native::elementwise_kernel&lt;128, 4, at::nati...         0.00%       0.000us         0.00%       0.000us       0.000us       7.966us        51.23%       7.966us       2.655us             3  
+                                              aten::mul         5.16%      25.070us         8.84%      42.990us      14.330us       7.585us        48.77%       7.585us       2.528us             3  
+void at::native::elementwise_kernel&lt;128, 4, at::nati...         0.00%       0.000us         0.00%       0.000us       0.000us       7.585us        48.77%       7.585us       2.528us             3  
+                                Activity Buffer Request        23.29%     113.242us        23.29%     113.242us     113.242us       2.688us        17.29%       2.688us       2.688us             1  
+                                            aten::slice         4.75%      23.080us         5.94%      28.889us       4.815us       0.000us         0.00%       0.000us       0.000us             6  
+                                       aten::as_strided         1.19%       5.809us         1.19%       5.809us       0.968us       0.000us         0.00%       0.000us       0.000us             6  
+                                       cudaLaunchKernel        34.58%     168.132us        34.58%     168.132us      28.022us       0.000us         0.00%       0.000us       0.000us             6  
+                                  cudaDeviceSynchronize         1.13%       5.500us         1.13%       5.500us       5.500us       0.000us         0.00%       0.000us       0.000us             1  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-Self CPU time total: 1.879ms
-Self CUDA time total: 25.952us
+Self CPU time total: 486.227us
+Self CUDA time total: 15.551us
 
 
 
@@ -4166,26 +4154,26 @@ PROFILE TRACE: torch_eager | cuda_T512_D2048
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
                                                    Name    Self CPU %      Self CPU   CPU total %     CPU total  CPU time avg     Self CUDA   Self CUDA %    CUDA total  CUDA time avg    # of Calls  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-                                            torch_eager         0.00%       0.000us         0.00%       0.000us       0.000us     158.077us       375.10%     158.077us     158.077us             1  
-                                            torch_eager         5.61%     105.585us        99.74%       1.877ms       1.877ms       0.000us         0.00%      49.375us      49.375us             1  
-                                             aten::silu         2.18%      41.121us        90.06%       1.695ms     564.996us      21.856us        51.86%      29.088us       9.696us             3  
-void at::native::elementwise_kernel&lt;128, 4, at::nati...         0.00%       0.000us         0.00%       0.000us       0.000us      21.856us        51.86%      21.856us       7.285us             3  
-                                              aten::mul         1.38%      26.000us         2.45%      46.100us      15.367us      20.287us        48.14%      20.287us       6.762us             3  
-void at::native::elementwise_kernel&lt;128, 4, at::nati...         0.00%       0.000us         0.00%       0.000us       0.000us      20.287us        48.14%      20.287us       6.762us             3  
-                                Activity Buffer Request        79.53%       1.497ms        79.53%       1.497ms       1.497ms       7.232us        17.16%       7.232us       7.232us             1  
-                                            aten::slice         1.26%      23.718us         1.62%      30.479us       5.080us       0.000us         0.00%       0.000us       0.000us             6  
-                                       aten::as_strided         0.36%       6.761us         0.36%       6.761us       1.127us       0.000us         0.00%       0.000us       0.000us             6  
-                                       cudaLaunchKernel         9.41%     177.183us         9.41%     177.183us      29.531us       0.000us         0.00%       0.000us       0.000us             6  
-                                  cudaDeviceSynchronize         0.26%       4.970us         0.26%       4.970us       4.970us       0.000us         0.00%       0.000us       0.000us             1  
+                                            torch_eager         0.00%       0.000us         0.00%       0.000us       0.000us     149.022us       661.50%     149.022us     149.022us             1  
+                                            torch_eager         5.72%     105.900us        99.72%       1.847ms       1.847ms       0.000us         0.00%      26.431us      26.431us             1  
+                                             aten::silu         2.24%      41.461us        90.05%       1.668ms     555.875us      11.552us        51.28%      15.455us       5.152us             3  
+void at::native::elementwise_kernel&lt;128, 4, at::nati...         0.00%       0.000us         0.00%       0.000us       0.000us      11.552us        51.28%      11.552us       3.851us             3  
+                                              aten::mul         1.41%      26.021us         2.40%      44.421us      14.807us      10.976us        48.72%      10.976us       3.659us             3  
+void at::native::elementwise_kernel&lt;128, 4, at::nati...         0.00%       0.000us         0.00%       0.000us       0.000us      10.976us        48.72%      10.976us       3.659us             3  
+                                Activity Buffer Request        79.50%       1.472ms        79.50%       1.472ms       1.472ms       3.903us        17.33%       3.903us       3.903us             1  
+                                            aten::slice         1.25%      23.131us         1.56%      28.831us       4.805us       0.000us         0.00%       0.000us       0.000us             6  
+                                       aten::as_strided         0.31%       5.700us         0.31%       5.700us       0.950us       0.000us         0.00%       0.000us       0.000us             6  
+                                       cudaLaunchKernel         9.31%     172.382us         9.31%     172.382us      28.730us       0.000us         0.00%       0.000us       0.000us             6  
+                                  cudaDeviceSynchronize         0.28%       5.130us         0.28%       5.130us       5.130us       0.000us         0.00%       0.000us       0.000us             1  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-Self CPU time total: 1.882ms
-Self CUDA time total: 42.143us
+Self CPU time total: 1.852ms
+Self CUDA time total: 22.528us
 
 
 impl                     wl                  p50(ms)  ok
 torch_eager              cuda_T128_D1024        0.05  True
 torch_eager              cuda_T128_D2048        0.05  True
-torch_eager              cuda_T128_D768         0.05  True
+torch_eager              cuda_T128_D768         0.04  True
 torch_eager              cuda_T256_D1024        0.05  True
 torch_eager              cuda_T256_D2048        0.05  True
 torch_eager              cuda_T256_D768         0.05  True
@@ -4196,53 +4184,7 @@ torch_eager              cuda_T512_D768         0.05  True
 <div class="uv-install-logs" id="uv-logs-benchmark">
 <div class="uv-logs-header" onclick="toggleUvLogs(this)">▶ UV Install Logs</div>
 <div class="uv-logs-content" style="display: none;">
-   Building kernels-benchmark-tools @ file:///home/ubuntu/Projects/kernels-benchmarks-consolidated/tools
-Downloading setuptools (1.1MiB)
-Downloading sympy (6.0MiB)
-Downloading nvidia-cuda-nvrtc-cu12 (84.0MiB)
-Downloading nvidia-cusparselt-cu12 (273.9MiB)
-Downloading nvidia-cuda-cupti-cu12 (9.8MiB)
-Downloading nvidia-cublas-cu12 (566.8MiB)
-Downloading networkx (1.9MiB)
-Downloading numpy (15.9MiB)
-Downloading matplotlib (8.3MiB)
-Downloading nvidia-nvjitlink-cu12 (37.4MiB)
-Downloading nvidia-cusolver-cu12 (255.1MiB)
-Downloading pillow (6.7MiB)
-Downloading nvidia-nccl-cu12 (307.4MiB)
-Downloading nvidia-cudnn-cu12 (674.0MiB)
-Downloading fonttools (4.7MiB)
-Downloading kiwisolver (1.4MiB)
-Downloading nvidia-curand-cu12 (60.7MiB)
-Downloading nvidia-cusparse-cu12 (274.9MiB)
-Downloading nvidia-cufile-cu12 (1.1MiB)
-Downloading nvidia-cufft-cu12 (184.2MiB)
-Downloading torch (846.8MiB)
-Downloading triton (148.4MiB)
- Downloading nvidia-cufile-cu12
- Downloading kiwisolver
- Downloading setuptools
- Downloading networkx
- Downloading fonttools
- Downloading pillow
-      Built kernels-benchmark-tools @ file:///home/ubuntu/Projects/kernels-benchmarks-consolidated/tools
- Downloading matplotlib
- Downloading nvidia-cuda-cupti-cu12
- Downloading numpy
- Downloading nvidia-nvjitlink-cu12
- Downloading sympy
- Downloading nvidia-curand-cu12
- Downloading nvidia-cuda-nvrtc-cu12
- Downloading triton
- Downloading nvidia-cufft-cu12
- Downloading nvidia-cusolver-cu12
- Downloading nvidia-cusparse-cu12
- Downloading nvidia-cusparselt-cu12
- Downloading nvidia-nccl-cu12
- Downloading nvidia-cublas-cu12
- Downloading nvidia-cudnn-cu12
- Downloading torch
-Installed 37 packages in 214ms
+Installed 37 packages in 246ms
 </div>
 </div>
 <div class="cell-artifacts">
diff --git a/activation/results/artifacts/combine/latency.svg b/activation/results/artifacts/combine/latency.svg
index bc4b8900664bea348ce4e4cdc17535a6ff8d8951..02e24e06df11cd1929543b7b6eb05b29ace9034e 100644
--- a/activation/results/artifacts/combine/latency.svg
+++ b/activation/results/artifacts/combine/latency.svg
@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:f286130086ddc73e4e87d0a2a68de7b2f17cff9f893d7fad0e1eb7210cf7e246
-size 20694
+oid sha256:9254fad09b1905d500f91c98ba5debdf4f6497c196acc2cdc499c0572bc73647
+size 20632
diff --git a/activation/results/combined_results.html b/activation/results/combined_results.html
index 424cb36877c2c2c6d1afa9c948ce04e702e8766f..ebf73560e992accff356031bd9555e356bb61b32 100644
--- a/activation/results/combined_results.html
+++ b/activation/results/combined_results.html
@@ -3857,7 +3857,7 @@ span.linenos.special { color: #000000; background-color: #ffffc0; padding-left:
     <div class="system-info">
         <div class="system-info-header">Generated on:</div>
         <div class="system-info-content">
-            Linux x86_64 | Linux-5.15.0-1084-aws-x86_64-with-glibc2.31
+            Linux x86_64 | Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35
         </div>
     </div>
     
@@ -3872,7 +3872,7 @@ span.linenos.special { color: #000000; background-color: #ffffc0; padding-left:
   <rdf:RDF>
    <ns2:Work>
     <dc:type rdf:resource="http://purl.org/dc/dcmitype/StillImage" />
-    <dc:date>2025-10-24T19:26:55.354611</dc:date>
+    <dc:date>2025-10-27T14:46:43.482898</dc:date>
     <dc:format>image/svg+xml</dc:format>
     <dc:creator>
      <ns2:Agent>
@@ -4021,83 +4021,83 @@ span.linenos.special { color: #000000; background-color: #ffffc0; padding-left:
    <g id="matplotlib.axis_2">
     <g id="ytick_1">
      <g id="grid-y--2" class="grid grid-y">
-      <path d="M 60.23 398.041123  L 847.294169 398.041123  " clip-path="url(#p620c7d392f)" style="fill: none; stroke: #b0b0b0; stroke-opacity: 0.3; stroke-width: 0.8; stroke-linecap: square" />
+      <path d="M 60.23 452.615548  L 847.294169 452.615548  " clip-path="url(#p620c7d392f)" style="fill: none; stroke: #b0b0b0; stroke-opacity: 0.3; stroke-width: 0.8; stroke-linecap: square" />
      </g>
      <g id="line2d_10">
       <defs>
        <path id="m0fca2865ba" d="M 0 0  L -3.5 0  " style="stroke: #000000; stroke-width: 0.8" />
       </defs>
       <g>
-       <use ns4:href="#m0fca2865ba" x="60.23" y="398.041123" style="stroke: #000000; stroke-width: 0.8" />
+       <use ns4:href="#m0fca2865ba" x="60.23" y="452.615548" style="stroke: #000000; stroke-width: 0.8" />
       </g>
      </g>
      <g id="text_10">
-      <text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: end" x="53.23" y="401.840342" transform="rotate(-0 53.23 401.840342)">0.030</text>
+      <text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: end" x="53.23" y="456.414767" transform="rotate(-0 53.23 456.414767)">0.025</text>
      </g>
     </g>
     <g id="ytick_2">
      <g id="grid-y--3" class="grid grid-y">
-      <path d="M 60.23 324.254737  L 847.294169 324.254737  " clip-path="url(#p620c7d392f)" style="fill: none; stroke: #b0b0b0; stroke-opacity: 0.3; stroke-width: 0.8; stroke-linecap: square" />
+      <path d="M 60.23 373.068398  L 847.294169 373.068398  " clip-path="url(#p620c7d392f)" style="fill: none; stroke: #b0b0b0; stroke-opacity: 0.3; stroke-width: 0.8; stroke-linecap: square" />
      </g>
      <g id="line2d_11">
       <g>
-       <use ns4:href="#m0fca2865ba" x="60.23" y="324.254737" style="stroke: #000000; stroke-width: 0.8" />
+       <use ns4:href="#m0fca2865ba" x="60.23" y="373.068398" style="stroke: #000000; stroke-width: 0.8" />
       </g>
      </g>
      <g id="text_11">
-      <text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: end" x="53.23" y="328.053956" transform="rotate(-0 53.23 328.053956)">0.035</text>
+      <text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: end" x="53.23" y="376.867617" transform="rotate(-0 53.23 376.867617)">0.030</text>
      </g>
     </g>
     <g id="ytick_3">
      <g id="grid-y--4" class="grid grid-y">
-      <path d="M 60.23 250.468352  L 847.294169 250.468352  " clip-path="url(#p620c7d392f)" style="fill: none; stroke: #b0b0b0; stroke-opacity: 0.3; stroke-width: 0.8; stroke-linecap: square" />
+      <path d="M 60.23 293.521249  L 847.294169 293.521249  " clip-path="url(#p620c7d392f)" style="fill: none; stroke: #b0b0b0; stroke-opacity: 0.3; stroke-width: 0.8; stroke-linecap: square" />
      </g>
      <g id="line2d_12">
       <g>
-       <use ns4:href="#m0fca2865ba" x="60.23" y="250.468352" style="stroke: #000000; stroke-width: 0.8" />
+       <use ns4:href="#m0fca2865ba" x="60.23" y="293.521249" style="stroke: #000000; stroke-width: 0.8" />
       </g>
      </g>
      <g id="text_12">
-      <text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: end" x="53.23" y="254.26757" transform="rotate(-0 53.23 254.26757)">0.040</text>
+      <text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: end" x="53.23" y="297.320468" transform="rotate(-0 53.23 297.320468)">0.035</text>
      </g>
     </g>
     <g id="ytick_4">
      <g id="grid-y--5" class="grid grid-y">
-      <path d="M 60.23 176.681966  L 847.294169 176.681966  " clip-path="url(#p620c7d392f)" style="fill: none; stroke: #b0b0b0; stroke-opacity: 0.3; stroke-width: 0.8; stroke-linecap: square" />
+      <path d="M 60.23 213.974099  L 847.294169 213.974099  " clip-path="url(#p620c7d392f)" style="fill: none; stroke: #b0b0b0; stroke-opacity: 0.3; stroke-width: 0.8; stroke-linecap: square" />
      </g>
      <g id="line2d_13">
       <g>
-       <use ns4:href="#m0fca2865ba" x="60.23" y="176.681966" style="stroke: #000000; stroke-width: 0.8" />
+       <use ns4:href="#m0fca2865ba" x="60.23" y="213.974099" style="stroke: #000000; stroke-width: 0.8" />
       </g>
      </g>
      <g id="text_13">
-      <text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: end" x="53.23" y="180.481185" transform="rotate(-0 53.23 180.481185)">0.045</text>
+      <text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: end" x="53.23" y="217.773318" transform="rotate(-0 53.23 217.773318)">0.040</text>
      </g>
     </g>
     <g id="ytick_5">
      <g id="grid-y--6" class="grid grid-y">
-      <path d="M 60.23 102.89558  L 847.294169 102.89558  " clip-path="url(#p620c7d392f)" style="fill: none; stroke: #b0b0b0; stroke-opacity: 0.3; stroke-width: 0.8; stroke-linecap: square" />
+      <path d="M 60.23 134.42695  L 847.294169 134.42695  " clip-path="url(#p620c7d392f)" style="fill: none; stroke: #b0b0b0; stroke-opacity: 0.3; stroke-width: 0.8; stroke-linecap: square" />
      </g>
      <g id="line2d_14">
       <g>
-       <use ns4:href="#m0fca2865ba" x="60.23" y="102.89558" style="stroke: #000000; stroke-width: 0.8" />
+       <use ns4:href="#m0fca2865ba" x="60.23" y="134.42695" style="stroke: #000000; stroke-width: 0.8" />
       </g>
      </g>
      <g id="text_14">
-      <text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: end" x="53.23" y="106.694799" transform="rotate(-0 53.23 106.694799)">0.050</text>
+      <text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: end" x="53.23" y="138.226168" transform="rotate(-0 53.23 138.226168)">0.045</text>
      </g>
     </g>
     <g id="ytick_6">
      <g id="grid-y--7" class="grid grid-y">
-      <path d="M 60.23 29.109195  L 847.294169 29.109195  " clip-path="url(#p620c7d392f)" style="fill: none; stroke: #b0b0b0; stroke-opacity: 0.3; stroke-width: 0.8; stroke-linecap: square" />
+      <path d="M 60.23 54.8798  L 847.294169 54.8798  " clip-path="url(#p620c7d392f)" style="fill: none; stroke: #b0b0b0; stroke-opacity: 0.3; stroke-width: 0.8; stroke-linecap: square" />
      </g>
      <g id="line2d_15">
       <g>
-       <use ns4:href="#m0fca2865ba" x="60.23" y="29.109195" style="stroke: #000000; stroke-width: 0.8" />
+       <use ns4:href="#m0fca2865ba" x="60.23" y="54.8798" style="stroke: #000000; stroke-width: 0.8" />
       </g>
      </g>
      <g id="text_15">
-      <text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: end" x="53.23" y="32.908413" transform="rotate(-0 53.23 32.908413)">0.055</text>
+      <text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: end" x="53.23" y="58.679019" transform="rotate(-0 53.23 58.679019)">0.050</text>
      </g>
     </g>
     <g id="label--y" class="ylabel">
@@ -4105,37 +4105,37 @@ span.linenos.special { color: #000000; background-color: #ffffc0; padding-left:
     </g>
    </g>
    <g id="series--hf-kernels-swiglu" class="series">
-    <path d="M 96.005644 451.16779  L 185.444754 387.991045  L 274.883864 378.251855  L 364.322974 388.728914  L 453.762084 400.830317  L 543.201194 401.582789  L 632.640304 395.827579  L 722.079415 413.683333  L 811.518525 378.989724  " clip-path="url(#p620c7d392f)" style="fill: none; stroke: #1f77b4; stroke-width: 1.5; stroke-linecap: square" />
+    <path d="M 96.005644 451.16779  L 185.444754 379.591266  L 274.883864 367.802376  L 364.322974 382.120864  L 453.762084 356.82487  L 543.201194 396.121166  L 632.640304 374.96162  L 722.079415 389.136924  L 811.518525 358.734003  " clip-path="url(#p620c7d392f)" style="fill: none; stroke: #1f77b4; stroke-width: 1.5; stroke-linecap: square" />
     <defs>
      <path id="md7efaf3aec" d="M 0 3  C 0.795609 3 1.55874 2.683901 2.12132 2.12132  C 2.683901 1.55874 3 0.795609 3 0  C 3 -0.795609 2.683901 -1.55874 2.12132 -2.12132  C 1.55874 -2.683901 0.795609 -3 0 -3  C -0.795609 -3 -1.55874 -2.683901 -2.12132 -2.12132  C -2.683901 -1.55874 -3 -0.795609 -3 0  C -3 0.795609 -2.683901 1.55874 -2.12132 2.12132  C -1.55874 2.683901 -0.795609 3 0 3  z " style="stroke: #1f77b4" />
     </defs>
     <g clip-path="url(#p620c7d392f)">
      <use ns4:href="#md7efaf3aec" x="96.005644" y="451.16779" style="fill: #1f77b4; stroke: #1f77b4" />
-     <use ns4:href="#md7efaf3aec" x="185.444754" y="387.991045" style="fill: #1f77b4; stroke: #1f77b4" />
-     <use ns4:href="#md7efaf3aec" x="274.883864" y="378.251855" style="fill: #1f77b4; stroke: #1f77b4" />
-     <use ns4:href="#md7efaf3aec" x="364.322974" y="388.728914" style="fill: #1f77b4; stroke: #1f77b4" />
-     <use ns4:href="#md7efaf3aec" x="453.762084" y="400.830317" style="fill: #1f77b4; stroke: #1f77b4" />
-     <use ns4:href="#md7efaf3aec" x="543.201194" y="401.582789" style="fill: #1f77b4; stroke: #1f77b4" />
-     <use ns4:href="#md7efaf3aec" x="632.640304" y="395.827579" style="fill: #1f77b4; stroke: #1f77b4" />
-     <use ns4:href="#md7efaf3aec" x="722.079415" y="413.683333" style="fill: #1f77b4; stroke: #1f77b4" />
-     <use ns4:href="#md7efaf3aec" x="811.518525" y="378.989724" style="fill: #1f77b4; stroke: #1f77b4" />
+     <use ns4:href="#md7efaf3aec" x="185.444754" y="379.591266" style="fill: #1f77b4; stroke: #1f77b4" />
+     <use ns4:href="#md7efaf3aec" x="274.883864" y="367.802376" style="fill: #1f77b4; stroke: #1f77b4" />
+     <use ns4:href="#md7efaf3aec" x="364.322974" y="382.120864" style="fill: #1f77b4; stroke: #1f77b4" />
+     <use ns4:href="#md7efaf3aec" x="453.762084" y="356.82487" style="fill: #1f77b4; stroke: #1f77b4" />
+     <use ns4:href="#md7efaf3aec" x="543.201194" y="396.121166" style="fill: #1f77b4; stroke: #1f77b4" />
+     <use ns4:href="#md7efaf3aec" x="632.640304" y="374.96162" style="fill: #1f77b4; stroke: #1f77b4" />
+     <use ns4:href="#md7efaf3aec" x="722.079415" y="389.136924" style="fill: #1f77b4; stroke: #1f77b4" />
+     <use ns4:href="#md7efaf3aec" x="811.518525" y="358.734003" style="fill: #1f77b4; stroke: #1f77b4" />
     </g>
    </g>
    <g id="series--torch-eager" class="series">
-    <path d="M 96.005644 118.965896  L 185.444754 47.08418  L 274.883864 58.165966  L 364.322974 50.625782  L 453.762084 57.428956  L 543.201194 72.776469  L 632.640304 54.181987  L 722.079415 80.892174  L 811.518525 57.28121  " clip-path="url(#p620c7d392f)" style="fill: none; stroke: #ff7f0e; stroke-width: 1.5; stroke-linecap: square" />
+    <path d="M 96.005644 189.63267  L 185.444754 53.272948  L 274.883864 47.08418  L 364.322974 66.175497  L 453.762084 61.545851  L 543.201194 66.795966  L 632.640304 59.954911  L 722.079415 85.26681  L 811.518525 95.751126  " clip-path="url(#p620c7d392f)" style="fill: none; stroke: #ff7f0e; stroke-width: 1.5; stroke-linecap: square" />
     <defs>
      <path id="m9b8c54d372" d="M 0 3  C 0.795609 3 1.55874 2.683901 2.12132 2.12132  C 2.683901 1.55874 3 0.795609 3 0  C 3 -0.795609 2.683901 -1.55874 2.12132 -2.12132  C 1.55874 -2.683901 0.795609 -3 0 -3  C -0.795609 -3 -1.55874 -2.683901 -2.12132 -2.12132  C -2.683901 -1.55874 -3 -0.795609 -3 0  C -3 0.795609 -2.683901 1.55874 -2.12132 2.12132  C -1.55874 2.683901 -0.795609 3 0 3  z " style="stroke: #ff7f0e" />
     </defs>
     <g clip-path="url(#p620c7d392f)">
-     <use ns4:href="#m9b8c54d372" x="96.005644" y="118.965896" style="fill: #ff7f0e; stroke: #ff7f0e" />
-     <use ns4:href="#m9b8c54d372" x="185.444754" y="47.08418" style="fill: #ff7f0e; stroke: #ff7f0e" />
-     <use ns4:href="#m9b8c54d372" x="274.883864" y="58.165966" style="fill: #ff7f0e; stroke: #ff7f0e" />
-     <use ns4:href="#m9b8c54d372" x="364.322974" y="50.625782" style="fill: #ff7f0e; stroke: #ff7f0e" />
-     <use ns4:href="#m9b8c54d372" x="453.762084" y="57.428956" style="fill: #ff7f0e; stroke: #ff7f0e" />
-     <use ns4:href="#m9b8c54d372" x="543.201194" y="72.776469" style="fill: #ff7f0e; stroke: #ff7f0e" />
-     <use ns4:href="#m9b8c54d372" x="632.640304" y="54.181987" style="fill: #ff7f0e; stroke: #ff7f0e" />
-     <use ns4:href="#m9b8c54d372" x="722.079415" y="80.892174" style="fill: #ff7f0e; stroke: #ff7f0e" />
-     <use ns4:href="#m9b8c54d372" x="811.518525" y="57.28121" style="fill: #ff7f0e; stroke: #ff7f0e" />
+     <use ns4:href="#m9b8c54d372" x="96.005644" y="189.63267" style="fill: #ff7f0e; stroke: #ff7f0e" />
+     <use ns4:href="#m9b8c54d372" x="185.444754" y="53.272948" style="fill: #ff7f0e; stroke: #ff7f0e" />
+     <use ns4:href="#m9b8c54d372" x="274.883864" y="47.08418" style="fill: #ff7f0e; stroke: #ff7f0e" />
+     <use ns4:href="#m9b8c54d372" x="364.322974" y="66.175497" style="fill: #ff7f0e; stroke: #ff7f0e" />
+     <use ns4:href="#m9b8c54d372" x="453.762084" y="61.545851" style="fill: #ff7f0e; stroke: #ff7f0e" />
+     <use ns4:href="#m9b8c54d372" x="543.201194" y="66.795966" style="fill: #ff7f0e; stroke: #ff7f0e" />
+     <use ns4:href="#m9b8c54d372" x="632.640304" y="59.954911" style="fill: #ff7f0e; stroke: #ff7f0e" />
+     <use ns4:href="#m9b8c54d372" x="722.079415" y="85.26681" style="fill: #ff7f0e; stroke: #ff7f0e" />
+     <use ns4:href="#m9b8c54d372" x="811.518525" y="95.751126" style="fill: #ff7f0e; stroke: #ff7f0e" />
     </g>
    </g>
    <g id="patch_3">
@@ -4155,25 +4155,25 @@ span.linenos.special { color: #000000; background-color: #ffffc0; padding-left:
    </g>
    <g id="legend" class="legend">
     <g id="patch_7">
-     <path d="M 720.811356 466.37197  L 840.294169 466.37197  Q 842.294169 466.37197 842.294169 464.37197  L 842.294169 435.45947  Q 842.294169 433.45947 840.294169 433.45947  L 720.811356 433.45947  Q 718.811356 433.45947 718.811356 435.45947  L 718.811356 464.37197  Q 718.811356 466.37197 720.811356 466.37197  L 720.811356 466.37197  z " style="fill: none; opacity: 0.8; stroke: #cccccc; stroke-linejoin: miter" />
+     <path d="M 720.811356 64.7925  L 840.294169 64.7925  Q 842.294169 64.7925 842.294169 62.7925  L 842.294169 33.88  Q 842.294169 31.88 840.294169 31.88  L 720.811356 31.88  Q 718.811356 31.88 718.811356 33.88  L 718.811356 62.7925  Q 718.811356 64.7925 720.811356 64.7925  L 720.811356 64.7925  z " style="fill: none; opacity: 0.8; stroke: #cccccc; stroke-linejoin: miter" />
     </g>
     <g id="line2d_16">
-     <path d="M 722.811356 441.557908  L 732.811356 441.557908  L 742.811356 441.557908  " style="fill: none; stroke: #1f77b4; stroke-width: 1.5; stroke-linecap: square" />
+     <path d="M 722.811356 39.978438  L 732.811356 39.978438  L 742.811356 39.978438  " style="fill: none; stroke: #1f77b4; stroke-width: 1.5; stroke-linecap: square" />
      <g>
-      <use ns4:href="#md7efaf3aec" x="732.811356" y="441.557908" style="fill: #1f77b4; stroke: #1f77b4" />
+      <use ns4:href="#md7efaf3aec" x="732.811356" y="39.978438" style="fill: #1f77b4; stroke: #1f77b4" />
      </g>
     </g>
     <g id="legend-label--hf-kernels-swiglu" class="legend">
-     <text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: start" x="750.811356" y="445.057908" transform="rotate(-0 750.811356 445.057908)">hf_kernels_swiglu</text>
+     <text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: start" x="750.811356" y="43.478438" transform="rotate(-0 750.811356 43.478438)">hf_kernels_swiglu</text>
     </g>
     <g id="line2d_17">
-     <path d="M 722.811356 456.514158  L 732.811356 456.514158  L 742.811356 456.514158  " style="fill: none; stroke: #ff7f0e; stroke-width: 1.5; stroke-linecap: square" />
+     <path d="M 722.811356 54.934687  L 732.811356 54.934687  L 742.811356 54.934687  " style="fill: none; stroke: #ff7f0e; stroke-width: 1.5; stroke-linecap: square" />
      <g>
-      <use ns4:href="#m9b8c54d372" x="732.811356" y="456.514158" style="fill: #ff7f0e; stroke: #ff7f0e" />
+      <use ns4:href="#m9b8c54d372" x="732.811356" y="54.934687" style="fill: #ff7f0e; stroke: #ff7f0e" />
      </g>
     </g>
     <g id="legend-label--torch-eager" class="legend">
-     <text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: start" x="750.811356" y="460.014158" transform="rotate(-0 750.811356 460.014158)">torch_eager</text>
+     <text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: start" x="750.811356" y="58.434687" transform="rotate(-0 750.811356 58.434687)">torch_eager</text>
     </g>
    </g>
   </g>
@@ -4193,7 +4193,7 @@ span.linenos.special { color: #000000; background-color: #ffffc0; padding-left:
 <span onclick="toggleOutput('combine')" style="cursor: pointer;">▼ output</span>
  <span id="uv-indicator-combine" onclick="toggleUvLogsFromHeader('combine')" style="cursor: pointer;">▶ uv-logs</span>
 </span> | 
-Cell: combine | 38.46s
+Cell: combine | 4.45s
  | <button class="run-btn" onclick="runCell('combine')">▶ run</button>
 <button class="copy-btn" onclick="copyCell('combine')">Copy</button>
 <a href="cells/combine.py" target="_blank" class="raw-btn">Raw</a>
@@ -4267,13 +4267,13 @@ Cell: combine | 38.46s
 <div class="cell-stdout"><pre class="stdout-text">======================================================================
 LOADING BENCHMARK DATA
 ======================================================================
-✓ HF Kernels SwiGLU             : /home/ubuntu/Projects/kernels-benchmarks-consolidated/benches/activation/impls/.uvnote/cache/d30443d7e6209ed0a7ffb0b020b1f31815cb2e95563283b7a25710e6420dbed8
-✓ PyTorch SwiGLU                : /home/ubuntu/Projects/kernels-benchmarks-consolidated/benches/activation/impls/.uvnote/cache/86a65ffc73cc3e7a7b1efe81bde7937d3f4e55d4f6b857c3fca0d9008687d8d6
+✓ HF Kernels SwiGLU             : /__w/kernels-benchmarks/kernels-benchmarks/benches/activation/impls/.uvnote/cache/2775e6386f1caf1fda935a997130c06dcaf7641efb0db21560c35301fdabfd9b
+✓ PyTorch SwiGLU                : /__w/kernels-benchmarks/kernels-benchmarks/benches/activation/impls/.uvnote/cache/661ca38adec8893d7c284140e922da661f0afcea4aaff6a3bf48a6494ce7c6eb
 
   ✓ Found HF Kernels SwiGLU
-     Path: /home/ubuntu/Projects/kernels-benchmarks-consolidated/benches/activation/impls/.uvnote/cache/d30443d7e6209ed0a7ffb0b020b1f31815cb2e95563283b7a25710e6420dbed8/activation.jsonl
+     Path: /__w/kernels-benchmarks/kernels-benchmarks/benches/activation/impls/.uvnote/cache/2775e6386f1caf1fda935a997130c06dcaf7641efb0db21560c35301fdabfd9b/activation.jsonl
   ✓ Found PyTorch SwiGLU
-     Path: /home/ubuntu/Projects/kernels-benchmarks-consolidated/benches/activation/impls/.uvnote/cache/86a65ffc73cc3e7a7b1efe81bde7937d3f4e55d4f6b857c3fca0d9008687d8d6/activation.jsonl
+     Path: /__w/kernels-benchmarks/kernels-benchmarks/benches/activation/impls/.uvnote/cache/661ca38adec8893d7c284140e922da661f0afcea4aaff6a3bf48a6494ce7c6eb/activation.jsonl
 
 ======================================================================
 Summary: 2 found, 0 skipped, 0 missing
@@ -4293,7 +4293,7 @@ hf_kernels_swiglu        cuda_T512_D2048        0.03  True
 hf_kernels_swiglu        cuda_T512_D768         0.03  True
 torch_eager              cuda_T128_D1024        0.05  True
 torch_eager              cuda_T128_D2048        0.05  True
-torch_eager              cuda_T128_D768         0.05  True
+torch_eager              cuda_T128_D768         0.04  True
 torch_eager              cuda_T256_D1024        0.05  True
 torch_eager              cuda_T256_D2048        0.05  True
 torch_eager              cuda_T256_D768         0.05  True
@@ -4319,53 +4319,7 @@ Implementations included:
 <div class="uv-install-logs" id="uv-logs-combine">
 <div class="uv-logs-header" onclick="toggleUvLogs(this)">▶ UV Install Logs</div>
 <div class="uv-logs-content" style="display: none;">
-   Building kernels-benchmark-tools @ file:///home/ubuntu/Projects/kernels-benchmarks-consolidated/tools
-Downloading numpy (15.9MiB)
-Downloading sympy (6.0MiB)
-Downloading networkx (1.9MiB)
-Downloading nvidia-cusparselt-cu12 (273.9MiB)
-Downloading nvidia-cusparse-cu12 (274.9MiB)
-Downloading nvidia-cufft-cu12 (184.2MiB)
-Downloading torch (846.8MiB)
-Downloading setuptools (1.1MiB)
-Downloading nvidia-cuda-cupti-cu12 (9.8MiB)
-Downloading nvidia-cuda-nvrtc-cu12 (84.0MiB)
-Downloading nvidia-nvjitlink-cu12 (37.4MiB)
-Downloading nvidia-cudnn-cu12 (674.0MiB)
-Downloading pillow (6.7MiB)
-Downloading nvidia-nccl-cu12 (307.4MiB)
-Downloading nvidia-curand-cu12 (60.7MiB)
-Downloading nvidia-cublas-cu12 (566.8MiB)
-Downloading nvidia-cufile-cu12 (1.1MiB)
-Downloading fonttools (4.7MiB)
-Downloading triton (148.4MiB)
-Downloading matplotlib (8.3MiB)
-Downloading kiwisolver (1.4MiB)
-Downloading nvidia-cusolver-cu12 (255.1MiB)
- Downloading nvidia-cufile-cu12
- Downloading kiwisolver
- Downloading setuptools
- Downloading networkx
- Downloading fonttools
- Downloading pillow
-      Built kernels-benchmark-tools @ file:///home/ubuntu/Projects/kernels-benchmarks-consolidated/tools
- Downloading nvidia-cuda-cupti-cu12
- Downloading matplotlib
- Downloading numpy
- Downloading sympy
- Downloading nvidia-nvjitlink-cu12
- Downloading nvidia-curand-cu12
- Downloading nvidia-cuda-nvrtc-cu12
- Downloading triton
- Downloading nvidia-cufft-cu12
- Downloading nvidia-cusolver-cu12
- Downloading nvidia-cusparse-cu12
- Downloading nvidia-cusparselt-cu12
- Downloading nvidia-nccl-cu12
- Downloading nvidia-cublas-cu12
- Downloading nvidia-cudnn-cu12
- Downloading torch
-Installed 37 packages in 212ms
+Installed 37 packages in 250ms
 </div>
 </div>
 <div class="cell-artifacts">
@@ -4378,7 +4332,7 @@ Installed 37 packages in 212ms
   <rdf:RDF>
    <ns2:Work>
     <dc:type rdf:resource="http://purl.org/dc/dcmitype/StillImage" />
-    <dc:date>2025-10-24T19:26:55.354611</dc:date>
+    <dc:date>2025-10-27T14:46:43.482898</dc:date>
     <dc:format>image/svg+xml</dc:format>
     <dc:creator>
      <ns2:Agent>
@@ -4527,83 +4481,83 @@ Installed 37 packages in 212ms
    <g id="matplotlib.axis_2">
     <g id="ytick_1">
      <g id="grid-y--2" class="grid grid-y">
-      <path d="M 60.23 398.041123  L 847.294169 398.041123  " clip-path="url(#p620c7d392f)" style="fill: none; stroke: #b0b0b0; stroke-opacity: 0.3; stroke-width: 0.8; stroke-linecap: square" />
+      <path d="M 60.23 452.615548  L 847.294169 452.615548  " clip-path="url(#p620c7d392f)" style="fill: none; stroke: #b0b0b0; stroke-opacity: 0.3; stroke-width: 0.8; stroke-linecap: square" />
      </g>
      <g id="line2d_10">
       <defs>
        <path id="m0fca2865ba" d="M 0 0  L -3.5 0  " style="stroke: #000000; stroke-width: 0.8" />
       </defs>
       <g>
-       <use ns4:href="#m0fca2865ba" x="60.23" y="398.041123" style="stroke: #000000; stroke-width: 0.8" />
+       <use ns4:href="#m0fca2865ba" x="60.23" y="452.615548" style="stroke: #000000; stroke-width: 0.8" />
       </g>
      </g>
      <g id="text_10">
-      <text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: end" x="53.23" y="401.840342" transform="rotate(-0 53.23 401.840342)">0.030</text>
+      <text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: end" x="53.23" y="456.414767" transform="rotate(-0 53.23 456.414767)">0.025</text>
      </g>
     </g>
     <g id="ytick_2">
      <g id="grid-y--3" class="grid grid-y">
-      <path d="M 60.23 324.254737  L 847.294169 324.254737  " clip-path="url(#p620c7d392f)" style="fill: none; stroke: #b0b0b0; stroke-opacity: 0.3; stroke-width: 0.8; stroke-linecap: square" />
+      <path d="M 60.23 373.068398  L 847.294169 373.068398  " clip-path="url(#p620c7d392f)" style="fill: none; stroke: #b0b0b0; stroke-opacity: 0.3; stroke-width: 0.8; stroke-linecap: square" />
      </g>
      <g id="line2d_11">
       <g>
-       <use ns4:href="#m0fca2865ba" x="60.23" y="324.254737" style="stroke: #000000; stroke-width: 0.8" />
+       <use ns4:href="#m0fca2865ba" x="60.23" y="373.068398" style="stroke: #000000; stroke-width: 0.8" />
       </g>
      </g>
      <g id="text_11">
-      <text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: end" x="53.23" y="328.053956" transform="rotate(-0 53.23 328.053956)">0.035</text>
+      <text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: end" x="53.23" y="376.867617" transform="rotate(-0 53.23 376.867617)">0.030</text>
      </g>
     </g>
     <g id="ytick_3">
      <g id="grid-y--4" class="grid grid-y">
-      <path d="M 60.23 250.468352  L 847.294169 250.468352  " clip-path="url(#p620c7d392f)" style="fill: none; stroke: #b0b0b0; stroke-opacity: 0.3; stroke-width: 0.8; stroke-linecap: square" />
+      <path d="M 60.23 293.521249  L 847.294169 293.521249  " clip-path="url(#p620c7d392f)" style="fill: none; stroke: #b0b0b0; stroke-opacity: 0.3; stroke-width: 0.8; stroke-linecap: square" />
      </g>
      <g id="line2d_12">
       <g>
-       <use ns4:href="#m0fca2865ba" x="60.23" y="250.468352" style="stroke: #000000; stroke-width: 0.8" />
+       <use ns4:href="#m0fca2865ba" x="60.23" y="293.521249" style="stroke: #000000; stroke-width: 0.8" />
       </g>
      </g>
      <g id="text_12">
-      <text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: end" x="53.23" y="254.26757" transform="rotate(-0 53.23 254.26757)">0.040</text>
+      <text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: end" x="53.23" y="297.320468" transform="rotate(-0 53.23 297.320468)">0.035</text>
      </g>
     </g>
     <g id="ytick_4">
      <g id="grid-y--5" class="grid grid-y">
-      <path d="M 60.23 176.681966  L 847.294169 176.681966  " clip-path="url(#p620c7d392f)" style="fill: none; stroke: #b0b0b0; stroke-opacity: 0.3; stroke-width: 0.8; stroke-linecap: square" />
+      <path d="M 60.23 213.974099  L 847.294169 213.974099  " clip-path="url(#p620c7d392f)" style="fill: none; stroke: #b0b0b0; stroke-opacity: 0.3; stroke-width: 0.8; stroke-linecap: square" />
      </g>
      <g id="line2d_13">
       <g>
-       <use ns4:href="#m0fca2865ba" x="60.23" y="176.681966" style="stroke: #000000; stroke-width: 0.8" />
+       <use ns4:href="#m0fca2865ba" x="60.23" y="213.974099" style="stroke: #000000; stroke-width: 0.8" />
       </g>
      </g>
      <g id="text_13">
-      <text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: end" x="53.23" y="180.481185" transform="rotate(-0 53.23 180.481185)">0.045</text>
+      <text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: end" x="53.23" y="217.773318" transform="rotate(-0 53.23 217.773318)">0.040</text>
      </g>
     </g>
     <g id="ytick_5">
      <g id="grid-y--6" class="grid grid-y">
-      <path d="M 60.23 102.89558  L 847.294169 102.89558  " clip-path="url(#p620c7d392f)" style="fill: none; stroke: #b0b0b0; stroke-opacity: 0.3; stroke-width: 0.8; stroke-linecap: square" />
+      <path d="M 60.23 134.42695  L 847.294169 134.42695  " clip-path="url(#p620c7d392f)" style="fill: none; stroke: #b0b0b0; stroke-opacity: 0.3; stroke-width: 0.8; stroke-linecap: square" />
      </g>
      <g id="line2d_14">
       <g>
-       <use ns4:href="#m0fca2865ba" x="60.23" y="102.89558" style="stroke: #000000; stroke-width: 0.8" />
+       <use ns4:href="#m0fca2865ba" x="60.23" y="134.42695" style="stroke: #000000; stroke-width: 0.8" />
       </g>
      </g>
      <g id="text_14">
-      <text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: end" x="53.23" y="106.694799" transform="rotate(-0 53.23 106.694799)">0.050</text>
+      <text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: end" x="53.23" y="138.226168" transform="rotate(-0 53.23 138.226168)">0.045</text>
      </g>
     </g>
     <g id="ytick_6">
      <g id="grid-y--7" class="grid grid-y">
-      <path d="M 60.23 29.109195  L 847.294169 29.109195  " clip-path="url(#p620c7d392f)" style="fill: none; stroke: #b0b0b0; stroke-opacity: 0.3; stroke-width: 0.8; stroke-linecap: square" />
+      <path d="M 60.23 54.8798  L 847.294169 54.8798  " clip-path="url(#p620c7d392f)" style="fill: none; stroke: #b0b0b0; stroke-opacity: 0.3; stroke-width: 0.8; stroke-linecap: square" />
      </g>
      <g id="line2d_15">
       <g>
-       <use ns4:href="#m0fca2865ba" x="60.23" y="29.109195" style="stroke: #000000; stroke-width: 0.8" />
+       <use ns4:href="#m0fca2865ba" x="60.23" y="54.8798" style="stroke: #000000; stroke-width: 0.8" />
       </g>
      </g>
      <g id="text_15">
-      <text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: end" x="53.23" y="32.908413" transform="rotate(-0 53.23 32.908413)">0.055</text>
+      <text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: end" x="53.23" y="58.679019" transform="rotate(-0 53.23 58.679019)">0.050</text>
      </g>
     </g>
     <g id="label--y" class="ylabel">
@@ -4611,37 +4565,37 @@ Installed 37 packages in 212ms
     </g>
    </g>
    <g id="series--hf-kernels-swiglu" class="series">
-    <path d="M 96.005644 451.16779  L 185.444754 387.991045  L 274.883864 378.251855  L 364.322974 388.728914  L 453.762084 400.830317  L 543.201194 401.582789  L 632.640304 395.827579  L 722.079415 413.683333  L 811.518525 378.989724  " clip-path="url(#p620c7d392f)" style="fill: none; stroke: #1f77b4; stroke-width: 1.5; stroke-linecap: square" />
+    <path d="M 96.005644 451.16779  L 185.444754 379.591266  L 274.883864 367.802376  L 364.322974 382.120864  L 453.762084 356.82487  L 543.201194 396.121166  L 632.640304 374.96162  L 722.079415 389.136924  L 811.518525 358.734003  " clip-path="url(#p620c7d392f)" style="fill: none; stroke: #1f77b4; stroke-width: 1.5; stroke-linecap: square" />
     <defs>
      <path id="md7efaf3aec" d="M 0 3  C 0.795609 3 1.55874 2.683901 2.12132 2.12132  C 2.683901 1.55874 3 0.795609 3 0  C 3 -0.795609 2.683901 -1.55874 2.12132 -2.12132  C 1.55874 -2.683901 0.795609 -3 0 -3  C -0.795609 -3 -1.55874 -2.683901 -2.12132 -2.12132  C -2.683901 -1.55874 -3 -0.795609 -3 0  C -3 0.795609 -2.683901 1.55874 -2.12132 2.12132  C -1.55874 2.683901 -0.795609 3 0 3  z " style="stroke: #1f77b4" />
     </defs>
     <g clip-path="url(#p620c7d392f)">
      <use ns4:href="#md7efaf3aec" x="96.005644" y="451.16779" style="fill: #1f77b4; stroke: #1f77b4" />
-     <use ns4:href="#md7efaf3aec" x="185.444754" y="387.991045" style="fill: #1f77b4; stroke: #1f77b4" />
-     <use ns4:href="#md7efaf3aec" x="274.883864" y="378.251855" style="fill: #1f77b4; stroke: #1f77b4" />
-     <use ns4:href="#md7efaf3aec" x="364.322974" y="388.728914" style="fill: #1f77b4; stroke: #1f77b4" />
-     <use ns4:href="#md7efaf3aec" x="453.762084" y="400.830317" style="fill: #1f77b4; stroke: #1f77b4" />
-     <use ns4:href="#md7efaf3aec" x="543.201194" y="401.582789" style="fill: #1f77b4; stroke: #1f77b4" />
-     <use ns4:href="#md7efaf3aec" x="632.640304" y="395.827579" style="fill: #1f77b4; stroke: #1f77b4" />
-     <use ns4:href="#md7efaf3aec" x="722.079415" y="413.683333" style="fill: #1f77b4; stroke: #1f77b4" />
-     <use ns4:href="#md7efaf3aec" x="811.518525" y="378.989724" style="fill: #1f77b4; stroke: #1f77b4" />
+     <use ns4:href="#md7efaf3aec" x="185.444754" y="379.591266" style="fill: #1f77b4; stroke: #1f77b4" />
+     <use ns4:href="#md7efaf3aec" x="274.883864" y="367.802376" style="fill: #1f77b4; stroke: #1f77b4" />
+     <use ns4:href="#md7efaf3aec" x="364.322974" y="382.120864" style="fill: #1f77b4; stroke: #1f77b4" />
+     <use ns4:href="#md7efaf3aec" x="453.762084" y="356.82487" style="fill: #1f77b4; stroke: #1f77b4" />
+     <use ns4:href="#md7efaf3aec" x="543.201194" y="396.121166" style="fill: #1f77b4; stroke: #1f77b4" />
+     <use ns4:href="#md7efaf3aec" x="632.640304" y="374.96162" style="fill: #1f77b4; stroke: #1f77b4" />
+     <use ns4:href="#md7efaf3aec" x="722.079415" y="389.136924" style="fill: #1f77b4; stroke: #1f77b4" />
+     <use ns4:href="#md7efaf3aec" x="811.518525" y="358.734003" style="fill: #1f77b4; stroke: #1f77b4" />
     </g>
    </g>
    <g id="series--torch-eager" class="series">
-    <path d="M 96.005644 118.965896  L 185.444754 47.08418  L 274.883864 58.165966  L 364.322974 50.625782  L 453.762084 57.428956  L 543.201194 72.776469  L 632.640304 54.181987  L 722.079415 80.892174  L 811.518525 57.28121  " clip-path="url(#p620c7d392f)" style="fill: none; stroke: #ff7f0e; stroke-width: 1.5; stroke-linecap: square" />
+    <path d="M 96.005644 189.63267  L 185.444754 53.272948  L 274.883864 47.08418  L 364.322974 66.175497  L 453.762084 61.545851  L 543.201194 66.795966  L 632.640304 59.954911  L 722.079415 85.26681  L 811.518525 95.751126  " clip-path="url(#p620c7d392f)" style="fill: none; stroke: #ff7f0e; stroke-width: 1.5; stroke-linecap: square" />
     <defs>
      <path id="m9b8c54d372" d="M 0 3  C 0.795609 3 1.55874 2.683901 2.12132 2.12132  C 2.683901 1.55874 3 0.795609 3 0  C 3 -0.795609 2.683901 -1.55874 2.12132 -2.12132  C 1.55874 -2.683901 0.795609 -3 0 -3  C -0.795609 -3 -1.55874 -2.683901 -2.12132 -2.12132  C -2.683901 -1.55874 -3 -0.795609 -3 0  C -3 0.795609 -2.683901 1.55874 -2.12132 2.12132  C -1.55874 2.683901 -0.795609 3 0 3  z " style="stroke: #ff7f0e" />
     </defs>
     <g clip-path="url(#p620c7d392f)">
-     <use ns4:href="#m9b8c54d372" x="96.005644" y="118.965896" style="fill: #ff7f0e; stroke: #ff7f0e" />
-     <use ns4:href="#m9b8c54d372" x="185.444754" y="47.08418" style="fill: #ff7f0e; stroke: #ff7f0e" />
-     <use ns4:href="#m9b8c54d372" x="274.883864" y="58.165966" style="fill: #ff7f0e; stroke: #ff7f0e" />
-     <use ns4:href="#m9b8c54d372" x="364.322974" y="50.625782" style="fill: #ff7f0e; stroke: #ff7f0e" />
-     <use ns4:href="#m9b8c54d372" x="453.762084" y="57.428956" style="fill: #ff7f0e; stroke: #ff7f0e" />
-     <use ns4:href="#m9b8c54d372" x="543.201194" y="72.776469" style="fill: #ff7f0e; stroke: #ff7f0e" />
-     <use ns4:href="#m9b8c54d372" x="632.640304" y="54.181987" style="fill: #ff7f0e; stroke: #ff7f0e" />
-     <use ns4:href="#m9b8c54d372" x="722.079415" y="80.892174" style="fill: #ff7f0e; stroke: #ff7f0e" />
-     <use ns4:href="#m9b8c54d372" x="811.518525" y="57.28121" style="fill: #ff7f0e; stroke: #ff7f0e" />
+     <use ns4:href="#m9b8c54d372" x="96.005644" y="189.63267" style="fill: #ff7f0e; stroke: #ff7f0e" />
+     <use ns4:href="#m9b8c54d372" x="185.444754" y="53.272948" style="fill: #ff7f0e; stroke: #ff7f0e" />
+     <use ns4:href="#m9b8c54d372" x="274.883864" y="47.08418" style="fill: #ff7f0e; stroke: #ff7f0e" />
+     <use ns4:href="#m9b8c54d372" x="364.322974" y="66.175497" style="fill: #ff7f0e; stroke: #ff7f0e" />
+     <use ns4:href="#m9b8c54d372" x="453.762084" y="61.545851" style="fill: #ff7f0e; stroke: #ff7f0e" />
+     <use ns4:href="#m9b8c54d372" x="543.201194" y="66.795966" style="fill: #ff7f0e; stroke: #ff7f0e" />
+     <use ns4:href="#m9b8c54d372" x="632.640304" y="59.954911" style="fill: #ff7f0e; stroke: #ff7f0e" />
+     <use ns4:href="#m9b8c54d372" x="722.079415" y="85.26681" style="fill: #ff7f0e; stroke: #ff7f0e" />
+     <use ns4:href="#m9b8c54d372" x="811.518525" y="95.751126" style="fill: #ff7f0e; stroke: #ff7f0e" />
     </g>
    </g>
    <g id="patch_3">
@@ -4661,25 +4615,25 @@ Installed 37 packages in 212ms
    </g>
    <g id="legend" class="legend">
     <g id="patch_7">
-     <path d="M 720.811356 466.37197  L 840.294169 466.37197  Q 842.294169 466.37197 842.294169 464.37197  L 842.294169 435.45947  Q 842.294169 433.45947 840.294169 433.45947  L 720.811356 433.45947  Q 718.811356 433.45947 718.811356 435.45947  L 718.811356 464.37197  Q 718.811356 466.37197 720.811356 466.37197  L 720.811356 466.37197  z " style="fill: none; opacity: 0.8; stroke: #cccccc; stroke-linejoin: miter" />
+     <path d="M 720.811356 64.7925  L 840.294169 64.7925  Q 842.294169 64.7925 842.294169 62.7925  L 842.294169 33.88  Q 842.294169 31.88 840.294169 31.88  L 720.811356 31.88  Q 718.811356 31.88 718.811356 33.88  L 718.811356 62.7925  Q 718.811356 64.7925 720.811356 64.7925  L 720.811356 64.7925  z " style="fill: none; opacity: 0.8; stroke: #cccccc; stroke-linejoin: miter" />
     </g>
     <g id="line2d_16">
-     <path d="M 722.811356 441.557908  L 732.811356 441.557908  L 742.811356 441.557908  " style="fill: none; stroke: #1f77b4; stroke-width: 1.5; stroke-linecap: square" />
+     <path d="M 722.811356 39.978438  L 732.811356 39.978438  L 742.811356 39.978438  " style="fill: none; stroke: #1f77b4; stroke-width: 1.5; stroke-linecap: square" />
      <g>
-      <use ns4:href="#md7efaf3aec" x="732.811356" y="441.557908" style="fill: #1f77b4; stroke: #1f77b4" />
+      <use ns4:href="#md7efaf3aec" x="732.811356" y="39.978438" style="fill: #1f77b4; stroke: #1f77b4" />
      </g>
     </g>
     <g id="legend-label--hf-kernels-swiglu" class="legend">
-     <text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: start" x="750.811356" y="445.057908" transform="rotate(-0 750.811356 445.057908)">hf_kernels_swiglu</text>
+     <text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: start" x="750.811356" y="43.478438" transform="rotate(-0 750.811356 43.478438)">hf_kernels_swiglu</text>
     </g>
     <g id="line2d_17">
-     <path d="M 722.811356 456.514158  L 732.811356 456.514158  L 742.811356 456.514158  " style="fill: none; stroke: #ff7f0e; stroke-width: 1.5; stroke-linecap: square" />
+     <path d="M 722.811356 54.934687  L 732.811356 54.934687  L 742.811356 54.934687  " style="fill: none; stroke: #ff7f0e; stroke-width: 1.5; stroke-linecap: square" />
      <g>
-      <use ns4:href="#m9b8c54d372" x="732.811356" y="456.514158" style="fill: #ff7f0e; stroke: #ff7f0e" />
+      <use ns4:href="#m9b8c54d372" x="732.811356" y="54.934687" style="fill: #ff7f0e; stroke: #ff7f0e" />
      </g>
     </g>
     <g id="legend-label--torch-eager" class="legend">
-     <text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: start" x="750.811356" y="460.014158" transform="rotate(-0 750.811356 460.014158)">torch_eager</text>
+     <text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: start" x="750.811356" y="58.434687" transform="rotate(-0 750.811356 58.434687)">torch_eager</text>
     </g>
    </g>
   </g>
diff --git a/flash_attn/impls/artifacts/benchmark/attention.jsonl b/flash_attn/impls/artifacts/benchmark/attention.jsonl
index a71c816c6a4debc70577fca4dc7743032e2e6be5..f7b87bffff02cfb69a5abf9ea7fad8f878048292 100644
--- a/flash_attn/impls/artifacts/benchmark/attention.jsonl
+++ b/flash_attn/impls/artifacts/benchmark/attention.jsonl
@@ -1,6 +1,6 @@
-{"ts": "2025-10-24T19:25:35Z", "run": "352bdfcdb8354c52bb9bcfb05fb3e88b", "impl": "hf_kernels_flash_attn", "tags": {"family": "hf-kernels", "backend": "flash-attn", "compile": "none"}, "wl": {"name": "cuda_attn_L128_bfloat16", "batch": 1, "seq_len": 4224, "heads": 24, "head_dim": 128, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L4", "sm": "8.9", "py": "3.12.7", "plat": "Linux-5.15.0-1084-aws-x86_64-with-glibc2.31"}, "lat_ms": {"p10": 2.817559987306595, "p50": 2.819840970914811, "p90": 2.8203310212120414, "mean": 2.8193464037030935, "iqr": 0.002661021426320076, "raw_times": [2.8176699997857213, 2.8203310212120414, 2.821330039296299, 2.819840970914811, 2.817559987306595], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 2.8170199948363006, "peak_bytes": 295567360, "ok": true, "absmax": 0.0625, "corr": {"ok": true, "rtol": 0.02, "atol": 0.02, "absmax": 0.0625, "mae": 0.0003528594970703125, "mse": 2.726912498474121e-06, "ref": "sdpa_math_fp32"}, "err": null}
-{"ts": "2025-10-24T19:25:35Z", "run": "352bdfcdb8354c52bb9bcfb05fb3e88b", "impl": "hf_kernels_flash_attn", "tags": {"family": "hf-kernels", "backend": "flash-attn", "compile": "none"}, "wl": {"name": "cuda_attn_L256_bfloat16", "batch": 1, "seq_len": 4352, "heads": 24, "head_dim": 128, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L4", "sm": "8.9", "py": "3.12.7", "plat": "Linux-5.15.0-1084-aws-x86_64-with-glibc2.31"}, "lat_ms": {"p10": 3.9076139801181853, "p50": 3.9129150100052357, "p90": 3.91379400389269, "mean": 3.920128010213375, "iqr": 0.0021209707483649254, "raw_times": [3.9546440239064395, 3.9076139801181853, 3.9129150100052357, 3.911673033144325, 3.91379400389269], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 3.1108770053833723, "peak_bytes": 304742400, "ok": true, "absmax": 0.0625, "corr": {"ok": true, "rtol": 0.02, "atol": 0.02, "absmax": 0.0625, "mae": 0.000354766845703125, "mse": 2.7567148208618164e-06, "ref": "sdpa_math_fp32"}, "err": null}
-{"ts": "2025-10-24T19:25:35Z", "run": "352bdfcdb8354c52bb9bcfb05fb3e88b", "impl": "hf_kernels_flash_attn", "tags": {"family": "hf-kernels", "backend": "flash-attn", "compile": "none"}, "wl": {"name": "cuda_attn_L320_bfloat16", "batch": 1, "seq_len": 4416, "heads": 24, "head_dim": 128, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L4", "sm": "8.9", "py": "3.12.7", "plat": "Linux-5.15.0-1084-aws-x86_64-with-glibc2.31"}, "lat_ms": {"p10": 4.073257034178823, "p50": 4.119218967389315, "p90": 4.122229001950473, "mean": 4.102474392857403, "iqr": 0.04891102435067296, "raw_times": [4.073257034178823, 4.122229001950473, 4.119218967389315, 4.124348983168602, 4.0733179775998], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 3.606237005442381, "peak_bytes": 307494912, "ok": true, "absmax": 0.0625, "corr": {"ok": true, "rtol": 0.02, "atol": 0.02, "absmax": 0.0625, "mae": 0.000354766845703125, "mse": 2.7418136596679688e-06, "ref": "sdpa_math_fp32"}, "err": null}
-{"ts": "2025-10-24T19:25:36Z", "run": "352bdfcdb8354c52bb9bcfb05fb3e88b", "impl": "hf_kernels_flash_attn", "tags": {"family": "hf-kernels", "backend": "flash-attn", "compile": "none"}, "wl": {"name": "cuda_attn_L384_bfloat16", "batch": 1, "seq_len": 4480, "heads": 24, "head_dim": 128, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L4", "sm": "8.9", "py": "3.12.7", "plat": "Linux-5.15.0-1084-aws-x86_64-with-glibc2.31"}, "lat_ms": {"p10": 4.078477970324457, "p50": 4.127818974666297, "p90": 4.151278990320861, "mean": 4.122894583269954, "iqr": 0.06814103107899427, "raw_times": [4.173759021796286, 4.151278990320861, 4.127818974666297, 4.078477970324457, 4.083137959241867], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 3.617736976593733, "peak_bytes": 311296000, "ok": true, "absmax": 0.0625, "corr": {"ok": true, "rtol": 0.02, "atol": 0.02, "absmax": 0.0625, "mae": 0.000354766845703125, "mse": 2.7418136596679688e-06, "ref": "sdpa_math_fp32"}, "err": null}
-{"ts": "2025-10-24T19:25:36Z", "run": "352bdfcdb8354c52bb9bcfb05fb3e88b", "impl": "hf_kernels_flash_attn", "tags": {"family": "hf-kernels", "backend": "flash-attn", "compile": "none"}, "wl": {"name": "cuda_attn_L448_bfloat16", "batch": 1, "seq_len": 4544, "heads": 24, "head_dim": 128, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L4", "sm": "8.9", "py": "3.12.7", "plat": "Linux-5.15.0-1084-aws-x86_64-with-glibc2.31"}, "lat_ms": {"p10": 4.104706982616335, "p50": 4.1118780500255525, "p90": 4.146788967773318, "mean": 4.123546194750816, "iqr": 0.0404709717258811, "raw_times": [4.106317996047437, 4.104706982616335, 4.1118780500255525, 4.146788967773318, 4.148038977291435], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 4.064576991368085, "peak_bytes": 315621376, "ok": true, "absmax": 0.0625, "corr": {"ok": true, "rtol": 0.02, "atol": 0.02, "absmax": 0.0625, "mae": 0.0003528594970703125, "mse": 2.726912498474121e-06, "ref": "sdpa_math_fp32"}, "err": null}
-{"ts": "2025-10-24T19:25:36Z", "run": "352bdfcdb8354c52bb9bcfb05fb3e88b", "impl": "hf_kernels_flash_attn", "tags": {"family": "hf-kernels", "backend": "flash-attn", "compile": "none"}, "wl": {"name": "cuda_attn_L512_bfloat16", "batch": 1, "seq_len": 4608, "heads": 24, "head_dim": 128, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L4", "sm": "8.9", "py": "3.12.7", "plat": "Linux-5.15.0-1084-aws-x86_64-with-glibc2.31"}, "lat_ms": {"p10": 4.358973994385451, "p50": 4.570448014419526, "p90": 4.571158031467348, "mean": 4.518645000644028, "iqr": 0.052271061576902866, "raw_times": [4.358973994385451, 4.570448014419526, 4.57375799305737, 4.571158031467348, 4.5188869698904455], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 3.924274002201855, "peak_bytes": 319946752, "ok": true, "absmax": 0.0625, "corr": {"ok": true, "rtol": 0.02, "atol": 0.02, "absmax": 0.0625, "mae": 0.0003528594970703125, "mse": 2.726912498474121e-06, "ref": "sdpa_math_fp32"}, "err": null}
+{"ts": "2025-10-27T14:46:25Z", "run": "a317de5ffd144e2d917a7f8d53507ad4", "impl": "sage_int8_fp16", "tags": {"family": "sageattention", "backend": "int8_fp16_cuda", "compile": "none"}, "wl": {"name": "cuda_attn_L128_bfloat16", "batch": 1, "seq_len": 4224, "heads": 24, "head_dim": 128, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35"}, "lat_ms": null, "compile_ms": null, "peak_bytes": null, "ok": false, "absmax": null, "corr": {}, "err": {"type": "AttributeError", "msg": "module 'sage_attention_12c766386675beb4' has no attribute 'fwd'"}}
+{"ts": "2025-10-27T14:46:25Z", "run": "a317de5ffd144e2d917a7f8d53507ad4", "impl": "sage_int8_fp16", "tags": {"family": "sageattention", "backend": "int8_fp16_cuda", "compile": "none"}, "wl": {"name": "cuda_attn_L256_bfloat16", "batch": 1, "seq_len": 4352, "heads": 24, "head_dim": 128, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35"}, "lat_ms": null, "compile_ms": null, "peak_bytes": null, "ok": false, "absmax": null, "corr": {}, "err": {"type": "AttributeError", "msg": "module 'sage_attention_12c766386675beb4' has no attribute 'fwd'"}}
+{"ts": "2025-10-27T14:46:25Z", "run": "a317de5ffd144e2d917a7f8d53507ad4", "impl": "sage_int8_fp16", "tags": {"family": "sageattention", "backend": "int8_fp16_cuda", "compile": "none"}, "wl": {"name": "cuda_attn_L320_bfloat16", "batch": 1, "seq_len": 4416, "heads": 24, "head_dim": 128, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35"}, "lat_ms": null, "compile_ms": null, "peak_bytes": null, "ok": false, "absmax": null, "corr": {}, "err": {"type": "AttributeError", "msg": "module 'sage_attention_12c766386675beb4' has no attribute 'fwd'"}}
+{"ts": "2025-10-27T14:46:25Z", "run": "a317de5ffd144e2d917a7f8d53507ad4", "impl": "sage_int8_fp16", "tags": {"family": "sageattention", "backend": "int8_fp16_cuda", "compile": "none"}, "wl": {"name": "cuda_attn_L384_bfloat16", "batch": 1, "seq_len": 4480, "heads": 24, "head_dim": 128, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35"}, "lat_ms": null, "compile_ms": null, "peak_bytes": null, "ok": false, "absmax": null, "corr": {}, "err": {"type": "AttributeError", "msg": "module 'sage_attention_12c766386675beb4' has no attribute 'fwd'"}}
+{"ts": "2025-10-27T14:46:25Z", "run": "a317de5ffd144e2d917a7f8d53507ad4", "impl": "sage_int8_fp16", "tags": {"family": "sageattention", "backend": "int8_fp16_cuda", "compile": "none"}, "wl": {"name": "cuda_attn_L448_bfloat16", "batch": 1, "seq_len": 4544, "heads": 24, "head_dim": 128, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35"}, "lat_ms": null, "compile_ms": null, "peak_bytes": null, "ok": false, "absmax": null, "corr": {}, "err": {"type": "AttributeError", "msg": "module 'sage_attention_12c766386675beb4' has no attribute 'fwd'"}}
+{"ts": "2025-10-27T14:46:25Z", "run": "a317de5ffd144e2d917a7f8d53507ad4", "impl": "sage_int8_fp16", "tags": {"family": "sageattention", "backend": "int8_fp16_cuda", "compile": "none"}, "wl": {"name": "cuda_attn_L512_bfloat16", "batch": 1, "seq_len": 4608, "heads": 24, "head_dim": 128, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35"}, "lat_ms": null, "compile_ms": null, "peak_bytes": null, "ok": false, "absmax": null, "corr": {}, "err": {"type": "AttributeError", "msg": "module 'sage_attention_12c766386675beb4' has no attribute 'fwd'"}}
diff --git a/flash_attn/impls/cells/benchmark.py b/flash_attn/impls/cells/benchmark.py
index 3322ea52931b529ccb87a69b788c5390d7ca6dd7..64fe6a4eeb97a838a63f7152c1133db1ed3229d9 100644
--- a/flash_attn/impls/cells/benchmark.py
+++ b/flash_attn/impls/cells/benchmark.py
@@ -3,8 +3,9 @@
 # dependencies = [
 #     "numpy",
 #     "torch==2.8.0",
-#     "kernels-benchmark-tools",
 #     "kernels",
+#     "kernels-benchmark-tools",
+#     "sageattention",
 # ]
 #
 # [tool.uv.sources]
@@ -15,18 +16,18 @@ import sys
 from kernels_benchmark_tools import KernelTypeEnum, run_benchmark
 from kernels import get_kernel
 
-# Load the flash attention kernel
-hf_kernels_flash_attn = get_kernel("kernels-community/flash-attn")
+# Load the sage attention kernel
+hf_kernels_sage_attn = get_kernel("kernels-community/sage_attention")
 
 
-def hf_flash_attention(query, key, value):
-    """HuggingFace Kernels Flash Attention"""
-    return hf_kernels_flash_attn.fwd(query, key, value, is_causal=False)[0]
+def sage_attention(query, key, value):
+    """SageAttention with INT8 Q/K quantization and FP16 P/V"""
+    return hf_kernels_sage_attn.fwd(query, key, value, is_causal=False)[0]
 
 
 run_benchmark(
     kernel_type=KernelTypeEnum.ATTENTION,
-    impl_name="hf_kernels_flash_attn",
-    impl_tags={"family": "hf-kernels", "backend": "flash-attn", "compile": "none"},
-    impl_func=hf_flash_attention,
+    impl_name="sage_int8_fp16",
+    impl_tags={"family": "sageattention", "backend": "int8_fp16_cuda", "compile": "none"},
+    impl_func=sage_attention,
 )
\ No newline at end of file
diff --git a/flash_attn/impls/flash_attention.html b/flash_attn/impls/flash_attention.html
index 1a8157b79111cd82be83b7682897d7f7f715a588..865b225e49d1ec1a10bb57c96dc824f8c850085f 100644
--- a/flash_attn/impls/flash_attention.html
+++ b/flash_attn/impls/flash_attention.html
@@ -3857,7 +3857,7 @@ span.linenos.special { color: #000000; background-color: #ffffc0; padding-left:
     <div class="system-info">
         <div class="system-info-header">Generated on:</div>
         <div class="system-info-content">
-            Linux x86_64 | Linux-5.15.0-1084-aws-x86_64-with-glibc2.31
+            Linux x86_64 | Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35
         </div>
     </div>
     
@@ -3871,7 +3871,7 @@ span.linenos.special { color: #000000; background-color: #ffffc0; padding-left:
 <span onclick="toggleOutput('nv')" style="cursor: pointer;">▼ output</span>
  <span id="uv-indicator-nv" style="cursor: default; opacity: 0.3;">▶ uv-logs</span>
 </span> | 
-Cell: nv | 4.05s
+Cell: nv | 0.26s
  | <button class="run-btn" onclick="runCell('nv')">▶ run</button>
 <button class="copy-btn" onclick="copyCell('nv')">Copy</button>
 <a href="cells/nv.py" target="_blank" class="raw-btn">Raw</a>
@@ -3888,34 +3888,22 @@ Cell: nv | 4.05s
 </div>
 </div>
 <div id="output-nv" class="cell-output">
-<div class="cell-stdout"><pre class="stdout-text">Fri Oct 24 19:21:04 2025       
+<div class="cell-stdout"><pre class="stdout-text">Mon Oct 27 14:45:45 2025       
 +-----------------------------------------------------------------------------------------+
-| NVIDIA-SMI 560.35.05              Driver Version: 560.35.05      CUDA Version: 12.6     |
+| NVIDIA-SMI 570.195.03             Driver Version: 570.195.03     CUDA Version: 12.8     |
 |-----------------------------------------+------------------------+----------------------+
 | GPU  Name                 Persistence-M | Bus-Id          Disp.A | Volatile Uncorr. ECC |
 | Fan  Temp   Perf          Pwr:Usage/Cap |           Memory-Usage | GPU-Util  Compute M. |
 |                                         |                        |               MIG M. |
 |=========================================+========================+======================|
-|   0  NVIDIA L4                      Off |   00000000:38:00.0 Off |                    0 |
-| N/A   36C    P0             27W /   72W |       1MiB /  23034MiB |      0%      Default |
-|                                         |                        |                  N/A |
-+-----------------------------------------+------------------------+----------------------+
-|   1  NVIDIA L4                      Off |   00000000:3A:00.0 Off |                    0 |
-| N/A   33C    P0             28W /   72W |       1MiB /  23034MiB |      2%      Default |
-|                                         |                        |                  N/A |
-+-----------------------------------------+------------------------+----------------------+
-|   2  NVIDIA L4                      Off |   00000000:3C:00.0 Off |                    0 |
-| N/A   34C    P0             27W /   72W |       1MiB /  23034MiB |      1%      Default |
-|                                         |                        |                  N/A |
-+-----------------------------------------+------------------------+----------------------+
-|   3  NVIDIA L4                      Off |   00000000:3E:00.0 Off |                    0 |
-| N/A   33C    P0             27W /   72W |       1MiB /  23034MiB |      2%      Default |
+|   0  NVIDIA L40S                    On  |   00000000:4D:00.0 Off |                    0 |
+| N/A   31C    P0            135W /  350W |       0MiB /  46068MiB |    100%      Default |
 |                                         |                        |                  N/A |
 +-----------------------------------------+------------------------+----------------------+
 
 +-----------------------------------------------------------------------------------------+
 | Processes:                                                                              |
-|  GPU   GI   CI        PID   Type   Process name                              GPU Memory |
+|  GPU   GI   CI              PID   Type   Process name                        GPU Memory |
 |        ID   ID                                                               Usage      |
 |=========================================================================================|
 |  No running processes found                                                             |
@@ -3931,9 +3919,9 @@ Cell: nv | 4.05s
 <span class="collapse-indicators">
 <span onclick="toggleCode('benchmark')" style="cursor: pointer;">▼ code</span> 
 <span onclick="toggleOutput('benchmark')" style="cursor: pointer;">▼ output</span>
- <span id="uv-indicator-benchmark" onclick="toggleUvLogsFromHeader('benchmark')" style="cursor: pointer;">▶ uv-logs</span>
+ <span id="uv-indicator-benchmark" style="cursor: default; opacity: 0.3;">▶ uv-logs</span>
 </span> | 
-Cell: benchmark | 44.13s
+Cell: benchmark | 3.87s
  | <button class="run-btn" onclick="runCell('benchmark')">▶ run</button>
 <button class="copy-btn" onclick="copyCell('benchmark')">Copy</button>
 <a href="cells/benchmark.py" target="_blank" class="raw-btn">Raw</a>
@@ -3984,29 +3972,29 @@ PROFILE TRACE: torch_flash_ma | cuda_attn_L128_bfloat16
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
                                                    Name    Self CPU %      Self CPU   CPU total %     CPU total  CPU time avg     Self CUDA   Self CUDA %    CUDA total  CUDA time avg    # of Calls  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-                                         torch_flash_ma         2.87%     353.236us        20.60%       2.536ms       2.536ms       0.000us         0.00%      10.773ms      10.773ms             1  
-                                         torch_flash_ma         0.00%       0.000us         0.00%       0.000us       0.000us      10.620ms       100.09%      10.620ms      10.620ms             1  
-                     aten::scaled_dot_product_attention         0.36%      44.342us         1.92%     236.065us      78.688us       0.000us         0.00%       8.386ms       2.795ms             3  
-              aten::_scaled_dot_product_flash_attention         0.24%      29.551us         1.56%     191.723us      63.908us       0.000us         0.00%       8.386ms       2.795ms             3  
-                         aten::_flash_attention_forward         0.31%      38.342us         1.10%     135.583us      45.194us       8.386ms        79.03%       8.386ms       2.795ms             3  
-void pytorch_flash::flash_fwd_kernel&lt;Flash_fwd_kerne...         0.00%       0.000us         0.00%       0.000us       0.000us       8.386ms        79.03%       8.386ms       2.795ms             3  
-                                       aten::contiguous         0.12%      15.199us        15.18%       1.869ms     155.744us       0.000us         0.00%       2.387ms     198.924us            12  
-                                            aten::clone         0.36%      44.321us        15.06%       1.854ms     154.478us       0.000us         0.00%       2.387ms     198.924us            12  
-                                            aten::copy_         0.78%      95.990us        13.98%       1.720ms     143.361us       2.225ms        20.97%       2.387ms     198.924us            12  
-void at::native::elementwise_kernel&lt;128, 4, at::nati...         0.00%       0.000us         0.00%       0.000us       0.000us       2.225ms        20.97%       2.225ms     185.396us            12  
-                                Activity Buffer Request        12.35%       1.520ms        12.35%       1.520ms       1.520ms     162.335us         1.53%     162.335us     162.335us             1  
-                                        aten::transpose         0.62%      76.778us         0.84%     103.972us       4.332us       0.000us         0.00%       0.000us       0.000us            24  
-                                       aten::as_strided         0.22%      27.194us         0.22%      27.194us       1.133us       0.000us         0.00%       0.000us       0.000us            24  
-                                       aten::empty_like         0.24%      30.024us         0.91%     112.425us       7.495us       0.000us         0.00%       0.000us       0.000us            15  
-                                            aten::empty         0.80%      98.881us         0.80%      98.881us       4.120us       0.000us         0.00%       0.000us       0.000us            24  
-                                       cudaLaunchKernel         1.06%     129.984us         1.06%     129.984us       8.666us       0.000us         0.00%       0.000us       0.000us            15  
-                                    aten::empty_strided         0.14%      17.180us         0.14%      17.180us       5.727us       0.000us         0.00%       0.000us       0.000us             3  
-                                 cudaDeviceGetAttribute         0.02%       2.899us         0.02%       2.899us       0.483us       0.000us         0.00%       0.000us       0.000us             6  
-                                   cudaFuncSetAttribute         0.10%      11.980us         0.10%      11.980us       3.993us       0.000us         0.00%       0.000us       0.000us             3  
-                                  cudaDeviceSynchronize        79.40%       9.774ms        79.40%       9.774ms       9.774ms       0.000us         0.00%       0.000us       0.000us             1  
+                                         torch_flash_ma         0.00%       0.000us         0.00%       0.000us       0.000us       3.610ms       101.76%       3.610ms       3.610ms             1  
+                                         torch_flash_ma         6.54%     340.396us        46.01%       2.394ms       2.394ms       0.000us         0.00%       3.588ms       3.588ms             1  
+                     aten::scaled_dot_product_attention         0.84%      43.810us         4.24%     220.593us      73.531us       0.000us         0.00%       2.829ms     943.091us             3  
+              aten::_scaled_dot_product_flash_attention         0.51%      26.609us         3.40%     176.783us      58.928us       0.000us         0.00%       2.829ms     943.091us             3  
+                         aten::_flash_attention_forward         0.74%      38.381us         2.45%     127.692us      42.564us       2.829ms        79.74%       2.829ms     943.091us             3  
+void pytorch_flash::flash_fwd_kernel&lt;Flash_fwd_kerne...         0.00%       0.000us         0.00%       0.000us       0.000us       2.829ms        79.74%       2.829ms     943.091us             3  
+                                       aten::contiguous         0.29%      15.001us        33.86%       1.762ms     146.802us       0.000us         0.00%     759.072us      63.256us            12  
+                                            aten::clone         0.76%      39.432us        33.57%       1.747ms     145.552us       0.000us         0.00%     759.072us      63.256us            12  
+                                            aten::copy_         1.71%      88.801us        31.26%       1.626ms     135.534us     718.688us        20.26%     759.072us      63.256us            12  
+void at::native::elementwise_kernel&lt;128, 4, at::nati...         0.00%       0.000us         0.00%       0.000us       0.000us     718.688us        20.26%     718.688us      59.891us            12  
+                                Activity Buffer Request        27.68%       1.440ms        27.68%       1.440ms       1.440ms      40.384us         1.14%      40.384us      40.384us             1  
+                                        aten::transpose         1.34%      69.973us         1.80%      93.503us       3.896us       0.000us         0.00%       0.000us       0.000us            24  
+                                       aten::as_strided         0.45%      23.530us         0.45%      23.530us       0.980us       0.000us         0.00%       0.000us       0.000us            24  
+                                       aten::empty_like         0.50%      25.908us         1.97%     102.319us       6.821us       0.000us         0.00%       0.000us       0.000us            15  
+                                            aten::empty         1.75%      91.041us         1.75%      91.041us       3.793us       0.000us         0.00%       0.000us       0.000us            24  
+                                       cudaLaunchKernel         2.36%     123.031us         2.36%     123.031us       8.202us       0.000us         0.00%       0.000us       0.000us            15  
+                                    aten::empty_strided         0.31%      16.010us         0.31%      16.010us       5.337us       0.000us         0.00%       0.000us       0.000us             3  
+                                 cudaDeviceGetAttribute         0.05%       2.700us         0.05%       2.700us       0.450us       0.000us         0.00%       0.000us       0.000us             6  
+                                   cudaFuncSetAttribute         0.17%       8.980us         0.17%       8.980us       2.993us       0.000us         0.00%       0.000us       0.000us             3  
+                                  cudaDeviceSynchronize        53.99%       2.809ms        53.99%       2.809ms       2.809ms       0.000us         0.00%       0.000us       0.000us             1  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-Self CPU time total: 12.309ms
-Self CUDA time total: 10.610ms
+Self CPU time total: 5.203ms
+Self CUDA time total: 3.548ms
 
 
 
@@ -4016,29 +4004,29 @@ PROFILE TRACE: torch_flash_ma | cuda_attn_L256_bfloat16
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
                                                    Name    Self CPU %      Self CPU   CPU total %     CPU total  CPU time avg     Self CUDA   Self CUDA %    CUDA total  CUDA time avg    # of Calls  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-                                         torch_flash_ma         1.72%     263.576us        14.84%       2.279ms       2.279ms       0.000us         0.00%      13.971ms      13.971ms             1  
-                                         torch_flash_ma         0.00%       0.000us         0.00%       0.000us       0.000us      13.784ms       100.09%      13.784ms      13.784ms             1  
-                     aten::scaled_dot_product_attention         0.17%      25.751us         1.16%     178.074us      59.358us       0.000us         0.00%      11.389ms       3.796ms             3  
-              aten::_scaled_dot_product_flash_attention         0.12%      18.370us         0.99%     152.323us      50.774us       0.000us         0.00%      11.389ms       3.796ms             3  
-                         aten::_flash_attention_forward         0.21%      32.869us         0.72%     109.873us      36.624us      11.389ms        82.70%      11.389ms       3.796ms             3  
-void pytorch_flash::flash_fwd_kernel&lt;Flash_fwd_kerne...         0.00%       0.000us         0.00%       0.000us       0.000us      11.389ms        82.70%      11.389ms       3.796ms             3  
-                                       aten::contiguous         0.06%       9.710us        11.64%       1.787ms     148.932us       0.000us         0.00%       2.582ms     215.169us            12  
-                                            aten::clone         0.19%      29.062us        11.57%       1.777ms     148.123us       0.000us         0.00%       2.582ms     215.169us            12  
-                                            aten::copy_         0.55%      83.901us        10.97%       1.685ms     140.395us       2.382ms        17.30%       2.582ms     215.169us            12  
-void at::native::elementwise_kernel&lt;128, 4, at::nati...         0.00%       0.000us         0.00%       0.000us       0.000us       2.382ms        17.30%       2.382ms     198.534us            12  
-                                Activity Buffer Request         9.88%       1.517ms         9.88%       1.517ms       1.517ms     199.614us         1.45%     199.614us     199.614us             1  
-                                        aten::transpose         0.36%      54.739us         0.48%      74.091us       3.087us       0.000us         0.00%       0.000us       0.000us            24  
-                                       aten::as_strided         0.13%      19.352us         0.13%      19.352us       0.806us       0.000us         0.00%       0.000us       0.000us            24  
-                                       aten::empty_like         0.13%      19.810us         0.54%      82.371us       5.491us       0.000us         0.00%       0.000us       0.000us            15  
-                                            aten::empty         0.51%      77.821us         0.51%      77.821us       3.243us       0.000us         0.00%       0.000us       0.000us            24  
-                                       cudaLaunchKernel         0.70%     107.293us         0.70%     107.293us       7.153us       0.000us         0.00%       0.000us       0.000us            15  
-                                    aten::empty_strided         0.09%      13.681us         0.09%      13.681us       4.560us       0.000us         0.00%       0.000us       0.000us             3  
-                                 cudaDeviceGetAttribute         0.01%       1.961us         0.01%       1.961us       0.327us       0.000us         0.00%       0.000us       0.000us             6  
-                                   cudaFuncSetAttribute         0.03%       4.001us         0.03%       4.001us       1.334us       0.000us         0.00%       0.000us       0.000us             3  
-                                  cudaDeviceSynchronize        85.16%      13.081ms        85.16%      13.081ms      13.081ms       0.000us         0.00%       0.000us       0.000us             1  
+                                         torch_flash_ma         5.17%     272.917us        42.06%       2.218ms       2.218ms       0.000us         0.00%       3.821ms       3.821ms             1  
+                                         torch_flash_ma         0.00%       0.000us         0.00%       0.000us       0.000us       3.777ms       100.28%       3.777ms       3.777ms             1  
+                     aten::scaled_dot_product_attention         0.53%      27.761us         3.55%     187.333us      62.444us       0.000us         0.00%       3.004ms       1.001ms             3  
+              aten::_scaled_dot_product_flash_attention         0.37%      19.492us         3.03%     159.572us      53.191us       0.000us         0.00%       3.004ms       1.001ms             3  
+                         aten::_flash_attention_forward         0.75%      39.549us         2.23%     117.371us      39.124us       3.004ms        79.75%       3.004ms       1.001ms             3  
+void pytorch_flash::flash_fwd_kernel&lt;Flash_fwd_kerne...         0.00%       0.000us         0.00%       0.000us       0.000us       3.004ms        79.75%       3.004ms       1.001ms             3  
+                                       aten::contiguous         0.20%      10.320us        32.06%       1.691ms     140.876us       0.000us         0.00%     817.314us      68.110us            12  
+                                            aten::clone         0.55%      29.048us        31.86%       1.680ms     140.016us       0.000us         0.00%     817.314us      68.110us            12  
+                                            aten::copy_         1.64%      86.662us        30.11%       1.588ms     132.347us     762.658us        20.25%     817.314us      68.110us            12  
+void at::native::elementwise_kernel&lt;128, 4, at::nati...         0.00%       0.000us         0.00%       0.000us       0.000us     762.658us        20.25%     762.658us      63.555us            12  
+                                Activity Buffer Request        26.84%       1.415ms        26.84%       1.415ms       1.415ms      54.656us         1.45%      54.656us      54.656us             1  
+                                        aten::transpose         1.36%      71.528us         1.71%      90.179us       3.757us       0.000us         0.00%       0.000us       0.000us            24  
+                                       aten::as_strided         0.35%      18.651us         0.35%      18.651us       0.777us       0.000us         0.00%       0.000us       0.000us            24  
+                                       aten::empty_like         0.38%      19.801us         1.55%      81.840us       5.456us       0.000us         0.00%       0.000us       0.000us            15  
+                                            aten::empty         1.46%      77.040us         1.46%      77.040us       3.210us       0.000us         0.00%       0.000us       0.000us            24  
+                                       cudaLaunchKernel         2.07%     108.973us         2.07%     108.973us       7.265us       0.000us         0.00%       0.000us       0.000us            15  
+                                    aten::empty_strided         0.26%      13.940us         0.26%      13.940us       4.647us       0.000us         0.00%       0.000us       0.000us             3  
+                                 cudaDeviceGetAttribute         0.06%       2.910us         0.06%       2.910us       0.485us       0.000us         0.00%       0.000us       0.000us             6  
+                                   cudaFuncSetAttribute         0.08%       4.240us         0.08%       4.240us       1.413us       0.000us         0.00%       0.000us       0.000us             3  
+                                  cudaDeviceSynchronize        57.94%       3.056ms        57.94%       3.056ms       3.056ms       0.000us         0.00%       0.000us       0.000us             1  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-Self CPU time total: 15.360ms
-Self CUDA time total: 13.772ms
+Self CPU time total: 5.274ms
+Self CUDA time total: 3.767ms
 
 
 
@@ -4048,29 +4036,29 @@ PROFILE TRACE: torch_flash_ma | cuda_attn_L320_bfloat16
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
                                                    Name    Self CPU %      Self CPU   CPU total %     CPU total  CPU time avg     Self CUDA   Self CUDA %    CUDA total  CUDA time avg    # of Calls  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-                                         torch_flash_ma         1.59%     253.009us        16.33%       2.606ms       2.606ms       0.000us         0.00%      14.231ms      14.231ms             1  
-                                         torch_flash_ma         0.00%       0.000us         0.00%       0.000us       0.000us      14.040ms       100.09%      14.040ms      14.040ms             1  
-                     aten::scaled_dot_product_attention         0.16%      26.200us         1.12%     178.593us      59.531us       0.000us         0.00%      11.609ms       3.870ms             3  
-              aten::_scaled_dot_product_flash_attention         0.12%      19.071us         0.96%     152.393us      50.798us       0.000us         0.00%      11.609ms       3.870ms             3  
-                         aten::_flash_attention_forward         0.21%      33.032us         0.69%     110.322us      36.774us      11.609ms        82.76%      11.609ms       3.870ms             3  
-void pytorch_flash::flash_fwd_kernel&lt;Flash_fwd_kerne...         0.00%       0.000us         0.00%       0.000us       0.000us      11.609ms        82.76%      11.609ms       3.870ms             3  
-                                       aten::contiguous         0.06%      10.030us        13.32%       2.125ms     177.070us       0.000us         0.00%       2.623ms     218.547us            12  
-                                            aten::clone         0.18%      28.858us        13.25%       2.115ms     176.235us       0.000us         0.00%       2.623ms     218.547us            12  
-                                            aten::copy_         0.51%      81.604us        12.67%       2.022ms     168.500us       2.418ms        17.24%       2.623ms     218.547us            12  
-void at::native::elementwise_kernel&lt;128, 4, at::nati...         0.00%       0.000us         0.00%       0.000us       0.000us       2.418ms        17.24%       2.418ms     201.529us            12  
-                                Activity Buffer Request        11.62%       1.854ms        11.62%       1.854ms       1.854ms     204.222us         1.46%     204.222us     204.222us             1  
-                                        aten::transpose         0.33%      52.790us         0.45%      72.350us       3.015us       0.000us         0.00%       0.000us       0.000us            24  
-                                       aten::as_strided         0.12%      19.560us         0.12%      19.560us       0.815us       0.000us         0.00%       0.000us       0.000us            24  
-                                       aten::empty_like         0.12%      19.891us         0.52%      83.030us       5.535us       0.000us         0.00%       0.000us       0.000us            15  
-                                            aten::empty         0.49%      77.888us         0.49%      77.888us       3.245us       0.000us         0.00%       0.000us       0.000us            24  
-                                       cudaLaunchKernel         0.69%     109.402us         0.69%     109.402us       7.293us       0.000us         0.00%       0.000us       0.000us            15  
-                                    aten::empty_strided         0.09%      14.430us         0.09%      14.430us       4.810us       0.000us         0.00%       0.000us       0.000us             3  
-                                 cudaDeviceGetAttribute         0.01%       1.730us         0.01%       1.730us       0.288us       0.000us         0.00%       0.000us       0.000us             6  
-                                   cudaFuncSetAttribute         0.02%       3.831us         0.02%       3.831us       1.277us       0.000us         0.00%       0.000us       0.000us             3  
-                                  cudaDeviceSynchronize        83.67%      13.349ms        83.67%      13.349ms      13.349ms       0.000us         0.00%       0.000us       0.000us             1  
+                                         torch_flash_ma         4.99%     269.576us        41.89%       2.262ms       2.262ms       0.000us         0.00%       3.875ms       3.875ms             1  
+                                         torch_flash_ma         0.00%       0.000us         0.00%       0.000us       0.000us       3.827ms       100.29%       3.827ms       3.827ms             1  
+                     aten::scaled_dot_product_attention         0.50%      27.011us         3.47%     187.262us      62.421us       0.000us         0.00%       3.037ms       1.012ms             3  
+              aten::_scaled_dot_product_flash_attention         0.35%      18.851us         2.97%     160.251us      53.417us       0.000us         0.00%       3.037ms       1.012ms             3  
+                         aten::_flash_attention_forward         0.72%      39.000us         2.20%     118.550us      39.517us       3.037ms        79.57%       3.037ms       1.012ms             3  
+void pytorch_flash::flash_fwd_kernel&lt;Flash_fwd_kerne...         0.00%       0.000us         0.00%       0.000us       0.000us       3.037ms        79.57%       3.037ms       1.012ms             3  
+                                       aten::contiguous         0.18%       9.780us        32.51%       1.755ms     146.253us       0.000us         0.00%     838.461us      69.872us            12  
+                                            aten::clone         0.54%      29.119us        32.32%       1.745ms     145.438us       0.000us         0.00%     838.461us      69.872us            12  
+                                            aten::copy_         1.56%      84.200us        30.52%       1.648ms     137.328us     779.741us        20.43%     838.461us      69.872us            12  
+void at::native::elementwise_kernel&lt;128, 4, at::nati...         0.00%       0.000us         0.00%       0.000us       0.000us     779.741us        20.43%     779.741us      64.978us            12  
+                                Activity Buffer Request        27.41%       1.480ms        27.41%       1.480ms       1.480ms      58.720us         1.54%      58.720us      58.720us             1  
+                                        aten::transpose         1.00%      54.180us         1.34%      72.500us       3.021us       0.000us         0.00%       0.000us       0.000us            24  
+                                       aten::as_strided         0.34%      18.320us         0.34%      18.320us       0.763us       0.000us         0.00%       0.000us       0.000us            24  
+                                       aten::empty_like         0.36%      19.560us         1.66%      89.381us       5.959us       0.000us         0.00%       0.000us       0.000us            15  
+                                            aten::empty         1.53%      82.821us         1.53%      82.821us       3.451us       0.000us         0.00%       0.000us       0.000us            24  
+                                       cudaLaunchKernel         1.99%     107.272us         1.99%     107.272us       7.151us       0.000us         0.00%       0.000us       0.000us            15  
+                                    aten::empty_strided         0.30%      16.380us         0.30%      16.380us       5.460us       0.000us         0.00%       0.000us       0.000us             3  
+                                 cudaDeviceGetAttribute         0.03%       1.850us         0.03%       1.850us       0.308us       0.000us         0.00%       0.000us       0.000us             6  
+                                   cudaFuncSetAttribute         0.07%       3.830us         0.07%       3.830us       1.277us       0.000us         0.00%       0.000us       0.000us             3  
+                                  cudaDeviceSynchronize        58.11%       3.138ms        58.11%       3.138ms       3.138ms       0.000us         0.00%       0.000us       0.000us             1  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-Self CPU time total: 15.955ms
-Self CUDA time total: 14.027ms
+Self CPU time total: 5.399ms
+Self CUDA time total: 3.817ms
 
 
 
@@ -4080,29 +4068,29 @@ PROFILE TRACE: torch_flash_ma | cuda_attn_L384_bfloat16
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
                                                    Name    Self CPU %      Self CPU   CPU total %     CPU total  CPU time avg     Self CUDA   Self CUDA %    CUDA total  CUDA time avg    # of Calls  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-                                         torch_flash_ma         1.54%     253.696us        15.59%       2.567ms       2.567ms       0.000us         0.00%      14.787ms      14.787ms             1  
-                                         torch_flash_ma         0.00%       0.000us         0.00%       0.000us       0.000us      14.594ms       100.09%      14.594ms      14.594ms             1  
-                     aten::scaled_dot_product_attention         0.16%      26.450us         1.08%     178.164us      59.388us       0.000us         0.00%      12.117ms       4.039ms             3  
-              aten::_scaled_dot_product_flash_attention         0.12%      18.962us         0.92%     151.714us      50.571us       0.000us         0.00%      12.117ms       4.039ms             3  
-                         aten::_flash_attention_forward         0.20%      32.440us         0.66%     109.033us      36.344us      12.117ms        83.10%      12.117ms       4.039ms             3  
-void pytorch_flash::flash_fwd_kernel&lt;Flash_fwd_kerne...         0.00%       0.000us         0.00%       0.000us       0.000us      12.117ms        83.10%      12.117ms       4.039ms             3  
-                                       aten::contiguous         0.06%      10.538us        12.68%       2.087ms     173.951us       0.000us         0.00%       2.670ms     222.462us            12  
-                                            aten::clone         0.17%      28.412us        12.61%       2.077ms     173.073us       0.000us         0.00%       2.670ms     222.462us            12  
-                                            aten::copy_         0.50%      82.093us        12.05%       1.984ms     165.351us       2.464ms        16.90%       2.670ms     222.462us            12  
-void at::native::elementwise_kernel&lt;128, 4, at::nati...         0.00%       0.000us         0.00%       0.000us       0.000us       2.464ms        16.90%       2.464ms     205.326us            12  
-                                Activity Buffer Request         9.45%       1.555ms         9.45%       1.555ms       1.555ms     205.630us         1.41%     205.630us     205.630us             1  
-                                        aten::transpose         0.32%      52.269us         0.44%      71.730us       2.989us       0.000us         0.00%       0.000us       0.000us            24  
-                                       aten::as_strided         0.12%      19.461us         0.12%      19.461us       0.811us       0.000us         0.00%       0.000us       0.000us            24  
-                                       aten::empty_like         0.12%      19.690us         0.51%      84.151us       5.610us       0.000us         0.00%       0.000us       0.000us            15  
-                                            aten::empty         0.47%      77.802us         0.47%      77.802us       3.242us       0.000us         0.00%       0.000us       0.000us            24  
-                                       cudaLaunchKernel         2.24%     369.337us         2.24%     369.337us      24.622us       0.000us         0.00%       0.000us       0.000us            15  
-                                    aten::empty_strided         0.09%      14.871us         0.09%      14.871us       4.957us       0.000us         0.00%       0.000us       0.000us             3  
-                                 cudaDeviceGetAttribute         0.01%       1.880us         0.01%       1.880us       0.313us       0.000us         0.00%       0.000us       0.000us             6  
-                                   cudaFuncSetAttribute         0.02%       4.010us         0.02%       4.010us       1.337us       0.000us         0.00%       0.000us       0.000us             3  
-                                  cudaDeviceSynchronize        84.41%      13.899ms        84.41%      13.899ms      13.899ms       0.000us         0.00%       0.000us       0.000us             1  
+                                         torch_flash_ma         4.76%     268.853us        43.13%       2.435ms       2.435ms       0.000us         0.00%       3.964ms       3.964ms             1  
+                                         torch_flash_ma         0.00%       0.000us         0.00%       0.000us       0.000us       3.917ms       100.30%       3.917ms       3.917ms             1  
+                     aten::scaled_dot_product_attention         0.49%      27.720us         3.46%     195.333us      65.111us       0.000us         0.00%       3.118ms       1.039ms             3  
+              aten::_scaled_dot_product_flash_attention         0.34%      19.471us         2.97%     167.613us      55.871us       0.000us         0.00%       3.118ms       1.039ms             3  
+                         aten::_flash_attention_forward         0.70%      39.530us         2.23%     125.742us      41.914us       3.118ms        79.84%       3.118ms       1.039ms             3  
+void pytorch_flash::flash_fwd_kernel&lt;Flash_fwd_kerne...         0.00%       0.000us         0.00%       0.000us       0.000us       3.118ms        79.84%       3.118ms       1.039ms             3  
+                                       aten::contiguous         0.17%       9.719us        34.03%       1.921ms     160.116us       0.000us         0.00%     845.599us      70.467us            12  
+                                            aten::clone         0.52%      29.239us        33.85%       1.912ms     159.306us       0.000us         0.00%     845.599us      70.467us            12  
+                                            aten::copy_         1.54%      86.910us        32.19%       1.818ms     151.460us     787.167us        20.16%     845.599us      70.467us            12  
+void at::native::elementwise_kernel&lt;128, 4, at::nati...         0.00%       0.000us         0.00%       0.000us       0.000us     787.167us        20.16%     787.167us      65.597us            12  
+                                Activity Buffer Request        25.41%       1.435ms        25.41%       1.435ms       1.435ms      58.432us         1.50%      58.432us      58.432us             1  
+                                        aten::transpose         0.96%      54.080us         1.28%      72.141us       3.006us       0.000us         0.00%       0.000us       0.000us            24  
+                                       aten::as_strided         0.32%      18.061us         0.32%      18.061us       0.753us       0.000us         0.00%       0.000us       0.000us            24  
+                                       aten::empty_like         0.35%      19.512us         1.49%      84.134us       5.609us       0.000us         0.00%       0.000us       0.000us            15  
+                                            aten::empty         1.53%      86.581us         1.53%      86.581us       3.608us       0.000us         0.00%       0.000us       0.000us            24  
+                                       cudaLaunchKernel         5.66%     319.547us         5.66%     319.547us      21.303us       0.000us         0.00%       0.000us       0.000us            15  
+                                    aten::empty_strided         0.26%      14.430us         0.26%      14.430us       4.810us       0.000us         0.00%       0.000us       0.000us             3  
+                                 cudaDeviceGetAttribute         0.05%       2.740us         0.05%       2.740us       0.457us       0.000us         0.00%       0.000us       0.000us             6  
+                                   cudaFuncSetAttribute         0.07%       4.201us         0.07%       4.201us       1.400us       0.000us         0.00%       0.000us       0.000us             3  
+                                  cudaDeviceSynchronize        56.87%       3.211ms        56.87%       3.211ms       3.211ms       0.000us         0.00%       0.000us       0.000us             1  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-Self CPU time total: 16.466ms
-Self CUDA time total: 14.581ms
+Self CPU time total: 5.647ms
+Self CUDA time total: 3.906ms
 
 
 
@@ -4112,29 +4100,29 @@ PROFILE TRACE: torch_flash_ma | cuda_attn_L448_bfloat16
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
                                                    Name    Self CPU %      Self CPU   CPU total %     CPU total  CPU time avg     Self CUDA   Self CUDA %    CUDA total  CUDA time avg    # of Calls  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-                                         torch_flash_ma         1.70%     278.864us        15.50%       2.543ms       2.543ms       0.000us         0.00%      14.797ms      14.797ms             1  
-                                         torch_flash_ma         0.00%       0.000us         0.00%       0.000us       0.000us      14.600ms       100.09%      14.600ms      14.600ms             1  
-                     aten::scaled_dot_product_attention         0.17%      27.381us         1.16%     189.724us      63.241us       0.000us         0.00%      12.088ms       4.029ms             3  
-              aten::_scaled_dot_product_flash_attention         0.12%      19.359us         0.99%     162.343us      54.114us       0.000us         0.00%      12.088ms       4.029ms             3  
-                         aten::_flash_attention_forward         0.21%      33.700us         0.72%     118.223us      39.408us      12.088ms        82.87%      12.088ms       4.029ms             3  
-void pytorch_flash::flash_fwd_kernel&lt;Flash_fwd_kerne...         0.00%       0.000us         0.00%       0.000us       0.000us      12.088ms        82.87%      12.088ms       4.029ms             3  
-                                       aten::contiguous         0.06%      10.278us        12.35%       2.025ms     168.720us       0.000us         0.00%       2.709ms     225.729us            12  
-                                            aten::clone         0.18%      29.935us        12.28%       2.014ms     167.864us       0.000us         0.00%       2.709ms     225.729us            12  
-                                            aten::copy_         0.52%      84.857us        11.68%       1.915ms     159.605us       2.499ms        17.13%       2.709ms     225.729us            12  
-void at::native::elementwise_kernel&lt;128, 4, at::nati...         0.00%       0.000us         0.00%       0.000us       0.000us       2.499ms        17.13%       2.499ms     208.262us            12  
-                                Activity Buffer Request         9.10%       1.493ms         9.10%       1.493ms       1.493ms     209.598us         1.44%     209.598us     209.598us             1  
-                                        aten::transpose         0.33%      54.376us         0.45%      74.216us       3.092us       0.000us         0.00%       0.000us       0.000us            24  
-                                       aten::as_strided         0.12%      19.840us         0.12%      19.840us       0.827us       0.000us         0.00%       0.000us       0.000us            24  
-                                       aten::empty_like         0.12%      20.251us         0.54%      88.821us       5.921us       0.000us         0.00%       0.000us       0.000us            15  
-                                            aten::empty         0.50%      82.172us         0.50%      82.172us       3.424us       0.000us         0.00%       0.000us       0.000us            24  
-                                       cudaLaunchKernel         2.25%     368.209us         2.25%     368.209us      24.547us       0.000us         0.00%       0.000us       0.000us            15  
-                                    aten::empty_strided         0.09%      14.850us         0.09%      14.850us       4.950us       0.000us         0.00%       0.000us       0.000us             3  
-                                 cudaDeviceGetAttribute         0.01%       2.110us         0.01%       2.110us       0.352us       0.000us         0.00%       0.000us       0.000us             6  
-                                   cudaFuncSetAttribute         0.02%       3.861us         0.02%       3.861us       1.287us       0.000us         0.00%       0.000us       0.000us             3  
-                                  cudaDeviceSynchronize        84.50%      13.857ms        84.50%      13.857ms      13.857ms       0.000us         0.00%       0.000us       0.000us             1  
+                                         torch_flash_ma         5.25%     320.614us        40.80%       2.490ms       2.490ms       0.000us         0.00%       4.428ms       4.428ms             1  
+                                         torch_flash_ma         0.00%       0.000us         0.00%       0.000us       0.000us       4.377ms       100.25%       4.377ms       4.377ms             1  
+                     aten::scaled_dot_product_attention         0.44%      26.800us         3.27%     199.713us      66.571us       0.000us         0.00%       3.558ms       1.186ms             3  
+              aten::_scaled_dot_product_flash_attention         0.32%      19.239us         2.83%     172.913us      57.638us       0.000us         0.00%       3.558ms       1.186ms             3  
+                         aten::_flash_attention_forward         0.64%      38.816us         2.13%     129.963us      43.321us       3.558ms        81.48%       3.558ms       1.186ms             3  
+void pytorch_flash::flash_fwd_kernel&lt;Flash_fwd_kerne...         0.00%       0.000us         0.00%       0.000us       0.000us       3.558ms        81.48%       3.558ms       1.186ms             3  
+                                       aten::contiguous         0.17%      10.568us        31.48%       1.922ms     160.138us       0.000us         0.00%     870.015us      72.501us            12  
+                                            aten::clone         0.48%      29.552us        31.31%       1.911ms     159.257us       0.000us         0.00%     870.015us      72.501us            12  
+                                            aten::copy_         1.37%      83.622us        29.71%       1.813ms     151.123us     808.479us        18.52%     870.015us      72.501us            12  
+void at::native::elementwise_kernel&lt;128, 4, at::nati...         0.00%       0.000us         0.00%       0.000us       0.000us     808.479us        18.52%     808.479us      67.373us            12  
+                                Activity Buffer Request        24.07%       1.469ms        24.07%       1.469ms       1.469ms      61.536us         1.41%      61.536us      61.536us             1  
+                                        aten::transpose         0.88%      53.494us         1.18%      71.893us       2.996us       0.000us         0.00%       0.000us       0.000us            24  
+                                       aten::as_strided         0.30%      18.399us         0.30%      18.399us       0.767us       0.000us         0.00%       0.000us       0.000us            24  
+                                       aten::empty_like         0.45%      27.388us         1.61%      98.450us       6.563us       0.000us         0.00%       0.000us       0.000us            15  
+                                            aten::empty         1.35%      82.243us         1.35%      82.243us       3.427us       0.000us         0.00%       0.000us       0.000us            24  
+                                       cudaLaunchKernel         4.68%     285.943us         4.68%     285.943us      19.063us       0.000us         0.00%       0.000us       0.000us            15  
+                                    aten::empty_strided         0.29%      17.820us         0.29%      17.820us       5.940us       0.000us         0.00%       0.000us       0.000us             3  
+                                 cudaDeviceGetAttribute         0.04%       2.328us         0.04%       2.328us       0.388us       0.000us         0.00%       0.000us       0.000us             6  
+                                   cudaFuncSetAttribute         0.07%       4.078us         0.07%       4.078us       1.359us       0.000us         0.00%       0.000us       0.000us             3  
+                                  cudaDeviceSynchronize        59.20%       3.614ms        59.20%       3.614ms       3.614ms       0.000us         0.00%       0.000us       0.000us             1  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-Self CPU time total: 16.399ms
-Self CUDA time total: 14.587ms
+Self CPU time total: 6.104ms
+Self CUDA time total: 4.366ms
 
 
 
@@ -4144,91 +4132,39 @@ PROFILE TRACE: torch_flash_ma | cuda_attn_L512_bfloat16
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
                                                    Name    Self CPU %      Self CPU   CPU total %     CPU total  CPU time avg     Self CUDA   Self CUDA %    CUDA total  CUDA time avg    # of Calls  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-                                         torch_flash_ma         1.34%     250.556us        18.55%       3.457ms       3.457ms       0.000us         0.00%      16.094ms      16.094ms             1  
-                                         torch_flash_ma         0.00%       0.000us         0.00%       0.000us       0.000us      15.878ms       100.09%      15.878ms      15.878ms             1  
-                     aten::scaled_dot_product_attention         0.14%      25.201us         0.97%     180.244us      60.081us       0.000us         0.00%      12.955ms       4.318ms             3  
-              aten::_scaled_dot_product_flash_attention         0.10%      18.431us         0.83%     155.043us      51.681us       0.000us         0.00%      12.955ms       4.318ms             3  
-                         aten::_flash_attention_forward         0.18%      33.193us         0.61%     113.432us      37.811us      12.955ms        81.66%      12.955ms       4.318ms             3  
-void pytorch_flash::flash_fwd_kernel&lt;Flash_fwd_kerne...         0.00%       0.000us         0.00%       0.000us       0.000us      12.955ms        81.66%      12.955ms       4.318ms             3  
-                                       aten::contiguous         0.05%      10.100us        15.97%       2.976ms     248.003us       0.000us         0.00%       3.139ms     261.603us            12  
-                                            aten::clone         0.16%      29.450us        15.92%       2.966ms     247.161us       0.000us         0.00%       3.139ms     261.603us            12  
-                                            aten::copy_         0.46%      85.134us        15.41%       2.871ms     239.275us       2.909ms        18.34%       3.139ms     261.603us            12  
-void at::native::elementwise_kernel&lt;128, 4, at::nati...         0.00%       0.000us         0.00%       0.000us       0.000us       2.909ms        18.34%       2.909ms     242.440us            12  
-                                Activity Buffer Request         8.03%       1.497ms         8.03%       1.497ms       1.497ms     229.949us         1.45%     229.949us     229.949us             1  
-                                        aten::transpose         0.29%      53.550us         0.39%      73.110us       3.046us       0.000us         0.00%       0.000us       0.000us            24  
-                                       aten::as_strided         0.10%      19.560us         0.10%      19.560us       0.815us       0.000us         0.00%       0.000us       0.000us            24  
-                                       aten::empty_like         0.11%      19.791us         0.47%      87.501us       5.833us       0.000us         0.00%       0.000us       0.000us            15  
-                                            aten::empty         0.42%      78.571us         0.42%      78.571us       3.274us       0.000us         0.00%       0.000us       0.000us            24  
-                                       cudaLaunchKernel         7.05%       1.313ms         7.05%       1.313ms      87.561us       0.000us         0.00%       0.000us       0.000us            15  
-                                    aten::empty_strided         0.09%      17.450us         0.09%      17.450us       5.817us       0.000us         0.00%       0.000us       0.000us             3  
-                                 cudaDeviceGetAttribute         0.01%       1.828us         0.01%       1.828us       0.305us       0.000us         0.00%       0.000us       0.000us             6  
-                                   cudaFuncSetAttribute         0.02%       3.779us         0.02%       3.779us       1.260us       0.000us         0.00%       0.000us       0.000us             3  
-                                  cudaDeviceSynchronize        81.45%      15.178ms        81.45%      15.178ms      15.178ms       0.000us         0.00%       0.000us       0.000us             1  
+                                         torch_flash_ma         4.45%     272.752us        38.96%       2.390ms       2.390ms       0.000us         0.00%       4.517ms       4.517ms             1  
+                                         torch_flash_ma         0.00%       0.000us         0.00%       0.000us       0.000us       4.467ms       100.24%       4.467ms       4.467ms             1  
+                     aten::scaled_dot_product_attention         0.45%      27.641us         3.22%     197.213us      65.738us       0.000us         0.00%       3.636ms       1.212ms             3  
+              aten::_scaled_dot_product_flash_attention         0.32%      19.841us         2.76%     169.572us      56.524us       0.000us         0.00%       3.636ms       1.212ms             3  
+                         aten::_flash_attention_forward         0.71%      43.282us         2.06%     126.092us      42.031us       3.636ms        81.58%       3.636ms       1.212ms             3  
+void pytorch_flash::flash_fwd_kernel&lt;Flash_fwd_kerne...         0.00%       0.000us         0.00%       0.000us       0.000us       3.636ms        81.58%       3.636ms       1.212ms             3  
+                                       aten::contiguous         0.18%      11.069us        30.46%       1.869ms     155.711us       0.000us         0.00%     881.085us      73.424us            12  
+                                            aten::clone         0.50%      30.953us        30.28%       1.857ms     154.789us       0.000us         0.00%     881.085us      73.424us            12  
+                                            aten::copy_         1.39%      85.529us        28.66%       1.758ms     146.482us     820.670us        18.42%     881.085us      73.424us            12  
+void at::native::elementwise_kernel&lt;128, 4, at::nati...         0.00%       0.000us         0.00%       0.000us       0.000us     820.670us        18.42%     820.670us      68.389us            12  
+                                Activity Buffer Request        23.40%       1.435ms        23.40%       1.435ms       1.435ms      60.415us         1.36%      60.415us      60.415us             1  
+                                        aten::transpose         0.92%      56.138us         1.22%      75.130us       3.130us       0.000us         0.00%       0.000us       0.000us            24  
+                                       aten::as_strided         0.31%      18.992us         0.31%      18.992us       0.791us       0.000us         0.00%       0.000us       0.000us            24  
+                                       aten::empty_like         0.33%      20.287us         1.48%      90.810us       6.054us       0.000us         0.00%       0.000us       0.000us            15  
+                                            aten::empty         1.36%      83.613us         1.36%      83.613us       3.484us       0.000us         0.00%       0.000us       0.000us            24  
+                                       cudaLaunchKernel         4.26%     261.175us         4.26%     261.175us      17.412us       0.000us         0.00%       0.000us       0.000us            15  
+                                    aten::empty_strided         0.28%      17.260us         0.28%      17.260us       5.753us       0.000us         0.00%       0.000us       0.000us             3  
+                                 cudaDeviceGetAttribute         0.03%       1.850us         0.03%       1.850us       0.308us       0.000us         0.00%       0.000us       0.000us             6  
+                                   cudaFuncSetAttribute         0.07%       4.250us         0.07%       4.250us       1.417us       0.000us         0.00%       0.000us       0.000us             3  
+                                  cudaDeviceSynchronize        61.04%       3.744ms        61.04%       3.744ms       3.744ms       0.000us         0.00%       0.000us       0.000us             1  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-Self CPU time total: 18.634ms
-Self CUDA time total: 15.864ms
+Self CPU time total: 6.134ms
+Self CUDA time total: 4.456ms
 
 
 impl                     wl                  p50(ms)  ok
-torch_flash_ma           cuda_attn_L128_bfloat16     4.09  True
-torch_flash_ma           cuda_attn_L256_bfloat16     4.79  True
-torch_flash_ma           cuda_attn_L320_bfloat16     4.90  True
-torch_flash_ma           cuda_attn_L384_bfloat16     4.98  True
-torch_flash_ma           cuda_attn_L448_bfloat16     5.05  True
-torch_flash_ma           cuda_attn_L512_bfloat16     5.47  True
+torch_flash_ma           cuda_attn_L128_bfloat16     1.22  True
+torch_flash_ma           cuda_attn_L256_bfloat16     1.27  True
+torch_flash_ma           cuda_attn_L320_bfloat16     1.31  True
+torch_flash_ma           cuda_attn_L384_bfloat16     1.34  True
+torch_flash_ma           cuda_attn_L448_bfloat16     1.48  True
+torch_flash_ma           cuda_attn_L512_bfloat16     1.52  True
 </pre></div>
-<div class="uv-install-logs" id="uv-logs-benchmark">
-<div class="uv-logs-header" onclick="toggleUvLogs(this)">▶ UV Install Logs</div>
-<div class="uv-logs-content" style="display: none;">
-   Building kernels-benchmark-tools @ file:///home/ubuntu/Projects/kernels-benchmarks-consolidated/tools
-Downloading nvidia-curand-cu12 (60.7MiB)
-Downloading numpy (15.9MiB)
-Downloading nvidia-cuda-cupti-cu12 (9.8MiB)
-Downloading pillow (6.7MiB)
-Downloading fonttools (4.7MiB)
-Downloading networkx (1.9MiB)
-Downloading setuptools (1.1MiB)
-Downloading nvidia-cusolver-cu12 (255.1MiB)
-Downloading nvidia-cudnn-cu12 (674.0MiB)
-Downloading nvidia-nccl-cu12 (307.4MiB)
-Downloading nvidia-cufft-cu12 (184.2MiB)
-Downloading nvidia-cublas-cu12 (566.8MiB)
-Downloading nvidia-cufile-cu12 (1.1MiB)
-Downloading nvidia-cusparse-cu12 (274.9MiB)
-Downloading torch (846.8MiB)
-Downloading kiwisolver (1.4MiB)
-Downloading nvidia-nvjitlink-cu12 (37.4MiB)
-Downloading nvidia-cuda-nvrtc-cu12 (84.0MiB)
-Downloading matplotlib (8.3MiB)
-Downloading nvidia-cusparselt-cu12 (273.9MiB)
-Downloading sympy (6.0MiB)
-Downloading triton (148.4MiB)
- Downloading nvidia-cufile-cu12
- Downloading kiwisolver
- Downloading setuptools
- Downloading networkx
- Downloading fonttools
- Downloading pillow
-      Built kernels-benchmark-tools @ file:///home/ubuntu/Projects/kernels-benchmarks-consolidated/tools
- Downloading matplotlib
- Downloading nvidia-cuda-cupti-cu12
- Downloading numpy
- Downloading sympy
- Downloading nvidia-nvjitlink-cu12
- Downloading nvidia-curand-cu12
- Downloading nvidia-cuda-nvrtc-cu12
- Downloading triton
- Downloading nvidia-cufft-cu12
- Downloading nvidia-cusolver-cu12
- Downloading nvidia-cusparse-cu12
- Downloading nvidia-cusparselt-cu12
- Downloading nvidia-nccl-cu12
- Downloading nvidia-cublas-cu12
- Downloading nvidia-cudnn-cu12
- Downloading torch
-Installed 37 packages in 231ms
-</div>
-</div>
 <div class="cell-artifacts">
 <h4>Artifacts:</h4>
 <a href="artifacts/benchmark/attention.jsonl" class="artifact" target="_blank">attention.jsonl</a>
diff --git a/flash_attn/impls/hf_kernels_flash_attn.html b/flash_attn/impls/hf_kernels_flash_attn.html
index 89b7e537b19452298016ec6db0ede83224aeee2c..377e4f883c400300d7994f075a1a49399ece1b0c 100644
--- a/flash_attn/impls/hf_kernels_flash_attn.html
+++ b/flash_attn/impls/hf_kernels_flash_attn.html
@@ -3857,7 +3857,7 @@ span.linenos.special { color: #000000; background-color: #ffffc0; padding-left:
     <div class="system-info">
         <div class="system-info-header">Generated on:</div>
         <div class="system-info-content">
-            Linux x86_64 | Linux-5.15.0-1084-aws-x86_64-with-glibc2.31
+            Linux x86_64 | Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35
         </div>
     </div>
     
@@ -3871,7 +3871,7 @@ span.linenos.special { color: #000000; background-color: #ffffc0; padding-left:
 <span onclick="toggleOutput('benchmark')" style="cursor: pointer;">▼ output</span>
  <span id="uv-indicator-benchmark" onclick="toggleUvLogsFromHeader('benchmark')" style="cursor: pointer;">▶ uv-logs</span>
 </span> | 
-Cell: benchmark | 47.93s
+Cell: benchmark | 35.44s
  | <button class="run-btn" onclick="runCell('benchmark')">▶ run</button>
 <button class="copy-btn" onclick="copyCell('benchmark')">Copy</button>
 <a href="cells/benchmark.py" target="_blank" class="raw-btn">Raw</a>
@@ -3926,21 +3926,21 @@ PROFILE TRACE: hf_kernels_flash_attn | cuda_attn_L128_bfloat16
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
                                                    Name    Self CPU %      Self CPU   CPU total %     CPU total  CPU time avg     Self CUDA   Self CUDA %    CUDA total  CUDA time avg    # of Calls  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-                                  hf_kernels_flash_attn         1.75%     172.444us        18.87%       1.860ms       1.860ms       0.000us         0.00%      10.982ms      10.982ms             1  
-                               _flash_attn_9e27194::fwd         0.72%      71.472us        17.12%       1.688ms     562.609us       8.236ms       100.00%      10.982ms       3.661ms             3  
-                                  hf_kernels_flash_attn         0.00%       0.000us         0.00%       0.000us       0.000us       8.238ms       100.02%       8.238ms       8.238ms             1  
-void flash::flash_fwd_kernel&lt;Flash_fwd_kernel_traits...         0.00%       0.000us         0.00%       0.000us       0.000us       8.236ms       100.00%       8.236ms       2.745ms             3  
-                                Activity Buffer Request        14.98%       1.477ms        14.98%       1.477ms       1.477ms       2.746ms        33.34%       2.746ms       2.746ms             1  
-                                 cudaDeviceGetAttribute         0.11%      11.099us         0.11%      11.099us       0.740us       0.000us         0.00%       0.000us       0.000us            15  
-                                       aten::empty_like         0.19%      18.800us         0.53%      52.161us      17.387us       0.000us         0.00%       0.000us       0.000us             3  
-                                    aten::empty_strided         0.34%      33.361us         0.34%      33.361us      11.120us       0.000us         0.00%       0.000us       0.000us             3  
-                                            aten::empty         0.27%      26.650us         0.27%      26.650us       2.961us       0.000us         0.00%       0.000us       0.000us             9  
-                                   cudaFuncSetAttribute         0.09%       8.722us         0.09%       8.722us       2.907us       0.000us         0.00%       0.000us       0.000us             3  
-                                       cudaLaunchKernel         0.41%      40.651us         0.41%      40.651us      13.550us       0.000us         0.00%       0.000us       0.000us             3  
-                                  cudaDeviceSynchronize        81.13%       8.001ms        81.13%       8.001ms       8.001ms       0.000us         0.00%       0.000us       0.000us             1  
+                                  hf_kernels_flash_attn         3.89%     173.532us        41.54%       1.852ms       1.852ms       0.000us         0.00%       3.821ms       3.821ms             1  
+                               _flash_attn_9e27194::fwd         1.71%      76.382us        37.65%       1.679ms     559.513us       2.851ms       100.00%       3.821ms       1.274ms             3  
+                                  hf_kernels_flash_attn         0.00%       0.000us         0.00%       0.000us       0.000us       2.852ms       100.05%       2.852ms       2.852ms             1  
+void flash::flash_fwd_kernel&lt;Flash_fwd_kernel_traits...         0.00%       0.000us         0.00%       0.000us       0.000us       2.851ms       100.00%       2.851ms     950.289us             3  
+                                Activity Buffer Request        32.53%       1.450ms        32.53%       1.450ms       1.450ms     970.364us        34.04%     970.364us     970.364us             1  
+                                 cudaDeviceGetAttribute         0.10%       4.520us         0.10%       4.520us       0.301us       0.000us         0.00%       0.000us       0.000us            15  
+                                       aten::empty_like         0.46%      20.440us         1.29%      57.461us      19.154us       0.000us         0.00%       0.000us       0.000us             3  
+                                    aten::empty_strided         0.83%      37.021us         0.83%      37.021us      12.340us       0.000us         0.00%       0.000us       0.000us             3  
+                                            aten::empty         0.76%      33.730us         0.76%      33.730us       3.748us       0.000us         0.00%       0.000us       0.000us             9  
+                                   cudaFuncSetAttribute         0.29%      12.870us         0.29%      12.870us       4.290us       0.000us         0.00%       0.000us       0.000us             3  
+                                       cudaLaunchKernel         0.97%      43.280us         0.97%      43.280us      14.427us       0.000us         0.00%       0.000us       0.000us             3  
+                                  cudaDeviceSynchronize        58.46%       2.606ms        58.46%       2.606ms       2.606ms       0.000us         0.00%       0.000us       0.000us             1  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-Self CPU time total: 9.861ms
-Self CUDA time total: 8.236ms
+Self CPU time total: 4.458ms
+Self CUDA time total: 2.851ms
 
 
 
@@ -3950,21 +3950,21 @@ PROFILE TRACE: hf_kernels_flash_attn | cuda_attn_L256_bfloat16
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
                                                    Name    Self CPU %      Self CPU   CPU total %     CPU total  CPU time avg     Self CUDA   Self CUDA %    CUDA total  CUDA time avg    # of Calls  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-                                  hf_kernels_flash_attn         0.74%      96.063us        13.14%       1.699ms       1.699ms       0.000us         0.00%      15.210ms      15.210ms             1  
-                               _flash_attn_9e27194::fwd         0.37%      48.372us        12.39%       1.603ms     534.225us      11.384ms       100.00%      15.210ms       5.070ms             3  
-                                  hf_kernels_flash_attn         0.00%       0.000us         0.00%       0.000us       0.000us      11.386ms       100.02%      11.386ms      11.386ms             1  
-void flash::flash_fwd_kernel&lt;Flash_fwd_kernel_traits...         0.00%       0.000us         0.00%       0.000us       0.000us      11.384ms       100.00%      11.384ms       3.795ms             3  
-                                Activity Buffer Request        11.40%       1.474ms        11.40%       1.474ms       1.474ms       3.826ms        33.61%       3.826ms       3.826ms             1  
-                                 cudaDeviceGetAttribute         0.03%       4.448us         0.03%       4.448us       0.297us       0.000us         0.00%       0.000us       0.000us            15  
-                                       aten::empty_like         0.05%       6.910us         0.18%      23.882us       7.961us       0.000us         0.00%       0.000us       0.000us             3  
-                                    aten::empty_strided         0.13%      16.972us         0.13%      16.972us       5.657us       0.000us         0.00%       0.000us       0.000us             3  
-                                            aten::empty         0.17%      21.490us         0.17%      21.490us       2.388us       0.000us         0.00%       0.000us       0.000us             9  
-                                   cudaFuncSetAttribute         0.03%       3.650us         0.03%       3.650us       1.217us       0.000us         0.00%       0.000us       0.000us             3  
-                                       cudaLaunchKernel         0.21%      26.920us         0.21%      26.920us       8.973us       0.000us         0.00%       0.000us       0.000us             3  
-                                  cudaDeviceSynchronize        86.86%      11.232ms        86.86%      11.232ms      11.232ms       0.000us         0.00%       0.000us       0.000us             1  
+                                  hf_kernels_flash_attn         2.32%     104.162us        37.24%       1.676ms       1.676ms       0.000us         0.00%       4.000ms       4.000ms             1  
+                               _flash_attn_9e27194::fwd         1.05%      47.052us        34.93%       1.571ms     523.812us       2.988ms       100.00%       4.000ms       1.333ms             3  
+                                  hf_kernels_flash_attn         0.00%       0.000us         0.00%       0.000us       0.000us       2.989ms       100.04%       2.989ms       2.989ms             1  
+void flash::flash_fwd_kernel&lt;Flash_fwd_kernel_traits...         0.00%       0.000us         0.00%       0.000us       0.000us       2.988ms       100.00%       2.988ms     995.942us             3  
+                                Activity Buffer Request        32.02%       1.441ms        32.02%       1.441ms       1.441ms       1.012ms        33.87%       1.012ms       1.012ms             1  
+                                 cudaDeviceGetAttribute         0.10%       4.331us         0.10%       4.331us       0.289us       0.000us         0.00%       0.000us       0.000us            15  
+                                       aten::empty_like         0.16%       7.210us         0.52%      23.350us       7.783us       0.000us         0.00%       0.000us       0.000us             3  
+                                    aten::empty_strided         0.36%      16.140us         0.36%      16.140us       5.380us       0.000us         0.00%       0.000us       0.000us             3  
+                                            aten::empty         0.47%      21.320us         0.47%      21.320us       2.369us       0.000us         0.00%       0.000us       0.000us             9  
+                                   cudaFuncSetAttribute         0.10%       4.349us         0.10%       4.349us       1.450us       0.000us         0.00%       0.000us       0.000us             3  
+                                       cudaLaunchKernel         0.67%      30.329us         0.67%      30.329us      10.110us       0.000us         0.00%       0.000us       0.000us             3  
+                                  cudaDeviceSynchronize        62.76%       2.824ms        62.76%       2.824ms       2.824ms       0.000us         0.00%       0.000us       0.000us             1  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-Self CPU time total: 12.931ms
-Self CUDA time total: 11.384ms
+Self CPU time total: 4.499ms
+Self CUDA time total: 2.988ms
 
 
 
@@ -3974,21 +3974,21 @@ PROFILE TRACE: hf_kernels_flash_attn | cuda_attn_L320_bfloat16
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
                                                    Name    Self CPU %      Self CPU   CPU total %     CPU total  CPU time avg     Self CUDA   Self CUDA %    CUDA total  CUDA time avg    # of Calls  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-                                  hf_kernels_flash_attn         0.67%      91.024us        12.59%       1.703ms       1.703ms       0.000us         0.00%      15.954ms      15.954ms             1  
-                               _flash_attn_9e27194::fwd         0.35%      47.311us        11.92%       1.612ms     537.434us      11.964ms       100.00%      15.954ms       5.318ms             3  
-                                  hf_kernels_flash_attn         0.00%       0.000us         0.00%       0.000us       0.000us      11.966ms       100.01%      11.966ms      11.966ms             1  
-void flash::flash_fwd_kernel&lt;Flash_fwd_kernel_traits...         0.00%       0.000us         0.00%       0.000us       0.000us      11.964ms       100.00%      11.964ms       3.988ms             3  
-                                Activity Buffer Request        10.98%       1.485ms        10.98%       1.485ms       1.485ms       3.990ms        33.35%       3.990ms       3.990ms             1  
-                                 cudaDeviceGetAttribute         0.03%       4.340us         0.03%       4.340us       0.289us       0.000us         0.00%       0.000us       0.000us            15  
-                                       aten::empty_like         0.06%       8.720us         0.18%      24.830us       8.277us       0.000us         0.00%       0.000us       0.000us             3  
-                                    aten::empty_strided         0.12%      16.110us         0.12%      16.110us       5.370us       0.000us         0.00%       0.000us       0.000us             3  
-                                            aten::empty         0.15%      20.500us         0.15%      20.500us       2.278us       0.000us         0.00%       0.000us       0.000us             9  
-                                   cudaFuncSetAttribute         0.03%       3.660us         0.03%       3.660us       1.220us       0.000us         0.00%       0.000us       0.000us             3  
-                                       cudaLaunchKernel         0.20%      26.400us         0.20%      26.400us       8.800us       0.000us         0.00%       0.000us       0.000us             3  
-                                  cudaDeviceSynchronize        87.41%      11.823ms        87.41%      11.823ms      11.823ms       0.000us         0.00%       0.000us       0.000us             1  
+                                  hf_kernels_flash_attn         2.58%     116.241us        37.17%       1.677ms       1.677ms       0.000us         0.00%       4.040ms       4.040ms             1  
+                               _flash_attn_9e27194::fwd         1.11%      49.909us        34.60%       1.561ms     520.326us       3.012ms       100.00%       4.040ms       1.347ms             3  
+                                  hf_kernels_flash_attn         0.00%       0.000us         0.00%       0.000us       0.000us       3.013ms       100.04%       3.013ms       3.013ms             1  
+void flash::flash_fwd_kernel&lt;Flash_fwd_kernel_traits...         0.00%       0.000us         0.00%       0.000us       0.000us       3.012ms       100.00%       3.012ms       1.004ms             3  
+                                Activity Buffer Request        31.60%       1.426ms        31.60%       1.426ms       1.426ms       1.029ms        34.16%       1.029ms       1.029ms             1  
+                                 cudaDeviceGetAttribute         0.08%       3.801us         0.08%       3.801us       0.253us       0.000us         0.00%       0.000us       0.000us            15  
+                                       aten::empty_like         0.18%       8.151us         0.55%      24.960us       8.320us       0.000us         0.00%       0.000us       0.000us             3  
+                                    aten::empty_strided         0.37%      16.809us         0.37%      16.809us       5.603us       0.000us         0.00%       0.000us       0.000us             3  
+                                            aten::empty         0.47%      21.201us         0.47%      21.201us       2.356us       0.000us         0.00%       0.000us       0.000us             9  
+                                   cudaFuncSetAttribute         0.09%       3.950us         0.09%       3.950us       1.317us       0.000us         0.00%       0.000us       0.000us             3  
+                                       cudaLaunchKernel         0.69%      31.260us         0.69%      31.260us      10.420us       0.000us         0.00%       0.000us       0.000us             3  
+                                  cudaDeviceSynchronize        62.83%       2.835ms        62.83%       2.835ms       2.835ms       0.000us         0.00%       0.000us       0.000us             1  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-Self CPU time total: 13.526ms
-Self CUDA time total: 11.964ms
+Self CPU time total: 4.512ms
+Self CUDA time total: 3.012ms
 
 
 
@@ -3998,21 +3998,21 @@ PROFILE TRACE: hf_kernels_flash_attn | cuda_attn_L384_bfloat16
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
                                                    Name    Self CPU %      Self CPU   CPU total %     CPU total  CPU time avg     Self CUDA   Self CUDA %    CUDA total  CUDA time avg    # of Calls  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-                                  hf_kernels_flash_attn         0.67%      93.544us        14.10%       1.960ms       1.960ms       0.000us         0.00%      16.171ms      16.171ms             1  
-                               _flash_attn_9e27194::fwd         0.34%      47.108us        13.43%       1.866ms     622.149us      12.086ms       100.00%      16.171ms       5.390ms             3  
-                                  hf_kernels_flash_attn         0.00%       0.000us         0.00%       0.000us       0.000us      12.088ms       100.02%      12.088ms      12.088ms             1  
-void flash::flash_fwd_kernel&lt;Flash_fwd_kernel_traits...         0.00%       0.000us         0.00%       0.000us       0.000us      12.086ms       100.00%      12.086ms       4.029ms             3  
-                                Activity Buffer Request        10.87%       1.511ms        10.87%       1.511ms       1.511ms       4.085ms        33.80%       4.085ms       4.085ms             1  
-                                 cudaDeviceGetAttribute         0.03%       4.151us         0.03%       4.151us       0.277us       0.000us         0.00%       0.000us       0.000us            15  
-                                       aten::empty_like         0.05%       7.020us         0.18%      24.401us       8.134us       0.000us         0.00%       0.000us       0.000us             3  
-                                    aten::empty_strided         0.13%      17.381us         0.13%      17.381us       5.794us       0.000us         0.00%       0.000us       0.000us             3  
-                                            aten::empty         0.16%      21.650us         0.16%      21.650us       2.406us       0.000us         0.00%       0.000us       0.000us             9  
-                                   cudaFuncSetAttribute         0.03%       3.680us         0.03%       3.680us       1.227us       0.000us         0.00%       0.000us       0.000us             3  
-                                       cudaLaunchKernel         1.83%     254.116us         1.83%     254.116us      84.705us       0.000us         0.00%       0.000us       0.000us             3  
-                                  cudaDeviceSynchronize        85.90%      11.939ms        85.90%      11.939ms      11.939ms       0.000us         0.00%       0.000us       0.000us             1  
+                                  hf_kernels_flash_attn         2.01%      99.212us        38.53%       1.898ms       1.898ms       0.000us         0.00%       4.264ms       4.264ms             1  
+                               _flash_attn_9e27194::fwd         1.06%      52.152us        36.51%       1.799ms     599.723us       3.190ms       100.00%       4.264ms       1.421ms             3  
+                                  hf_kernels_flash_attn         0.00%       0.000us         0.00%       0.000us       0.000us       3.191ms       100.05%       3.191ms       3.191ms             1  
+void flash::flash_fwd_kernel&lt;Flash_fwd_kernel_traits...         0.00%       0.000us         0.00%       0.000us       0.000us       3.190ms       100.00%       3.190ms       1.063ms             3  
+                                Activity Buffer Request        28.82%       1.420ms        28.82%       1.420ms       1.420ms       1.074ms        33.68%       1.074ms       1.074ms             1  
+                                 cudaDeviceGetAttribute         0.09%       4.479us         0.09%       4.479us       0.299us       0.000us         0.00%       0.000us       0.000us            15  
+                                       aten::empty_like         0.16%       7.900us         0.54%      26.470us       8.823us       0.000us         0.00%       0.000us       0.000us             3  
+                                    aten::empty_strided         0.38%      18.570us         0.38%      18.570us       6.190us       0.000us         0.00%       0.000us       0.000us             3  
+                                            aten::empty         0.46%      22.430us         0.46%      22.430us       2.492us       0.000us         0.00%       0.000us       0.000us             9  
+                                   cudaFuncSetAttribute         0.08%       3.830us         0.08%       3.830us       1.277us       0.000us         0.00%       0.000us       0.000us             3  
+                                       cudaLaunchKernel         5.47%     269.763us         5.47%     269.763us      89.921us       0.000us         0.00%       0.000us       0.000us             3  
+                                  cudaDeviceSynchronize        61.47%       3.029ms        61.47%       3.029ms       3.029ms       0.000us         0.00%       0.000us       0.000us             1  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-Self CPU time total: 13.899ms
-Self CUDA time total: 12.086ms
+Self CPU time total: 4.928ms
+Self CUDA time total: 3.190ms
 
 
 
@@ -4022,21 +4022,21 @@ PROFILE TRACE: hf_kernels_flash_attn | cuda_attn_L448_bfloat16
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
                                                    Name    Self CPU %      Self CPU   CPU total %     CPU total  CPU time avg     Self CUDA   Self CUDA %    CUDA total  CUDA time avg    # of Calls  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-                                  hf_kernels_flash_attn         0.66%      93.812us        13.64%       1.945ms       1.945ms       0.000us         0.00%      16.623ms      16.623ms             1  
-                               _flash_attn_9e27194::fwd         0.35%      50.392us        12.98%       1.852ms     617.193us      12.470ms       100.00%      16.623ms       5.541ms             3  
-                                  hf_kernels_flash_attn         0.00%       0.000us         0.00%       0.000us       0.000us      12.472ms       100.02%      12.472ms      12.472ms             1  
-void flash::flash_fwd_kernel&lt;Flash_fwd_kernel_traits...         0.00%       0.000us         0.00%       0.000us       0.000us      12.470ms       100.00%      12.470ms       4.157ms             3  
-                                Activity Buffer Request        10.49%       1.496ms        10.49%       1.496ms       1.496ms       4.153ms        33.30%       4.153ms       4.153ms             1  
-                                 cudaDeviceGetAttribute         0.03%       4.180us         0.03%       4.180us       0.279us       0.000us         0.00%       0.000us       0.000us            15  
-                                       aten::empty_like         0.11%      15.512us         0.23%      32.181us      10.727us       0.000us         0.00%       0.000us       0.000us             3  
-                                    aten::empty_strided         0.12%      16.669us         0.12%      16.669us       5.556us       0.000us         0.00%       0.000us       0.000us             3  
-                                            aten::empty         0.15%      21.480us         0.15%      21.480us       2.387us       0.000us         0.00%       0.000us       0.000us             9  
-                                   cudaFuncSetAttribute         0.03%       4.150us         0.03%       4.150us       1.383us       0.000us         0.00%       0.000us       0.000us             3  
-                                       cudaLaunchKernel         1.70%     242.835us         1.70%     242.835us      80.945us       0.000us         0.00%       0.000us       0.000us             3  
-                                  cudaDeviceSynchronize        86.36%      12.315ms        86.36%      12.315ms      12.315ms       0.000us         0.00%       0.000us       0.000us             1  
+                                  hf_kernels_flash_attn         2.16%      88.971us        14.91%     614.057us     614.057us       0.000us         0.00%       4.875ms       4.875ms             1  
+                               _flash_attn_9e27194::fwd         1.23%      50.539us        12.75%     525.086us     175.029us       3.652ms       100.00%       4.875ms       1.625ms             3  
+                                  hf_kernels_flash_attn         0.00%       0.000us         0.00%       0.000us       0.000us       3.653ms       100.04%       3.653ms       3.653ms             1  
+void flash::flash_fwd_kernel&lt;Flash_fwd_kernel_traits...         0.00%       0.000us         0.00%       0.000us       0.000us       3.652ms       100.00%       3.652ms       1.217ms             3  
+                                Activity Buffer Request         5.08%     209.112us         5.08%     209.112us     209.112us       1.223ms        33.50%       1.223ms       1.223ms             1  
+                                 cudaDeviceGetAttribute         0.10%       3.960us         0.10%       3.960us       0.264us       0.000us         0.00%       0.000us       0.000us            15  
+                                       aten::empty_like         0.19%       7.749us         0.60%      24.700us       8.233us       0.000us         0.00%       0.000us       0.000us             3  
+                                    aten::empty_strided         0.41%      16.951us         0.41%      16.951us       5.650us       0.000us         0.00%       0.000us       0.000us             3  
+                                            aten::empty         0.54%      22.121us         0.54%      22.121us       2.458us       0.000us         0.00%       0.000us       0.000us             9  
+                                   cudaFuncSetAttribute         0.10%       4.190us         0.10%       4.190us       1.397us       0.000us         0.00%       0.000us       0.000us             3  
+                                       cudaLaunchKernel         5.11%     210.464us         5.11%     210.464us      70.155us       0.000us         0.00%       0.000us       0.000us             3  
+                                  cudaDeviceSynchronize        85.09%       3.504ms        85.09%       3.504ms       3.504ms       0.000us         0.00%       0.000us       0.000us             1  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-Self CPU time total: 14.261ms
-Self CUDA time total: 12.470ms
+Self CPU time total: 4.118ms
+Self CUDA time total: 3.652ms
 
 
 
@@ -4046,89 +4046,88 @@ PROFILE TRACE: hf_kernels_flash_attn | cuda_attn_L512_bfloat16
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
                                                    Name    Self CPU %      Self CPU   CPU total %     CPU total  CPU time avg     Self CUDA   Self CUDA %    CUDA total  CUDA time avg    # of Calls  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-                                  hf_kernels_flash_attn         0.61%      96.222us        15.74%       2.480ms       2.480ms       0.000us         0.00%      17.900ms      17.900ms             1  
-                               _flash_attn_9e27194::fwd         0.31%      49.571us        15.13%       2.384ms     794.661us      13.426ms       100.00%      17.900ms       5.967ms             3  
-                                  hf_kernels_flash_attn         0.00%       0.000us         0.00%       0.000us       0.000us      13.428ms       100.02%      13.428ms      13.428ms             1  
-void flash::flash_fwd_kernel&lt;Flash_fwd_kernel_traits...         0.00%       0.000us         0.00%       0.000us       0.000us      13.426ms       100.00%      13.426ms       4.475ms             3  
-                                Activity Buffer Request         9.64%       1.519ms         9.64%       1.519ms       1.519ms       4.474ms        33.33%       4.474ms       4.474ms             1  
-                                 cudaDeviceGetAttribute         0.03%       4.041us         0.03%       4.041us       0.269us       0.000us         0.00%       0.000us       0.000us            15  
-                                       aten::empty_like         0.05%       7.901us         0.16%      24.582us       8.194us       0.000us         0.00%       0.000us       0.000us             3  
-                                    aten::empty_strided         0.11%      16.681us         0.11%      16.681us       5.560us       0.000us         0.00%       0.000us       0.000us             3  
-                                            aten::empty         0.13%      20.818us         0.13%      20.818us       2.313us       0.000us         0.00%       0.000us       0.000us             9  
-                                   cudaFuncSetAttribute         0.02%       3.610us         0.02%       3.610us       1.203us       0.000us         0.00%       0.000us       0.000us             3  
-                                       cudaLaunchKernel         4.84%     761.957us         4.84%     761.957us     253.986us       0.000us         0.00%       0.000us       0.000us             3  
-                                  cudaDeviceSynchronize        84.26%      13.278ms        84.26%      13.278ms      13.278ms       0.000us         0.00%       0.000us       0.000us             1  
+                                  hf_kernels_flash_attn         2.23%      91.402us        14.65%     600.857us     600.857us       0.000us         0.00%       4.881ms       4.881ms             1  
+                               _flash_attn_9e27194::fwd         1.15%      47.191us        12.42%     509.455us     169.818us       3.654ms       100.00%       4.881ms       1.627ms             3  
+                                  hf_kernels_flash_attn         0.00%       0.000us         0.00%       0.000us       0.000us       3.655ms       100.04%       3.655ms       3.655ms             1  
+void flash::flash_fwd_kernel&lt;Flash_fwd_kernel_traits...         0.00%       0.000us         0.00%       0.000us       0.000us       3.654ms       100.00%       3.654ms       1.218ms             3  
+                                Activity Buffer Request         5.38%     220.623us         5.38%     220.623us     220.623us       1.227ms        33.59%       1.227ms       1.227ms             1  
+                                 cudaDeviceGetAttribute         0.09%       3.601us         0.09%       3.601us       0.240us       0.000us         0.00%       0.000us       0.000us            15  
+                                       aten::empty_like         0.18%       7.230us         0.58%      23.840us       7.947us       0.000us         0.00%       0.000us       0.000us             3  
+                                    aten::empty_strided         0.40%      16.610us         0.40%      16.610us       5.537us       0.000us         0.00%       0.000us       0.000us             3  
+                                            aten::empty         0.51%      20.851us         0.51%      20.851us       2.317us       0.000us         0.00%       0.000us       0.000us             9  
+                                   cudaFuncSetAttribute         0.09%       3.688us         0.09%       3.688us       1.229us       0.000us         0.00%       0.000us       0.000us             3  
+                                       cudaLaunchKernel         4.62%     189.661us         4.62%     189.661us      63.220us       0.000us         0.00%       0.000us       0.000us             3  
+                                  cudaDeviceSynchronize        85.35%       3.502ms        85.35%       3.502ms       3.502ms       0.000us         0.00%       0.000us       0.000us             1  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-Self CPU time total: 15.758ms
-Self CUDA time total: 13.426ms
+Self CPU time total: 4.103ms
+Self CUDA time total: 3.654ms
 
 
 impl                     wl                  p50(ms)  ok
-hf_kernels_flash_attn    cuda_attn_L128_bfloat16     2.82  True
-hf_kernels_flash_attn    cuda_attn_L256_bfloat16     3.91  True
-hf_kernels_flash_attn    cuda_attn_L320_bfloat16     4.12  True
-hf_kernels_flash_attn    cuda_attn_L384_bfloat16     4.13  True
-hf_kernels_flash_attn    cuda_attn_L448_bfloat16     4.11  True
-hf_kernels_flash_attn    cuda_attn_L512_bfloat16     4.57  True
+hf_kernels_flash_attn    cuda_attn_L128_bfloat16     0.98  True
+hf_kernels_flash_attn    cuda_attn_L256_bfloat16     1.02  True
+hf_kernels_flash_attn    cuda_attn_L320_bfloat16     1.05  True
+hf_kernels_flash_attn    cuda_attn_L384_bfloat16     1.07  True
+hf_kernels_flash_attn    cuda_attn_L448_bfloat16     1.23  True
+hf_kernels_flash_attn    cuda_attn_L512_bfloat16     1.23  True
 </pre></div>
 <div class="uv-install-logs" id="uv-logs-benchmark">
 <div class="uv-logs-header" onclick="toggleUvLogs(this)">▶ UV Install Logs</div>
 <div class="uv-logs-content" style="display: none;">
-   Building kernels-benchmark-tools @ file:///home/ubuntu/Projects/kernels-benchmarks-consolidated/tools
-Downloading nvidia-cufft-cu12 (184.2MiB)
+   Building kernels-benchmark-tools @ file:///__w/kernels-benchmarks/kernels-benchmarks/tools
 Downloading hf-xet (3.2MiB)
-Downloading setuptools (1.1MiB)
-Downloading sympy (6.0MiB)
-Downloading nvidia-cusparse-cu12 (274.9MiB)
-Downloading fonttools (4.7MiB)
-Downloading kiwisolver (1.4MiB)
+Downloading nvidia-nvjitlink-cu12 (37.4MiB)
 Downloading networkx (1.9MiB)
-Downloading nvidia-cusolver-cu12 (255.1MiB)
-Downloading pillow (6.7MiB)
-Downloading nvidia-cublas-cu12 (566.8MiB)
-Downloading nvidia-cuda-cupti-cu12 (9.8MiB)
+Downloading nvidia-cusparselt-cu12 (273.9MiB)
 Downloading nvidia-nccl-cu12 (307.4MiB)
-Downloading nvidia-cuda-nvrtc-cu12 (84.0MiB)
-Downloading triton (148.4MiB)
+Downloading kiwisolver (1.4MiB)
+Downloading pillow (6.7MiB)
 Downloading nvidia-curand-cu12 (60.7MiB)
-Downloading nvidia-nvjitlink-cu12 (37.4MiB)
-Downloading nvidia-cusparselt-cu12 (273.9MiB)
+Downloading nvidia-cublas-cu12 (566.8MiB)
+Downloading sympy (6.0MiB)
+Downloading setuptools (1.1MiB)
+Downloading matplotlib (8.3MiB)
+Downloading numpy (16.2MiB)
+Downloading triton (148.3MiB)
 Downloading nvidia-cudnn-cu12 (674.0MiB)
-Downloading torch (846.8MiB)
-Downloading numpy (15.9MiB)
+Downloading fonttools (4.7MiB)
+Downloading nvidia-cusparse-cu12 (274.9MiB)
+Downloading nvidia-cuda-nvrtc-cu12 (84.0MiB)
 Downloading nvidia-cufile-cu12 (1.1MiB)
-Downloading matplotlib (8.3MiB)
+Downloading nvidia-cusolver-cu12 (255.1MiB)
+Downloading nvidia-cuda-cupti-cu12 (9.8MiB)
+Downloading nvidia-cufft-cu12 (184.2MiB)
+Downloading torch (846.9MiB)
  Downloading nvidia-cufile-cu12
  Downloading kiwisolver
  Downloading hf-xet
  Downloading setuptools
- Downloading fonttools
  Downloading networkx
+ Downloading fonttools
  Downloading pillow
-      Built kernels-benchmark-tools @ file:///home/ubuntu/Projects/kernels-benchmarks-consolidated/tools
+      Built kernels-benchmark-tools @ file:///__w/kernels-benchmarks/kernels-benchmarks/tools
  Downloading nvidia-cuda-cupti-cu12
  Downloading matplotlib
  Downloading numpy
- Downloading nvidia-nvjitlink-cu12
  Downloading sympy
+ Downloading nvidia-nvjitlink-cu12
  Downloading nvidia-curand-cu12
  Downloading nvidia-cuda-nvrtc-cu12
  Downloading triton
  Downloading nvidia-cufft-cu12
  Downloading nvidia-cusolver-cu12
- Downloading nvidia-cusparse-cu12
  Downloading nvidia-cusparselt-cu12
+ Downloading nvidia-cusparse-cu12
  Downloading nvidia-nccl-cu12
  Downloading nvidia-cublas-cu12
  Downloading nvidia-cudnn-cu12
  Downloading torch
-Installed 47 packages in 223ms
+Installed 52 packages in 223ms
 </div>
 </div>
 <div class="cell-stderr">Fetching 20 files:   0%|          | 0/20 [00:00&lt;?, ?it/s]
-Fetching 20 files:   5%|▌         | 1/20 [00:00&lt;00:04,  4.15it/s]
-Fetching 20 files:  10%|█         | 2/20 [00:03&lt;00:35,  1.96s/it]
-Fetching 20 files: 100%|██████████| 20/20 [00:03&lt;00:00,  5.86it/s]</div>
+Fetching 20 files:  10%|█         | 2/20 [00:01&lt;00:12,  1.43it/s]
+Fetching 20 files: 100%|██████████| 20/20 [00:01&lt;00:00, 14.34it/s]</div>
 <div class="cell-artifacts">
 <h4>Artifacts:</h4>
 <a href="artifacts/benchmark/attention.jsonl" class="artifact" target="_blank">attention.jsonl</a>
diff --git a/flash_attn/impls/hf_kernels_flash_attn3.html b/flash_attn/impls/hf_kernels_flash_attn3.html
index 9931f2961261033f4de2f06ea452f344486787ca..a053bb95457c873b96f776bcf4309302293dd2b6 100644
--- a/flash_attn/impls/hf_kernels_flash_attn3.html
+++ b/flash_attn/impls/hf_kernels_flash_attn3.html
@@ -3857,7 +3857,7 @@ span.linenos.special { color: #000000; background-color: #ffffc0; padding-left:
     <div class="system-info">
         <div class="system-info-header">Generated on:</div>
         <div class="system-info-content">
-            Linux x86_64 | Linux-5.15.0-1084-aws-x86_64-with-glibc2.31
+            Linux x86_64 | Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35
         </div>
     </div>
     
@@ -3869,9 +3869,9 @@ span.linenos.special { color: #000000; background-color: #ffffc0; padding-left:
 <span class="collapse-indicators">
 <span onclick="toggleCode('benchmark')" style="cursor: pointer;">▼ code</span> 
 <span onclick="toggleOutput('benchmark')" style="cursor: pointer;">▼ output</span>
- <span id="uv-indicator-benchmark" onclick="toggleUvLogsFromHeader('benchmark')" style="cursor: pointer;">▶ uv-logs</span>
+ <span id="uv-indicator-benchmark" style="cursor: default; opacity: 0.3;">▶ uv-logs</span>
 </span> | 
-Cell: benchmark | 45.91s
+Cell: benchmark | 5.62s
  | <button class="run-btn" onclick="runCell('benchmark')">▶ run</button>
 <button class="copy-btn" onclick="copyCell('benchmark')">Copy</button>
 <a href="cells/benchmark.py" target="_blank" class="raw-btn">Raw</a>
@@ -3925,19 +3925,19 @@ PROFILE TRACE: hf_kernels_flash_attn3 | cuda_attn_L128_bfloat16
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
                                                    Name    Self CPU %      Self CPU   CPU total %     CPU total  CPU time avg     Self CUDA   Self CUDA %    CUDA total  CUDA time avg    # of Calls  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-                                 hf_kernels_flash_attn3         1.92%     183.884us        20.78%       1.986ms       1.986ms       0.000us         0.00%      10.512ms      10.512ms             1  
-                                          FlashAttnFunc         1.41%     134.465us        18.86%       1.802ms     600.660us       0.000us         0.00%      10.512ms       3.504ms             3  
-                        _flash_attn3_48fe103_dirty::fwd         0.80%      76.599us        17.45%       1.668ms     555.838us       7.883ms       100.00%      10.512ms       3.504ms             3  
-                                 hf_kernels_flash_attn3         0.00%       0.000us         0.00%       0.000us       0.000us       7.884ms       100.02%       7.884ms       7.884ms             1  
-void cutlass::device_kernel&lt;flash::enable_sm80_to_sm...         0.00%       0.000us         0.00%       0.000us       0.000us       7.883ms       100.00%       7.883ms       2.628ms             3  
-                                Activity Buffer Request        15.56%       1.487ms        15.56%       1.487ms       1.487ms       2.629ms        33.36%       2.629ms       2.629ms             1  
-                                            aten::empty         0.46%      44.151us         0.46%      44.151us       7.358us       0.000us         0.00%       0.000us       0.000us             6  
-                                   cudaFuncSetAttribute         0.16%      15.420us         0.16%      15.420us       5.140us       0.000us         0.00%       0.000us       0.000us             3  
-                                       cudaLaunchKernel         0.46%      44.162us         0.46%      44.162us      14.721us       0.000us         0.00%       0.000us       0.000us             3  
-                                  cudaDeviceSynchronize        79.22%       7.570ms        79.22%       7.570ms       7.570ms       0.000us         0.00%       0.000us       0.000us             1  
+                                 hf_kernels_flash_attn3         3.90%     171.143us        44.22%       1.941ms       1.941ms       0.000us         0.00%       3.653ms       3.653ms             1  
+                                          FlashAttnFunc         2.92%     128.011us        40.32%       1.769ms     589.788us       0.000us         0.00%       3.653ms       1.218ms             3  
+                        _flash_attn3_48fe103_dirty::fwd         1.90%      83.422us        37.41%       1.641ms     547.118us       2.755ms       100.00%       3.653ms       1.218ms             3  
+                                 hf_kernels_flash_attn3         0.00%       0.000us         0.00%       0.000us       0.000us       2.756ms       100.05%       2.756ms       2.756ms             1  
+void cutlass::device_kernel&lt;flash::enable_sm80_to_sm...         0.00%       0.000us         0.00%       0.000us       0.000us       2.755ms       100.00%       2.755ms     918.306us             3  
+                                Activity Buffer Request        33.13%       1.454ms        33.13%       1.454ms       1.454ms     898.082us        32.60%     898.082us     898.082us             1  
+                                            aten::empty         1.02%      44.762us         1.02%      44.762us       7.460us       0.000us         0.00%       0.000us       0.000us             6  
+                                   cudaFuncSetAttribute         0.33%      14.660us         0.33%      14.660us       4.887us       0.000us         0.00%       0.000us       0.000us             3  
+                                       cudaLaunchKernel         1.02%      44.660us         1.02%      44.660us      14.887us       0.000us         0.00%       0.000us       0.000us             3  
+                                  cudaDeviceSynchronize        55.78%       2.447ms        55.78%       2.447ms       2.447ms       0.000us         0.00%       0.000us       0.000us             1  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-Self CPU time total: 9.555ms
-Self CUDA time total: 7.883ms
+Self CPU time total: 4.388ms
+Self CUDA time total: 2.755ms
 
 
 
@@ -3947,19 +3947,19 @@ PROFILE TRACE: hf_kernels_flash_attn3 | cuda_attn_L256_bfloat16
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
                                                    Name    Self CPU %      Self CPU   CPU total %     CPU total  CPU time avg     Self CUDA   Self CUDA %    CUDA total  CUDA time avg    # of Calls  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-                                          FlashAttnFunc         0.80%     101.601us        13.56%       1.712ms     570.799us       0.000us         0.00%      14.746ms       4.915ms             3  
-                        _flash_attn3_48fe103_dirty::fwd         0.39%      49.531us        12.75%       1.611ms     536.932us      11.037ms       100.00%      14.746ms       4.915ms             3  
-                                 hf_kernels_flash_attn3         0.89%     111.943us        14.45%       1.824ms       1.824ms       0.000us         0.00%      14.746ms      14.746ms             1  
-                                 hf_kernels_flash_attn3         0.00%       0.000us         0.00%       0.000us       0.000us      11.039ms       100.02%      11.039ms      11.039ms             1  
-void cutlass::device_kernel&lt;flash::enable_sm80_to_sm...         0.00%       0.000us         0.00%       0.000us       0.000us      11.037ms       100.00%      11.037ms       3.679ms             3  
-                                Activity Buffer Request        11.87%       1.500ms        11.87%       1.500ms       1.500ms       3.709ms        33.60%       3.709ms       3.709ms             1  
-                                            aten::empty         0.21%      26.220us         0.21%      26.220us       4.370us       0.000us         0.00%       0.000us       0.000us             6  
-                                   cudaFuncSetAttribute         0.04%       5.092us         0.04%       5.092us       1.697us       0.000us         0.00%       0.000us       0.000us             3  
-                                       cudaLaunchKernel         0.24%      30.290us         0.24%      30.290us      10.097us       0.000us         0.00%       0.000us       0.000us             3  
-                                  cudaDeviceSynchronize        85.55%      10.805ms        85.55%      10.805ms      10.805ms       0.000us         0.00%       0.000us       0.000us             1  
+                                 hf_kernels_flash_attn3         2.42%     105.470us        40.03%       1.743ms       1.743ms       0.000us         0.00%       3.784ms       3.784ms             1  
+                                          FlashAttnFunc         2.12%      92.121us        37.61%       1.638ms     546.005us       0.000us         0.00%       3.784ms       1.261ms             3  
+                        _flash_attn3_48fe103_dirty::fwd         1.23%      53.460us        35.49%       1.546ms     515.298us       2.836ms       100.00%       3.784ms       1.261ms             3  
+                                 hf_kernels_flash_attn3         0.00%       0.000us         0.00%       0.000us       0.000us       2.838ms       100.05%       2.838ms       2.838ms             1  
+void cutlass::device_kernel&lt;flash::enable_sm80_to_sm...         0.00%       0.000us         0.00%       0.000us       0.000us       2.836ms       100.00%       2.836ms     945.359us             3  
+                                Activity Buffer Request        32.85%       1.431ms        32.85%       1.431ms       1.431ms     947.652us        33.41%     947.652us     947.652us             1  
+                                            aten::empty         0.62%      27.052us         0.62%      27.052us       4.509us       0.000us         0.00%       0.000us       0.000us             6  
+                                   cudaFuncSetAttribute         0.11%       4.721us         0.11%       4.721us       1.574us       0.000us         0.00%       0.000us       0.000us             3  
+                                       cudaLaunchKernel         0.68%      29.730us         0.68%      29.730us       9.910us       0.000us         0.00%       0.000us       0.000us             3  
+                                  cudaDeviceSynchronize        59.97%       2.612ms        59.97%       2.612ms       2.612ms       0.000us         0.00%       0.000us       0.000us             1  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-Self CPU time total: 12.629ms
-Self CUDA time total: 11.037ms
+Self CPU time total: 4.355ms
+Self CUDA time total: 2.836ms
 
 
 
@@ -3969,19 +3969,19 @@ PROFILE TRACE: hf_kernels_flash_attn3 | cuda_attn_L320_bfloat16
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
                                                    Name    Self CPU %      Self CPU   CPU total %     CPU total  CPU time avg     Self CUDA   Self CUDA %    CUDA total  CUDA time avg    # of Calls  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-                                 hf_kernels_flash_attn3         0.84%     108.082us        14.36%       1.851ms       1.851ms       0.000us         0.00%      15.081ms      15.081ms             1  
-                                          FlashAttnFunc         0.79%     101.882us        13.52%       1.743ms     580.849us       0.000us         0.00%      15.081ms       5.027ms             3  
-                        _flash_attn3_48fe103_dirty::fwd         0.38%      48.472us        12.73%       1.641ms     546.889us      11.268ms       100.00%      15.081ms       5.027ms             3  
-                                 hf_kernels_flash_attn3         0.00%       0.000us         0.00%       0.000us       0.000us      11.269ms       100.02%      11.269ms      11.269ms             1  
-void cutlass::device_kernel&lt;flash::enable_sm80_to_sm...         0.00%       0.000us         0.00%       0.000us       0.000us      11.268ms       100.00%      11.268ms       3.756ms             3  
-                                Activity Buffer Request        11.87%       1.530ms        11.87%       1.530ms       1.530ms       3.813ms        33.84%       3.813ms       3.813ms             1  
-                                            aten::empty         0.21%      26.670us         0.21%      26.670us       4.445us       0.000us         0.00%       0.000us       0.000us             6  
-                                   cudaFuncSetAttribute         0.04%       5.170us         0.04%       5.170us       1.723us       0.000us         0.00%       0.000us       0.000us             3  
-                                       cudaLaunchKernel         0.24%      30.581us         0.24%      30.581us      10.194us       0.000us         0.00%       0.000us       0.000us             3  
-                                  cudaDeviceSynchronize        85.64%      11.041ms        85.64%      11.041ms      11.041ms       0.000us         0.00%       0.000us       0.000us             1  
+                                 hf_kernels_flash_attn3         2.34%     104.112us        39.68%       1.767ms       1.767ms       0.000us         0.00%       3.931ms       3.931ms             1  
+                                          FlashAttnFunc         2.59%     115.143us        37.35%       1.662ms     554.155us       0.000us         0.00%       3.931ms       1.310ms             3  
+                        _flash_attn3_48fe103_dirty::fwd         1.23%      54.772us        34.76%       1.547ms     515.774us       2.932ms       100.00%       3.931ms       1.310ms             3  
+                                 hf_kernels_flash_attn3         0.00%       0.000us         0.00%       0.000us       0.000us       2.934ms       100.05%       2.934ms       2.934ms             1  
+void cutlass::device_kernel&lt;flash::enable_sm80_to_sm...         0.00%       0.000us         0.00%       0.000us       0.000us       2.932ms       100.00%       2.932ms     977.432us             3  
+                                Activity Buffer Request        32.05%       1.427ms        32.05%       1.427ms       1.427ms     998.487us        34.05%     998.487us     998.487us             1  
+                                            aten::empty         0.66%      29.309us         0.66%      29.309us       4.885us       0.000us         0.00%       0.000us       0.000us             6  
+                                   cudaFuncSetAttribute         0.11%       4.840us         0.11%       4.840us       1.613us       0.000us         0.00%       0.000us       0.000us             3  
+                                       cudaLaunchKernel         0.71%      31.520us         0.71%      31.520us      10.507us       0.000us         0.00%       0.000us       0.000us             3  
+                                  cudaDeviceSynchronize        60.32%       2.685ms        60.32%       2.685ms       2.685ms       0.000us         0.00%       0.000us       0.000us             1  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-Self CPU time total: 12.891ms
-Self CUDA time total: 11.268ms
+Self CPU time total: 4.452ms
+Self CUDA time total: 2.932ms
 
 
 
@@ -3991,19 +3991,19 @@ PROFILE TRACE: hf_kernels_flash_attn3 | cuda_attn_L384_bfloat16
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
                                                    Name    Self CPU %      Self CPU   CPU total %     CPU total  CPU time avg     Self CUDA   Self CUDA %    CUDA total  CUDA time avg    # of Calls  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-                                 hf_kernels_flash_attn3         0.87%     107.542us        12.07%       1.493ms       1.493ms       0.000us         0.00%      14.923ms      14.923ms             1  
-                                          FlashAttnFunc         0.84%     104.222us        11.20%       1.385ms     461.687us       0.000us         0.00%      14.923ms       4.974ms             3  
-                        _flash_attn3_48fe103_dirty::fwd         0.41%      51.032us        10.36%       1.281ms     426.946us      11.101ms       100.00%      14.923ms       4.974ms             3  
-                                 hf_kernels_flash_attn3         0.00%       0.000us         0.00%       0.000us       0.000us      11.102ms       100.02%      11.102ms      11.102ms             1  
-void cutlass::device_kernel&lt;flash::enable_sm80_to_sm...         0.00%       0.000us         0.00%       0.000us       0.000us      11.101ms       100.00%      11.101ms       3.700ms             3  
-                                Activity Buffer Request         7.69%     950.601us         7.69%     950.601us     950.601us       3.822ms        34.43%       3.822ms       3.822ms             1  
-                                            aten::empty         0.22%      27.719us         0.22%      27.719us       4.620us       0.000us         0.00%       0.000us       0.000us             6  
-                                   cudaFuncSetAttribute         0.04%       5.160us         0.04%       5.160us       1.720us       0.000us         0.00%       0.000us       0.000us             3  
-                                       cudaLaunchKernel         1.99%     246.326us         1.99%     246.326us      82.109us       0.000us         0.00%       0.000us       0.000us             3  
-                                  cudaDeviceSynchronize        87.93%      10.869ms        87.93%      10.869ms      10.869ms       0.000us         0.00%       0.000us       0.000us             1  
+                                 hf_kernels_flash_attn3         2.48%     118.391us        41.58%       1.983ms       1.983ms       0.000us         0.00%       4.029ms       4.029ms             1  
+                                          FlashAttnFunc         2.00%      95.232us        39.09%       1.865ms     621.579us       0.000us         0.00%       4.029ms       1.343ms             3  
+                        _flash_attn3_48fe103_dirty::fwd         1.18%      56.301us        37.10%       1.770ms     589.835us       3.014ms       100.00%       4.029ms       1.343ms             3  
+                                 hf_kernels_flash_attn3         0.00%       0.000us         0.00%       0.000us       0.000us       3.016ms       100.06%       3.016ms       3.016ms             1  
+void cutlass::device_kernel&lt;flash::enable_sm80_to_sm...         0.00%       0.000us         0.00%       0.000us       0.000us       3.014ms       100.00%       3.014ms       1.005ms             3  
+                                Activity Buffer Request        30.19%       1.440ms        30.19%       1.440ms       1.440ms       1.015ms        33.67%       1.015ms       1.015ms             1  
+                                            aten::empty         0.58%      27.710us         0.58%      27.710us       4.618us       0.000us         0.00%       0.000us       0.000us             6  
+                                   cudaFuncSetAttribute         0.10%       4.771us         0.10%       4.771us       1.590us       0.000us         0.00%       0.000us       0.000us             3  
+                                       cudaLaunchKernel         5.05%     240.873us         5.05%     240.873us      80.291us       0.000us         0.00%       0.000us       0.000us             3  
+                                  cudaDeviceSynchronize        58.42%       2.787ms        58.42%       2.787ms       2.787ms       0.000us         0.00%       0.000us       0.000us             1  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-Self CPU time total: 12.361ms
-Self CUDA time total: 11.101ms
+Self CPU time total: 4.770ms
+Self CUDA time total: 3.014ms
 
 
 
@@ -4013,19 +4013,19 @@ PROFILE TRACE: hf_kernels_flash_attn3 | cuda_attn_L448_bfloat16
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
                                                    Name    Self CPU %      Self CPU   CPU total %     CPU total  CPU time avg     Self CUDA   Self CUDA %    CUDA total  CUDA time avg    # of Calls  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-                                 hf_kernels_flash_attn3         0.89%     122.681us        14.72%       2.032ms       2.032ms       0.000us         0.00%      16.019ms      16.019ms             1  
-                                          FlashAttnFunc         0.72%     100.054us        13.83%       1.909ms     636.464us       0.000us         0.00%      16.019ms       5.340ms             3  
-                        _flash_attn3_48fe103_dirty::fwd         0.37%      50.743us        13.11%       1.809ms     603.113us      11.999ms       100.00%      16.019ms       5.340ms             3  
-                                 hf_kernels_flash_attn3         0.00%       0.000us         0.00%       0.000us       0.000us      12.001ms       100.02%      12.001ms      12.001ms             1  
-void cutlass::device_kernel&lt;flash::enable_sm80_to_sm...         0.00%       0.000us         0.00%       0.000us       0.000us      11.999ms       100.00%      11.999ms       4.000ms             3  
-                                Activity Buffer Request        10.68%       1.474ms        10.68%       1.474ms       1.474ms       4.020ms        33.50%       4.020ms       4.020ms             1  
-                                            aten::empty         0.20%      27.509us         0.20%      27.509us       4.585us       0.000us         0.00%       0.000us       0.000us             6  
-                                   cudaFuncSetAttribute         0.04%       5.180us         0.04%       5.180us       1.727us       0.000us         0.00%       0.000us       0.000us             3  
-                                       cudaLaunchKernel         1.82%     251.475us         1.82%     251.475us      83.825us       0.000us         0.00%       0.000us       0.000us             3  
-                                  cudaDeviceSynchronize        85.28%      11.773ms        85.28%      11.773ms      11.773ms       0.000us         0.00%       0.000us       0.000us             1  
+                                 hf_kernels_flash_attn3         2.45%     127.821us        37.14%       1.937ms       1.937ms       0.000us         0.00%       4.669ms       4.669ms             1  
+                                          FlashAttnFunc         1.78%      92.961us        34.69%       1.809ms     603.079us       0.000us         0.00%       4.669ms       1.556ms             3  
+                        _flash_attn3_48fe103_dirty::fwd         0.98%      50.990us        32.91%       1.716ms     572.092us       3.496ms       100.00%       4.669ms       1.556ms             3  
+                                 hf_kernels_flash_attn3         0.00%       0.000us         0.00%       0.000us       0.000us       3.498ms       100.05%       3.498ms       3.498ms             1  
+void cutlass::device_kernel&lt;flash::enable_sm80_to_sm...         0.00%       0.000us         0.00%       0.000us       0.000us       3.496ms       100.00%       3.496ms       1.165ms             3  
+                                Activity Buffer Request        27.66%       1.443ms        27.66%       1.443ms       1.443ms       1.173ms        33.56%       1.173ms       1.173ms             1  
+                                            aten::empty         0.56%      28.951us         0.56%      28.951us       4.825us       0.000us         0.00%       0.000us       0.000us             6  
+                                   cudaFuncSetAttribute         0.09%       4.870us         0.09%       4.870us       1.623us       0.000us         0.00%       0.000us       0.000us             3  
+                                       cudaLaunchKernel         3.62%     188.673us         3.62%     188.673us      62.891us       0.000us         0.00%       0.000us       0.000us             3  
+                                  cudaDeviceSynchronize        62.86%       3.279ms        62.86%       3.279ms       3.279ms       0.000us         0.00%       0.000us       0.000us             1  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-Self CPU time total: 13.805ms
-Self CUDA time total: 11.999ms
+Self CPU time total: 5.216ms
+Self CUDA time total: 3.496ms
 
 
 
@@ -4035,87 +4035,34 @@ PROFILE TRACE: hf_kernels_flash_attn3 | cuda_attn_L512_bfloat16
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
                                                    Name    Self CPU %      Self CPU   CPU total %     CPU total  CPU time avg     Self CUDA   Self CUDA %    CUDA total  CUDA time avg    # of Calls  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-                                 hf_kernels_flash_attn3         0.65%     102.032us        20.79%       3.268ms       3.268ms       0.000us         0.00%      16.971ms      16.971ms             1  
-                                          FlashAttnFunc         0.66%     104.392us        20.14%       3.166ms       1.055ms       0.000us         0.00%      16.971ms       5.657ms             3  
-                        _flash_attn3_48fe103_dirty::fwd         0.30%      47.113us        19.48%       3.062ms       1.021ms      12.681ms       100.00%      16.971ms       5.657ms             3  
-                                 hf_kernels_flash_attn3         0.00%       0.000us         0.00%       0.000us       0.000us      12.683ms       100.02%      12.683ms      12.683ms             1  
-void cutlass::device_kernel&lt;flash::enable_sm80_to_sm...         0.00%       0.000us         0.00%       0.000us       0.000us      12.681ms       100.00%      12.681ms       4.227ms             3  
-                                Activity Buffer Request        10.87%       1.709ms        10.87%       1.709ms       1.709ms       4.290ms        33.83%       4.290ms       4.290ms             1  
-                                            aten::empty         0.17%      27.090us         0.17%      27.090us       4.515us       0.000us         0.00%       0.000us       0.000us             6  
-                                   cudaFuncSetAttribute         0.03%       5.219us         0.03%       5.219us       1.740us       0.000us         0.00%       0.000us       0.000us             3  
-                                       cudaLaunchKernel         8.10%       1.273ms         8.10%       1.273ms     424.362us       0.000us         0.00%       0.000us       0.000us             3  
-                                  cudaDeviceSynchronize        79.21%      12.453ms        79.21%      12.453ms      12.453ms       0.000us         0.00%       0.000us       0.000us             1  
+                                 hf_kernels_flash_attn3         2.26%     115.651us        36.11%       1.844ms       1.844ms       0.000us         0.00%       4.648ms       4.648ms             1  
+                                          FlashAttnFunc         1.78%      91.130us        33.84%       1.728ms     576.085us       0.000us         0.00%       4.648ms       1.549ms             3  
+                        _flash_attn3_48fe103_dirty::fwd         1.06%      54.250us        32.06%       1.637ms     545.708us       3.480ms       100.00%       4.648ms       1.549ms             3  
+                                 hf_kernels_flash_attn3         0.00%       0.000us         0.00%       0.000us       0.000us       3.481ms       100.04%       3.481ms       3.481ms             1  
+void cutlass::device_kernel&lt;flash::enable_sm80_to_sm...         0.00%       0.000us         0.00%       0.000us       0.000us       3.480ms       100.00%       3.480ms       1.160ms             3  
+                                Activity Buffer Request        27.00%       1.379ms        27.00%       1.379ms       1.379ms       1.168ms        33.58%       1.168ms       1.168ms             1  
+                                            aten::empty         0.55%      28.142us         0.55%      28.142us       4.690us       0.000us         0.00%       0.000us       0.000us             6  
+                                   cudaFuncSetAttribute         0.10%       5.261us         0.10%       5.261us       1.754us       0.000us         0.00%       0.000us       0.000us             3  
+                                       cudaLaunchKernel         3.35%     170.883us         3.35%     170.883us      56.961us       0.000us         0.00%       0.000us       0.000us             3  
+                                  cudaDeviceSynchronize        63.89%       3.263ms        63.89%       3.263ms       3.263ms       0.000us         0.00%       0.000us       0.000us             1  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-Self CPU time total: 15.722ms
-Self CUDA time total: 12.681ms
+Self CPU time total: 5.107ms
+Self CUDA time total: 3.480ms
 
 
 impl                     wl                  p50(ms)  ok
-hf_kernels_flash_attn3   cuda_attn_L128_bfloat16     3.22  True
-hf_kernels_flash_attn3   cuda_attn_L256_bfloat16     3.77  True
-hf_kernels_flash_attn3   cuda_attn_L320_bfloat16     3.91  True
-hf_kernels_flash_attn3   cuda_attn_L384_bfloat16     3.97  True
-hf_kernels_flash_attn3   cuda_attn_L448_bfloat16     4.19  True
-hf_kernels_flash_attn3   cuda_attn_L512_bfloat16     4.41  True
+hf_kernels_flash_attn3   cuda_attn_L128_bfloat16     0.95  True
+hf_kernels_flash_attn3   cuda_attn_L256_bfloat16     0.98  True
+hf_kernels_flash_attn3   cuda_attn_L320_bfloat16     1.03  True
+hf_kernels_flash_attn3   cuda_attn_L384_bfloat16     1.04  True
+hf_kernels_flash_attn3   cuda_attn_L448_bfloat16     1.21  True
+hf_kernels_flash_attn3   cuda_attn_L512_bfloat16     1.18  True
 </pre></div>
-<div class="uv-install-logs" id="uv-logs-benchmark">
-<div class="uv-logs-header" onclick="toggleUvLogs(this)">▶ UV Install Logs</div>
-<div class="uv-logs-content" style="display: none;">
-   Building kernels-benchmark-tools @ file:///home/ubuntu/Projects/kernels-benchmarks-consolidated/tools
-Downloading nvidia-cufile-cu12 (1.1MiB)
-Downloading nvidia-cufft-cu12 (184.2MiB)
-Downloading sympy (6.0MiB)
-Downloading setuptools (1.1MiB)
-Downloading nvidia-curand-cu12 (60.7MiB)
-Downloading numpy (15.9MiB)
-Downloading nvidia-cuda-cupti-cu12 (9.8MiB)
-Downloading nvidia-cusolver-cu12 (255.1MiB)
-Downloading nvidia-cusparse-cu12 (274.9MiB)
-Downloading nvidia-cudnn-cu12 (674.0MiB)
-Downloading nvidia-cusparselt-cu12 (273.9MiB)
-Downloading matplotlib (8.3MiB)
-Downloading hf-xet (3.2MiB)
-Downloading nvidia-cublas-cu12 (566.8MiB)
-Downloading pillow (6.7MiB)
-Downloading nvidia-nccl-cu12 (307.4MiB)
-Downloading networkx (1.9MiB)
-Downloading nvidia-cuda-nvrtc-cu12 (84.0MiB)
-Downloading nvidia-nvjitlink-cu12 (37.4MiB)
-Downloading triton (148.4MiB)
-Downloading fonttools (4.7MiB)
-Downloading kiwisolver (1.4MiB)
-Downloading torch (846.8MiB)
- Downloading nvidia-cufile-cu12
- Downloading kiwisolver
- Downloading hf-xet
- Downloading setuptools
- Downloading networkx
- Downloading fonttools
- Downloading pillow
-      Built kernels-benchmark-tools @ file:///home/ubuntu/Projects/kernels-benchmarks-consolidated/tools
- Downloading nvidia-cuda-cupti-cu12
- Downloading matplotlib
- Downloading numpy
- Downloading sympy
- Downloading nvidia-nvjitlink-cu12
- Downloading nvidia-curand-cu12
- Downloading nvidia-cuda-nvrtc-cu12
- Downloading triton
- Downloading nvidia-cufft-cu12
- Downloading nvidia-cusolver-cu12
- Downloading nvidia-cusparse-cu12
- Downloading nvidia-cusparselt-cu12
- Downloading nvidia-nccl-cu12
- Downloading nvidia-cublas-cu12
- Downloading nvidia-cudnn-cu12
- Downloading torch
-Installed 47 packages in 222ms
+<div class="cell-stderr">
+Fetching 4 files:   0%|          | 0/4 [00:00&lt;?, ?it/s]
+Fetching 4 files:  50%|█████     | 2/4 [00:01&lt;00:01,  1.33it/s]
+Fetching 4 files: 100%|██████████| 4/4 [00:01&lt;00:00,  2.66it/s]
 </div>
-</div>
-<div class="cell-stderr">Fetching 4 files:   0%|          | 0/4 [00:00&lt;?, ?it/s]
-Fetching 4 files:  25%|██▌       | 1/4 [00:00&lt;00:00,  7.95it/s]
-Fetching 4 files:  50%|█████     | 2/4 [00:01&lt;00:01,  1.15it/s]
-Fetching 4 files: 100%|██████████| 4/4 [00:01&lt;00:00,  2.64it/s]</div>
 <div class="cell-artifacts">
 <h4>Artifacts:</h4>
 <a href="artifacts/benchmark/attention.jsonl" class="artifact" target="_blank">attention.jsonl</a>
diff --git a/flash_attn/impls/mem_efficient_attention.html b/flash_attn/impls/mem_efficient_attention.html
index fe7541e7ea06ed98fb77a03fb38c560aafd75082..2ef177e15d8ffcd4554ffa06fae5689015fee95f 100644
--- a/flash_attn/impls/mem_efficient_attention.html
+++ b/flash_attn/impls/mem_efficient_attention.html
@@ -3857,7 +3857,7 @@ span.linenos.special { color: #000000; background-color: #ffffc0; padding-left:
     <div class="system-info">
         <div class="system-info-header">Generated on:</div>
         <div class="system-info-content">
-            Linux x86_64 | Linux-5.15.0-1084-aws-x86_64-with-glibc2.31
+            Linux x86_64 | Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35
         </div>
     </div>
     
@@ -3869,9 +3869,9 @@ span.linenos.special { color: #000000; background-color: #ffffc0; padding-left:
 <span class="collapse-indicators">
 <span onclick="toggleCode('benchmark')" style="cursor: pointer;">▼ code</span> 
 <span onclick="toggleOutput('benchmark')" style="cursor: pointer;">▼ output</span>
- <span id="uv-indicator-benchmark" onclick="toggleUvLogsFromHeader('benchmark')" style="cursor: pointer;">▶ uv-logs</span>
+ <span id="uv-indicator-benchmark" style="cursor: default; opacity: 0.3;">▶ uv-logs</span>
 </span> | 
-Cell: benchmark | 44.03s
+Cell: benchmark | 4.02s
  | <button class="run-btn" onclick="runCell('benchmark')">▶ run</button>
 <button class="copy-btn" onclick="copyCell('benchmark')">Copy</button>
 <a href="cells/benchmark.py" target="_blank" class="raw-btn">Raw</a>
@@ -3924,28 +3924,28 @@ PROFILE TRACE: torch_mem_eff | cuda_attn_L128_bfloat16
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
                                                    Name    Self CPU %      Self CPU   CPU total %     CPU total  CPU time avg     Self CUDA   Self CUDA %    CUDA total  CUDA time avg    # of Calls  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-                                          torch_mem_eff         2.05%     363.238us        13.65%       2.421ms       2.421ms       0.000us         0.00%      16.223ms      16.223ms             1  
-                                          torch_mem_eff         0.00%       0.000us         0.00%       0.000us       0.000us      16.048ms       100.05%      16.048ms      16.048ms             1  
-                     aten::scaled_dot_product_attention         0.20%      35.830us         1.03%     182.144us      60.715us       0.000us         0.00%      14.265ms       4.755ms             3  
-          aten::_scaled_dot_product_efficient_attention         0.13%      22.700us         0.82%     146.314us      48.771us       0.000us         0.00%      14.265ms       4.755ms             3  
-                     aten::_efficient_attention_forward         0.19%      33.351us         0.54%      96.203us      32.068us      14.265ms        88.94%      14.265ms       4.755ms             3  
-fmha_cutlassF_bf16_aligned_64x128_rf_sm80(PyTorchMem...         0.00%       0.000us         0.00%       0.000us       0.000us      14.265ms        88.94%      14.265ms       4.755ms             3  
-                                       aten::contiguous         0.08%      13.451us        10.18%       1.806ms     200.629us       0.000us         0.00%       1.957ms     217.467us             9  
-                                            aten::clone         0.17%      30.701us        10.10%       1.792ms     199.134us       0.000us         0.00%       1.957ms     217.467us             9  
-                                            aten::copy_         0.43%      76.213us         9.49%       1.684ms     187.121us       1.775ms        11.06%       1.957ms     217.467us             9  
-void at::native::elementwise_kernel&lt;128, 4, at::nati...         0.00%       0.000us         0.00%       0.000us       0.000us       1.775ms        11.06%       1.775ms     197.189us             9  
-                                Activity Buffer Request         8.62%       1.529ms         8.62%       1.529ms       1.529ms     182.494us         1.14%     182.494us     182.494us             1  
-                                        aten::transpose         0.41%      73.552us         0.55%      97.771us       4.074us       0.000us         0.00%       0.000us       0.000us            24  
-                                       aten::as_strided         0.14%      24.219us         0.14%      24.219us       1.009us       0.000us         0.00%       0.000us       0.000us            24  
-                                       aten::empty_like         0.13%      23.478us         0.44%      77.421us       8.602us       0.000us         0.00%       0.000us       0.000us             9  
-                                            aten::empty         0.48%      85.684us         0.48%      85.684us       4.080us       0.000us         0.00%       0.000us       0.000us            21  
-                                       cudaLaunchKernel         0.58%     102.581us         0.58%     102.581us       8.548us       0.000us         0.00%       0.000us       0.000us            12  
-                                  cudaStreamIsCapturing         0.02%       3.010us         0.02%       3.010us       1.003us       0.000us         0.00%       0.000us       0.000us             3  
-                                   cudaFuncSetAttribute         0.02%       4.301us         0.02%       4.301us       1.434us       0.000us         0.00%       0.000us       0.000us             3  
-                                  cudaDeviceSynchronize        86.35%      15.322ms        86.35%      15.322ms      15.322ms       0.000us         0.00%       0.000us       0.000us             1  
+                                          torch_mem_eff         4.61%     329.029us        32.49%       2.320ms       2.320ms       0.000us         0.00%       5.545ms       5.545ms             1  
+                                          torch_mem_eff         0.00%       0.000us         0.00%       0.000us       0.000us       5.524ms       100.54%       5.524ms       5.524ms             1  
+                     aten::scaled_dot_product_attention         0.42%      29.860us         2.75%     196.242us      65.414us       0.000us         0.00%       4.878ms       1.626ms             3  
+          aten::_scaled_dot_product_efficient_attention         0.35%      25.230us         2.33%     166.382us      55.461us       0.000us         0.00%       4.878ms       1.626ms             3  
+                     aten::_efficient_attention_forward         0.73%      52.049us         1.68%     119.861us      39.954us       4.878ms        88.79%       4.878ms       1.626ms             3  
+fmha_cutlassF_bf16_aligned_64x128_rf_sm80(PyTorchMem...         0.00%       0.000us         0.00%       0.000us       0.000us       4.878ms        88.79%       4.878ms       1.626ms             3  
+                                       aten::contiguous         0.18%      13.143us        24.28%       1.734ms     192.643us       0.000us         0.00%     666.300us      74.033us             9  
+                                            aten::clone         0.50%      35.608us        24.09%       1.721ms     191.183us       0.000us         0.00%     666.300us      74.033us             9  
+                                            aten::copy_         1.01%      71.952us        22.59%       1.613ms     179.214us     615.708us        11.21%     666.300us      74.033us             9  
+void at::native::elementwise_kernel&lt;128, 4, at::nati...         0.00%       0.000us         0.00%       0.000us       0.000us     615.708us        11.21%     615.708us      68.412us             9  
+                                Activity Buffer Request        20.33%       1.452ms        20.33%       1.452ms       1.452ms      50.592us         0.92%      50.592us      50.592us             1  
+                                        aten::transpose         0.87%      61.994us         1.16%      82.494us       3.437us       0.000us         0.00%       0.000us       0.000us            24  
+                                       aten::as_strided         0.29%      20.500us         0.29%      20.500us       0.854us       0.000us         0.00%       0.000us       0.000us            24  
+                                       aten::empty_like         0.25%      17.742us         1.01%      72.112us       8.012us       0.000us         0.00%       0.000us       0.000us             9  
+                                            aten::empty         1.17%      83.610us         1.17%      83.610us       3.981us       0.000us         0.00%       0.000us       0.000us            21  
+                                       cudaLaunchKernel         1.60%     114.582us         1.60%     114.582us       9.548us       0.000us         0.00%       0.000us       0.000us            12  
+                                  cudaStreamIsCapturing         0.04%       3.180us         0.04%       3.180us       1.060us       0.000us         0.00%       0.000us       0.000us             3  
+                                   cudaFuncSetAttribute         0.14%      10.280us         0.14%      10.280us       3.427us       0.000us         0.00%       0.000us       0.000us             3  
+                                  cudaDeviceSynchronize        67.51%       4.821ms        67.51%       4.821ms       4.821ms       0.000us         0.00%       0.000us       0.000us             1  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-Self CPU time total: 17.744ms
-Self CUDA time total: 16.040ms
+Self CPU time total: 7.141ms
+Self CUDA time total: 5.494ms
 
 
 
@@ -3955,28 +3955,28 @@ PROFILE TRACE: torch_mem_eff | cuda_attn_L256_bfloat16
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
                                                    Name    Self CPU %      Self CPU   CPU total %     CPU total  CPU time avg     Self CUDA   Self CUDA %    CUDA total  CUDA time avg    # of Calls  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-                                          torch_mem_eff         1.10%     253.536us         9.32%       2.141ms       2.141ms       0.000us         0.00%      21.587ms      21.587ms             1  
-                                          torch_mem_eff         0.00%       0.000us         0.00%       0.000us       0.000us      21.402ms       100.04%      21.402ms      21.402ms             1  
-                     aten::scaled_dot_product_attention         0.08%      19.430us         0.63%     143.683us      47.894us       0.000us         0.00%      19.557ms       6.519ms             3  
-          aten::_scaled_dot_product_efficient_attention         0.08%      18.332us         0.54%     124.253us      41.418us       0.000us         0.00%      19.557ms       6.519ms             3  
-                     aten::_efficient_attention_forward         0.12%      28.280us         0.35%      81.271us      27.090us      19.557ms        91.42%      19.557ms       6.519ms             3  
-fmha_cutlassF_bf16_aligned_64x128_rf_sm80(PyTorchMem...         0.00%       0.000us         0.00%       0.000us       0.000us      19.557ms        91.42%      19.557ms       6.519ms             3  
-                                       aten::contiguous         0.03%       7.109us         7.41%       1.701ms     189.023us       0.000us         0.00%       2.030ms     225.605us             9  
-                                            aten::clone         0.09%      20.673us         7.38%       1.694ms     188.233us       0.000us         0.00%       2.030ms     225.605us             9  
-                                            aten::copy_         0.27%      61.032us         7.08%       1.625ms     180.543us       1.836ms         8.58%       2.030ms     225.605us             9  
-void at::native::elementwise_kernel&lt;128, 4, at::nati...         0.00%       0.000us         0.00%       0.000us       0.000us       1.836ms         8.58%       1.836ms     203.973us             9  
-                                Activity Buffer Request         6.54%       1.501ms         6.54%       1.501ms       1.501ms     194.686us         0.91%     194.686us     194.686us             1  
-                                        aten::transpose         0.22%      49.892us         0.29%      67.250us       2.802us       0.000us         0.00%       0.000us       0.000us            24  
-                                       aten::as_strided         0.08%      17.358us         0.08%      17.358us       0.723us       0.000us         0.00%       0.000us       0.000us            24  
-                                       aten::empty_like         0.05%      11.620us         0.21%      48.540us       5.393us       0.000us         0.00%       0.000us       0.000us             9  
-                                            aten::empty         0.27%      63.131us         0.27%      63.131us       3.006us       0.000us         0.00%       0.000us       0.000us            21  
-                                       cudaLaunchKernel         0.37%      84.411us         0.37%      84.411us       7.034us       0.000us         0.00%       0.000us       0.000us            12  
-                                  cudaStreamIsCapturing         0.01%       2.460us         0.01%       2.460us       0.820us       0.000us         0.00%       0.000us       0.000us             3  
-                                   cudaFuncSetAttribute         0.01%       2.960us         0.01%       2.960us       0.987us       0.000us         0.00%       0.000us       0.000us             3  
-                                  cudaDeviceSynchronize        90.68%      20.821ms        90.68%      20.821ms      20.821ms       0.000us         0.00%       0.000us       0.000us             1  
+                                          torch_mem_eff         3.39%     253.102us        28.13%       2.097ms       2.097ms       0.000us         0.00%       5.972ms       5.972ms             1  
+                                          torch_mem_eff         0.00%       0.000us         0.00%       0.000us       0.000us       5.926ms       100.15%       5.926ms       5.926ms             1  
+                     aten::scaled_dot_product_attention         0.26%      19.190us         1.92%     143.113us      47.704us       0.000us         0.00%       5.278ms       1.759ms             3  
+          aten::_scaled_dot_product_efficient_attention         0.26%      19.540us         1.66%     123.923us      41.308us       0.000us         0.00%       5.278ms       1.759ms             3  
+                     aten::_efficient_attention_forward         0.37%      27.385us         1.10%      81.652us      27.217us       5.278ms        89.20%       5.278ms       1.759ms             3  
+fmha_cutlassF_bf16_aligned_64x128_rf_sm80(PyTorchMem...         0.00%       0.000us         0.00%       0.000us       0.000us       5.278ms        89.20%       5.278ms       1.759ms             3  
+                                       aten::contiguous         0.09%       6.999us        22.26%       1.660ms     184.423us       0.000us         0.00%     693.503us      77.056us             9  
+                                            aten::clone         0.31%      23.031us        22.17%       1.653ms     183.645us       0.000us         0.00%     693.503us      77.056us             9  
+                                            aten::copy_         0.83%      61.989us        21.18%       1.579ms     175.477us     638.911us        10.80%     693.503us      77.056us             9  
+void at::native::elementwise_kernel&lt;128, 4, at::nati...         0.00%       0.000us         0.00%       0.000us       0.000us     638.911us        10.80%     638.911us      70.990us             9  
+                                Activity Buffer Request        19.45%       1.450ms        19.45%       1.450ms       1.450ms      54.592us         0.92%      54.592us      54.592us             1  
+                                        aten::transpose         0.64%      47.641us         0.86%      64.101us       2.671us       0.000us         0.00%       0.000us       0.000us            24  
+                                       aten::as_strided         0.22%      16.460us         0.22%      16.460us       0.686us       0.000us         0.00%       0.000us       0.000us            24  
+                                       aten::empty_like         0.16%      11.730us         0.68%      50.483us       5.609us       0.000us         0.00%       0.000us       0.000us             9  
+                                            aten::empty         0.86%      64.470us         0.86%      64.470us       3.070us       0.000us         0.00%       0.000us       0.000us            21  
+                                       cudaLaunchKernel         1.21%      90.240us         1.21%      90.240us       7.520us       0.000us         0.00%       0.000us       0.000us            12  
+                                  cudaStreamIsCapturing         0.03%       2.290us         0.03%       2.290us       0.763us       0.000us         0.00%       0.000us       0.000us             3  
+                                   cudaFuncSetAttribute         0.04%       3.130us         0.04%       3.130us       1.043us       0.000us         0.00%       0.000us       0.000us             3  
+                                  cudaDeviceSynchronize        71.87%       5.359ms        71.87%       5.359ms       5.359ms       0.000us         0.00%       0.000us       0.000us             1  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-Self CPU time total: 22.962ms
-Self CUDA time total: 21.392ms
+Self CPU time total: 7.456ms
+Self CUDA time total: 5.917ms
 
 
 
@@ -3986,28 +3986,28 @@ PROFILE TRACE: torch_mem_eff | cuda_attn_L320_bfloat16
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
                                                    Name    Self CPU %      Self CPU   CPU total %     CPU total  CPU time avg     Self CUDA   Self CUDA %    CUDA total  CUDA time avg    # of Calls  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-                                          torch_mem_eff         1.02%     243.020us         8.92%       2.127ms       2.127ms       0.000us         0.00%      22.482ms      22.482ms             1  
-                                          torch_mem_eff         0.00%       0.000us         0.00%       0.000us       0.000us      22.293ms       100.04%      22.293ms      22.293ms             1  
-                     aten::scaled_dot_product_attention         0.08%      18.442us         0.60%     142.065us      47.355us       0.000us         0.00%      20.413ms       6.804ms             3  
-          aten::_scaled_dot_product_efficient_attention         0.08%      17.984us         0.52%     123.623us      41.208us       0.000us         0.00%      20.413ms       6.804ms             3  
-                     aten::_efficient_attention_forward         0.12%      28.538us         0.35%      82.550us      27.517us      20.413ms        91.61%      20.413ms       6.804ms             3  
-fmha_cutlassF_bf16_aligned_64x128_rf_sm80(PyTorchMem...         0.00%       0.000us         0.00%       0.000us       0.000us      20.413ms        91.61%      20.413ms       6.804ms             3  
-                                       aten::contiguous         0.03%       7.301us         7.12%       1.699ms     188.733us       0.000us         0.00%       2.068ms     229.822us             9  
-                                            aten::clone         0.09%      20.431us         7.09%       1.691ms     187.922us       0.000us         0.00%       2.068ms     229.822us             9  
-                                            aten::copy_         0.25%      59.709us         6.80%       1.622ms     180.233us       1.870ms         8.39%       2.068ms     229.822us             9  
-void at::native::elementwise_kernel&lt;128, 4, at::nati...         0.00%       0.000us         0.00%       0.000us       0.000us       1.870ms         8.39%       1.870ms     207.771us             9  
-                                Activity Buffer Request         6.28%       1.498ms         6.28%       1.498ms       1.498ms     198.462us         0.89%     198.462us     198.462us             1  
-                                        aten::transpose         0.21%      49.091us         0.28%      66.291us       2.762us       0.000us         0.00%       0.000us       0.000us            24  
-                                       aten::as_strided         0.07%      17.200us         0.07%      17.200us       0.717us       0.000us         0.00%       0.000us       0.000us            24  
-                                       aten::empty_like         0.05%      11.563us         0.20%      48.772us       5.419us       0.000us         0.00%       0.000us       0.000us             9  
-                                            aten::empty         0.27%      63.659us         0.27%      63.659us       3.031us       0.000us         0.00%       0.000us       0.000us            21  
-                                       cudaLaunchKernel         0.36%      86.324us         0.36%      86.324us       7.194us       0.000us         0.00%       0.000us       0.000us            12  
-                                  cudaStreamIsCapturing         0.01%       2.431us         0.01%       2.431us       0.810us       0.000us         0.00%       0.000us       0.000us             3  
-                                   cudaFuncSetAttribute         0.01%       2.970us         0.01%       2.970us       0.990us       0.000us         0.00%       0.000us       0.000us             3  
-                                  cudaDeviceSynchronize        91.08%      21.725ms        91.08%      21.725ms      21.725ms       0.000us         0.00%       0.000us       0.000us             1  
+                                          torch_mem_eff         3.16%     240.823us        26.89%       2.051ms       2.051ms       0.000us         0.00%       6.167ms       6.167ms             1  
+                                          torch_mem_eff         0.00%       0.000us         0.00%       0.000us       0.000us       6.117ms       100.14%       6.117ms       6.117ms             1  
+                     aten::scaled_dot_product_attention         0.24%      18.220us         1.81%     137.732us      45.911us       0.000us         0.00%       5.453ms       1.818ms             3  
+          aten::_scaled_dot_product_efficient_attention         0.24%      18.402us         1.57%     119.512us      39.837us       0.000us         0.00%       5.453ms       1.818ms             3  
+                     aten::_efficient_attention_forward         0.35%      26.389us         1.04%      79.670us      26.557us       5.453ms        89.28%       5.453ms       1.818ms             3  
+fmha_cutlassF_bf16_aligned_64x128_rf_sm80(PyTorchMem...         0.00%       0.000us         0.00%       0.000us       0.000us       5.453ms        89.28%       5.453ms       1.818ms             3  
+                                       aten::contiguous         0.09%       6.950us        21.38%       1.630ms     181.132us       0.000us         0.00%     713.534us      79.282us             9  
+                                            aten::clone         0.28%      21.189us        21.28%       1.623ms     180.360us       0.000us         0.00%     713.534us      79.282us             9  
+                                            aten::copy_         0.81%      62.032us        20.34%       1.551ms     172.330us     655.038us        10.72%     713.534us      79.282us             9  
+void at::native::elementwise_kernel&lt;128, 4, at::nati...         0.00%       0.000us         0.00%       0.000us       0.000us     655.038us        10.72%     655.038us      72.782us             9  
+                                Activity Buffer Request        18.63%       1.421ms        18.63%       1.421ms       1.421ms      58.496us         0.96%      58.496us      58.496us             1  
+                                        aten::transpose         0.62%      47.348us         0.84%      63.699us       2.654us       0.000us         0.00%       0.000us       0.000us            24  
+                                       aten::as_strided         0.21%      16.351us         0.21%      16.351us       0.681us       0.000us         0.00%       0.000us       0.000us            24  
+                                       aten::empty_like         0.15%      11.091us         0.67%      51.081us       5.676us       0.000us         0.00%       0.000us       0.000us             9  
+                                            aten::empty         0.86%      65.760us         0.86%      65.760us       3.131us       0.000us         0.00%       0.000us       0.000us            21  
+                                       cudaLaunchKernel         1.18%      89.982us         1.18%      89.982us       7.498us       0.000us         0.00%       0.000us       0.000us            12  
+                                  cudaStreamIsCapturing         0.03%       2.210us         0.03%       2.210us       0.737us       0.000us         0.00%       0.000us       0.000us             3  
+                                   cudaFuncSetAttribute         0.04%       3.100us         0.04%       3.100us       1.033us       0.000us         0.00%       0.000us       0.000us             3  
+                                  cudaDeviceSynchronize        73.11%       5.575ms        73.11%       5.575ms       5.575ms       0.000us         0.00%       0.000us       0.000us             1  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-Self CPU time total: 23.852ms
-Self CUDA time total: 22.283ms
+Self CPU time total: 7.626ms
+Self CUDA time total: 6.108ms
 
 
 
@@ -4017,28 +4017,28 @@ PROFILE TRACE: torch_mem_eff | cuda_attn_L384_bfloat16
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
                                                    Name    Self CPU %      Self CPU   CPU total %     CPU total  CPU time avg     Self CUDA   Self CUDA %    CUDA total  CUDA time avg    # of Calls  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-                                          torch_mem_eff         1.02%     244.258us         9.92%       2.384ms       2.384ms       0.000us         0.00%      22.468ms      22.468ms             1  
-                                          torch_mem_eff         0.00%       0.000us         0.00%       0.000us       0.000us      22.273ms       100.04%      22.273ms      22.273ms             1  
-                     aten::scaled_dot_product_attention         0.08%      18.581us         0.64%     152.823us      50.941us       0.000us         0.00%      20.365ms       6.788ms             3  
-          aten::_scaled_dot_product_efficient_attention         0.08%      18.340us         0.56%     134.242us      44.747us       0.000us         0.00%      20.365ms       6.788ms             3  
-                     aten::_efficient_attention_forward         0.12%      27.659us         0.39%      92.632us      30.877us      20.365ms        91.47%      20.365ms       6.788ms             3  
-fmha_cutlassF_bf16_aligned_64x128_rf_sm80(PyTorchMem...         0.00%       0.000us         0.00%       0.000us       0.000us      20.365ms        91.47%      20.365ms       6.788ms             3  
-                                       aten::contiguous         0.03%       7.371us         8.08%       1.943ms     215.938us       0.000us         0.00%       2.103ms     233.655us             9  
-                                            aten::clone         0.09%      21.799us         8.05%       1.936ms     215.119us       0.000us         0.00%       2.103ms     233.655us             9  
-                                            aten::copy_         0.27%      65.442us         7.66%       1.841ms     204.604us       1.898ms         8.53%       2.103ms     233.655us             9  
-void at::native::elementwise_kernel&lt;128, 4, at::nati...         0.00%       0.000us         0.00%       0.000us       0.000us       1.898ms         8.53%       1.898ms     210.921us             9  
-                                Activity Buffer Request         6.22%       1.495ms         6.22%       1.495ms       1.495ms     204.607us         0.92%     204.607us     204.607us             1  
-                                        aten::transpose         0.20%      48.657us         0.28%      66.799us       2.783us       0.000us         0.00%       0.000us       0.000us            24  
-                                       aten::as_strided         0.08%      18.142us         0.08%      18.142us       0.756us       0.000us         0.00%       0.000us       0.000us            24  
-                                       aten::empty_like         0.13%      32.371us         0.30%      72.832us       8.092us       0.000us         0.00%       0.000us       0.000us             9  
-                                            aten::empty         0.29%      69.063us         0.29%      69.063us       3.289us       0.000us         0.00%       0.000us       0.000us            21  
-                                       cudaLaunchKernel         1.30%     311.775us         1.30%     311.775us      25.981us       0.000us         0.00%       0.000us       0.000us            12  
-                                  cudaStreamIsCapturing         0.01%       2.430us         0.01%       2.430us       0.810us       0.000us         0.00%       0.000us       0.000us             3  
-                                   cudaFuncSetAttribute         0.01%       2.951us         0.01%       2.951us       0.984us       0.000us         0.00%       0.000us       0.000us             3  
-                                  cudaDeviceSynchronize        90.08%      21.659ms        90.08%      21.659ms      21.659ms       0.000us         0.00%       0.000us       0.000us             1  
+                                          torch_mem_eff         4.44%     356.182us        33.00%       2.648ms       2.648ms       0.000us         0.00%       6.210ms       6.210ms             1  
+                                          torch_mem_eff         0.00%       0.000us         0.00%       0.000us       0.000us       6.165ms       100.21%       6.165ms       6.165ms             1  
+                     aten::scaled_dot_product_attention         0.29%      23.400us         2.31%     185.263us      61.754us       0.000us         0.00%       5.497ms       1.832ms             3  
+          aten::_scaled_dot_product_efficient_attention         0.29%      23.202us         2.02%     161.863us      53.954us       0.000us         0.00%       5.497ms       1.832ms             3  
+                     aten::_efficient_attention_forward         0.44%      35.239us         1.36%     108.811us      36.270us       5.497ms        89.36%       5.497ms       1.832ms             3  
+fmha_cutlassF_bf16_aligned_64x128_rf_sm80(PyTorchMem...         0.00%       0.000us         0.00%       0.000us       0.000us       5.497ms        89.36%       5.497ms       1.832ms             3  
+                                       aten::contiguous         0.11%       9.040us        25.54%       2.050ms     227.726us       0.000us         0.00%     712.735us      79.193us             9  
+                                            aten::clone         0.35%      28.461us        25.43%       2.040ms     226.722us       0.000us         0.00%     712.735us      79.193us             9  
+                                            aten::copy_         1.02%      82.020us        24.22%       1.944ms     215.993us     654.527us        10.64%     712.735us      79.193us             9  
+void at::native::elementwise_kernel&lt;128, 4, at::nati...         0.00%       0.000us         0.00%       0.000us       0.000us     654.527us        10.64%     654.527us      72.725us             9  
+                                Activity Buffer Request        19.35%       1.553ms        19.35%       1.553ms       1.553ms      58.208us         0.95%      58.208us      58.208us             1  
+                                        aten::transpose         0.81%      64.960us         1.09%      87.330us       3.639us       0.000us         0.00%       0.000us       0.000us            24  
+                                       aten::as_strided         0.28%      22.370us         0.28%      22.370us       0.932us       0.000us         0.00%       0.000us       0.000us            24  
+                                       aten::empty_like         0.19%      15.081us         0.85%      68.092us       7.566us       0.000us         0.00%       0.000us       0.000us             9  
+                                            aten::empty         1.09%      87.522us         1.09%      87.522us       4.168us       0.000us         0.00%       0.000us       0.000us            21  
+                                       cudaLaunchKernel         4.25%     341.154us         4.25%     341.154us      28.429us       0.000us         0.00%       0.000us       0.000us            12  
+                                  cudaStreamIsCapturing         0.04%       2.841us         0.04%       2.841us       0.947us       0.000us         0.00%       0.000us       0.000us             3  
+                                   cudaFuncSetAttribute         0.05%       4.120us         0.05%       4.120us       1.373us       0.000us         0.00%       0.000us       0.000us             3  
+                                  cudaDeviceSynchronize        67.00%       5.376ms        67.00%       5.376ms       5.376ms       0.000us         0.00%       0.000us       0.000us             1  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-Self CPU time total: 24.043ms
-Self CUDA time total: 22.264ms
+Self CPU time total: 8.025ms
+Self CUDA time total: 6.152ms
 
 
 
@@ -4048,28 +4048,28 @@ PROFILE TRACE: torch_mem_eff | cuda_attn_L448_bfloat16
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
                                                    Name    Self CPU %      Self CPU   CPU total %     CPU total  CPU time avg     Self CUDA   Self CUDA %    CUDA total  CUDA time avg    # of Calls  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-                                          torch_mem_eff         0.99%     238.965us         8.38%       2.024ms       2.024ms       0.000us         0.00%      22.887ms      22.887ms             1  
-                                          torch_mem_eff         0.00%       0.000us         0.00%       0.000us       0.000us      22.691ms       100.04%      22.691ms      22.691ms             1  
-                     aten::scaled_dot_product_attention         0.08%      19.540us         0.60%     145.283us      48.428us       0.000us         0.00%      20.756ms       6.919ms             3  
-          aten::_scaled_dot_product_efficient_attention         0.08%      18.450us         0.52%     125.743us      41.914us       0.000us         0.00%      20.756ms       6.919ms             3  
-                     aten::_efficient_attention_forward         0.12%      28.200us         0.34%      82.042us      27.347us      20.756ms        91.51%      20.756ms       6.919ms             3  
-fmha_cutlassF_bf16_aligned_64x128_rf_sm80(PyTorchMem...         0.00%       0.000us         0.00%       0.000us       0.000us      20.756ms        91.51%      20.756ms       6.919ms             3  
-                                       aten::contiguous         0.03%       7.310us         6.62%       1.597ms     177.483us       0.000us         0.00%       2.130ms     236.720us             9  
-                                            aten::clone         0.08%      20.502us         6.59%       1.590ms     176.671us       0.000us         0.00%       2.130ms     236.720us             9  
-                                            aten::copy_         0.25%      60.710us         6.29%       1.519ms     168.815us       1.926ms         8.49%       2.130ms     236.720us             9  
-void at::native::elementwise_kernel&lt;128, 4, at::nati...         0.00%       0.000us         0.00%       0.000us       0.000us       1.926ms         8.49%       1.926ms     213.965us             9  
-                                Activity Buffer Request         4.97%       1.199ms         4.97%       1.199ms       1.199ms     204.798us         0.90%     204.798us     204.798us             1  
-                                        aten::transpose         0.21%      49.950us         0.28%      67.671us       2.820us       0.000us         0.00%       0.000us       0.000us            24  
-                                       aten::as_strided         0.07%      17.721us         0.07%      17.721us       0.738us       0.000us         0.00%       0.000us       0.000us            24  
-                                       aten::empty_like         0.05%      11.321us         0.21%      50.202us       5.578us       0.000us         0.00%       0.000us       0.000us             9  
-                                            aten::empty         0.27%      64.383us         0.27%      64.383us       3.066us       0.000us         0.00%       0.000us       0.000us            21  
-                                       cudaLaunchKernel         1.17%     282.217us         1.17%     282.217us      23.518us       0.000us         0.00%       0.000us       0.000us            12  
-                                  cudaStreamIsCapturing         0.01%       2.720us         0.01%       2.720us       0.907us       0.000us         0.00%       0.000us       0.000us             3  
-                                   cudaFuncSetAttribute         0.01%       3.029us         0.01%       3.029us       1.010us       0.000us         0.00%       0.000us       0.000us             3  
-                                  cudaDeviceSynchronize        91.62%      22.117ms        91.62%      22.117ms      22.117ms       0.000us         0.00%       0.000us       0.000us             1  
+                                          torch_mem_eff         3.33%     272.217us        28.45%       2.323ms       2.323ms       0.000us         0.00%       6.452ms       6.452ms             1  
+                                          torch_mem_eff         0.00%       0.000us         0.00%       0.000us       0.000us       6.401ms       100.14%       6.401ms       6.401ms             1  
+                     aten::scaled_dot_product_attention         0.25%      20.040us         1.74%     141.700us      47.233us       0.000us         0.00%       5.729ms       1.910ms             3  
+          aten::_scaled_dot_product_efficient_attention         0.23%      18.560us         1.49%     121.660us      40.553us       0.000us         0.00%       5.729ms       1.910ms             3  
+                     aten::_efficient_attention_forward         0.34%      27.420us         1.00%      81.440us      27.147us       5.729ms        89.62%       5.729ms       1.910ms             3  
+fmha_cutlassF_bf16_aligned_64x128_rf_sm80(PyTorchMem...         0.00%       0.000us         0.00%       0.000us       0.000us       5.729ms        89.62%       5.729ms       1.910ms             3  
+                                       aten::contiguous         0.09%       7.310us        22.83%       1.865ms     207.177us       0.000us         0.00%     723.614us      80.402us             9  
+                                            aten::clone         0.27%      22.438us        22.75%       1.857ms     206.364us       0.000us         0.00%     723.614us      80.402us             9  
+                                            aten::copy_         0.75%      61.292us        21.84%       1.783ms     198.108us     663.806us        10.38%     723.614us      80.402us             9  
+void at::native::elementwise_kernel&lt;128, 4, at::nati...         0.00%       0.000us         0.00%       0.000us       0.000us     663.806us        10.38%     663.806us      73.756us             9  
+                                Activity Buffer Request        18.13%       1.481ms        18.13%       1.481ms       1.481ms      59.808us         0.94%      59.808us      59.808us             1  
+                                        aten::transpose         0.61%      49.591us         0.81%      66.019us       2.751us       0.000us         0.00%       0.000us       0.000us            24  
+                                       aten::as_strided         0.20%      16.428us         0.20%      16.428us       0.684us       0.000us         0.00%       0.000us       0.000us            24  
+                                       aten::empty_like         0.14%      11.501us         0.64%      51.871us       5.763us       0.000us         0.00%       0.000us       0.000us             9  
+                                            aten::empty         0.80%      65.620us         0.80%      65.620us       3.125us       0.000us         0.00%       0.000us       0.000us            21  
+                                       cudaLaunchKernel         3.24%     264.473us         3.24%     264.473us      22.039us       0.000us         0.00%       0.000us       0.000us            12  
+                                  cudaStreamIsCapturing         0.03%       2.310us         0.03%       2.310us       0.770us       0.000us         0.00%       0.000us       0.000us             3  
+                                   cudaFuncSetAttribute         0.04%       3.060us         0.04%       3.060us       1.020us       0.000us         0.00%       0.000us       0.000us             3  
+                                  cudaDeviceSynchronize        71.55%       5.843ms        71.55%       5.843ms       5.843ms       0.000us         0.00%       0.000us       0.000us             1  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-Self CPU time total: 24.141ms
-Self CUDA time total: 22.682ms
+Self CPU time total: 8.166ms
+Self CUDA time total: 6.392ms
 
 
 
@@ -4079,90 +4079,38 @@ PROFILE TRACE: torch_mem_eff | cuda_attn_L512_bfloat16
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
                                                    Name    Self CPU %      Self CPU   CPU total %     CPU total  CPU time avg     Self CUDA   Self CUDA %    CUDA total  CUDA time avg    # of Calls  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-                                          torch_mem_eff         0.89%     241.438us         9.64%       2.630ms       2.630ms       0.000us         0.00%      25.454ms      25.454ms             1  
-                                          torch_mem_eff         0.00%       0.000us         0.00%       0.000us       0.000us      25.223ms       100.04%      25.223ms      25.223ms             1  
-                     aten::scaled_dot_product_attention         0.07%      18.690us         0.53%     143.613us      47.871us       0.000us         0.00%      22.917ms       7.639ms             3  
-          aten::_scaled_dot_product_efficient_attention         0.07%      19.432us         0.46%     124.923us      41.641us       0.000us         0.00%      22.917ms       7.639ms             3  
-                     aten::_efficient_attention_forward         0.10%      27.951us         0.30%      81.832us      27.277us      22.917ms        90.90%      22.917ms       7.639ms             3  
-fmha_cutlassF_bf16_aligned_64x128_rf_sm80(PyTorchMem...         0.00%       0.000us         0.00%       0.000us       0.000us      22.917ms        90.90%      22.917ms       7.639ms             3  
-                                       aten::contiguous         0.03%       7.769us         8.07%       2.200ms     244.390us       0.000us         0.00%       2.537ms     281.850us             9  
-                                            aten::clone         0.08%      21.360us         8.04%       2.192ms     243.526us       0.000us         0.00%       2.537ms     281.850us             9  
-                                            aten::copy_         0.23%      62.351us         7.77%       2.118ms     235.368us       2.295ms         9.10%       2.537ms     281.850us             9  
-void at::native::elementwise_kernel&lt;128, 4, at::nati...         0.00%       0.000us         0.00%       0.000us       0.000us       2.295ms         9.10%       2.295ms     255.042us             9  
-                                Activity Buffer Request         5.96%       1.625ms         5.96%       1.625ms       1.625ms     241.278us         0.96%     241.278us     241.278us             1  
-                                        aten::transpose         0.19%      51.326us         0.25%      68.688us       2.862us       0.000us         0.00%       0.000us       0.000us            24  
-                                       aten::as_strided         0.06%      17.362us         0.06%      17.362us       0.723us       0.000us         0.00%       0.000us       0.000us            24  
-                                       aten::empty_like         0.04%      11.861us         0.19%      52.062us       5.785us       0.000us         0.00%       0.000us       0.000us             9  
-                                            aten::empty         0.24%      65.461us         0.24%      65.461us       3.117us       0.000us         0.00%       0.000us       0.000us            21  
-                                       cudaLaunchKernel         1.67%     454.311us         1.67%     454.311us      37.859us       0.000us         0.00%       0.000us       0.000us            12  
-                                  cudaStreamIsCapturing         0.01%       2.710us         0.01%       2.710us       0.903us       0.000us         0.00%       0.000us       0.000us             3  
-                                   cudaFuncSetAttribute         0.01%       2.880us         0.01%       2.880us       0.960us       0.000us         0.00%       0.000us       0.000us             3  
-                                  cudaDeviceSynchronize        90.36%      24.642ms        90.36%      24.642ms      24.642ms       0.000us         0.00%       0.000us       0.000us             1  
+                                          torch_mem_eff         2.84%     238.921us        26.25%       2.206ms       2.206ms       0.000us         0.00%       6.803ms       6.803ms             1  
+                                          torch_mem_eff         0.00%       0.000us         0.00%       0.000us       0.000us       6.751ms       100.13%       6.751ms       6.751ms             1  
+                     aten::scaled_dot_product_attention         0.23%      19.080us         1.67%     140.122us      46.707us       0.000us         0.00%       6.072ms       2.024ms             3  
+          aten::_scaled_dot_product_efficient_attention         0.22%      18.680us         1.44%     121.042us      40.347us       0.000us         0.00%       6.072ms       2.024ms             3  
+                     aten::_efficient_attention_forward         0.32%      27.009us         0.95%      79.840us      26.613us       6.072ms        90.07%       6.072ms       2.024ms             3  
+fmha_cutlassF_bf16_aligned_64x128_rf_sm80(PyTorchMem...         0.00%       0.000us         0.00%       0.000us       0.000us       6.072ms        90.07%       6.072ms       2.024ms             3  
+                                       aten::contiguous         0.09%       7.439us        21.24%       1.785ms     198.324us       0.000us         0.00%     731.099us      81.233us             9  
+                                            aten::clone         0.26%      21.852us        21.15%       1.777ms     197.498us       0.000us         0.00%     731.099us      81.233us             9  
+                                            aten::copy_         0.77%      64.769us        20.27%       1.703ms     189.239us     669.820us         9.93%     731.099us      81.233us             9  
+void at::native::elementwise_kernel&lt;128, 4, at::nati...         0.00%       0.000us         0.00%       0.000us       0.000us     669.820us         9.93%     669.820us      74.424us             9  
+                                Activity Buffer Request        16.92%       1.422ms        16.92%       1.422ms       1.422ms      61.279us         0.91%      61.279us      61.279us             1  
+                                        aten::transpose         0.57%      48.271us         0.77%      64.334us       2.681us       0.000us         0.00%       0.000us       0.000us            24  
+                                       aten::as_strided         0.19%      16.063us         0.19%      16.063us       0.669us       0.000us         0.00%       0.000us       0.000us            24  
+                                       aten::empty_like         0.14%      11.440us         0.62%      52.480us       5.831us       0.000us         0.00%       0.000us       0.000us             9  
+                                            aten::empty         0.79%      66.661us         0.79%      66.661us       3.174us       0.000us         0.00%       0.000us       0.000us            21  
+                                       cudaLaunchKernel         2.84%     238.383us         2.84%     238.383us      19.865us       0.000us         0.00%       0.000us       0.000us            12  
+                                  cudaStreamIsCapturing         0.03%       2.270us         0.03%       2.270us       0.757us       0.000us         0.00%       0.000us       0.000us             3  
+                                   cudaFuncSetAttribute         0.04%       3.090us         0.04%       3.090us       1.030us       0.000us         0.00%       0.000us       0.000us             3  
+                                  cudaDeviceSynchronize        73.75%       6.196ms        73.75%       6.196ms       6.196ms       0.000us         0.00%       0.000us       0.000us             1  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-Self CPU time total: 27.271ms
-Self CUDA time total: 25.213ms
+Self CPU time total: 8.402ms
+Self CUDA time total: 6.742ms
 
 
 impl                     wl                  p50(ms)  ok
-torch_mem_eff            cuda_attn_L128_bfloat16     6.77  True
-torch_mem_eff            cuda_attn_L256_bfloat16     7.24  True
-torch_mem_eff            cuda_attn_L320_bfloat16     7.52  True
-torch_mem_eff            cuda_attn_L384_bfloat16     7.59  True
-torch_mem_eff            cuda_attn_L448_bfloat16     7.97  True
-torch_mem_eff            cuda_attn_L512_bfloat16     8.47  True
+torch_mem_eff            cuda_attn_L128_bfloat16     1.89  True
+torch_mem_eff            cuda_attn_L256_bfloat16     1.95  True
+torch_mem_eff            cuda_attn_L320_bfloat16     2.05  True
+torch_mem_eff            cuda_attn_L384_bfloat16     2.08  True
+torch_mem_eff            cuda_attn_L448_bfloat16     2.13  True
+torch_mem_eff            cuda_attn_L512_bfloat16     2.27  True
 </pre></div>
-<div class="uv-install-logs" id="uv-logs-benchmark">
-<div class="uv-logs-header" onclick="toggleUvLogs(this)">▶ UV Install Logs</div>
-<div class="uv-logs-content" style="display: none;">
-   Building kernels-benchmark-tools @ file:///home/ubuntu/Projects/kernels-benchmarks-consolidated/tools
-Downloading pillow (6.7MiB)
-Downloading sympy (6.0MiB)
-Downloading nvidia-nccl-cu12 (307.4MiB)
-Downloading nvidia-cusparselt-cu12 (273.9MiB)
-Downloading nvidia-cusolver-cu12 (255.1MiB)
-Downloading nvidia-cufile-cu12 (1.1MiB)
-Downloading fonttools (4.7MiB)
-Downloading nvidia-cudnn-cu12 (674.0MiB)
-Downloading nvidia-cuda-nvrtc-cu12 (84.0MiB)
-Downloading numpy (15.9MiB)
-Downloading setuptools (1.1MiB)
-Downloading nvidia-cuda-cupti-cu12 (9.8MiB)
-Downloading matplotlib (8.3MiB)
-Downloading networkx (1.9MiB)
-Downloading nvidia-curand-cu12 (60.7MiB)
-Downloading nvidia-nvjitlink-cu12 (37.4MiB)
-Downloading kiwisolver (1.4MiB)
-Downloading nvidia-cusparse-cu12 (274.9MiB)
-Downloading nvidia-cufft-cu12 (184.2MiB)
-Downloading nvidia-cublas-cu12 (566.8MiB)
-Downloading torch (846.8MiB)
-Downloading triton (148.4MiB)
- Downloading nvidia-cufile-cu12
- Downloading kiwisolver
- Downloading setuptools
- Downloading fonttools
- Downloading networkx
- Downloading pillow
-      Built kernels-benchmark-tools @ file:///home/ubuntu/Projects/kernels-benchmarks-consolidated/tools
- Downloading matplotlib
- Downloading nvidia-cuda-cupti-cu12
- Downloading numpy
- Downloading nvidia-nvjitlink-cu12
- Downloading sympy
- Downloading nvidia-curand-cu12
- Downloading nvidia-cuda-nvrtc-cu12
- Downloading triton
- Downloading nvidia-cufft-cu12
- Downloading nvidia-cusolver-cu12
- Downloading nvidia-cusparse-cu12
- Downloading nvidia-cusparselt-cu12
- Downloading nvidia-nccl-cu12
- Downloading nvidia-cublas-cu12
- Downloading nvidia-cudnn-cu12
- Downloading torch
-Installed 37 packages in 228ms
-</div>
-</div>
 <div class="cell-artifacts">
 <h4>Artifacts:</h4>
 <a href="artifacts/benchmark/attention.jsonl" class="artifact" target="_blank">attention.jsonl</a>
diff --git a/flash_attn/impls/sage_attention.html b/flash_attn/impls/sage_attention.html
index 364f1f4634c22af85c73498d50323f326f86ac56..ab4f80472a285c7007aef3edb3b9473b4ac8170b 100644
--- a/flash_attn/impls/sage_attention.html
+++ b/flash_attn/impls/sage_attention.html
@@ -3857,7 +3857,7 @@ span.linenos.special { color: #000000; background-color: #ffffc0; padding-left:
     <div class="system-info">
         <div class="system-info-header">Generated on:</div>
         <div class="system-info-content">
-            Linux x86_64 | Linux-5.15.0-1084-aws-x86_64-with-glibc2.31
+            Linux x86_64 | Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35
         </div>
     </div>
     
@@ -3871,7 +3871,7 @@ span.linenos.special { color: #000000; background-color: #ffffc0; padding-left:
 <span onclick="toggleOutput('benchmark')" style="cursor: pointer;">▼ output</span>
  <span id="uv-indicator-benchmark" onclick="toggleUvLogsFromHeader('benchmark')" style="cursor: pointer;">▶ uv-logs</span>
 </span> | 
-Cell: benchmark | 44.02s
+Cell: benchmark | 4.37s
  | <button class="run-btn" onclick="runCell('benchmark')">▶ run</button>
 <button class="copy-btn" onclick="copyCell('benchmark')">Copy</button>
 <a href="cells/benchmark.py" target="_blank" class="raw-btn">Raw</a>
@@ -3921,76 +3921,28 @@ Cell: benchmark | 44.02s
 <div class="cell-stdout"><pre class="stdout-text">Running attention benchmark on cuda with 6 workloads.
 impl                     wl                  p50(ms)  ok
 sage_int8_fp16           cuda_attn_L128_bfloat16    FAIL  False
-  Error: module &#x27;sage_attention_c9b3a60c7e3c5091&#x27; has no attribute &#x27;fwd&#x27;
+  Error: module &#x27;sage_attention_12c766386675beb4&#x27; has no attribute &#x27;fwd&#x27;
 sage_int8_fp16           cuda_attn_L256_bfloat16    FAIL  False
-  Error: module &#x27;sage_attention_c9b3a60c7e3c5091&#x27; has no attribute &#x27;fwd&#x27;
+  Error: module &#x27;sage_attention_12c766386675beb4&#x27; has no attribute &#x27;fwd&#x27;
 sage_int8_fp16           cuda_attn_L320_bfloat16    FAIL  False
-  Error: module &#x27;sage_attention_c9b3a60c7e3c5091&#x27; has no attribute &#x27;fwd&#x27;
+  Error: module &#x27;sage_attention_12c766386675beb4&#x27; has no attribute &#x27;fwd&#x27;
 sage_int8_fp16           cuda_attn_L384_bfloat16    FAIL  False
-  Error: module &#x27;sage_attention_c9b3a60c7e3c5091&#x27; has no attribute &#x27;fwd&#x27;
+  Error: module &#x27;sage_attention_12c766386675beb4&#x27; has no attribute &#x27;fwd&#x27;
 sage_int8_fp16           cuda_attn_L448_bfloat16    FAIL  False
-  Error: module &#x27;sage_attention_c9b3a60c7e3c5091&#x27; has no attribute &#x27;fwd&#x27;
+  Error: module &#x27;sage_attention_12c766386675beb4&#x27; has no attribute &#x27;fwd&#x27;
 sage_int8_fp16           cuda_attn_L512_bfloat16    FAIL  False
-  Error: module &#x27;sage_attention_c9b3a60c7e3c5091&#x27; has no attribute &#x27;fwd&#x27;
+  Error: module &#x27;sage_attention_12c766386675beb4&#x27; has no attribute &#x27;fwd&#x27;
 </pre></div>
 <div class="uv-install-logs" id="uv-logs-benchmark">
 <div class="uv-logs-header" onclick="toggleUvLogs(this)">▶ UV Install Logs</div>
 <div class="uv-logs-content" style="display: none;">
-   Building kernels-benchmark-tools @ file:///home/ubuntu/Projects/kernels-benchmarks-consolidated/tools
-Downloading pillow (6.7MiB)
-Downloading hf-xet (3.2MiB)
-Downloading networkx (1.9MiB)
-Downloading setuptools (1.1MiB)
-Downloading nvidia-cufile-cu12 (1.1MiB)
-Downloading numpy (15.9MiB)
-Downloading nvidia-cuda-cupti-cu12 (9.8MiB)
-Downloading nvidia-cublas-cu12 (566.8MiB)
-Downloading nvidia-cusparselt-cu12 (273.9MiB)
-Downloading nvidia-curand-cu12 (60.7MiB)
-Downloading sympy (6.0MiB)
-Downloading kiwisolver (1.4MiB)
-Downloading fonttools (4.7MiB)
-Downloading nvidia-nccl-cu12 (307.4MiB)
-Downloading nvidia-cusparse-cu12 (274.9MiB)
-Downloading nvidia-cufft-cu12 (184.2MiB)
-Downloading nvidia-cuda-nvrtc-cu12 (84.0MiB)
-Downloading matplotlib (8.3MiB)
-Downloading nvidia-cusolver-cu12 (255.1MiB)
-Downloading nvidia-cudnn-cu12 (674.0MiB)
-Downloading nvidia-nvjitlink-cu12 (37.4MiB)
-Downloading triton (148.4MiB)
-Downloading torch (846.8MiB)
- Downloading nvidia-cufile-cu12
- Downloading kiwisolver
- Downloading hf-xet
- Downloading setuptools
- Downloading fonttools
- Downloading networkx
- Downloading pillow
-      Built kernels-benchmark-tools @ file:///home/ubuntu/Projects/kernels-benchmarks-consolidated/tools
- Downloading nvidia-cuda-cupti-cu12
- Downloading matplotlib
- Downloading numpy
- Downloading sympy
- Downloading nvidia-nvjitlink-cu12
- Downloading nvidia-curand-cu12
- Downloading nvidia-cuda-nvrtc-cu12
- Downloading triton
- Downloading nvidia-cufft-cu12
- Downloading nvidia-cusolver-cu12
- Downloading nvidia-cusparselt-cu12
- Downloading nvidia-cusparse-cu12
- Downloading nvidia-nccl-cu12
- Downloading nvidia-cublas-cu12
- Downloading nvidia-cudnn-cu12
- Downloading torch
-Installed 48 packages in 211ms
+Installed 1 package in 11ms
 </div>
 </div>
 <div class="cell-stderr">Fetching 11 files:   0%|          | 0/11 [00:00&lt;?, ?it/s]
-Fetching 11 files:   9%|▉         | 1/11 [00:00&lt;00:02,  3.52it/s]
-Fetching 11 files:  73%|███████▎  | 8/11 [00:00&lt;00:00,  9.29it/s]
-Fetching 11 files: 100%|██████████| 11/11 [00:00&lt;00:00, 12.03it/s]</div>
+Fetching 11 files:  27%|██▋       | 3/11 [00:00&lt;00:00, 14.92it/s]
+Fetching 11 files:  73%|███████▎  | 8/11 [00:00&lt;00:00, 14.19it/s]
+Fetching 11 files: 100%|██████████| 11/11 [00:00&lt;00:00, 19.60it/s]</div>
 <div class="cell-artifacts">
 <h4>Artifacts:</h4>
 <a href="artifacts/benchmark/attention.jsonl" class="artifact" target="_blank">attention.jsonl</a>
diff --git a/flash_attn/impls/xformers.html b/flash_attn/impls/xformers.html
index e7a61d66f041c6966cb9e2e4975416b77d7cb087..294f2ef1a8f4ed568426150c8120ee2f0c927541 100644
--- a/flash_attn/impls/xformers.html
+++ b/flash_attn/impls/xformers.html
@@ -3857,7 +3857,7 @@ span.linenos.special { color: #000000; background-color: #ffffc0; padding-left:
     <div class="system-info">
         <div class="system-info-header">Generated on:</div>
         <div class="system-info-content">
-            Linux x86_64 | Linux-5.15.0-1084-aws-x86_64-with-glibc2.31
+            Linux x86_64 | Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35
         </div>
     </div>
     
@@ -3871,7 +3871,7 @@ span.linenos.special { color: #000000; background-color: #ffffc0; padding-left:
 <span onclick="toggleOutput('benchmark')" style="cursor: pointer;">▼ output</span>
  <span id="uv-indicator-benchmark" onclick="toggleUvLogsFromHeader('benchmark')" style="cursor: pointer;">▶ uv-logs</span>
 </span> | 
-Cell: benchmark | 45.32s
+Cell: benchmark | 5.09s
  | <button class="run-btn" onclick="runCell('benchmark')">▶ run</button>
 <button class="copy-btn" onclick="copyCell('benchmark')">Copy</button>
 <a href="cells/benchmark.py" target="_blank" class="raw-btn">Raw</a>
@@ -3923,21 +3923,21 @@ PROFILE TRACE: xformers_meff | cuda_attn_L128_bfloat16
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
                                                    Name    Self CPU %      Self CPU   CPU total %     CPU total  CPU time avg     Self CUDA   Self CUDA %    CUDA total  CUDA time avg    # of Calls  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-                                          xformers_meff         5.28%     517.181us        24.85%       2.433ms       2.433ms       0.000us         0.00%      10.583ms      10.583ms             1  
-                             xformers_flash3::flash_fwd         2.21%     216.725us        19.17%       1.877ms     625.707us       0.000us         0.00%      10.583ms       3.528ms             3  
-                                      flash_attn_3::fwd         0.75%      73.471us        16.96%       1.660ms     553.465us       7.934ms       100.00%      10.583ms       3.528ms             3  
-                                          xformers_meff         0.00%       0.000us         0.00%       0.000us       0.000us       7.935ms       100.02%       7.935ms       7.935ms             1  
-void cutlass::device_kernel&lt;flash::enable_sm80_to_sm...         0.00%       0.000us         0.00%       0.000us       0.000us       7.934ms       100.00%       7.934ms       2.645ms             3  
-                                Activity Buffer Request        15.30%       1.498ms        15.30%       1.498ms       1.498ms       2.649ms        33.39%       2.649ms       2.649ms             1  
-                                            aten::empty         0.35%      34.410us         0.35%      34.410us       5.735us       0.000us         0.00%       0.000us       0.000us             6  
-                                   cudaFuncSetAttribute         0.13%      13.051us         0.13%      13.051us       4.350us       0.000us         0.00%       0.000us       0.000us             3  
-                                       cudaLaunchKernel         0.42%      41.351us         0.42%      41.351us      13.784us       0.000us         0.00%       0.000us       0.000us             3  
-                                          aten::reshape         0.14%      13.581us         0.40%      38.881us       6.480us       0.000us         0.00%       0.000us       0.000us             6  
-                                             aten::view         0.26%      25.300us         0.26%      25.300us       4.217us       0.000us         0.00%       0.000us       0.000us             6  
-                                  cudaDeviceSynchronize        75.15%       7.358ms        75.15%       7.358ms       7.358ms       0.000us         0.00%       0.000us       0.000us             1  
+                                          xformers_meff        10.73%     481.606us        51.24%       2.299ms       2.299ms       0.000us         0.00%       3.630ms       3.630ms             1  
+                             xformers_flash3::flash_fwd         4.33%     194.084us        39.70%       1.781ms     593.782us       0.000us         0.00%       3.630ms       1.210ms             3  
+                                      flash_attn_3::fwd         1.76%      78.961us        35.37%       1.587ms     529.087us       2.729ms       100.00%       3.630ms       1.210ms             3  
+                                          xformers_meff         0.00%       0.000us         0.00%       0.000us       0.000us       2.730ms       100.05%       2.730ms       2.730ms             1  
+void cutlass::device_kernel&lt;flash::enable_sm80_to_sm...         0.00%       0.000us         0.00%       0.000us       0.000us       2.729ms       100.00%       2.729ms     909.588us             3  
+                                Activity Buffer Request        31.70%       1.423ms        31.70%       1.423ms       1.423ms     901.535us        33.04%     901.535us     901.535us             1  
+                                            aten::empty         0.75%      33.761us         0.75%      33.761us       5.627us       0.000us         0.00%       0.000us       0.000us             6  
+                                   cudaFuncSetAttribute         0.28%      12.380us         0.28%      12.380us       4.127us       0.000us         0.00%       0.000us       0.000us             3  
+                                       cudaLaunchKernel         0.88%      39.570us         0.88%      39.570us      13.190us       0.000us         0.00%       0.000us       0.000us             3  
+                                          aten::reshape         0.30%      13.520us         0.80%      36.080us       6.013us       0.000us         0.00%       0.000us       0.000us             6  
+                                             aten::view         0.50%      22.560us         0.50%      22.560us       3.760us       0.000us         0.00%       0.000us       0.000us             6  
+                                  cudaDeviceSynchronize        48.76%       2.188ms        48.76%       2.188ms       2.188ms       0.000us         0.00%       0.000us       0.000us             1  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-Self CPU time total: 9.791ms
-Self CUDA time total: 7.934ms
+Self CPU time total: 4.487ms
+Self CUDA time total: 2.729ms
 
 
 
@@ -3947,21 +3947,21 @@ PROFILE TRACE: xformers_meff | cuda_attn_L256_bfloat16
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
                                                    Name    Self CPU %      Self CPU   CPU total %     CPU total  CPU time avg     Self CUDA   Self CUDA %    CUDA total  CUDA time avg    # of Calls  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-                                          xformers_meff         2.97%     376.750us        17.03%       2.160ms       2.160ms       0.000us         0.00%      14.695ms      14.695ms             1  
-                             xformers_flash3::flash_fwd         1.31%     166.673us        13.88%       1.760ms     586.646us       0.000us         0.00%      14.695ms       4.898ms             3  
-                                      flash_attn_3::fwd         0.41%      52.370us        12.57%       1.593ms     531.088us      11.013ms       100.00%      14.695ms       4.898ms             3  
-                                          xformers_meff         0.00%       0.000us         0.00%       0.000us       0.000us      11.015ms       100.02%      11.015ms      11.015ms             1  
-void cutlass::device_kernel&lt;flash::enable_sm80_to_sm...         0.00%       0.000us         0.00%       0.000us       0.000us      11.013ms       100.00%      11.013ms       3.671ms             3  
-                                Activity Buffer Request        11.62%       1.473ms        11.62%       1.473ms       1.473ms       3.682ms        33.43%       3.682ms       3.682ms             1  
-                                            aten::empty         0.22%      28.511us         0.22%      28.511us       4.752us       0.000us         0.00%       0.000us       0.000us             6  
-                                   cudaFuncSetAttribute         0.04%       5.391us         0.04%       5.391us       1.797us       0.000us         0.00%       0.000us       0.000us             3  
-                                       cudaLaunchKernel         0.27%      34.441us         0.27%      34.441us      11.480us       0.000us         0.00%       0.000us       0.000us             3  
-                                          aten::reshape         0.07%       8.699us         0.18%      22.949us       3.825us       0.000us         0.00%       0.000us       0.000us             6  
-                                             aten::view         0.11%      14.250us         0.11%      14.250us       2.375us       0.000us         0.00%       0.000us       0.000us             6  
-                                  cudaDeviceSynchronize        82.97%      10.518ms        82.97%      10.518ms      10.518ms       0.000us         0.00%       0.000us       0.000us             1  
+                                          xformers_meff         7.10%     312.113us        46.81%       2.059ms       2.059ms       0.000us         0.00%       3.744ms       3.744ms             1  
+                             xformers_flash3::flash_fwd         3.88%     170.673us        39.17%       1.723ms     574.405us       0.000us         0.00%       3.744ms       1.248ms             3  
+                                      flash_attn_3::fwd         1.28%      56.171us        35.29%       1.553ms     517.514us       2.795ms       100.00%       3.744ms       1.248ms             3  
+                                          xformers_meff         0.00%       0.000us         0.00%       0.000us       0.000us       2.796ms       100.05%       2.796ms       2.796ms             1  
+void cutlass::device_kernel&lt;flash::enable_sm80_to_sm...         0.00%       0.000us         0.00%       0.000us       0.000us       2.795ms       100.00%       2.795ms     931.630us             3  
+                                Activity Buffer Request        32.47%       1.428ms        32.47%       1.428ms       1.428ms     948.729us        33.95%     948.729us     948.729us             1  
+                                            aten::empty         0.66%      29.091us         0.66%      29.091us       4.848us       0.000us         0.00%       0.000us       0.000us             6  
+                                   cudaFuncSetAttribute         0.13%       5.590us         0.13%       5.590us       1.863us       0.000us         0.00%       0.000us       0.000us             3  
+                                       cudaLaunchKernel         0.76%      33.440us         0.76%      33.440us      11.147us       0.000us         0.00%       0.000us       0.000us             3  
+                                          aten::reshape         0.20%       8.951us         0.54%      23.831us       3.972us       0.000us         0.00%       0.000us       0.000us             6  
+                                             aten::view         0.34%      14.880us         0.34%      14.880us       2.480us       0.000us         0.00%       0.000us       0.000us             6  
+                                  cudaDeviceSynchronize        53.19%       2.340ms        53.19%       2.340ms       2.340ms       0.000us         0.00%       0.000us       0.000us             1  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-Self CPU time total: 12.678ms
-Self CUDA time total: 11.013ms
+Self CPU time total: 4.399ms
+Self CUDA time total: 2.795ms
 
 
 
@@ -3971,21 +3971,21 @@ PROFILE TRACE: xformers_meff | cuda_attn_L320_bfloat16
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
                                                    Name    Self CPU %      Self CPU   CPU total %     CPU total  CPU time avg     Self CUDA   Self CUDA %    CUDA total  CUDA time avg    # of Calls  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-                                          xformers_meff         2.76%     351.879us        17.06%       2.178ms       2.178ms       0.000us         0.00%      14.911ms      14.911ms             1  
-                             xformers_flash3::flash_fwd         1.47%     187.843us        14.11%       1.803ms     600.839us       0.000us         0.00%      14.911ms       4.970ms             3  
-                                      flash_attn_3::fwd         0.41%      52.611us        12.64%       1.615ms     538.225us      11.083ms       100.00%      14.911ms       4.970ms             3  
-                                          xformers_meff         0.00%       0.000us         0.00%       0.000us       0.000us      11.085ms       100.02%      11.085ms      11.085ms             1  
-void cutlass::device_kernel&lt;flash::enable_sm80_to_sm...         0.00%       0.000us         0.00%       0.000us       0.000us      11.083ms       100.00%      11.083ms       3.694ms             3  
-                                Activity Buffer Request        11.67%       1.491ms        11.67%       1.491ms       1.491ms       3.829ms        34.54%       3.829ms       3.829ms             1  
-                                            aten::empty         0.23%      29.661us         0.23%      29.661us       4.944us       0.000us         0.00%       0.000us       0.000us             6  
-                                   cudaFuncSetAttribute         0.04%       5.680us         0.04%       5.680us       1.893us       0.000us         0.00%       0.000us       0.000us             3  
-                                       cudaLaunchKernel         0.28%      35.941us         0.28%      35.941us      11.980us       0.000us         0.00%       0.000us       0.000us             3  
-                                          aten::reshape         0.07%       8.779us         0.19%      23.920us       3.987us       0.000us         0.00%       0.000us       0.000us             6  
-                                             aten::view         0.12%      15.141us         0.12%      15.141us       2.524us       0.000us         0.00%       0.000us       0.000us             6  
-                                  cudaDeviceSynchronize        82.94%      10.593ms        82.94%      10.593ms      10.593ms       0.000us         0.00%       0.000us       0.000us             1  
+                                          xformers_meff         6.52%     299.466us        45.41%       2.085ms       2.085ms       0.000us         0.00%       3.907ms       3.907ms             1  
+                             xformers_flash3::flash_fwd         3.09%     142.061us        38.39%       1.763ms     587.558us       0.000us         0.00%       3.907ms       1.302ms             3  
+                                      flash_attn_3::fwd         1.15%      53.012us        35.30%       1.621ms     540.204us       2.913ms       100.00%       3.907ms       1.302ms             3  
+                                          xformers_meff         0.00%       0.000us         0.00%       0.000us       0.000us       2.915ms       100.06%       2.915ms       2.915ms             1  
+void cutlass::device_kernel&lt;flash::enable_sm80_to_sm...         0.00%       0.000us         0.00%       0.000us       0.000us       2.913ms       100.00%       2.913ms     971.158us             3  
+                                Activity Buffer Request        32.68%       1.500ms        32.68%       1.500ms       1.500ms     993.281us        34.09%     993.281us     993.281us             1  
+                                            aten::empty         0.62%      28.380us         0.62%      28.380us       4.730us       0.000us         0.00%       0.000us       0.000us             6  
+                                   cudaFuncSetAttribute         0.11%       5.270us         0.11%       5.270us       1.757us       0.000us         0.00%       0.000us       0.000us             3  
+                                       cudaLaunchKernel         0.73%      33.640us         0.73%      33.640us      11.213us       0.000us         0.00%       0.000us       0.000us             3  
+                                          aten::reshape         0.18%       8.421us         0.49%      22.660us       3.777us       0.000us         0.00%       0.000us       0.000us             6  
+                                             aten::view         0.31%      14.239us         0.31%      14.239us       2.373us       0.000us         0.00%       0.000us       0.000us             6  
+                                  cudaDeviceSynchronize        54.59%       2.507ms        54.59%       2.507ms       2.507ms       0.000us         0.00%       0.000us       0.000us             1  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-Self CPU time total: 12.771ms
-Self CUDA time total: 11.083ms
+Self CPU time total: 4.591ms
+Self CUDA time total: 2.913ms
 
 
 
@@ -3995,21 +3995,21 @@ PROFILE TRACE: xformers_meff | cuda_attn_L384_bfloat16
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
                                                    Name    Self CPU %      Self CPU   CPU total %     CPU total  CPU time avg     Self CUDA   Self CUDA %    CUDA total  CUDA time avg    # of Calls  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-                                          xformers_meff         2.60%     343.688us        18.22%       2.412ms       2.412ms       0.000us         0.00%      15.065ms      15.065ms             1  
-                             xformers_flash3::flash_fwd         1.25%     165.081us        15.45%       2.045ms     681.611us       0.000us         0.00%      15.065ms       5.022ms             3  
-                                      flash_attn_3::fwd         0.38%      50.950us        14.20%       1.880ms     626.584us      11.285ms       100.00%      15.065ms       5.022ms             3  
-                                          xformers_meff         0.00%       0.000us         0.00%       0.000us       0.000us      11.286ms       100.02%      11.286ms      11.286ms             1  
-void cutlass::device_kernel&lt;flash::enable_sm80_to_sm...         0.00%       0.000us         0.00%       0.000us       0.000us      11.285ms       100.00%      11.285ms       3.762ms             3  
-                                Activity Buffer Request        11.56%       1.531ms        11.56%       1.531ms       1.531ms       3.781ms        33.50%       3.781ms       3.781ms             1  
-                                            aten::empty         0.22%      29.192us         0.22%      29.192us       4.865us       0.000us         0.00%       0.000us       0.000us             6  
-                                   cudaFuncSetAttribute         0.04%       5.370us         0.04%       5.370us       1.790us       0.000us         0.00%       0.000us       0.000us             3  
-                                       cudaLaunchKernel         1.99%     263.376us         1.99%     263.376us      87.792us       0.000us         0.00%       0.000us       0.000us             3  
-                                          aten::reshape         0.07%       9.160us         0.18%      23.762us       3.960us       0.000us         0.00%       0.000us       0.000us             6  
-                                             aten::view         0.11%      14.602us         0.11%      14.602us       2.434us       0.000us         0.00%       0.000us       0.000us             6  
-                                  cudaDeviceSynchronize        81.78%      10.825ms        81.78%      10.825ms      10.825ms       0.000us         0.00%       0.000us       0.000us             1  
+                                          xformers_meff         6.26%     300.335us        46.54%       2.234ms       2.234ms       0.000us         0.00%       3.980ms       3.980ms             1  
+                             xformers_flash3::flash_fwd         3.08%     147.673us        39.81%       1.911ms     637.009us       0.000us         0.00%       3.980ms       1.327ms             3  
+                                      flash_attn_3::fwd         1.12%      53.571us        36.74%       1.763ms     587.785us       2.981ms       100.00%       3.980ms       1.327ms             3  
+                                          xformers_meff         0.00%       0.000us         0.00%       0.000us       0.000us       2.982ms       100.05%       2.982ms       2.982ms             1  
+void cutlass::device_kernel&lt;flash::enable_sm80_to_sm...         0.00%       0.000us         0.00%       0.000us       0.000us       2.981ms       100.00%       2.981ms     993.631us             3  
+                                Activity Buffer Request        29.81%       1.431ms        29.81%       1.431ms       1.431ms     999.263us        33.52%     999.263us     999.263us             1  
+                                            aten::empty         0.60%      28.930us         0.60%      28.930us       4.822us       0.000us         0.00%       0.000us       0.000us             6  
+                                   cudaFuncSetAttribute         0.12%       5.610us         0.12%       5.610us       1.870us       0.000us         0.00%       0.000us       0.000us             3  
+                                       cudaLaunchKernel         5.09%     244.533us         5.09%     244.533us      81.511us       0.000us         0.00%       0.000us       0.000us             3  
+                                          aten::reshape         0.18%       8.489us         0.47%      22.530us       3.755us       0.000us         0.00%       0.000us       0.000us             6  
+                                             aten::view         0.29%      14.041us         0.29%      14.041us       2.340us       0.000us         0.00%       0.000us       0.000us             6  
+                                  cudaDeviceSynchronize        53.46%       2.566ms        53.46%       2.566ms       2.566ms       0.000us         0.00%       0.000us       0.000us             1  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-Self CPU time total: 13.238ms
-Self CUDA time total: 11.285ms
+Self CPU time total: 4.800ms
+Self CUDA time total: 2.981ms
 
 
 
@@ -4019,21 +4019,21 @@ PROFILE TRACE: xformers_meff | cuda_attn_L448_bfloat16
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
                                                    Name    Self CPU %      Self CPU   CPU total %     CPU total  CPU time avg     Self CUDA   Self CUDA %    CUDA total  CUDA time avg    # of Calls  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-                                          xformers_meff         2.46%     345.459us        17.01%       2.385ms       2.385ms       0.000us         0.00%      16.124ms      16.124ms             1  
-                             xformers_flash3::flash_fwd         1.15%     161.632us        14.38%       2.017ms     672.171us       0.000us         0.00%      16.124ms       5.375ms             3  
-                                      flash_attn_3::fwd         0.37%      51.683us        13.23%       1.855ms     618.293us      12.092ms       100.00%      16.124ms       5.375ms             3  
-                                          xformers_meff         0.00%       0.000us         0.00%       0.000us       0.000us      12.094ms       100.02%      12.094ms      12.094ms             1  
-void cutlass::device_kernel&lt;flash::enable_sm80_to_sm...         0.00%       0.000us         0.00%       0.000us       0.000us      12.092ms       100.00%      12.092ms       4.031ms             3  
-                                Activity Buffer Request        10.69%       1.499ms        10.69%       1.499ms       1.499ms       4.032ms        33.35%       4.032ms       4.032ms             1  
-                                            aten::empty         0.21%      29.140us         0.21%      29.140us       4.857us       0.000us         0.00%       0.000us       0.000us             6  
-                                   cudaFuncSetAttribute         0.04%       5.520us         0.04%       5.520us       1.840us       0.000us         0.00%       0.000us       0.000us             3  
-                                       cudaLaunchKernel         1.92%     269.435us         1.92%     269.435us      89.812us       0.000us         0.00%       0.000us       0.000us             3  
-                                          aten::reshape         0.06%       9.069us         0.16%      22.880us       3.813us       0.000us         0.00%       0.000us       0.000us             6  
-                                             aten::view         0.10%      13.811us         0.10%      13.811us       2.302us       0.000us         0.00%       0.000us       0.000us             6  
-                                  cudaDeviceSynchronize        82.99%      11.636ms        82.99%      11.636ms      11.636ms       0.000us         0.00%       0.000us       0.000us             1  
+                                          xformers_meff         5.98%     313.865us        42.05%       2.207ms       2.207ms       0.000us         0.00%       4.635ms       4.635ms             1  
+                             xformers_flash3::flash_fwd         2.80%     146.723us        35.63%       1.870ms     623.176us       0.000us         0.00%       4.635ms       1.545ms             3  
+                                      flash_attn_3::fwd         0.99%      51.861us        32.83%       1.723ms     574.268us       3.467ms       100.00%       4.635ms       1.545ms             3  
+                                          xformers_meff         0.00%       0.000us         0.00%       0.000us       0.000us       3.469ms       100.05%       3.469ms       3.469ms             1  
+void cutlass::device_kernel&lt;flash::enable_sm80_to_sm...         0.00%       0.000us         0.00%       0.000us       0.000us       3.467ms       100.00%       3.467ms       1.156ms             3  
+                                Activity Buffer Request        27.82%       1.460ms        27.82%       1.460ms       1.460ms       1.168ms        33.68%       1.168ms       1.168ms             1  
+                                            aten::empty         0.56%      29.260us         0.56%      29.260us       4.877us       0.000us         0.00%       0.000us       0.000us             6  
+                                   cudaFuncSetAttribute         0.12%       6.040us         0.12%       6.040us       2.013us       0.000us         0.00%       0.000us       0.000us             3  
+                                       cudaLaunchKernel         3.35%     175.903us         3.35%     175.903us      58.634us       0.000us         0.00%       0.000us       0.000us             3  
+                                          aten::reshape         0.16%       8.638us         0.44%      23.169us       3.862us       0.000us         0.00%       0.000us       0.000us             6  
+                                             aten::view         0.28%      14.531us         0.28%      14.531us       2.422us       0.000us         0.00%       0.000us       0.000us             6  
+                                  cudaDeviceSynchronize        57.95%       3.041ms        57.95%       3.041ms       3.041ms       0.000us         0.00%       0.000us       0.000us             1  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-Self CPU time total: 14.021ms
-Self CUDA time total: 12.092ms
+Self CPU time total: 5.247ms
+Self CUDA time total: 3.467ms
 
 
 
@@ -4043,83 +4043,37 @@ PROFILE TRACE: xformers_meff | cuda_attn_L512_bfloat16
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
                                                    Name    Self CPU %      Self CPU   CPU total %     CPU total  CPU time avg     Self CUDA   Self CUDA %    CUDA total  CUDA time avg    # of Calls  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-                                          xformers_meff         2.36%     347.389us        16.65%       2.455ms       2.455ms       0.000us         0.00%      16.980ms      16.980ms             1  
-                             xformers_flash3::flash_fwd         1.09%     160.181us        14.14%       2.085ms     695.001us       0.000us         0.00%      16.980ms       5.660ms             3  
-                                      flash_attn_3::fwd         0.36%      52.921us        13.05%       1.925ms     641.607us      12.735ms       100.00%      16.980ms       5.660ms             3  
-                                          xformers_meff         0.00%       0.000us         0.00%       0.000us       0.000us      12.738ms       100.02%      12.738ms      12.738ms             1  
-void cutlass::device_kernel&lt;flash::enable_sm80_to_sm...         0.00%       0.000us         0.00%       0.000us       0.000us      12.735ms       100.00%      12.735ms       4.245ms             3  
-                                Activity Buffer Request        10.11%       1.491ms        10.11%       1.491ms       1.491ms       4.245ms        33.33%       4.245ms       4.245ms             1  
-                                            aten::empty         0.20%      29.922us         0.20%      29.922us       4.987us       0.000us         0.00%       0.000us       0.000us             6  
-                                   cudaFuncSetAttribute         0.04%       5.530us         0.04%       5.530us       1.843us       0.000us         0.00%       0.000us       0.000us             3  
-                                       cudaLaunchKernel         2.34%     345.117us         2.34%     345.117us     115.039us       0.000us         0.00%       0.000us       0.000us             3  
-                                          aten::reshape         0.06%       8.379us         0.15%      22.620us       3.770us       0.000us         0.00%       0.000us       0.000us             6  
-                                             aten::view         0.10%      14.241us         0.10%      14.241us       2.373us       0.000us         0.00%       0.000us       0.000us             6  
-                                  cudaDeviceSynchronize        83.35%      12.290ms        83.35%      12.290ms      12.290ms       0.000us         0.00%       0.000us       0.000us             1  
+                                          xformers_meff         5.97%     309.094us        41.86%       2.166ms       2.166ms       0.000us         0.00%       4.567ms       4.567ms             1  
+                             xformers_flash3::flash_fwd         2.75%     142.242us        35.45%       1.834ms     611.405us       0.000us         0.00%       4.567ms       1.522ms             3  
+                                      flash_attn_3::fwd         1.04%      53.951us        32.70%       1.692ms     563.991us       3.419ms       100.00%       4.567ms       1.522ms             3  
+                                          xformers_meff         0.00%       0.000us         0.00%       0.000us       0.000us       3.421ms       100.05%       3.421ms       3.421ms             1  
+void cutlass::device_kernel&lt;flash::enable_sm80_to_sm...         0.00%       0.000us         0.00%       0.000us       0.000us       3.419ms       100.00%       3.419ms       1.140ms             3  
+                                Activity Buffer Request        27.74%       1.436ms        27.74%       1.436ms       1.436ms       1.148ms        33.59%       1.148ms       1.148ms             1  
+                                            aten::empty         0.58%      29.770us         0.58%      29.770us       4.962us       0.000us         0.00%       0.000us       0.000us             6  
+                                   cudaFuncSetAttribute         0.11%       5.591us         0.11%       5.591us       1.864us       0.000us         0.00%       0.000us       0.000us             3  
+                                       cudaLaunchKernel         3.23%     167.152us         3.23%     167.152us      55.717us       0.000us         0.00%       0.000us       0.000us             3  
+                                          aten::reshape         0.16%       8.371us         0.44%      22.751us       3.792us       0.000us         0.00%       0.000us       0.000us             6  
+                                             aten::view         0.28%      14.380us         0.28%      14.380us       2.397us       0.000us         0.00%       0.000us       0.000us             6  
+                                  cudaDeviceSynchronize        58.14%       3.008ms        58.14%       3.008ms       3.008ms       0.000us         0.00%       0.000us       0.000us             1  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-Self CPU time total: 14.745ms
-Self CUDA time total: 12.735ms
+Self CPU time total: 5.174ms
+Self CUDA time total: 3.419ms
 
 
 impl                     wl                  p50(ms)  ok
-xformers_meff            cuda_attn_L128_bfloat16     3.60  True
-xformers_meff            cuda_attn_L256_bfloat16     3.43  True
-xformers_meff            cuda_attn_L320_bfloat16     4.10  True
-xformers_meff            cuda_attn_L384_bfloat16     4.01  True
-xformers_meff            cuda_attn_L448_bfloat16     4.21  True
-xformers_meff            cuda_attn_L512_bfloat16     4.43  True
+xformers_meff            cuda_attn_L128_bfloat16     1.00  True
+xformers_meff            cuda_attn_L256_bfloat16     1.04  True
+xformers_meff            cuda_attn_L320_bfloat16     1.09  True
+xformers_meff            cuda_attn_L384_bfloat16     1.11  True
+xformers_meff            cuda_attn_L448_bfloat16     1.26  True
+xformers_meff            cuda_attn_L512_bfloat16     1.25  True
 </pre></div>
 <div class="uv-install-logs" id="uv-logs-benchmark">
 <div class="uv-logs-header" onclick="toggleUvLogs(this)">▶ UV Install Logs</div>
 <div class="uv-logs-content" style="display: none;">
-   Building kernels-benchmark-tools @ file:///home/ubuntu/Projects/kernels-benchmarks-consolidated/tools
-Downloading nvidia-cusparse-cu12 (274.9MiB)
-Downloading nvidia-curand-cu12 (60.7MiB)
-Downloading triton (148.4MiB)
-Downloading matplotlib (8.3MiB)
-Downloading pillow (6.7MiB)
-Downloading setuptools (1.1MiB)
-Downloading nvidia-cudnn-cu12 (674.0MiB)
-Downloading torch (846.8MiB)
-Downloading nvidia-nvjitlink-cu12 (37.4MiB)
-Downloading nvidia-cusparselt-cu12 (273.9MiB)
-Downloading nvidia-cusolver-cu12 (255.1MiB)
-Downloading nvidia-cufile-cu12 (1.1MiB)
-Downloading nvidia-nccl-cu12 (307.4MiB)
-Downloading networkx (1.9MiB)
-Downloading kiwisolver (1.4MiB)
-Downloading fonttools (4.7MiB)
-Downloading numpy (15.9MiB)
-Downloading sympy (6.0MiB)
-Downloading nvidia-cublas-cu12 (566.8MiB)
-Downloading nvidia-cuda-cupti-cu12 (9.8MiB)
-Downloading nvidia-cufft-cu12 (184.2MiB)
-Downloading nvidia-cuda-nvrtc-cu12 (84.0MiB)
 Downloading xformers (111.8MiB)
- Downloading nvidia-cufile-cu12
- Downloading kiwisolver
- Downloading setuptools
- Downloading fonttools
- Downloading networkx
- Downloading pillow
-      Built kernels-benchmark-tools @ file:///home/ubuntu/Projects/kernels-benchmarks-consolidated/tools
- Downloading nvidia-cuda-cupti-cu12
- Downloading matplotlib
- Downloading numpy
- Downloading sympy
- Downloading nvidia-nvjitlink-cu12
- Downloading nvidia-curand-cu12
- Downloading nvidia-cuda-nvrtc-cu12
  Downloading xformers
- Downloading triton
- Downloading nvidia-cufft-cu12
- Downloading nvidia-cusolver-cu12
- Downloading nvidia-cusparse-cu12
- Downloading nvidia-cusparselt-cu12
- Downloading nvidia-nccl-cu12
- Downloading nvidia-cublas-cu12
- Downloading nvidia-cudnn-cu12
- Downloading torch
-Installed 38 packages in 211ms
+Installed 1 package in 14ms
 </div>
 </div>
 <div class="cell-artifacts">
diff --git a/flash_attn/results/artifacts/combine/latency.svg b/flash_attn/results/artifacts/combine/latency.svg
index 3367735a9738724cca392eba39308eb7893657bd..671245b29a5bb3712886378686087bbd4b801023 100644
--- a/flash_attn/results/artifacts/combine/latency.svg
+++ b/flash_attn/results/artifacts/combine/latency.svg
@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:b3bfd4c5e82f8daf2fec939924eb6dc23b3e5d20e8327316e4f8b69db047e2a9
-size 24011
+oid sha256:a94beca550ea0b3ff8a0f0eef062da6a6179ae09e78edc24cbacb71d8bd623a2
+size 24784
diff --git a/flash_attn/results/combined_results.html b/flash_attn/results/combined_results.html
index 83a9d73329f9582b6415102ca8b59cbdb9f81586..d8c030e7979e6d4a765ee5da1a3df4da10896636 100644
--- a/flash_attn/results/combined_results.html
+++ b/flash_attn/results/combined_results.html
@@ -3857,7 +3857,7 @@ span.linenos.special { color: #000000; background-color: #ffffc0; padding-left:
     <div class="system-info">
         <div class="system-info-header">Generated on:</div>
         <div class="system-info-content">
-            Linux x86_64 | Linux-5.15.0-1084-aws-x86_64-with-glibc2.31
+            Linux x86_64 | Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35
         </div>
     </div>
     
@@ -3872,7 +3872,7 @@ span.linenos.special { color: #000000; background-color: #ffffc0; padding-left:
   <rdf:RDF>
    <ns2:Work>
     <dc:type rdf:resource="http://purl.org/dc/dcmitype/StillImage" />
-    <dc:date>2025-10-24T19:27:34.267507</dc:date>
+    <dc:date>2025-10-27T14:46:38.946915</dc:date>
     <dc:format>image/svg+xml</dc:format>
     <dc:creator>
      <ns2:Agent>
@@ -3891,320 +3891,333 @@ span.linenos.special { color: #000000; background-color: #ffffc0; padding-left:
   </g>
   <g id="axes--1" class="axes">
    <g id="patch_2">
-    <path d="M 38.27 447.507117  L 835.361742 447.507117  L 835.361742 26.88  L 38.27 26.88  L 38.27 447.507117  z " style="fill: none" />
+    <path d="M 47.81 447.507117  L 835.361742 447.507117  L 835.361742 26.88  L 47.81 26.88  L 47.81 447.507117  z " style="fill: none" />
    </g>
    <g id="matplotlib.axis_1">
     <g id="xtick_1">
      <g id="grid-x--1" class="grid grid-x">
-      <path d="M 74.501443 447.507117  L 74.501443 26.88  " clip-path="url(#p0d2e0c97d5)" style="fill: none; stroke: #b0b0b0; stroke-opacity: 0.3; stroke-width: 0.8; stroke-linecap: square" />
+      <path d="M 83.607806 447.507117  L 83.607806 26.88  " clip-path="url(#p09feef2583)" style="fill: none; stroke: #b0b0b0; stroke-opacity: 0.3; stroke-width: 0.8; stroke-linecap: square" />
      </g>
      <g id="line2d_1">
       <defs>
        <path id="mafb3703e5b" d="M 0 0  L 0 3.5  " style="stroke: #000000; stroke-width: 0.8" />
       </defs>
       <g>
-       <use ns4:href="#mafb3703e5b" x="74.501443" y="447.507117" style="stroke: #000000; stroke-width: 0.8" />
+       <use ns4:href="#mafb3703e5b" x="83.607806" y="447.507117" style="stroke: #000000; stroke-width: 0.8" />
       </g>
      </g>
      <g id="text_1">
-      <text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif" transform="translate(31.87119 548.84621) rotate(-45)">cuda_attn_L128_bfloat16</text>
+      <text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif" transform="translate(40.977554 548.84621) rotate(-45)">cuda_attn_L128_bfloat16</text>
      </g>
     </g>
     <g id="xtick_2">
      <g id="grid-x--2" class="grid grid-x">
-      <path d="M 219.427214 447.507117  L 219.427214 26.88  " clip-path="url(#p0d2e0c97d5)" style="fill: none; stroke: #b0b0b0; stroke-opacity: 0.3; stroke-width: 0.8; stroke-linecap: square" />
+      <path d="M 226.799032 447.507117  L 226.799032 26.88  " clip-path="url(#p09feef2583)" style="fill: none; stroke: #b0b0b0; stroke-opacity: 0.3; stroke-width: 0.8; stroke-linecap: square" />
      </g>
      <g id="line2d_2">
       <g>
-       <use ns4:href="#mafb3703e5b" x="219.427214" y="447.507117" style="stroke: #000000; stroke-width: 0.8" />
+       <use ns4:href="#mafb3703e5b" x="226.799032" y="447.507117" style="stroke: #000000; stroke-width: 0.8" />
       </g>
      </g>
      <g id="text_2">
-      <text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif" transform="translate(176.796962 548.84621) rotate(-45)">cuda_attn_L256_bfloat16</text>
+      <text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif" transform="translate(184.16878 548.84621) rotate(-45)">cuda_attn_L256_bfloat16</text>
      </g>
     </g>
     <g id="xtick_3">
      <g id="grid-x--3" class="grid grid-x">
-      <path d="M 364.352985 447.507117  L 364.352985 26.88  " clip-path="url(#p0d2e0c97d5)" style="fill: none; stroke: #b0b0b0; stroke-opacity: 0.3; stroke-width: 0.8; stroke-linecap: square" />
+      <path d="M 369.990258 447.507117  L 369.990258 26.88  " clip-path="url(#p09feef2583)" style="fill: none; stroke: #b0b0b0; stroke-opacity: 0.3; stroke-width: 0.8; stroke-linecap: square" />
      </g>
      <g id="line2d_3">
       <g>
-       <use ns4:href="#mafb3703e5b" x="364.352985" y="447.507117" style="stroke: #000000; stroke-width: 0.8" />
+       <use ns4:href="#mafb3703e5b" x="369.990258" y="447.507117" style="stroke: #000000; stroke-width: 0.8" />
       </g>
      </g>
      <g id="text_3">
-      <text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif" transform="translate(321.722733 548.84621) rotate(-45)">cuda_attn_L320_bfloat16</text>
+      <text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif" transform="translate(327.360005 548.84621) rotate(-45)">cuda_attn_L320_bfloat16</text>
      </g>
     </g>
     <g id="xtick_4">
      <g id="grid-x--4" class="grid grid-x">
-      <path d="M 509.278756 447.507117  L 509.278756 26.88  " clip-path="url(#p0d2e0c97d5)" style="fill: none; stroke: #b0b0b0; stroke-opacity: 0.3; stroke-width: 0.8; stroke-linecap: square" />
+      <path d="M 513.181484 447.507117  L 513.181484 26.88  " clip-path="url(#p09feef2583)" style="fill: none; stroke: #b0b0b0; stroke-opacity: 0.3; stroke-width: 0.8; stroke-linecap: square" />
      </g>
      <g id="line2d_4">
       <g>
-       <use ns4:href="#mafb3703e5b" x="509.278756" y="447.507117" style="stroke: #000000; stroke-width: 0.8" />
+       <use ns4:href="#mafb3703e5b" x="513.181484" y="447.507117" style="stroke: #000000; stroke-width: 0.8" />
       </g>
      </g>
      <g id="text_4">
-      <text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif" transform="translate(466.648504 548.84621) rotate(-45)">cuda_attn_L384_bfloat16</text>
+      <text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif" transform="translate(470.551231 548.84621) rotate(-45)">cuda_attn_L384_bfloat16</text>
      </g>
     </g>
     <g id="xtick_5">
      <g id="grid-x--5" class="grid grid-x">
-      <path d="M 654.204528 447.507117  L 654.204528 26.88  " clip-path="url(#p0d2e0c97d5)" style="fill: none; stroke: #b0b0b0; stroke-opacity: 0.3; stroke-width: 0.8; stroke-linecap: square" />
+      <path d="M 656.37271 447.507117  L 656.37271 26.88  " clip-path="url(#p09feef2583)" style="fill: none; stroke: #b0b0b0; stroke-opacity: 0.3; stroke-width: 0.8; stroke-linecap: square" />
      </g>
      <g id="line2d_5">
       <g>
-       <use ns4:href="#mafb3703e5b" x="654.204528" y="447.507117" style="stroke: #000000; stroke-width: 0.8" />
+       <use ns4:href="#mafb3703e5b" x="656.37271" y="447.507117" style="stroke: #000000; stroke-width: 0.8" />
       </g>
      </g>
      <g id="text_5">
-      <text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif" transform="translate(611.574275 548.84621) rotate(-45)">cuda_attn_L448_bfloat16</text>
+      <text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif" transform="translate(613.742457 548.84621) rotate(-45)">cuda_attn_L448_bfloat16</text>
      </g>
     </g>
     <g id="xtick_6">
      <g id="grid-x--6" class="grid grid-x">
-      <path d="M 799.130299 447.507117  L 799.130299 26.88  " clip-path="url(#p0d2e0c97d5)" style="fill: none; stroke: #b0b0b0; stroke-opacity: 0.3; stroke-width: 0.8; stroke-linecap: square" />
+      <path d="M 799.563935 447.507117  L 799.563935 26.88  " clip-path="url(#p09feef2583)" style="fill: none; stroke: #b0b0b0; stroke-opacity: 0.3; stroke-width: 0.8; stroke-linecap: square" />
      </g>
      <g id="line2d_6">
       <g>
-       <use ns4:href="#mafb3703e5b" x="799.130299" y="447.507117" style="stroke: #000000; stroke-width: 0.8" />
+       <use ns4:href="#mafb3703e5b" x="799.563935" y="447.507117" style="stroke: #000000; stroke-width: 0.8" />
       </g>
      </g>
      <g id="text_6">
-      <text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif" transform="translate(756.500046 548.84621) rotate(-45)">cuda_attn_L512_bfloat16</text>
+      <text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif" transform="translate(756.933683 548.84621) rotate(-45)">cuda_attn_L512_bfloat16</text>
      </g>
     </g>
     <g id="label--x" class="xlabel">
-     <text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: middle" x="436.815871" y="562.111872" transform="rotate(-0 436.815871 562.111872)">Workload</text>
+     <text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: middle" x="441.585871" y="562.111872" transform="rotate(-0 441.585871 562.111872)">Workload</text>
     </g>
    </g>
    <g id="matplotlib.axis_2">
     <g id="ytick_1">
      <g id="grid-y--2" class="grid grid-y">
-      <path d="M 38.27 416.200827  L 835.361742 416.200827  " clip-path="url(#p0d2e0c97d5)" style="fill: none; stroke: #b0b0b0; stroke-opacity: 0.3; stroke-width: 0.8; stroke-linecap: square" />
+      <path d="M 47.81 413.210177  L 835.361742 413.210177  " clip-path="url(#p09feef2583)" style="fill: none; stroke: #b0b0b0; stroke-opacity: 0.3; stroke-width: 0.8; stroke-linecap: square" />
      </g>
      <g id="line2d_7">
       <defs>
        <path id="m0fca2865ba" d="M 0 0  L -3.5 0  " style="stroke: #000000; stroke-width: 0.8" />
       </defs>
       <g>
-       <use ns4:href="#m0fca2865ba" x="38.27" y="416.200827" style="stroke: #000000; stroke-width: 0.8" />
+       <use ns4:href="#m0fca2865ba" x="47.81" y="413.210177" style="stroke: #000000; stroke-width: 0.8" />
       </g>
      </g>
      <g id="text_7">
-      <text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: end" x="31.27" y="420.000045" transform="rotate(-0 31.27 420.000045)">3</text>
+      <text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: end" x="40.81" y="417.009396" transform="rotate(-0 40.81 417.009396)">1.0</text>
      </g>
     </g>
     <g id="ytick_2">
      <g id="grid-y--3" class="grid grid-y">
-      <path d="M 38.27 348.555726  L 835.361742 348.555726  " clip-path="url(#p0d2e0c97d5)" style="fill: none; stroke: #b0b0b0; stroke-opacity: 0.3; stroke-width: 0.8; stroke-linecap: square" />
+      <path d="M 47.81 355.233116  L 835.361742 355.233116  " clip-path="url(#p09feef2583)" style="fill: none; stroke: #b0b0b0; stroke-opacity: 0.3; stroke-width: 0.8; stroke-linecap: square" />
      </g>
      <g id="line2d_8">
       <g>
-       <use ns4:href="#m0fca2865ba" x="38.27" y="348.555726" style="stroke: #000000; stroke-width: 0.8" />
+       <use ns4:href="#m0fca2865ba" x="47.81" y="355.233116" style="stroke: #000000; stroke-width: 0.8" />
       </g>
      </g>
      <g id="text_8">
-      <text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: end" x="31.27" y="352.354944" transform="rotate(-0 31.27 352.354944)">4</text>
+      <text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: end" x="40.81" y="359.032335" transform="rotate(-0 40.81 359.032335)">1.2</text>
      </g>
     </g>
     <g id="ytick_3">
      <g id="grid-y--4" class="grid grid-y">
-      <path d="M 38.27 280.910625  L 835.361742 280.910625  " clip-path="url(#p0d2e0c97d5)" style="fill: none; stroke: #b0b0b0; stroke-opacity: 0.3; stroke-width: 0.8; stroke-linecap: square" />
+      <path d="M 47.81 297.256055  L 835.361742 297.256055  " clip-path="url(#p09feef2583)" style="fill: none; stroke: #b0b0b0; stroke-opacity: 0.3; stroke-width: 0.8; stroke-linecap: square" />
      </g>
      <g id="line2d_9">
       <g>
-       <use ns4:href="#m0fca2865ba" x="38.27" y="280.910625" style="stroke: #000000; stroke-width: 0.8" />
+       <use ns4:href="#m0fca2865ba" x="47.81" y="297.256055" style="stroke: #000000; stroke-width: 0.8" />
       </g>
      </g>
      <g id="text_9">
-      <text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: end" x="31.27" y="284.709843" transform="rotate(-0 31.27 284.709843)">5</text>
+      <text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: end" x="40.81" y="301.055273" transform="rotate(-0 40.81 301.055273)">1.4</text>
      </g>
     </g>
     <g id="ytick_4">
      <g id="grid-y--5" class="grid grid-y">
-      <path d="M 38.27 213.265524  L 835.361742 213.265524  " clip-path="url(#p0d2e0c97d5)" style="fill: none; stroke: #b0b0b0; stroke-opacity: 0.3; stroke-width: 0.8; stroke-linecap: square" />
+      <path d="M 47.81 239.278993  L 835.361742 239.278993  " clip-path="url(#p09feef2583)" style="fill: none; stroke: #b0b0b0; stroke-opacity: 0.3; stroke-width: 0.8; stroke-linecap: square" />
      </g>
      <g id="line2d_10">
       <g>
-       <use ns4:href="#m0fca2865ba" x="38.27" y="213.265524" style="stroke: #000000; stroke-width: 0.8" />
+       <use ns4:href="#m0fca2865ba" x="47.81" y="239.278993" style="stroke: #000000; stroke-width: 0.8" />
       </g>
      </g>
      <g id="text_10">
-      <text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: end" x="31.27" y="217.064743" transform="rotate(-0 31.27 217.064743)">6</text>
+      <text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: end" x="40.81" y="243.078212" transform="rotate(-0 40.81 243.078212)">1.6</text>
      </g>
     </g>
     <g id="ytick_5">
      <g id="grid-y--6" class="grid grid-y">
-      <path d="M 38.27 145.620423  L 835.361742 145.620423  " clip-path="url(#p0d2e0c97d5)" style="fill: none; stroke: #b0b0b0; stroke-opacity: 0.3; stroke-width: 0.8; stroke-linecap: square" />
+      <path d="M 47.81 181.301932  L 835.361742 181.301932  " clip-path="url(#p09feef2583)" style="fill: none; stroke: #b0b0b0; stroke-opacity: 0.3; stroke-width: 0.8; stroke-linecap: square" />
      </g>
      <g id="line2d_11">
       <g>
-       <use ns4:href="#m0fca2865ba" x="38.27" y="145.620423" style="stroke: #000000; stroke-width: 0.8" />
+       <use ns4:href="#m0fca2865ba" x="47.81" y="181.301932" style="stroke: #000000; stroke-width: 0.8" />
       </g>
      </g>
      <g id="text_11">
-      <text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: end" x="31.27" y="149.419642" transform="rotate(-0 31.27 149.419642)">7</text>
+      <text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: end" x="40.81" y="185.101151" transform="rotate(-0 40.81 185.101151)">1.8</text>
      </g>
     </g>
     <g id="ytick_6">
      <g id="grid-y--7" class="grid grid-y">
-      <path d="M 38.27 77.975322  L 835.361742 77.975322  " clip-path="url(#p0d2e0c97d5)" style="fill: none; stroke: #b0b0b0; stroke-opacity: 0.3; stroke-width: 0.8; stroke-linecap: square" />
+      <path d="M 47.81 123.324871  L 835.361742 123.324871  " clip-path="url(#p09feef2583)" style="fill: none; stroke: #b0b0b0; stroke-opacity: 0.3; stroke-width: 0.8; stroke-linecap: square" />
      </g>
      <g id="line2d_12">
       <g>
-       <use ns4:href="#m0fca2865ba" x="38.27" y="77.975322" style="stroke: #000000; stroke-width: 0.8" />
+       <use ns4:href="#m0fca2865ba" x="47.81" y="123.324871" style="stroke: #000000; stroke-width: 0.8" />
       </g>
      </g>
      <g id="text_12">
-      <text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: end" x="31.27" y="81.774541" transform="rotate(-0 31.27 81.774541)">8</text>
+      <text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: end" x="40.81" y="127.124089" transform="rotate(-0 40.81 127.124089)">2.0</text>
+     </g>
+    </g>
+    <g id="ytick_7">
+     <g id="grid-y--8" class="grid grid-y">
+      <path d="M 47.81 65.347809  L 835.361742 65.347809  " clip-path="url(#p09feef2583)" style="fill: none; stroke: #b0b0b0; stroke-opacity: 0.3; stroke-width: 0.8; stroke-linecap: square" />
+     </g>
+     <g id="line2d_13">
+      <g>
+       <use ns4:href="#m0fca2865ba" x="47.81" y="65.347809" style="stroke: #000000; stroke-width: 0.8" />
+      </g>
+     </g>
+     <g id="text_13">
+      <text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: end" x="40.81" y="69.147028" transform="rotate(-0 40.81 69.147028)">2.2</text>
      </g>
     </g>
     <g id="label--y" class="ylabel">
-     <text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: middle" x="18.827813" y="237.193558" transform="rotate(-90 18.827813 237.193558)">Latency P50 (ms)</text>
+     <text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: middle" x="18.827187" y="237.193558" transform="rotate(-90 18.827187 237.193558)">Latency P50 (ms)</text>
     </g>
    </g>
    <g id="series--torch-flash-ma" class="series">
-    <path d="M 74.501443 342.178283  L 219.427214 295.140246  L 364.352985 287.657208  L 509.278756 281.974208  L 654.204528 277.480409  L 799.130299 249.123044  " clip-path="url(#p0d2e0c97d5)" style="fill: none; stroke: #1f77b4; stroke-width: 1.5; stroke-linecap: square" />
+    <path d="M 83.607806 349.439178  L 226.799032 333.602454  L 369.990258 324.473676  L 513.181484 316.069901  L 656.37271 272.899601  L 799.563935 261.559288  " clip-path="url(#p09feef2583)" style="fill: none; stroke: #1f77b4; stroke-width: 1.5; stroke-linecap: square" />
     <defs>
      <path id="md7efaf3aec" d="M 0 3  C 0.795609 3 1.55874 2.683901 2.12132 2.12132  C 2.683901 1.55874 3 0.795609 3 0  C 3 -0.795609 2.683901 -1.55874 2.12132 -2.12132  C 1.55874 -2.683901 0.795609 -3 0 -3  C -0.795609 -3 -1.55874 -2.683901 -2.12132 -2.12132  C -2.683901 -1.55874 -3 -0.795609 -3 0  C -3 0.795609 -2.683901 1.55874 -2.12132 2.12132  C -1.55874 2.683901 -0.795609 3 0 3  z " style="stroke: #1f77b4" />
     </defs>
-    <g clip-path="url(#p0d2e0c97d5)">
-     <use ns4:href="#md7efaf3aec" x="74.501443" y="342.178283" style="fill: #1f77b4; stroke: #1f77b4" />
-     <use ns4:href="#md7efaf3aec" x="219.427214" y="295.140246" style="fill: #1f77b4; stroke: #1f77b4" />
-     <use ns4:href="#md7efaf3aec" x="364.352985" y="287.657208" style="fill: #1f77b4; stroke: #1f77b4" />
-     <use ns4:href="#md7efaf3aec" x="509.278756" y="281.974208" style="fill: #1f77b4; stroke: #1f77b4" />
-     <use ns4:href="#md7efaf3aec" x="654.204528" y="277.480409" style="fill: #1f77b4; stroke: #1f77b4" />
-     <use ns4:href="#md7efaf3aec" x="799.130299" y="249.123044" style="fill: #1f77b4; stroke: #1f77b4" />
+    <g clip-path="url(#p09feef2583)">
+     <use ns4:href="#md7efaf3aec" x="83.607806" y="349.439178" style="fill: #1f77b4; stroke: #1f77b4" />
+     <use ns4:href="#md7efaf3aec" x="226.799032" y="333.602454" style="fill: #1f77b4; stroke: #1f77b4" />
+     <use ns4:href="#md7efaf3aec" x="369.990258" y="324.473676" style="fill: #1f77b4; stroke: #1f77b4" />
+     <use ns4:href="#md7efaf3aec" x="513.181484" y="316.069901" style="fill: #1f77b4; stroke: #1f77b4" />
+     <use ns4:href="#md7efaf3aec" x="656.37271" y="272.899601" style="fill: #1f77b4; stroke: #1f77b4" />
+     <use ns4:href="#md7efaf3aec" x="799.563935" y="261.559288" style="fill: #1f77b4; stroke: #1f77b4" />
     </g>
    </g>
    <g id="series--torch-mem-eff" class="series">
-    <path d="M 74.501443 161.453774  L 219.427214 129.630134  L 364.352985 110.285937  L 509.278756 105.945081  L 654.204528 80.334108  L 799.130299 45.999414  " clip-path="url(#p0d2e0c97d5)" style="fill: none; stroke: #ff7f0e; stroke-width: 1.5; stroke-linecap: square" />
+    <path d="M 83.607806 156.020744  L 226.799032 138.969401  L 369.990258 109.128607  L 513.181484 99.249026  L 656.37271 87.05645  L 799.563935 45.999414  " clip-path="url(#p09feef2583)" style="fill: none; stroke: #ff7f0e; stroke-width: 1.5; stroke-linecap: square" />
     <defs>
      <path id="m9b8c54d372" d="M 0 3  C 0.795609 3 1.55874 2.683901 2.12132 2.12132  C 2.683901 1.55874 3 0.795609 3 0  C 3 -0.795609 2.683901 -1.55874 2.12132 -2.12132  C 1.55874 -2.683901 0.795609 -3 0 -3  C -0.795609 -3 -1.55874 -2.683901 -2.12132 -2.12132  C -2.683901 -1.55874 -3 -0.795609 -3 0  C -3 0.795609 -2.683901 1.55874 -2.12132 2.12132  C -1.55874 2.683901 -0.795609 3 0 3  z " style="stroke: #ff7f0e" />
     </defs>
-    <g clip-path="url(#p0d2e0c97d5)">
-     <use ns4:href="#m9b8c54d372" x="74.501443" y="161.453774" style="fill: #ff7f0e; stroke: #ff7f0e" />
-     <use ns4:href="#m9b8c54d372" x="219.427214" y="129.630134" style="fill: #ff7f0e; stroke: #ff7f0e" />
-     <use ns4:href="#m9b8c54d372" x="364.352985" y="110.285937" style="fill: #ff7f0e; stroke: #ff7f0e" />
-     <use ns4:href="#m9b8c54d372" x="509.278756" y="105.945081" style="fill: #ff7f0e; stroke: #ff7f0e" />
-     <use ns4:href="#m9b8c54d372" x="654.204528" y="80.334108" style="fill: #ff7f0e; stroke: #ff7f0e" />
-     <use ns4:href="#m9b8c54d372" x="799.130299" y="45.999414" style="fill: #ff7f0e; stroke: #ff7f0e" />
+    <g clip-path="url(#p09feef2583)">
+     <use ns4:href="#m9b8c54d372" x="83.607806" y="156.020744" style="fill: #ff7f0e; stroke: #ff7f0e" />
+     <use ns4:href="#m9b8c54d372" x="226.799032" y="138.969401" style="fill: #ff7f0e; stroke: #ff7f0e" />
+     <use ns4:href="#m9b8c54d372" x="369.990258" y="109.128607" style="fill: #ff7f0e; stroke: #ff7f0e" />
+     <use ns4:href="#m9b8c54d372" x="513.181484" y="99.249026" style="fill: #ff7f0e; stroke: #ff7f0e" />
+     <use ns4:href="#m9b8c54d372" x="656.37271" y="87.05645" style="fill: #ff7f0e; stroke: #ff7f0e" />
+     <use ns4:href="#m9b8c54d372" x="799.563935" y="45.999414" style="fill: #ff7f0e; stroke: #ff7f0e" />
     </g>
    </g>
    <g id="series--xformers-meff" class="series">
-    <path d="M 74.501443 375.900177  L 219.427214 387.059046  L 364.352985 342.097108  L 509.278756 347.750344  L 654.204528 334.407754  L 799.130299 319.565404  " clip-path="url(#p0d2e0c97d5)" style="fill: none; stroke: #2ca02c; stroke-width: 1.5; stroke-linecap: square" />
+    <path d="M 83.607806 414.345368  L 226.799032 400.181572  L 369.990258 385.808769  L 513.181484 380.581847  L 656.37271 338.122056  L 799.563935 339.866876  " clip-path="url(#p09feef2583)" style="fill: none; stroke: #2ca02c; stroke-width: 1.5; stroke-linecap: square" />
     <defs>
      <path id="mc655281e0b" d="M 0 3  C 0.795609 3 1.55874 2.683901 2.12132 2.12132  C 2.683901 1.55874 3 0.795609 3 0  C 3 -0.795609 2.683901 -1.55874 2.12132 -2.12132  C 1.55874 -2.683901 0.795609 -3 0 -3  C -0.795609 -3 -1.55874 -2.683901 -2.12132 -2.12132  C -2.683901 -1.55874 -3 -0.795609 -3 0  C -3 0.795609 -2.683901 1.55874 -2.12132 2.12132  C -1.55874 2.683901 -0.795609 3 0 3  z " style="stroke: #2ca02c" />
     </defs>
-    <g clip-path="url(#p0d2e0c97d5)">
-     <use ns4:href="#mc655281e0b" x="74.501443" y="375.900177" style="fill: #2ca02c; stroke: #2ca02c" />
-     <use ns4:href="#mc655281e0b" x="219.427214" y="387.059046" style="fill: #2ca02c; stroke: #2ca02c" />
-     <use ns4:href="#mc655281e0b" x="364.352985" y="342.097108" style="fill: #2ca02c; stroke: #2ca02c" />
-     <use ns4:href="#mc655281e0b" x="509.278756" y="347.750344" style="fill: #2ca02c; stroke: #2ca02c" />
-     <use ns4:href="#mc655281e0b" x="654.204528" y="334.407754" style="fill: #2ca02c; stroke: #2ca02c" />
-     <use ns4:href="#mc655281e0b" x="799.130299" y="319.565404" style="fill: #2ca02c; stroke: #2ca02c" />
+    <g clip-path="url(#p09feef2583)">
+     <use ns4:href="#mc655281e0b" x="83.607806" y="414.345368" style="fill: #2ca02c; stroke: #2ca02c" />
+     <use ns4:href="#mc655281e0b" x="226.799032" y="400.181572" style="fill: #2ca02c; stroke: #2ca02c" />
+     <use ns4:href="#mc655281e0b" x="369.990258" y="385.808769" style="fill: #2ca02c; stroke: #2ca02c" />
+     <use ns4:href="#mc655281e0b" x="513.181484" y="380.581847" style="fill: #2ca02c; stroke: #2ca02c" />
+     <use ns4:href="#mc655281e0b" x="656.37271" y="338.122056" style="fill: #2ca02c; stroke: #2ca02c" />
+     <use ns4:href="#mc655281e0b" x="799.563935" y="339.866876" style="fill: #2ca02c; stroke: #2ca02c" />
     </g>
    </g>
    <g id="series--hf-kernels-flash-attn" class="series">
-    <path d="M 74.501443 428.387702  L 219.427214 354.446599  L 364.352985 340.491147  L 509.278756 339.909398  L 654.204528 340.987724  L 799.130299 309.967712  " clip-path="url(#p0d2e0c97d5)" style="fill: none; stroke: #d62728; stroke-width: 1.5; stroke-linecap: square" />
+    <path d="M 83.607806 420.20395  L 226.799032 407.432473  L 369.990258 399.40236  L 513.181484 392.590345  L 656.37271 345.709514  L 799.563935 346.355668  " clip-path="url(#p09feef2583)" style="fill: none; stroke: #d62728; stroke-width: 1.5; stroke-linecap: square" />
     <defs>
      <path id="m61c8040d7e" d="M 0 3  C 0.795609 3 1.55874 2.683901 2.12132 2.12132  C 2.683901 1.55874 3 0.795609 3 0  C 3 -0.795609 2.683901 -1.55874 2.12132 -2.12132  C 1.55874 -2.683901 0.795609 -3 0 -3  C -0.795609 -3 -1.55874 -2.683901 -2.12132 -2.12132  C -2.683901 -1.55874 -3 -0.795609 -3 0  C -3 0.795609 -2.683901 1.55874 -2.12132 2.12132  C -1.55874 2.683901 -0.795609 3 0 3  z " style="stroke: #d62728" />
     </defs>
-    <g clip-path="url(#p0d2e0c97d5)">
-     <use ns4:href="#m61c8040d7e" x="74.501443" y="428.387702" style="fill: #d62728; stroke: #d62728" />
-     <use ns4:href="#m61c8040d7e" x="219.427214" y="354.446599" style="fill: #d62728; stroke: #d62728" />
-     <use ns4:href="#m61c8040d7e" x="364.352985" y="340.491147" style="fill: #d62728; stroke: #d62728" />
-     <use ns4:href="#m61c8040d7e" x="509.278756" y="339.909398" style="fill: #d62728; stroke: #d62728" />
-     <use ns4:href="#m61c8040d7e" x="654.204528" y="340.987724" style="fill: #d62728; stroke: #d62728" />
-     <use ns4:href="#m61c8040d7e" x="799.130299" y="309.967712" style="fill: #d62728; stroke: #d62728" />
+    <g clip-path="url(#p09feef2583)">
+     <use ns4:href="#m61c8040d7e" x="83.607806" y="420.20395" style="fill: #d62728; stroke: #d62728" />
+     <use ns4:href="#m61c8040d7e" x="226.799032" y="407.432473" style="fill: #d62728; stroke: #d62728" />
+     <use ns4:href="#m61c8040d7e" x="369.990258" y="399.40236" style="fill: #d62728; stroke: #d62728" />
+     <use ns4:href="#m61c8040d7e" x="513.181484" y="392.590345" style="fill: #d62728; stroke: #d62728" />
+     <use ns4:href="#m61c8040d7e" x="656.37271" y="345.709514" style="fill: #d62728; stroke: #d62728" />
+     <use ns4:href="#m61c8040d7e" x="799.563935" y="346.355668" style="fill: #d62728; stroke: #d62728" />
     </g>
    </g>
    <g id="series--hf-kernels-flash-attn3" class="series">
-    <path d="M 74.501443 401.329117  L 219.427214 364.396182  L 364.352985 354.430502  L 509.278756 350.637501  L 654.204528 335.557722  L 799.130299 321.02931  " clip-path="url(#p0d2e0c97d5)" style="fill: none; stroke: #9467bd; stroke-width: 1.5; stroke-linecap: square" />
+    <path d="M 83.607806 428.387702  L 226.799032 420.061906  L 369.990258 405.625328  L 513.181484 401.010644  L 656.37271 352.807645  L 799.563935 359.622849  " clip-path="url(#p09feef2583)" style="fill: none; stroke: #9467bd; stroke-width: 1.5; stroke-linecap: square" />
     <defs>
      <path id="m7cd35be9cc" d="M 0 3  C 0.795609 3 1.55874 2.683901 2.12132 2.12132  C 2.683901 1.55874 3 0.795609 3 0  C 3 -0.795609 2.683901 -1.55874 2.12132 -2.12132  C 1.55874 -2.683901 0.795609 -3 0 -3  C -0.795609 -3 -1.55874 -2.683901 -2.12132 -2.12132  C -2.683901 -1.55874 -3 -0.795609 -3 0  C -3 0.795609 -2.683901 1.55874 -2.12132 2.12132  C -1.55874 2.683901 -0.795609 3 0 3  z " style="stroke: #9467bd" />
     </defs>
-    <g clip-path="url(#p0d2e0c97d5)">
-     <use ns4:href="#m7cd35be9cc" x="74.501443" y="401.329117" style="fill: #9467bd; stroke: #9467bd" />
-     <use ns4:href="#m7cd35be9cc" x="219.427214" y="364.396182" style="fill: #9467bd; stroke: #9467bd" />
-     <use ns4:href="#m7cd35be9cc" x="364.352985" y="354.430502" style="fill: #9467bd; stroke: #9467bd" />
-     <use ns4:href="#m7cd35be9cc" x="509.278756" y="350.637501" style="fill: #9467bd; stroke: #9467bd" />
-     <use ns4:href="#m7cd35be9cc" x="654.204528" y="335.557722" style="fill: #9467bd; stroke: #9467bd" />
-     <use ns4:href="#m7cd35be9cc" x="799.130299" y="321.02931" style="fill: #9467bd; stroke: #9467bd" />
+    <g clip-path="url(#p09feef2583)">
+     <use ns4:href="#m7cd35be9cc" x="83.607806" y="428.387702" style="fill: #9467bd; stroke: #9467bd" />
+     <use ns4:href="#m7cd35be9cc" x="226.799032" y="420.061906" style="fill: #9467bd; stroke: #9467bd" />
+     <use ns4:href="#m7cd35be9cc" x="369.990258" y="405.625328" style="fill: #9467bd; stroke: #9467bd" />
+     <use ns4:href="#m7cd35be9cc" x="513.181484" y="401.010644" style="fill: #9467bd; stroke: #9467bd" />
+     <use ns4:href="#m7cd35be9cc" x="656.37271" y="352.807645" style="fill: #9467bd; stroke: #9467bd" />
+     <use ns4:href="#m7cd35be9cc" x="799.563935" y="359.622849" style="fill: #9467bd; stroke: #9467bd" />
     </g>
    </g>
    <g id="patch_3">
-    <path d="M 38.27 447.507117  L 38.27 26.88  " style="fill: none; stroke: #000000; stroke-width: 0.8; stroke-linejoin: miter; stroke-linecap: square" />
+    <path d="M 47.81 447.507117  L 47.81 26.88  " style="fill: none; stroke: #000000; stroke-width: 0.8; stroke-linejoin: miter; stroke-linecap: square" />
    </g>
    <g id="patch_4">
     <path d="M 835.361742 447.507117  L 835.361742 26.88  " style="fill: none; stroke: #000000; stroke-width: 0.8; stroke-linejoin: miter; stroke-linecap: square" />
    </g>
    <g id="patch_5">
-    <path d="M 38.27 447.507117  L 835.361742 447.507117  " style="fill: none; stroke: #000000; stroke-width: 0.8; stroke-linejoin: miter; stroke-linecap: square" />
+    <path d="M 47.81 447.507117  L 835.361742 447.507117  " style="fill: none; stroke: #000000; stroke-width: 0.8; stroke-linejoin: miter; stroke-linecap: square" />
    </g>
    <g id="patch_6">
-    <path d="M 38.27 26.88  L 835.361742 26.88  " style="fill: none; stroke: #000000; stroke-width: 0.8; stroke-linejoin: miter; stroke-linecap: square" />
+    <path d="M 47.81 26.88  L 835.361742 26.88  " style="fill: none; stroke: #000000; stroke-width: 0.8; stroke-linejoin: miter; stroke-linecap: square" />
    </g>
-   <g id="text_13">
-    <text style="font-size: 12px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: middle" x="436.815871" y="20.88" transform="rotate(-0 436.815871 20.88)">Attention Implementation Latency</text>
+   <g id="text_14">
+    <text style="font-size: 12px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: middle" x="441.585871" y="20.88" transform="rotate(-0 441.585871 20.88)">Attention Implementation Latency</text>
    </g>
    <g id="legend" class="legend">
     <g id="patch_7">
-     <path d="M 45.27 109.66125  L 188.765313 109.66125  Q 190.765313 109.66125 190.765313 107.66125  L 190.765313 33.88  Q 190.765313 31.88 188.765313 31.88  L 45.27 31.88  Q 43.27 31.88 43.27 33.88  L 43.27 107.66125  Q 43.27 109.66125 45.27 109.66125  L 45.27 109.66125  z " style="fill: none; opacity: 0.8; stroke: #cccccc; stroke-linejoin: miter" />
+     <path d="M 54.81 109.66125  L 198.305313 109.66125  Q 200.305313 109.66125 200.305313 107.66125  L 200.305313 33.88  Q 200.305313 31.88 198.305313 31.88  L 54.81 31.88  Q 52.81 31.88 52.81 33.88  L 52.81 107.66125  Q 52.81 109.66125 54.81 109.66125  L 54.81 109.66125  z " style="fill: none; opacity: 0.8; stroke: #cccccc; stroke-linejoin: miter" />
     </g>
-    <g id="line2d_13">
-     <path d="M 47.27 39.978438  L 57.27 39.978438  L 67.27 39.978438  " style="fill: none; stroke: #1f77b4; stroke-width: 1.5; stroke-linecap: square" />
+    <g id="line2d_14">
+     <path d="M 56.81 39.978438  L 66.81 39.978438  L 76.81 39.978438  " style="fill: none; stroke: #1f77b4; stroke-width: 1.5; stroke-linecap: square" />
      <g>
-      <use ns4:href="#md7efaf3aec" x="57.27" y="39.978438" style="fill: #1f77b4; stroke: #1f77b4" />
+      <use ns4:href="#md7efaf3aec" x="66.81" y="39.978438" style="fill: #1f77b4; stroke: #1f77b4" />
      </g>
     </g>
     <g id="legend-label--torch-flash-ma" class="legend">
-     <text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: start" x="75.27" y="43.478438" transform="rotate(-0 75.27 43.478438)">torch_flash_ma</text>
+     <text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: start" x="84.81" y="43.478438" transform="rotate(-0 84.81 43.478438)">torch_flash_ma</text>
     </g>
-    <g id="line2d_14">
-     <path d="M 47.27 54.934687  L 57.27 54.934687  L 67.27 54.934687  " style="fill: none; stroke: #ff7f0e; stroke-width: 1.5; stroke-linecap: square" />
+    <g id="line2d_15">
+     <path d="M 56.81 54.934687  L 66.81 54.934687  L 76.81 54.934687  " style="fill: none; stroke: #ff7f0e; stroke-width: 1.5; stroke-linecap: square" />
      <g>
-      <use ns4:href="#m9b8c54d372" x="57.27" y="54.934687" style="fill: #ff7f0e; stroke: #ff7f0e" />
+      <use ns4:href="#m9b8c54d372" x="66.81" y="54.934687" style="fill: #ff7f0e; stroke: #ff7f0e" />
      </g>
     </g>
     <g id="legend-label--torch-mem-eff" class="legend">
-     <text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: start" x="75.27" y="58.434687" transform="rotate(-0 75.27 58.434687)">torch_mem_eff</text>
+     <text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: start" x="84.81" y="58.434687" transform="rotate(-0 84.81 58.434687)">torch_mem_eff</text>
     </g>
-    <g id="line2d_15">
-     <path d="M 47.27 69.890938  L 57.27 69.890938  L 67.27 69.890938  " style="fill: none; stroke: #2ca02c; stroke-width: 1.5; stroke-linecap: square" />
+    <g id="line2d_16">
+     <path d="M 56.81 69.890938  L 66.81 69.890938  L 76.81 69.890938  " style="fill: none; stroke: #2ca02c; stroke-width: 1.5; stroke-linecap: square" />
      <g>
-      <use ns4:href="#mc655281e0b" x="57.27" y="69.890938" style="fill: #2ca02c; stroke: #2ca02c" />
+      <use ns4:href="#mc655281e0b" x="66.81" y="69.890938" style="fill: #2ca02c; stroke: #2ca02c" />
      </g>
     </g>
     <g id="legend-label--xformers-meff" class="legend">
-     <text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: start" x="75.27" y="73.390938" transform="rotate(-0 75.27 73.390938)">xformers_meff</text>
+     <text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: start" x="84.81" y="73.390938" transform="rotate(-0 84.81 73.390938)">xformers_meff</text>
     </g>
-    <g id="line2d_16">
-     <path d="M 47.27 84.847188  L 57.27 84.847188  L 67.27 84.847188  " style="fill: none; stroke: #d62728; stroke-width: 1.5; stroke-linecap: square" />
+    <g id="line2d_17">
+     <path d="M 56.81 84.847188  L 66.81 84.847188  L 76.81 84.847188  " style="fill: none; stroke: #d62728; stroke-width: 1.5; stroke-linecap: square" />
      <g>
-      <use ns4:href="#m61c8040d7e" x="57.27" y="84.847188" style="fill: #d62728; stroke: #d62728" />
+      <use ns4:href="#m61c8040d7e" x="66.81" y="84.847188" style="fill: #d62728; stroke: #d62728" />
      </g>
     </g>
     <g id="legend-label--hf-kernels-flash-attn" class="legend">
-     <text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: start" x="75.27" y="88.347188" transform="rotate(-0 75.27 88.347188)">hf_kernels_flash_attn</text>
+     <text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: start" x="84.81" y="88.347188" transform="rotate(-0 84.81 88.347188)">hf_kernels_flash_attn</text>
     </g>
-    <g id="line2d_17">
-     <path d="M 47.27 99.803438  L 57.27 99.803438  L 67.27 99.803438  " style="fill: none; stroke: #9467bd; stroke-width: 1.5; stroke-linecap: square" />
+    <g id="line2d_18">
+     <path d="M 56.81 99.803438  L 66.81 99.803438  L 76.81 99.803438  " style="fill: none; stroke: #9467bd; stroke-width: 1.5; stroke-linecap: square" />
      <g>
-      <use ns4:href="#m7cd35be9cc" x="57.27" y="99.803438" style="fill: #9467bd; stroke: #9467bd" />
+      <use ns4:href="#m7cd35be9cc" x="66.81" y="99.803438" style="fill: #9467bd; stroke: #9467bd" />
      </g>
     </g>
     <g id="legend-label--hf-kernels-flash-attn3" class="legend">
-     <text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: start" x="75.27" y="103.303438" transform="rotate(-0 75.27 103.303438)">hf_kernels_flash_attn3</text>
+     <text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: start" x="84.81" y="103.303438" transform="rotate(-0 84.81 103.303438)">hf_kernels_flash_attn3</text>
     </g>
    </g>
   </g>
  </g>
  <defs>
-  <clipPath id="p0d2e0c97d5">
-   <rect x="38.27" y="26.88" width="797.091742" height="420.627117" />
+  <clipPath id="p09feef2583">
+   <rect x="47.81" y="26.88" width="787.551742" height="420.627117" />
   </clipPath>
  </defs>
 </svg>
@@ -4217,7 +4230,7 @@ span.linenos.special { color: #000000; background-color: #ffffc0; padding-left:
 <span onclick="toggleOutput('combine')" style="cursor: pointer;">▼ output</span>
  <span id="uv-indicator-combine" onclick="toggleUvLogsFromHeader('combine')" style="cursor: pointer;">▶ uv-logs</span>
 </span> | 
-Cell: combine | 39.40s
+Cell: combine | 4.50s
  | <button class="run-btn" onclick="runCell('combine')">▶ run</button>
 <button class="copy-btn" onclick="copyCell('combine')">Copy</button>
 <a href="cells/combine.py" target="_blank" class="raw-btn">Raw</a>
@@ -4297,25 +4310,25 @@ Cell: combine | 39.40s
 <div class="cell-stdout"><pre class="stdout-text">======================================================================
 LOADING BENCHMARK DATA
 ======================================================================
-✓ Flash (PyTorch SDPA)          : /home/ubuntu/Projects/kernels-benchmarks-consolidated/benches/flash_attn/impls/.uvnote/cache/1229e2a918a2e0c395750645114ee4e0e721d5f703c5221972db88ca3fe9e8b9
-✓ MemEff (PyTorch SDPA)         : /home/ubuntu/Projects/kernels-benchmarks-consolidated/benches/flash_attn/impls/.uvnote/cache/2ad4910cf70b34b5a3a316e2d789b9763d6651ee3d6727249ff229320cd58d24
-✓ xFormers                      : /home/ubuntu/Projects/kernels-benchmarks-consolidated/benches/flash_attn/impls/.uvnote/cache/4883c01f586350408f08d21d8d78943b44e953dd559356f9392f803696daca1a
-✓ HF Kernels Flash Attn         : /home/ubuntu/Projects/kernels-benchmarks-consolidated/benches/flash_attn/impls/.uvnote/cache/0c83351d95394732eb53074c734760a0bef9733834298a6c04a08d9ec6a12660
-✓ HF Kernels Flash Attn3        : /home/ubuntu/Projects/kernels-benchmarks-consolidated/benches/flash_attn/impls/.uvnote/cache/f514c47cfa55db88f58672a033a040f67509b5051b8e3f332dd6d20ae85a88a8
-✓ SageAttention                 : /home/ubuntu/Projects/kernels-benchmarks-consolidated/benches/flash_attn/impls/.uvnote/cache/44c62c72b6cd63934c6e75ade8b74c9428734ea4f94c030632b382a8f0107a57
+✓ Flash (PyTorch SDPA)          : /__w/kernels-benchmarks/kernels-benchmarks/benches/flash_attn/impls/.uvnote/cache/4b81c2b991fc4a0f70c4117e933abc4007fd7f3f55394d7778a4074adf29df04
+✓ MemEff (PyTorch SDPA)         : /__w/kernels-benchmarks/kernels-benchmarks/benches/flash_attn/impls/.uvnote/cache/a23b7ad9cfb9e9968ec4a8f126174dc4a3ab5e6999c65a44570f93656598bd2f
+✓ xFormers                      : /__w/kernels-benchmarks/kernels-benchmarks/benches/flash_attn/impls/.uvnote/cache/269846603898e0ee1872d7a8b40fca43ba558b2f3400f8a7bedb1ee79df7da58
+✓ HF Kernels Flash Attn         : /__w/kernels-benchmarks/kernels-benchmarks/benches/flash_attn/impls/.uvnote/cache/c1c92a22d205ca145ffb0083188c0f8eef512cfd6aa091b1e49d6329fbd08849
+✓ HF Kernels Flash Attn3        : /__w/kernels-benchmarks/kernels-benchmarks/benches/flash_attn/impls/.uvnote/cache/8d741e4aa09c527ddf0f50ffa03a7e840559990c66178bfb9cf04bd97f3efd20
+✓ SageAttention                 : /__w/kernels-benchmarks/kernels-benchmarks/benches/flash_attn/impls/.uvnote/cache/1355120a3e88bcb74f4130be51dfe8b03e7dc2b7823f2a53b20da7899570a16f
 
   ✓ Found Flash (PyTorch SDPA)
-     Path: /home/ubuntu/Projects/kernels-benchmarks-consolidated/benches/flash_attn/impls/.uvnote/cache/1229e2a918a2e0c395750645114ee4e0e721d5f703c5221972db88ca3fe9e8b9/attention.jsonl
+     Path: /__w/kernels-benchmarks/kernels-benchmarks/benches/flash_attn/impls/.uvnote/cache/4b81c2b991fc4a0f70c4117e933abc4007fd7f3f55394d7778a4074adf29df04/attention.jsonl
   ✓ Found MemEff (PyTorch SDPA)
-     Path: /home/ubuntu/Projects/kernels-benchmarks-consolidated/benches/flash_attn/impls/.uvnote/cache/2ad4910cf70b34b5a3a316e2d789b9763d6651ee3d6727249ff229320cd58d24/attention.jsonl
+     Path: /__w/kernels-benchmarks/kernels-benchmarks/benches/flash_attn/impls/.uvnote/cache/a23b7ad9cfb9e9968ec4a8f126174dc4a3ab5e6999c65a44570f93656598bd2f/attention.jsonl
   ✓ Found xFormers
-     Path: /home/ubuntu/Projects/kernels-benchmarks-consolidated/benches/flash_attn/impls/.uvnote/cache/4883c01f586350408f08d21d8d78943b44e953dd559356f9392f803696daca1a/attention.jsonl
+     Path: /__w/kernels-benchmarks/kernels-benchmarks/benches/flash_attn/impls/.uvnote/cache/269846603898e0ee1872d7a8b40fca43ba558b2f3400f8a7bedb1ee79df7da58/attention.jsonl
   ✓ Found HF Kernels Flash Attn
-     Path: /home/ubuntu/Projects/kernels-benchmarks-consolidated/benches/flash_attn/impls/.uvnote/cache/0c83351d95394732eb53074c734760a0bef9733834298a6c04a08d9ec6a12660/attention.jsonl
+     Path: /__w/kernels-benchmarks/kernels-benchmarks/benches/flash_attn/impls/.uvnote/cache/c1c92a22d205ca145ffb0083188c0f8eef512cfd6aa091b1e49d6329fbd08849/attention.jsonl
   ✓ Found HF Kernels Flash Attn3
-     Path: /home/ubuntu/Projects/kernels-benchmarks-consolidated/benches/flash_attn/impls/.uvnote/cache/f514c47cfa55db88f58672a033a040f67509b5051b8e3f332dd6d20ae85a88a8/attention.jsonl
+     Path: /__w/kernels-benchmarks/kernels-benchmarks/benches/flash_attn/impls/.uvnote/cache/8d741e4aa09c527ddf0f50ffa03a7e840559990c66178bfb9cf04bd97f3efd20/attention.jsonl
   ✓ Found SageAttention
-     Path: /home/ubuntu/Projects/kernels-benchmarks-consolidated/benches/flash_attn/impls/.uvnote/cache/44c62c72b6cd63934c6e75ade8b74c9428734ea4f94c030632b382a8f0107a57/attention.jsonl
+     Path: /__w/kernels-benchmarks/kernels-benchmarks/benches/flash_attn/impls/.uvnote/cache/1355120a3e88bcb74f4130be51dfe8b03e7dc2b7823f2a53b20da7899570a16f/attention.jsonl
 
 ======================================================================
 Summary: 6 found, 0 skipped, 0 missing
@@ -4324,48 +4337,48 @@ Summary: 6 found, 0 skipped, 0 missing
 COMBINED BENCHMARK SUMMARY
 
 impl                     wl                  p50(ms)  ok
-hf_kernels_flash_attn    cuda_attn_L128_bfloat16     2.82  True
-hf_kernels_flash_attn    cuda_attn_L256_bfloat16     3.91  True
-hf_kernels_flash_attn    cuda_attn_L320_bfloat16     4.12  True
-hf_kernels_flash_attn    cuda_attn_L384_bfloat16     4.13  True
-hf_kernels_flash_attn    cuda_attn_L448_bfloat16     4.11  True
-hf_kernels_flash_attn    cuda_attn_L512_bfloat16     4.57  True
-hf_kernels_flash_attn3   cuda_attn_L128_bfloat16     3.22  True
-hf_kernels_flash_attn3   cuda_attn_L256_bfloat16     3.77  True
-hf_kernels_flash_attn3   cuda_attn_L320_bfloat16     3.91  True
-hf_kernels_flash_attn3   cuda_attn_L384_bfloat16     3.97  True
-hf_kernels_flash_attn3   cuda_attn_L448_bfloat16     4.19  True
-hf_kernels_flash_attn3   cuda_attn_L512_bfloat16     4.41  True
+hf_kernels_flash_attn    cuda_attn_L128_bfloat16     0.98  True
+hf_kernels_flash_attn    cuda_attn_L256_bfloat16     1.02  True
+hf_kernels_flash_attn    cuda_attn_L320_bfloat16     1.05  True
+hf_kernels_flash_attn    cuda_attn_L384_bfloat16     1.07  True
+hf_kernels_flash_attn    cuda_attn_L448_bfloat16     1.23  True
+hf_kernels_flash_attn    cuda_attn_L512_bfloat16     1.23  True
+hf_kernels_flash_attn3   cuda_attn_L128_bfloat16     0.95  True
+hf_kernels_flash_attn3   cuda_attn_L256_bfloat16     0.98  True
+hf_kernels_flash_attn3   cuda_attn_L320_bfloat16     1.03  True
+hf_kernels_flash_attn3   cuda_attn_L384_bfloat16     1.04  True
+hf_kernels_flash_attn3   cuda_attn_L448_bfloat16     1.21  True
+hf_kernels_flash_attn3   cuda_attn_L512_bfloat16     1.18  True
 sage_int8_fp16           cuda_attn_L128_bfloat16    FAIL  False
-  Error: module &#x27;sage_attention_c9b3a60c7e3c5091&#x27; has no attribute &#x27;fwd&#x27;
+  Error: module &#x27;sage_attention_12c766386675beb4&#x27; has no attribute &#x27;fwd&#x27;
 sage_int8_fp16           cuda_attn_L256_bfloat16    FAIL  False
-  Error: module &#x27;sage_attention_c9b3a60c7e3c5091&#x27; has no attribute &#x27;fwd&#x27;
+  Error: module &#x27;sage_attention_12c766386675beb4&#x27; has no attribute &#x27;fwd&#x27;
 sage_int8_fp16           cuda_attn_L320_bfloat16    FAIL  False
-  Error: module &#x27;sage_attention_c9b3a60c7e3c5091&#x27; has no attribute &#x27;fwd&#x27;
+  Error: module &#x27;sage_attention_12c766386675beb4&#x27; has no attribute &#x27;fwd&#x27;
 sage_int8_fp16           cuda_attn_L384_bfloat16    FAIL  False
-  Error: module &#x27;sage_attention_c9b3a60c7e3c5091&#x27; has no attribute &#x27;fwd&#x27;
+  Error: module &#x27;sage_attention_12c766386675beb4&#x27; has no attribute &#x27;fwd&#x27;
 sage_int8_fp16           cuda_attn_L448_bfloat16    FAIL  False
-  Error: module &#x27;sage_attention_c9b3a60c7e3c5091&#x27; has no attribute &#x27;fwd&#x27;
+  Error: module &#x27;sage_attention_12c766386675beb4&#x27; has no attribute &#x27;fwd&#x27;
 sage_int8_fp16           cuda_attn_L512_bfloat16    FAIL  False
-  Error: module &#x27;sage_attention_c9b3a60c7e3c5091&#x27; has no attribute &#x27;fwd&#x27;
-torch_flash_ma           cuda_attn_L128_bfloat16     4.09  True
-torch_flash_ma           cuda_attn_L256_bfloat16     4.79  True
-torch_flash_ma           cuda_attn_L320_bfloat16     4.90  True
-torch_flash_ma           cuda_attn_L384_bfloat16     4.98  True
-torch_flash_ma           cuda_attn_L448_bfloat16     5.05  True
-torch_flash_ma           cuda_attn_L512_bfloat16     5.47  True
-torch_mem_eff            cuda_attn_L128_bfloat16     6.77  True
-torch_mem_eff            cuda_attn_L256_bfloat16     7.24  True
-torch_mem_eff            cuda_attn_L320_bfloat16     7.52  True
-torch_mem_eff            cuda_attn_L384_bfloat16     7.59  True
-torch_mem_eff            cuda_attn_L448_bfloat16     7.97  True
-torch_mem_eff            cuda_attn_L512_bfloat16     8.47  True
-xformers_meff            cuda_attn_L128_bfloat16     3.60  True
-xformers_meff            cuda_attn_L256_bfloat16     3.43  True
-xformers_meff            cuda_attn_L320_bfloat16     4.10  True
-xformers_meff            cuda_attn_L384_bfloat16     4.01  True
-xformers_meff            cuda_attn_L448_bfloat16     4.21  True
-xformers_meff            cuda_attn_L512_bfloat16     4.43  True
+  Error: module &#x27;sage_attention_12c766386675beb4&#x27; has no attribute &#x27;fwd&#x27;
+torch_flash_ma           cuda_attn_L128_bfloat16     1.22  True
+torch_flash_ma           cuda_attn_L256_bfloat16     1.27  True
+torch_flash_ma           cuda_attn_L320_bfloat16     1.31  True
+torch_flash_ma           cuda_attn_L384_bfloat16     1.34  True
+torch_flash_ma           cuda_attn_L448_bfloat16     1.48  True
+torch_flash_ma           cuda_attn_L512_bfloat16     1.52  True
+torch_mem_eff            cuda_attn_L128_bfloat16     1.89  True
+torch_mem_eff            cuda_attn_L256_bfloat16     1.95  True
+torch_mem_eff            cuda_attn_L320_bfloat16     2.05  True
+torch_mem_eff            cuda_attn_L384_bfloat16     2.08  True
+torch_mem_eff            cuda_attn_L448_bfloat16     2.13  True
+torch_mem_eff            cuda_attn_L512_bfloat16     2.27  True
+xformers_meff            cuda_attn_L128_bfloat16     1.00  True
+xformers_meff            cuda_attn_L256_bfloat16     1.04  True
+xformers_meff            cuda_attn_L320_bfloat16     1.09  True
+xformers_meff            cuda_attn_L384_bfloat16     1.11  True
+xformers_meff            cuda_attn_L448_bfloat16     1.26  True
+xformers_meff            cuda_attn_L512_bfloat16     1.25  True
 
 GENERATING COMBINED VISUALIZATION
 
@@ -4389,53 +4402,7 @@ Implementations included:
 <div class="uv-install-logs" id="uv-logs-combine">
 <div class="uv-logs-header" onclick="toggleUvLogs(this)">▶ UV Install Logs</div>
 <div class="uv-logs-content" style="display: none;">
-   Building kernels-benchmark-tools @ file:///home/ubuntu/Projects/kernels-benchmarks-consolidated/tools
-Downloading nvidia-cufft-cu12 (184.2MiB)
-Downloading nvidia-cufile-cu12 (1.1MiB)
-Downloading networkx (1.9MiB)
-Downloading kiwisolver (1.4MiB)
-Downloading nvidia-cusolver-cu12 (255.1MiB)
-Downloading nvidia-cuda-cupti-cu12 (9.8MiB)
-Downloading pillow (6.7MiB)
-Downloading nvidia-cuda-nvrtc-cu12 (84.0MiB)
-Downloading nvidia-cusparse-cu12 (274.9MiB)
-Downloading nvidia-cublas-cu12 (566.8MiB)
-Downloading numpy (15.9MiB)
-Downloading fonttools (4.7MiB)
-Downloading setuptools (1.1MiB)
-Downloading sympy (6.0MiB)
-Downloading nvidia-nccl-cu12 (307.4MiB)
-Downloading nvidia-nvjitlink-cu12 (37.4MiB)
-Downloading nvidia-cusparselt-cu12 (273.9MiB)
-Downloading nvidia-cudnn-cu12 (674.0MiB)
-Downloading torch (846.8MiB)
-Downloading matplotlib (8.3MiB)
-Downloading triton (148.4MiB)
-Downloading nvidia-curand-cu12 (60.7MiB)
- Downloading nvidia-cufile-cu12
- Downloading kiwisolver
- Downloading setuptools
- Downloading fonttools
- Downloading networkx
- Downloading pillow
-      Built kernels-benchmark-tools @ file:///home/ubuntu/Projects/kernels-benchmarks-consolidated/tools
- Downloading matplotlib
- Downloading nvidia-cuda-cupti-cu12
- Downloading numpy
- Downloading nvidia-nvjitlink-cu12
- Downloading sympy
- Downloading nvidia-curand-cu12
- Downloading nvidia-cuda-nvrtc-cu12
- Downloading triton
- Downloading nvidia-cufft-cu12
- Downloading nvidia-cusolver-cu12
- Downloading nvidia-cusparselt-cu12
- Downloading nvidia-cusparse-cu12
- Downloading nvidia-nccl-cu12
- Downloading nvidia-cublas-cu12
- Downloading nvidia-cudnn-cu12
- Downloading torch
-Installed 37 packages in 230ms
+Installed 37 packages in 259ms
 </div>
 </div>
 <div class="cell-artifacts">
@@ -4448,7 +4415,7 @@ Installed 37 packages in 230ms
   <rdf:RDF>
    <ns2:Work>
     <dc:type rdf:resource="http://purl.org/dc/dcmitype/StillImage" />
-    <dc:date>2025-10-24T19:27:34.267507</dc:date>
+    <dc:date>2025-10-27T14:46:38.946915</dc:date>
     <dc:format>image/svg+xml</dc:format>
     <dc:creator>
      <ns2:Agent>
@@ -4467,320 +4434,333 @@ Installed 37 packages in 230ms
   </g>
   <g id="axes--1" class="axes">
    <g id="patch_2">
-    <path d="M 38.27 447.507117  L 835.361742 447.507117  L 835.361742 26.88  L 38.27 26.88  L 38.27 447.507117  z " style="fill: none" />
+    <path d="M 47.81 447.507117  L 835.361742 447.507117  L 835.361742 26.88  L 47.81 26.88  L 47.81 447.507117  z " style="fill: none" />
    </g>
    <g id="matplotlib.axis_1">
     <g id="xtick_1">
      <g id="grid-x--1" class="grid grid-x">
-      <path d="M 74.501443 447.507117  L 74.501443 26.88  " clip-path="url(#p0d2e0c97d5)" style="fill: none; stroke: #b0b0b0; stroke-opacity: 0.3; stroke-width: 0.8; stroke-linecap: square" />
+      <path d="M 83.607806 447.507117  L 83.607806 26.88  " clip-path="url(#p09feef2583)" style="fill: none; stroke: #b0b0b0; stroke-opacity: 0.3; stroke-width: 0.8; stroke-linecap: square" />
      </g>
      <g id="line2d_1">
       <defs>
        <path id="mafb3703e5b" d="M 0 0  L 0 3.5  " style="stroke: #000000; stroke-width: 0.8" />
       </defs>
       <g>
-       <use ns4:href="#mafb3703e5b" x="74.501443" y="447.507117" style="stroke: #000000; stroke-width: 0.8" />
+       <use ns4:href="#mafb3703e5b" x="83.607806" y="447.507117" style="stroke: #000000; stroke-width: 0.8" />
       </g>
      </g>
      <g id="text_1">
-      <text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif" transform="translate(31.87119 548.84621) rotate(-45)">cuda_attn_L128_bfloat16</text>
+      <text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif" transform="translate(40.977554 548.84621) rotate(-45)">cuda_attn_L128_bfloat16</text>
      </g>
     </g>
     <g id="xtick_2">
      <g id="grid-x--2" class="grid grid-x">
-      <path d="M 219.427214 447.507117  L 219.427214 26.88  " clip-path="url(#p0d2e0c97d5)" style="fill: none; stroke: #b0b0b0; stroke-opacity: 0.3; stroke-width: 0.8; stroke-linecap: square" />
+      <path d="M 226.799032 447.507117  L 226.799032 26.88  " clip-path="url(#p09feef2583)" style="fill: none; stroke: #b0b0b0; stroke-opacity: 0.3; stroke-width: 0.8; stroke-linecap: square" />
      </g>
      <g id="line2d_2">
       <g>
-       <use ns4:href="#mafb3703e5b" x="219.427214" y="447.507117" style="stroke: #000000; stroke-width: 0.8" />
+       <use ns4:href="#mafb3703e5b" x="226.799032" y="447.507117" style="stroke: #000000; stroke-width: 0.8" />
       </g>
      </g>
      <g id="text_2">
-      <text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif" transform="translate(176.796962 548.84621) rotate(-45)">cuda_attn_L256_bfloat16</text>
+      <text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif" transform="translate(184.16878 548.84621) rotate(-45)">cuda_attn_L256_bfloat16</text>
      </g>
     </g>
     <g id="xtick_3">
      <g id="grid-x--3" class="grid grid-x">
-      <path d="M 364.352985 447.507117  L 364.352985 26.88  " clip-path="url(#p0d2e0c97d5)" style="fill: none; stroke: #b0b0b0; stroke-opacity: 0.3; stroke-width: 0.8; stroke-linecap: square" />
+      <path d="M 369.990258 447.507117  L 369.990258 26.88  " clip-path="url(#p09feef2583)" style="fill: none; stroke: #b0b0b0; stroke-opacity: 0.3; stroke-width: 0.8; stroke-linecap: square" />
      </g>
      <g id="line2d_3">
       <g>
-       <use ns4:href="#mafb3703e5b" x="364.352985" y="447.507117" style="stroke: #000000; stroke-width: 0.8" />
+       <use ns4:href="#mafb3703e5b" x="369.990258" y="447.507117" style="stroke: #000000; stroke-width: 0.8" />
       </g>
      </g>
      <g id="text_3">
-      <text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif" transform="translate(321.722733 548.84621) rotate(-45)">cuda_attn_L320_bfloat16</text>
+      <text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif" transform="translate(327.360005 548.84621) rotate(-45)">cuda_attn_L320_bfloat16</text>
      </g>
     </g>
     <g id="xtick_4">
      <g id="grid-x--4" class="grid grid-x">
-      <path d="M 509.278756 447.507117  L 509.278756 26.88  " clip-path="url(#p0d2e0c97d5)" style="fill: none; stroke: #b0b0b0; stroke-opacity: 0.3; stroke-width: 0.8; stroke-linecap: square" />
+      <path d="M 513.181484 447.507117  L 513.181484 26.88  " clip-path="url(#p09feef2583)" style="fill: none; stroke: #b0b0b0; stroke-opacity: 0.3; stroke-width: 0.8; stroke-linecap: square" />
      </g>
      <g id="line2d_4">
       <g>
-       <use ns4:href="#mafb3703e5b" x="509.278756" y="447.507117" style="stroke: #000000; stroke-width: 0.8" />
+       <use ns4:href="#mafb3703e5b" x="513.181484" y="447.507117" style="stroke: #000000; stroke-width: 0.8" />
       </g>
      </g>
      <g id="text_4">
-      <text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif" transform="translate(466.648504 548.84621) rotate(-45)">cuda_attn_L384_bfloat16</text>
+      <text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif" transform="translate(470.551231 548.84621) rotate(-45)">cuda_attn_L384_bfloat16</text>
      </g>
     </g>
     <g id="xtick_5">
      <g id="grid-x--5" class="grid grid-x">
-      <path d="M 654.204528 447.507117  L 654.204528 26.88  " clip-path="url(#p0d2e0c97d5)" style="fill: none; stroke: #b0b0b0; stroke-opacity: 0.3; stroke-width: 0.8; stroke-linecap: square" />
+      <path d="M 656.37271 447.507117  L 656.37271 26.88  " clip-path="url(#p09feef2583)" style="fill: none; stroke: #b0b0b0; stroke-opacity: 0.3; stroke-width: 0.8; stroke-linecap: square" />
      </g>
      <g id="line2d_5">
       <g>
-       <use ns4:href="#mafb3703e5b" x="654.204528" y="447.507117" style="stroke: #000000; stroke-width: 0.8" />
+       <use ns4:href="#mafb3703e5b" x="656.37271" y="447.507117" style="stroke: #000000; stroke-width: 0.8" />
       </g>
      </g>
      <g id="text_5">
-      <text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif" transform="translate(611.574275 548.84621) rotate(-45)">cuda_attn_L448_bfloat16</text>
+      <text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif" transform="translate(613.742457 548.84621) rotate(-45)">cuda_attn_L448_bfloat16</text>
      </g>
     </g>
     <g id="xtick_6">
      <g id="grid-x--6" class="grid grid-x">
-      <path d="M 799.130299 447.507117  L 799.130299 26.88  " clip-path="url(#p0d2e0c97d5)" style="fill: none; stroke: #b0b0b0; stroke-opacity: 0.3; stroke-width: 0.8; stroke-linecap: square" />
+      <path d="M 799.563935 447.507117  L 799.563935 26.88  " clip-path="url(#p09feef2583)" style="fill: none; stroke: #b0b0b0; stroke-opacity: 0.3; stroke-width: 0.8; stroke-linecap: square" />
      </g>
      <g id="line2d_6">
       <g>
-       <use ns4:href="#mafb3703e5b" x="799.130299" y="447.507117" style="stroke: #000000; stroke-width: 0.8" />
+       <use ns4:href="#mafb3703e5b" x="799.563935" y="447.507117" style="stroke: #000000; stroke-width: 0.8" />
       </g>
      </g>
      <g id="text_6">
-      <text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif" transform="translate(756.500046 548.84621) rotate(-45)">cuda_attn_L512_bfloat16</text>
+      <text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif" transform="translate(756.933683 548.84621) rotate(-45)">cuda_attn_L512_bfloat16</text>
      </g>
     </g>
     <g id="label--x" class="xlabel">
-     <text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: middle" x="436.815871" y="562.111872" transform="rotate(-0 436.815871 562.111872)">Workload</text>
+     <text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: middle" x="441.585871" y="562.111872" transform="rotate(-0 441.585871 562.111872)">Workload</text>
     </g>
    </g>
    <g id="matplotlib.axis_2">
     <g id="ytick_1">
      <g id="grid-y--2" class="grid grid-y">
-      <path d="M 38.27 416.200827  L 835.361742 416.200827  " clip-path="url(#p0d2e0c97d5)" style="fill: none; stroke: #b0b0b0; stroke-opacity: 0.3; stroke-width: 0.8; stroke-linecap: square" />
+      <path d="M 47.81 413.210177  L 835.361742 413.210177  " clip-path="url(#p09feef2583)" style="fill: none; stroke: #b0b0b0; stroke-opacity: 0.3; stroke-width: 0.8; stroke-linecap: square" />
      </g>
      <g id="line2d_7">
       <defs>
        <path id="m0fca2865ba" d="M 0 0  L -3.5 0  " style="stroke: #000000; stroke-width: 0.8" />
       </defs>
       <g>
-       <use ns4:href="#m0fca2865ba" x="38.27" y="416.200827" style="stroke: #000000; stroke-width: 0.8" />
+       <use ns4:href="#m0fca2865ba" x="47.81" y="413.210177" style="stroke: #000000; stroke-width: 0.8" />
       </g>
      </g>
      <g id="text_7">
-      <text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: end" x="31.27" y="420.000045" transform="rotate(-0 31.27 420.000045)">3</text>
+      <text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: end" x="40.81" y="417.009396" transform="rotate(-0 40.81 417.009396)">1.0</text>
      </g>
     </g>
     <g id="ytick_2">
      <g id="grid-y--3" class="grid grid-y">
-      <path d="M 38.27 348.555726  L 835.361742 348.555726  " clip-path="url(#p0d2e0c97d5)" style="fill: none; stroke: #b0b0b0; stroke-opacity: 0.3; stroke-width: 0.8; stroke-linecap: square" />
+      <path d="M 47.81 355.233116  L 835.361742 355.233116  " clip-path="url(#p09feef2583)" style="fill: none; stroke: #b0b0b0; stroke-opacity: 0.3; stroke-width: 0.8; stroke-linecap: square" />
      </g>
      <g id="line2d_8">
       <g>
-       <use ns4:href="#m0fca2865ba" x="38.27" y="348.555726" style="stroke: #000000; stroke-width: 0.8" />
+       <use ns4:href="#m0fca2865ba" x="47.81" y="355.233116" style="stroke: #000000; stroke-width: 0.8" />
       </g>
      </g>
      <g id="text_8">
-      <text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: end" x="31.27" y="352.354944" transform="rotate(-0 31.27 352.354944)">4</text>
+      <text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: end" x="40.81" y="359.032335" transform="rotate(-0 40.81 359.032335)">1.2</text>
      </g>
     </g>
     <g id="ytick_3">
      <g id="grid-y--4" class="grid grid-y">
-      <path d="M 38.27 280.910625  L 835.361742 280.910625  " clip-path="url(#p0d2e0c97d5)" style="fill: none; stroke: #b0b0b0; stroke-opacity: 0.3; stroke-width: 0.8; stroke-linecap: square" />
+      <path d="M 47.81 297.256055  L 835.361742 297.256055  " clip-path="url(#p09feef2583)" style="fill: none; stroke: #b0b0b0; stroke-opacity: 0.3; stroke-width: 0.8; stroke-linecap: square" />
      </g>
      <g id="line2d_9">
       <g>
-       <use ns4:href="#m0fca2865ba" x="38.27" y="280.910625" style="stroke: #000000; stroke-width: 0.8" />
+       <use ns4:href="#m0fca2865ba" x="47.81" y="297.256055" style="stroke: #000000; stroke-width: 0.8" />
       </g>
      </g>
      <g id="text_9">
-      <text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: end" x="31.27" y="284.709843" transform="rotate(-0 31.27 284.709843)">5</text>
+      <text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: end" x="40.81" y="301.055273" transform="rotate(-0 40.81 301.055273)">1.4</text>
      </g>
     </g>
     <g id="ytick_4">
      <g id="grid-y--5" class="grid grid-y">
-      <path d="M 38.27 213.265524  L 835.361742 213.265524  " clip-path="url(#p0d2e0c97d5)" style="fill: none; stroke: #b0b0b0; stroke-opacity: 0.3; stroke-width: 0.8; stroke-linecap: square" />
+      <path d="M 47.81 239.278993  L 835.361742 239.278993  " clip-path="url(#p09feef2583)" style="fill: none; stroke: #b0b0b0; stroke-opacity: 0.3; stroke-width: 0.8; stroke-linecap: square" />
      </g>
      <g id="line2d_10">
       <g>
-       <use ns4:href="#m0fca2865ba" x="38.27" y="213.265524" style="stroke: #000000; stroke-width: 0.8" />
+       <use ns4:href="#m0fca2865ba" x="47.81" y="239.278993" style="stroke: #000000; stroke-width: 0.8" />
       </g>
      </g>
      <g id="text_10">
-      <text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: end" x="31.27" y="217.064743" transform="rotate(-0 31.27 217.064743)">6</text>
+      <text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: end" x="40.81" y="243.078212" transform="rotate(-0 40.81 243.078212)">1.6</text>
      </g>
     </g>
     <g id="ytick_5">
      <g id="grid-y--6" class="grid grid-y">
-      <path d="M 38.27 145.620423  L 835.361742 145.620423  " clip-path="url(#p0d2e0c97d5)" style="fill: none; stroke: #b0b0b0; stroke-opacity: 0.3; stroke-width: 0.8; stroke-linecap: square" />
+      <path d="M 47.81 181.301932  L 835.361742 181.301932  " clip-path="url(#p09feef2583)" style="fill: none; stroke: #b0b0b0; stroke-opacity: 0.3; stroke-width: 0.8; stroke-linecap: square" />
      </g>
      <g id="line2d_11">
       <g>
-       <use ns4:href="#m0fca2865ba" x="38.27" y="145.620423" style="stroke: #000000; stroke-width: 0.8" />
+       <use ns4:href="#m0fca2865ba" x="47.81" y="181.301932" style="stroke: #000000; stroke-width: 0.8" />
       </g>
      </g>
      <g id="text_11">
-      <text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: end" x="31.27" y="149.419642" transform="rotate(-0 31.27 149.419642)">7</text>
+      <text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: end" x="40.81" y="185.101151" transform="rotate(-0 40.81 185.101151)">1.8</text>
      </g>
     </g>
     <g id="ytick_6">
      <g id="grid-y--7" class="grid grid-y">
-      <path d="M 38.27 77.975322  L 835.361742 77.975322  " clip-path="url(#p0d2e0c97d5)" style="fill: none; stroke: #b0b0b0; stroke-opacity: 0.3; stroke-width: 0.8; stroke-linecap: square" />
+      <path d="M 47.81 123.324871  L 835.361742 123.324871  " clip-path="url(#p09feef2583)" style="fill: none; stroke: #b0b0b0; stroke-opacity: 0.3; stroke-width: 0.8; stroke-linecap: square" />
      </g>
      <g id="line2d_12">
       <g>
-       <use ns4:href="#m0fca2865ba" x="38.27" y="77.975322" style="stroke: #000000; stroke-width: 0.8" />
+       <use ns4:href="#m0fca2865ba" x="47.81" y="123.324871" style="stroke: #000000; stroke-width: 0.8" />
       </g>
      </g>
      <g id="text_12">
-      <text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: end" x="31.27" y="81.774541" transform="rotate(-0 31.27 81.774541)">8</text>
+      <text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: end" x="40.81" y="127.124089" transform="rotate(-0 40.81 127.124089)">2.0</text>
+     </g>
+    </g>
+    <g id="ytick_7">
+     <g id="grid-y--8" class="grid grid-y">
+      <path d="M 47.81 65.347809  L 835.361742 65.347809  " clip-path="url(#p09feef2583)" style="fill: none; stroke: #b0b0b0; stroke-opacity: 0.3; stroke-width: 0.8; stroke-linecap: square" />
+     </g>
+     <g id="line2d_13">
+      <g>
+       <use ns4:href="#m0fca2865ba" x="47.81" y="65.347809" style="stroke: #000000; stroke-width: 0.8" />
+      </g>
+     </g>
+     <g id="text_13">
+      <text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: end" x="40.81" y="69.147028" transform="rotate(-0 40.81 69.147028)">2.2</text>
      </g>
     </g>
     <g id="label--y" class="ylabel">
-     <text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: middle" x="18.827813" y="237.193558" transform="rotate(-90 18.827813 237.193558)">Latency P50 (ms)</text>
+     <text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: middle" x="18.827187" y="237.193558" transform="rotate(-90 18.827187 237.193558)">Latency P50 (ms)</text>
     </g>
    </g>
    <g id="series--torch-flash-ma" class="series">
-    <path d="M 74.501443 342.178283  L 219.427214 295.140246  L 364.352985 287.657208  L 509.278756 281.974208  L 654.204528 277.480409  L 799.130299 249.123044  " clip-path="url(#p0d2e0c97d5)" style="fill: none; stroke: #1f77b4; stroke-width: 1.5; stroke-linecap: square" />
+    <path d="M 83.607806 349.439178  L 226.799032 333.602454  L 369.990258 324.473676  L 513.181484 316.069901  L 656.37271 272.899601  L 799.563935 261.559288  " clip-path="url(#p09feef2583)" style="fill: none; stroke: #1f77b4; stroke-width: 1.5; stroke-linecap: square" />
     <defs>
      <path id="md7efaf3aec" d="M 0 3  C 0.795609 3 1.55874 2.683901 2.12132 2.12132  C 2.683901 1.55874 3 0.795609 3 0  C 3 -0.795609 2.683901 -1.55874 2.12132 -2.12132  C 1.55874 -2.683901 0.795609 -3 0 -3  C -0.795609 -3 -1.55874 -2.683901 -2.12132 -2.12132  C -2.683901 -1.55874 -3 -0.795609 -3 0  C -3 0.795609 -2.683901 1.55874 -2.12132 2.12132  C -1.55874 2.683901 -0.795609 3 0 3  z " style="stroke: #1f77b4" />
     </defs>
-    <g clip-path="url(#p0d2e0c97d5)">
-     <use ns4:href="#md7efaf3aec" x="74.501443" y="342.178283" style="fill: #1f77b4; stroke: #1f77b4" />
-     <use ns4:href="#md7efaf3aec" x="219.427214" y="295.140246" style="fill: #1f77b4; stroke: #1f77b4" />
-     <use ns4:href="#md7efaf3aec" x="364.352985" y="287.657208" style="fill: #1f77b4; stroke: #1f77b4" />
-     <use ns4:href="#md7efaf3aec" x="509.278756" y="281.974208" style="fill: #1f77b4; stroke: #1f77b4" />
-     <use ns4:href="#md7efaf3aec" x="654.204528" y="277.480409" style="fill: #1f77b4; stroke: #1f77b4" />
-     <use ns4:href="#md7efaf3aec" x="799.130299" y="249.123044" style="fill: #1f77b4; stroke: #1f77b4" />
+    <g clip-path="url(#p09feef2583)">
+     <use ns4:href="#md7efaf3aec" x="83.607806" y="349.439178" style="fill: #1f77b4; stroke: #1f77b4" />
+     <use ns4:href="#md7efaf3aec" x="226.799032" y="333.602454" style="fill: #1f77b4; stroke: #1f77b4" />
+     <use ns4:href="#md7efaf3aec" x="369.990258" y="324.473676" style="fill: #1f77b4; stroke: #1f77b4" />
+     <use ns4:href="#md7efaf3aec" x="513.181484" y="316.069901" style="fill: #1f77b4; stroke: #1f77b4" />
+     <use ns4:href="#md7efaf3aec" x="656.37271" y="272.899601" style="fill: #1f77b4; stroke: #1f77b4" />
+     <use ns4:href="#md7efaf3aec" x="799.563935" y="261.559288" style="fill: #1f77b4; stroke: #1f77b4" />
     </g>
    </g>
    <g id="series--torch-mem-eff" class="series">
-    <path d="M 74.501443 161.453774  L 219.427214 129.630134  L 364.352985 110.285937  L 509.278756 105.945081  L 654.204528 80.334108  L 799.130299 45.999414  " clip-path="url(#p0d2e0c97d5)" style="fill: none; stroke: #ff7f0e; stroke-width: 1.5; stroke-linecap: square" />
+    <path d="M 83.607806 156.020744  L 226.799032 138.969401  L 369.990258 109.128607  L 513.181484 99.249026  L 656.37271 87.05645  L 799.563935 45.999414  " clip-path="url(#p09feef2583)" style="fill: none; stroke: #ff7f0e; stroke-width: 1.5; stroke-linecap: square" />
     <defs>
      <path id="m9b8c54d372" d="M 0 3  C 0.795609 3 1.55874 2.683901 2.12132 2.12132  C 2.683901 1.55874 3 0.795609 3 0  C 3 -0.795609 2.683901 -1.55874 2.12132 -2.12132  C 1.55874 -2.683901 0.795609 -3 0 -3  C -0.795609 -3 -1.55874 -2.683901 -2.12132 -2.12132  C -2.683901 -1.55874 -3 -0.795609 -3 0  C -3 0.795609 -2.683901 1.55874 -2.12132 2.12132  C -1.55874 2.683901 -0.795609 3 0 3  z " style="stroke: #ff7f0e" />
     </defs>
-    <g clip-path="url(#p0d2e0c97d5)">
-     <use ns4:href="#m9b8c54d372" x="74.501443" y="161.453774" style="fill: #ff7f0e; stroke: #ff7f0e" />
-     <use ns4:href="#m9b8c54d372" x="219.427214" y="129.630134" style="fill: #ff7f0e; stroke: #ff7f0e" />
-     <use ns4:href="#m9b8c54d372" x="364.352985" y="110.285937" style="fill: #ff7f0e; stroke: #ff7f0e" />
-     <use ns4:href="#m9b8c54d372" x="509.278756" y="105.945081" style="fill: #ff7f0e; stroke: #ff7f0e" />
-     <use ns4:href="#m9b8c54d372" x="654.204528" y="80.334108" style="fill: #ff7f0e; stroke: #ff7f0e" />
-     <use ns4:href="#m9b8c54d372" x="799.130299" y="45.999414" style="fill: #ff7f0e; stroke: #ff7f0e" />
+    <g clip-path="url(#p09feef2583)">
+     <use ns4:href="#m9b8c54d372" x="83.607806" y="156.020744" style="fill: #ff7f0e; stroke: #ff7f0e" />
+     <use ns4:href="#m9b8c54d372" x="226.799032" y="138.969401" style="fill: #ff7f0e; stroke: #ff7f0e" />
+     <use ns4:href="#m9b8c54d372" x="369.990258" y="109.128607" style="fill: #ff7f0e; stroke: #ff7f0e" />
+     <use ns4:href="#m9b8c54d372" x="513.181484" y="99.249026" style="fill: #ff7f0e; stroke: #ff7f0e" />
+     <use ns4:href="#m9b8c54d372" x="656.37271" y="87.05645" style="fill: #ff7f0e; stroke: #ff7f0e" />
+     <use ns4:href="#m9b8c54d372" x="799.563935" y="45.999414" style="fill: #ff7f0e; stroke: #ff7f0e" />
     </g>
    </g>
    <g id="series--xformers-meff" class="series">
-    <path d="M 74.501443 375.900177  L 219.427214 387.059046  L 364.352985 342.097108  L 509.278756 347.750344  L 654.204528 334.407754  L 799.130299 319.565404  " clip-path="url(#p0d2e0c97d5)" style="fill: none; stroke: #2ca02c; stroke-width: 1.5; stroke-linecap: square" />
+    <path d="M 83.607806 414.345368  L 226.799032 400.181572  L 369.990258 385.808769  L 513.181484 380.581847  L 656.37271 338.122056  L 799.563935 339.866876  " clip-path="url(#p09feef2583)" style="fill: none; stroke: #2ca02c; stroke-width: 1.5; stroke-linecap: square" />
     <defs>
      <path id="mc655281e0b" d="M 0 3  C 0.795609 3 1.55874 2.683901 2.12132 2.12132  C 2.683901 1.55874 3 0.795609 3 0  C 3 -0.795609 2.683901 -1.55874 2.12132 -2.12132  C 1.55874 -2.683901 0.795609 -3 0 -3  C -0.795609 -3 -1.55874 -2.683901 -2.12132 -2.12132  C -2.683901 -1.55874 -3 -0.795609 -3 0  C -3 0.795609 -2.683901 1.55874 -2.12132 2.12132  C -1.55874 2.683901 -0.795609 3 0 3  z " style="stroke: #2ca02c" />
     </defs>
-    <g clip-path="url(#p0d2e0c97d5)">
-     <use ns4:href="#mc655281e0b" x="74.501443" y="375.900177" style="fill: #2ca02c; stroke: #2ca02c" />
-     <use ns4:href="#mc655281e0b" x="219.427214" y="387.059046" style="fill: #2ca02c; stroke: #2ca02c" />
-     <use ns4:href="#mc655281e0b" x="364.352985" y="342.097108" style="fill: #2ca02c; stroke: #2ca02c" />
-     <use ns4:href="#mc655281e0b" x="509.278756" y="347.750344" style="fill: #2ca02c; stroke: #2ca02c" />
-     <use ns4:href="#mc655281e0b" x="654.204528" y="334.407754" style="fill: #2ca02c; stroke: #2ca02c" />
-     <use ns4:href="#mc655281e0b" x="799.130299" y="319.565404" style="fill: #2ca02c; stroke: #2ca02c" />
+    <g clip-path="url(#p09feef2583)">
+     <use ns4:href="#mc655281e0b" x="83.607806" y="414.345368" style="fill: #2ca02c; stroke: #2ca02c" />
+     <use ns4:href="#mc655281e0b" x="226.799032" y="400.181572" style="fill: #2ca02c; stroke: #2ca02c" />
+     <use ns4:href="#mc655281e0b" x="369.990258" y="385.808769" style="fill: #2ca02c; stroke: #2ca02c" />
+     <use ns4:href="#mc655281e0b" x="513.181484" y="380.581847" style="fill: #2ca02c; stroke: #2ca02c" />
+     <use ns4:href="#mc655281e0b" x="656.37271" y="338.122056" style="fill: #2ca02c; stroke: #2ca02c" />
+     <use ns4:href="#mc655281e0b" x="799.563935" y="339.866876" style="fill: #2ca02c; stroke: #2ca02c" />
     </g>
    </g>
    <g id="series--hf-kernels-flash-attn" class="series">
-    <path d="M 74.501443 428.387702  L 219.427214 354.446599  L 364.352985 340.491147  L 509.278756 339.909398  L 654.204528 340.987724  L 799.130299 309.967712  " clip-path="url(#p0d2e0c97d5)" style="fill: none; stroke: #d62728; stroke-width: 1.5; stroke-linecap: square" />
+    <path d="M 83.607806 420.20395  L 226.799032 407.432473  L 369.990258 399.40236  L 513.181484 392.590345  L 656.37271 345.709514  L 799.563935 346.355668  " clip-path="url(#p09feef2583)" style="fill: none; stroke: #d62728; stroke-width: 1.5; stroke-linecap: square" />
     <defs>
      <path id="m61c8040d7e" d="M 0 3  C 0.795609 3 1.55874 2.683901 2.12132 2.12132  C 2.683901 1.55874 3 0.795609 3 0  C 3 -0.795609 2.683901 -1.55874 2.12132 -2.12132  C 1.55874 -2.683901 0.795609 -3 0 -3  C -0.795609 -3 -1.55874 -2.683901 -2.12132 -2.12132  C -2.683901 -1.55874 -3 -0.795609 -3 0  C -3 0.795609 -2.683901 1.55874 -2.12132 2.12132  C -1.55874 2.683901 -0.795609 3 0 3  z " style="stroke: #d62728" />
     </defs>
-    <g clip-path="url(#p0d2e0c97d5)">
-     <use ns4:href="#m61c8040d7e" x="74.501443" y="428.387702" style="fill: #d62728; stroke: #d62728" />
-     <use ns4:href="#m61c8040d7e" x="219.427214" y="354.446599" style="fill: #d62728; stroke: #d62728" />
-     <use ns4:href="#m61c8040d7e" x="364.352985" y="340.491147" style="fill: #d62728; stroke: #d62728" />
-     <use ns4:href="#m61c8040d7e" x="509.278756" y="339.909398" style="fill: #d62728; stroke: #d62728" />
-     <use ns4:href="#m61c8040d7e" x="654.204528" y="340.987724" style="fill: #d62728; stroke: #d62728" />
-     <use ns4:href="#m61c8040d7e" x="799.130299" y="309.967712" style="fill: #d62728; stroke: #d62728" />
+    <g clip-path="url(#p09feef2583)">
+     <use ns4:href="#m61c8040d7e" x="83.607806" y="420.20395" style="fill: #d62728; stroke: #d62728" />
+     <use ns4:href="#m61c8040d7e" x="226.799032" y="407.432473" style="fill: #d62728; stroke: #d62728" />
+     <use ns4:href="#m61c8040d7e" x="369.990258" y="399.40236" style="fill: #d62728; stroke: #d62728" />
+     <use ns4:href="#m61c8040d7e" x="513.181484" y="392.590345" style="fill: #d62728; stroke: #d62728" />
+     <use ns4:href="#m61c8040d7e" x="656.37271" y="345.709514" style="fill: #d62728; stroke: #d62728" />
+     <use ns4:href="#m61c8040d7e" x="799.563935" y="346.355668" style="fill: #d62728; stroke: #d62728" />
     </g>
    </g>
    <g id="series--hf-kernels-flash-attn3" class="series">
-    <path d="M 74.501443 401.329117  L 219.427214 364.396182  L 364.352985 354.430502  L 509.278756 350.637501  L 654.204528 335.557722  L 799.130299 321.02931  " clip-path="url(#p0d2e0c97d5)" style="fill: none; stroke: #9467bd; stroke-width: 1.5; stroke-linecap: square" />
+    <path d="M 83.607806 428.387702  L 226.799032 420.061906  L 369.990258 405.625328  L 513.181484 401.010644  L 656.37271 352.807645  L 799.563935 359.622849  " clip-path="url(#p09feef2583)" style="fill: none; stroke: #9467bd; stroke-width: 1.5; stroke-linecap: square" />
     <defs>
      <path id="m7cd35be9cc" d="M 0 3  C 0.795609 3 1.55874 2.683901 2.12132 2.12132  C 2.683901 1.55874 3 0.795609 3 0  C 3 -0.795609 2.683901 -1.55874 2.12132 -2.12132  C 1.55874 -2.683901 0.795609 -3 0 -3  C -0.795609 -3 -1.55874 -2.683901 -2.12132 -2.12132  C -2.683901 -1.55874 -3 -0.795609 -3 0  C -3 0.795609 -2.683901 1.55874 -2.12132 2.12132  C -1.55874 2.683901 -0.795609 3 0 3  z " style="stroke: #9467bd" />
     </defs>
-    <g clip-path="url(#p0d2e0c97d5)">
-     <use ns4:href="#m7cd35be9cc" x="74.501443" y="401.329117" style="fill: #9467bd; stroke: #9467bd" />
-     <use ns4:href="#m7cd35be9cc" x="219.427214" y="364.396182" style="fill: #9467bd; stroke: #9467bd" />
-     <use ns4:href="#m7cd35be9cc" x="364.352985" y="354.430502" style="fill: #9467bd; stroke: #9467bd" />
-     <use ns4:href="#m7cd35be9cc" x="509.278756" y="350.637501" style="fill: #9467bd; stroke: #9467bd" />
-     <use ns4:href="#m7cd35be9cc" x="654.204528" y="335.557722" style="fill: #9467bd; stroke: #9467bd" />
-     <use ns4:href="#m7cd35be9cc" x="799.130299" y="321.02931" style="fill: #9467bd; stroke: #9467bd" />
+    <g clip-path="url(#p09feef2583)">
+     <use ns4:href="#m7cd35be9cc" x="83.607806" y="428.387702" style="fill: #9467bd; stroke: #9467bd" />
+     <use ns4:href="#m7cd35be9cc" x="226.799032" y="420.061906" style="fill: #9467bd; stroke: #9467bd" />
+     <use ns4:href="#m7cd35be9cc" x="369.990258" y="405.625328" style="fill: #9467bd; stroke: #9467bd" />
+     <use ns4:href="#m7cd35be9cc" x="513.181484" y="401.010644" style="fill: #9467bd; stroke: #9467bd" />
+     <use ns4:href="#m7cd35be9cc" x="656.37271" y="352.807645" style="fill: #9467bd; stroke: #9467bd" />
+     <use ns4:href="#m7cd35be9cc" x="799.563935" y="359.622849" style="fill: #9467bd; stroke: #9467bd" />
     </g>
    </g>
    <g id="patch_3">
-    <path d="M 38.27 447.507117  L 38.27 26.88  " style="fill: none; stroke: #000000; stroke-width: 0.8; stroke-linejoin: miter; stroke-linecap: square" />
+    <path d="M 47.81 447.507117  L 47.81 26.88  " style="fill: none; stroke: #000000; stroke-width: 0.8; stroke-linejoin: miter; stroke-linecap: square" />
    </g>
    <g id="patch_4">
     <path d="M 835.361742 447.507117  L 835.361742 26.88  " style="fill: none; stroke: #000000; stroke-width: 0.8; stroke-linejoin: miter; stroke-linecap: square" />
    </g>
    <g id="patch_5">
-    <path d="M 38.27 447.507117  L 835.361742 447.507117  " style="fill: none; stroke: #000000; stroke-width: 0.8; stroke-linejoin: miter; stroke-linecap: square" />
+    <path d="M 47.81 447.507117  L 835.361742 447.507117  " style="fill: none; stroke: #000000; stroke-width: 0.8; stroke-linejoin: miter; stroke-linecap: square" />
    </g>
    <g id="patch_6">
-    <path d="M 38.27 26.88  L 835.361742 26.88  " style="fill: none; stroke: #000000; stroke-width: 0.8; stroke-linejoin: miter; stroke-linecap: square" />
+    <path d="M 47.81 26.88  L 835.361742 26.88  " style="fill: none; stroke: #000000; stroke-width: 0.8; stroke-linejoin: miter; stroke-linecap: square" />
    </g>
-   <g id="text_13">
-    <text style="font-size: 12px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: middle" x="436.815871" y="20.88" transform="rotate(-0 436.815871 20.88)">Attention Implementation Latency</text>
+   <g id="text_14">
+    <text style="font-size: 12px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: middle" x="441.585871" y="20.88" transform="rotate(-0 441.585871 20.88)">Attention Implementation Latency</text>
    </g>
    <g id="legend" class="legend">
     <g id="patch_7">
-     <path d="M 45.27 109.66125  L 188.765313 109.66125  Q 190.765313 109.66125 190.765313 107.66125  L 190.765313 33.88  Q 190.765313 31.88 188.765313 31.88  L 45.27 31.88  Q 43.27 31.88 43.27 33.88  L 43.27 107.66125  Q 43.27 109.66125 45.27 109.66125  L 45.27 109.66125  z " style="fill: none; opacity: 0.8; stroke: #cccccc; stroke-linejoin: miter" />
+     <path d="M 54.81 109.66125  L 198.305313 109.66125  Q 200.305313 109.66125 200.305313 107.66125  L 200.305313 33.88  Q 200.305313 31.88 198.305313 31.88  L 54.81 31.88  Q 52.81 31.88 52.81 33.88  L 52.81 107.66125  Q 52.81 109.66125 54.81 109.66125  L 54.81 109.66125  z " style="fill: none; opacity: 0.8; stroke: #cccccc; stroke-linejoin: miter" />
     </g>
-    <g id="line2d_13">
-     <path d="M 47.27 39.978438  L 57.27 39.978438  L 67.27 39.978438  " style="fill: none; stroke: #1f77b4; stroke-width: 1.5; stroke-linecap: square" />
+    <g id="line2d_14">
+     <path d="M 56.81 39.978438  L 66.81 39.978438  L 76.81 39.978438  " style="fill: none; stroke: #1f77b4; stroke-width: 1.5; stroke-linecap: square" />
      <g>
-      <use ns4:href="#md7efaf3aec" x="57.27" y="39.978438" style="fill: #1f77b4; stroke: #1f77b4" />
+      <use ns4:href="#md7efaf3aec" x="66.81" y="39.978438" style="fill: #1f77b4; stroke: #1f77b4" />
      </g>
     </g>
     <g id="legend-label--torch-flash-ma" class="legend">
-     <text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: start" x="75.27" y="43.478438" transform="rotate(-0 75.27 43.478438)">torch_flash_ma</text>
+     <text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: start" x="84.81" y="43.478438" transform="rotate(-0 84.81 43.478438)">torch_flash_ma</text>
     </g>
-    <g id="line2d_14">
-     <path d="M 47.27 54.934687  L 57.27 54.934687  L 67.27 54.934687  " style="fill: none; stroke: #ff7f0e; stroke-width: 1.5; stroke-linecap: square" />
+    <g id="line2d_15">
+     <path d="M 56.81 54.934687  L 66.81 54.934687  L 76.81 54.934687  " style="fill: none; stroke: #ff7f0e; stroke-width: 1.5; stroke-linecap: square" />
      <g>
-      <use ns4:href="#m9b8c54d372" x="57.27" y="54.934687" style="fill: #ff7f0e; stroke: #ff7f0e" />
+      <use ns4:href="#m9b8c54d372" x="66.81" y="54.934687" style="fill: #ff7f0e; stroke: #ff7f0e" />
      </g>
     </g>
     <g id="legend-label--torch-mem-eff" class="legend">
-     <text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: start" x="75.27" y="58.434687" transform="rotate(-0 75.27 58.434687)">torch_mem_eff</text>
+     <text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: start" x="84.81" y="58.434687" transform="rotate(-0 84.81 58.434687)">torch_mem_eff</text>
     </g>
-    <g id="line2d_15">
-     <path d="M 47.27 69.890938  L 57.27 69.890938  L 67.27 69.890938  " style="fill: none; stroke: #2ca02c; stroke-width: 1.5; stroke-linecap: square" />
+    <g id="line2d_16">
+     <path d="M 56.81 69.890938  L 66.81 69.890938  L 76.81 69.890938  " style="fill: none; stroke: #2ca02c; stroke-width: 1.5; stroke-linecap: square" />
      <g>
-      <use ns4:href="#mc655281e0b" x="57.27" y="69.890938" style="fill: #2ca02c; stroke: #2ca02c" />
+      <use ns4:href="#mc655281e0b" x="66.81" y="69.890938" style="fill: #2ca02c; stroke: #2ca02c" />
      </g>
     </g>
     <g id="legend-label--xformers-meff" class="legend">
-     <text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: start" x="75.27" y="73.390938" transform="rotate(-0 75.27 73.390938)">xformers_meff</text>
+     <text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: start" x="84.81" y="73.390938" transform="rotate(-0 84.81 73.390938)">xformers_meff</text>
     </g>
-    <g id="line2d_16">
-     <path d="M 47.27 84.847188  L 57.27 84.847188  L 67.27 84.847188  " style="fill: none; stroke: #d62728; stroke-width: 1.5; stroke-linecap: square" />
+    <g id="line2d_17">
+     <path d="M 56.81 84.847188  L 66.81 84.847188  L 76.81 84.847188  " style="fill: none; stroke: #d62728; stroke-width: 1.5; stroke-linecap: square" />
      <g>
-      <use ns4:href="#m61c8040d7e" x="57.27" y="84.847188" style="fill: #d62728; stroke: #d62728" />
+      <use ns4:href="#m61c8040d7e" x="66.81" y="84.847188" style="fill: #d62728; stroke: #d62728" />
      </g>
     </g>
     <g id="legend-label--hf-kernels-flash-attn" class="legend">
-     <text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: start" x="75.27" y="88.347188" transform="rotate(-0 75.27 88.347188)">hf_kernels_flash_attn</text>
+     <text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: start" x="84.81" y="88.347188" transform="rotate(-0 84.81 88.347188)">hf_kernels_flash_attn</text>
     </g>
-    <g id="line2d_17">
-     <path d="M 47.27 99.803438  L 57.27 99.803438  L 67.27 99.803438  " style="fill: none; stroke: #9467bd; stroke-width: 1.5; stroke-linecap: square" />
+    <g id="line2d_18">
+     <path d="M 56.81 99.803438  L 66.81 99.803438  L 76.81 99.803438  " style="fill: none; stroke: #9467bd; stroke-width: 1.5; stroke-linecap: square" />
      <g>
-      <use ns4:href="#m7cd35be9cc" x="57.27" y="99.803438" style="fill: #9467bd; stroke: #9467bd" />
+      <use ns4:href="#m7cd35be9cc" x="66.81" y="99.803438" style="fill: #9467bd; stroke: #9467bd" />
      </g>
     </g>
     <g id="legend-label--hf-kernels-flash-attn3" class="legend">
-     <text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: start" x="75.27" y="103.303438" transform="rotate(-0 75.27 103.303438)">hf_kernels_flash_attn3</text>
+     <text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: start" x="84.81" y="103.303438" transform="rotate(-0 84.81 103.303438)">hf_kernels_flash_attn3</text>
     </g>
    </g>
   </g>
  </g>
  <defs>
-  <clipPath id="p0d2e0c97d5">
-   <rect x="38.27" y="26.88" width="797.091742" height="420.627117" />
+  <clipPath id="p09feef2583">
+   <rect x="47.81" y="26.88" width="787.551742" height="420.627117" />
   </clipPath>
  </defs>
 </svg>
diff --git a/layer_norm/impls/artifacts/benchmark/layer_norm.jsonl b/layer_norm/impls/artifacts/benchmark/layer_norm.jsonl
index c6e99c7cf12fb0468248ecd58ebe47ba3c3385d9..a79e8f8bdbd98b4943b0ef6a74d24d280882e16b 100644
--- a/layer_norm/impls/artifacts/benchmark/layer_norm.jsonl
+++ b/layer_norm/impls/artifacts/benchmark/layer_norm.jsonl
@@ -1,48 +1,48 @@
-{"ts": "2025-10-24T19:24:01Z", "run": "fba0677bba464dcfbc87714fa9385ac6", "impl": "torch_layer_norm", "tags": {"family": "torch", "op": "layer_norm"}, "wl": {"name": "LN_B1_S128_D1024", "batch": 1, "seq_len": 128, "hidden_dim": 1024, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L4", "sm": "8.9", "py": "3.12.7", "plat": "Linux-5.15.0-1084-aws-x86_64-with-glibc2.31"}, "lat_ms": {"p10": 0.028959999326616526, "p50": 0.029670016374439, "p90": 0.02976099494844675, "mean": 0.030270603019744158, "iqr": 0.00046996865421533585, "raw_times": [0.028959999326616526, 0.02976099494844675, 0.029291026294231415, 0.029670016374439, 0.0336709781549871], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.03676099004223943, "peak_bytes": 2363392, "ok": false, "absmax": 0.03125, "corr": {"ok": false, "rtol": 0.001, "atol": 0.001, "absmax": 0.03125, "mae": 0.00151824951171875, "mse": 1.1086463928222656e-05, "ref": "layer_norm_fp32"}, "err": null}
-{"ts": "2025-10-24T19:24:01Z", "run": "fba0677bba464dcfbc87714fa9385ac6", "impl": "torch_layer_norm", "tags": {"family": "torch", "op": "layer_norm"}, "wl": {"name": "LN_B1_S128_D2048", "batch": 1, "seq_len": 128, "hidden_dim": 2048, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L4", "sm": "8.9", "py": "3.12.7", "plat": "Linux-5.15.0-1084-aws-x86_64-with-glibc2.31"}, "lat_ms": {"p10": 0.0324210268445313, "p50": 0.03255100455135107, "p90": 0.03347999881953001, "mean": 0.0366490101441741, "iqr": 0.0010589719749987125, "raw_times": [0.0324210268445313, 0.05237199366092682, 0.03347999881953001, 0.03255100455135107, 0.0324210268445313], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.03719097003340721, "peak_bytes": 4726784, "ok": false, "absmax": 0.03125, "corr": {"ok": false, "rtol": 0.001, "atol": 0.001, "absmax": 0.03125, "mae": 0.0014801025390625, "mse": 1.0728836059570312e-05, "ref": "layer_norm_fp32"}, "err": null}
-{"ts": "2025-10-24T19:24:01Z", "run": "fba0677bba464dcfbc87714fa9385ac6", "impl": "torch_layer_norm", "tags": {"family": "torch", "op": "layer_norm"}, "wl": {"name": "LN_B1_S128_D4096", "batch": 1, "seq_len": 128, "hidden_dim": 4096, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L4", "sm": "8.9", "py": "3.12.7", "plat": "Linux-5.15.0-1084-aws-x86_64-with-glibc2.31"}, "lat_ms": {"p10": 0.030800001695752144, "p50": 0.032610027119517326, "p90": 0.03269099397584796, "mean": 0.03422859590500593, "iqr": 0.0001300359144806862, "raw_times": [0.042480998672544956, 0.03256095806136727, 0.032610027119517326, 0.03269099397584796, 0.030800001695752144], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.038141035474836826, "peak_bytes": 9453568, "ok": false, "absmax": 0.03125, "corr": {"ok": false, "rtol": 0.001, "atol": 0.001, "absmax": 0.03125, "mae": 0.00153350830078125, "mse": 1.0967254638671875e-05, "ref": "layer_norm_fp32"}, "err": null}
-{"ts": "2025-10-24T19:24:01Z", "run": "fba0677bba464dcfbc87714fa9385ac6", "impl": "torch_layer_norm", "tags": {"family": "torch", "op": "layer_norm"}, "wl": {"name": "LN_B1_S128_D8192", "batch": 1, "seq_len": 128, "hidden_dim": 8192, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L4", "sm": "8.9", "py": "3.12.7", "plat": "Linux-5.15.0-1084-aws-x86_64-with-glibc2.31"}, "lat_ms": {"p10": 0.031030969694256783, "p50": 0.03305997233837843, "p90": 0.03334099892526865, "mean": 0.034240796230733395, "iqr": 0.0011699739843606949, "raw_times": [0.031030969694256783, 0.03334099892526865, 0.03305997233837843, 0.032171024940907955, 0.041601015254855156], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.03404001472517848, "peak_bytes": 18907136, "ok": false, "absmax": 0.03125, "corr": {"ok": false, "rtol": 0.001, "atol": 0.001, "absmax": 0.03125, "mae": 0.00148773193359375, "mse": 1.1086463928222656e-05, "ref": "layer_norm_fp32"}, "err": null}
-{"ts": "2025-10-24T19:24:01Z", "run": "fba0677bba464dcfbc87714fa9385ac6", "impl": "torch_layer_norm", "tags": {"family": "torch", "op": "layer_norm"}, "wl": {"name": "LN_B1_S512_D1024", "batch": 1, "seq_len": 512, "hidden_dim": 1024, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L4", "sm": "8.9", "py": "3.12.7", "plat": "Linux-5.15.0-1084-aws-x86_64-with-glibc2.31"}, "lat_ms": {"p10": 0.030020950362086296, "p50": 0.03277999348938465, "p90": 0.03290100721642375, "mean": 0.03224459942430258, "iqr": 0.0006309710443019867, "raw_times": [0.03227003617212176, 0.03290100721642375, 0.03277999348938465, 0.030020950362086296, 0.03325100988149643], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.034740951377898455, "peak_bytes": 9441280, "ok": false, "absmax": 0.03125, "corr": {"ok": false, "rtol": 0.001, "atol": 0.001, "absmax": 0.03125, "mae": 0.001556396484375, "mse": 1.1205673217773438e-05, "ref": "layer_norm_fp32"}, "err": null}
-{"ts": "2025-10-24T19:24:01Z", "run": "fba0677bba464dcfbc87714fa9385ac6", "impl": "torch_layer_norm", "tags": {"family": "torch", "op": "layer_norm"}, "wl": {"name": "LN_B1_S512_D2048", "batch": 1, "seq_len": 512, "hidden_dim": 2048, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L4", "sm": "8.9", "py": "3.12.7", "plat": "Linux-5.15.0-1084-aws-x86_64-with-glibc2.31"}, "lat_ms": {"p10": 0.029260001610964537, "p50": 0.031291041523218155, "p90": 0.03236101474612951, "mean": 0.03568681422621012, "iqr": 0.0016400008462369442, "raw_times": [0.029260001610964537, 0.03236101474612951, 0.054800999350845814, 0.03072101389989257, 0.031291041523218155], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.03347097663208842, "peak_bytes": 18882560, "ok": false, "absmax": 0.03125, "corr": {"ok": false, "rtol": 0.001, "atol": 0.001, "absmax": 0.03125, "mae": 0.00154876708984375, "mse": 1.1265277862548828e-05, "ref": "layer_norm_fp32"}, "err": null}
-{"ts": "2025-10-24T19:24:01Z", "run": "fba0677bba464dcfbc87714fa9385ac6", "impl": "torch_layer_norm", "tags": {"family": "torch", "op": "layer_norm"}, "wl": {"name": "LN_B1_S512_D4096", "batch": 1, "seq_len": 512, "hidden_dim": 4096, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L4", "sm": "8.9", "py": "3.12.7", "plat": "Linux-5.15.0-1084-aws-x86_64-with-glibc2.31"}, "lat_ms": {"p10": 0.029839982744306326, "p50": 0.031700998079031706, "p90": 0.03264000406488776, "mean": 0.031820591539144516, "iqr": 0.0009690411388874054, "raw_times": [0.029839982744306326, 0.03167096292600036, 0.03264000406488776, 0.031700998079031706, 0.03325100988149643], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.03392098005861044, "peak_bytes": 37765120, "ok": false, "absmax": 0.03125, "corr": {"ok": false, "rtol": 0.001, "atol": 0.001, "absmax": 0.03125, "mae": 0.0015869140625, "mse": 1.1563301086425781e-05, "ref": "layer_norm_fp32"}, "err": null}
-{"ts": "2025-10-24T19:24:01Z", "run": "fba0677bba464dcfbc87714fa9385ac6", "impl": "torch_layer_norm", "tags": {"family": "torch", "op": "layer_norm"}, "wl": {"name": "LN_B1_S512_D8192", "batch": 1, "seq_len": 512, "hidden_dim": 8192, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L4", "sm": "8.9", "py": "3.12.7", "plat": "Linux-5.15.0-1084-aws-x86_64-with-glibc2.31"}, "lat_ms": {"p10": 0.03726099384948611, "p50": 0.03943097544834018, "p90": 0.040319981053471565, "mean": 0.039948790799826384, "iqr": 0.0009189825505018234, "raw_times": [0.03726099384948611, 0.03940099850296974, 0.04333100514486432, 0.040319981053471565, 0.03943097544834018], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.040440005250275135, "peak_bytes": 75530240, "ok": false, "absmax": 0.03125, "corr": {"ok": false, "rtol": 0.001, "atol": 0.001, "absmax": 0.03125, "mae": 0.0014801025390625, "mse": 1.0848045349121094e-05, "ref": "layer_norm_fp32"}, "err": null}
-{"ts": "2025-10-24T19:24:01Z", "run": "fba0677bba464dcfbc87714fa9385ac6", "impl": "torch_layer_norm", "tags": {"family": "torch", "op": "layer_norm"}, "wl": {"name": "LN_B1_S1024_D1024", "batch": 1, "seq_len": 1024, "hidden_dim": 1024, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L4", "sm": "8.9", "py": "3.12.7", "plat": "Linux-5.15.0-1084-aws-x86_64-with-glibc2.31"}, "lat_ms": {"p10": 0.029730028472840786, "p50": 0.03197102341800928, "p90": 0.03233103780075908, "mean": 0.03148262621834874, "iqr": 0.001300009898841381, "raw_times": [0.029730028472840786, 0.03197102341800928, 0.03235001349821687, 0.031031027901917696, 0.03233103780075908], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.034419994335621595, "peak_bytes": 21008384, "ok": false, "absmax": 0.03125, "corr": {"ok": false, "rtol": 0.001, "atol": 0.001, "absmax": 0.03125, "mae": 0.0015716552734375, "mse": 1.1324882507324219e-05, "ref": "layer_norm_fp32"}, "err": null}
-{"ts": "2025-10-24T19:24:01Z", "run": "fba0677bba464dcfbc87714fa9385ac6", "impl": "torch_layer_norm", "tags": {"family": "torch", "op": "layer_norm"}, "wl": {"name": "LN_B1_S1024_D2048", "batch": 1, "seq_len": 1024, "hidden_dim": 2048, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L4", "sm": "8.9", "py": "3.12.7", "plat": "Linux-5.15.0-1084-aws-x86_64-with-glibc2.31"}, "lat_ms": {"p10": 0.030681025236845016, "p50": 0.031400995794683695, "p90": 0.03258103970438242, "mean": 0.03199680941179395, "iqr": 0.0015910482034087181, "raw_times": [0.030681025236845016, 0.03258103970438242, 0.034330994822084904, 0.0309899915009737, 0.031400995794683695], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.03291096072643995, "peak_bytes": 37756928, "ok": false, "absmax": 0.03125, "corr": {"ok": false, "rtol": 0.001, "atol": 0.001, "absmax": 0.03125, "mae": 0.00156402587890625, "mse": 1.138448715209961e-05, "ref": "layer_norm_fp32"}, "err": null}
-{"ts": "2025-10-24T19:24:01Z", "run": "fba0677bba464dcfbc87714fa9385ac6", "impl": "torch_layer_norm", "tags": {"family": "torch", "op": "layer_norm"}, "wl": {"name": "LN_B1_S1024_D4096", "batch": 1, "seq_len": 1024, "hidden_dim": 4096, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L4", "sm": "8.9", "py": "3.12.7", "plat": "Linux-5.15.0-1084-aws-x86_64-with-glibc2.31"}, "lat_ms": {"p10": 0.03773096250370145, "p50": 0.03855105023831129, "p90": 0.03870099317282438, "mean": 0.038839003536850214, "iqr": 0.00018998980522155762, "raw_times": [0.03773096250370145, 0.04070100840181112, 0.03855105023831129, 0.038511003367602825, 0.03870099317282438], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.04059000639244914, "peak_bytes": 75513856, "ok": false, "absmax": 0.03125, "corr": {"ok": false, "rtol": 0.001, "atol": 0.001, "absmax": 0.03125, "mae": 0.0015411376953125, "mse": 1.1205673217773438e-05, "ref": "layer_norm_fp32"}, "err": null}
-{"ts": "2025-10-24T19:24:01Z", "run": "fba0677bba464dcfbc87714fa9385ac6", "impl": "torch_layer_norm", "tags": {"family": "torch", "op": "layer_norm"}, "wl": {"name": "LN_B1_S1024_D8192", "batch": 1, "seq_len": 1024, "hidden_dim": 8192, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L4", "sm": "8.9", "py": "3.12.7", "plat": "Linux-5.15.0-1084-aws-x86_64-with-glibc2.31"}, "lat_ms": {"p10": 0.05229096859693527, "p50": 0.05274196155369282, "p90": 0.052841962315142155, "mean": 0.05267937667667866, "iqr": 0.00024097971618175507, "raw_times": [0.05274196155369282, 0.052841962315142155, 0.05229096859693527, 0.05292100831866264, 0.0526009825989604], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.05453097401186824, "peak_bytes": 151027712, "ok": false, "absmax": 0.03125, "corr": {"ok": false, "rtol": 0.001, "atol": 0.001, "absmax": 0.03125, "mae": 0.001495361328125, "mse": 1.0967254638671875e-05, "ref": "layer_norm_fp32"}, "err": null}
-{"ts": "2025-10-24T19:24:01Z", "run": "fba0677bba464dcfbc87714fa9385ac6", "impl": "torch_layer_norm", "tags": {"family": "torch", "op": "layer_norm"}, "wl": {"name": "LN_B1_S2048_D1024", "batch": 1, "seq_len": 2048, "hidden_dim": 1024, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L4", "sm": "8.9", "py": "3.12.7", "plat": "Linux-5.15.0-1084-aws-x86_64-with-glibc2.31"}, "lat_ms": {"p10": 0.033741001971066, "p50": 0.03515096614137292, "p90": 0.035751028917729855, "mean": 0.035592797212302685, "iqr": 0.0010509975254535675, "raw_times": [0.033741001971066, 0.03515096614137292, 0.038620957639068365, 0.035751028917729855, 0.03470003139227629], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.03478099824860692, "peak_bytes": 41979904, "ok": false, "absmax": 0.03125, "corr": {"ok": false, "rtol": 0.001, "atol": 0.001, "absmax": 0.03125, "mae": 0.00157928466796875, "mse": 1.1444091796875e-05, "ref": "layer_norm_fp32"}, "err": null}
-{"ts": "2025-10-24T19:24:01Z", "run": "fba0677bba464dcfbc87714fa9385ac6", "impl": "torch_layer_norm", "tags": {"family": "torch", "op": "layer_norm"}, "wl": {"name": "LN_B1_S2048_D2048", "batch": 1, "seq_len": 2048, "hidden_dim": 2048, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L4", "sm": "8.9", "py": "3.12.7", "plat": "Linux-5.15.0-1084-aws-x86_64-with-glibc2.31"}, "lat_ms": {"p10": 0.040161015931516886, "p50": 0.04095997428521514, "p90": 0.04124100087210536, "mean": 0.04105480620637536, "iqr": 0.00031996751204133034, "raw_times": [0.040161015931516886, 0.04199100658297539, 0.04124100087210536, 0.04095997428521514, 0.04092103336006403], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.04168099258095026, "peak_bytes": 75505664, "ok": false, "absmax": 0.03125, "corr": {"ok": false, "rtol": 0.001, "atol": 0.001, "absmax": 0.03125, "mae": 0.0015716552734375, "mse": 1.1444091796875e-05, "ref": "layer_norm_fp32"}, "err": null}
-{"ts": "2025-10-24T19:24:01Z", "run": "fba0677bba464dcfbc87714fa9385ac6", "impl": "torch_layer_norm", "tags": {"family": "torch", "op": "layer_norm"}, "wl": {"name": "LN_B1_S2048_D4096", "batch": 1, "seq_len": 2048, "hidden_dim": 4096, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L4", "sm": "8.9", "py": "3.12.7", "plat": "Linux-5.15.0-1084-aws-x86_64-with-glibc2.31"}, "lat_ms": {"p10": 0.05262100603431463, "p50": 0.05288200918585062, "p90": 0.0531109981238842, "mean": 0.05337720504030585, "iqr": 0.00027997884899377823, "raw_times": [0.05283101927489042, 0.05262100603431463, 0.05288200918585062, 0.0531109981238842, 0.05544099258258939], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.05422096000984311, "peak_bytes": 151011328, "ok": false, "absmax": 0.03125, "corr": {"ok": false, "rtol": 0.001, "atol": 0.001, "absmax": 0.03125, "mae": 0.0015411376953125, "mse": 1.1205673217773438e-05, "ref": "layer_norm_fp32"}, "err": null}
-{"ts": "2025-10-24T19:24:01Z", "run": "fba0677bba464dcfbc87714fa9385ac6", "impl": "torch_layer_norm", "tags": {"family": "torch", "op": "layer_norm"}, "wl": {"name": "LN_B1_S2048_D8192", "batch": 1, "seq_len": 2048, "hidden_dim": 8192, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L4", "sm": "8.9", "py": "3.12.7", "plat": "Linux-5.15.0-1084-aws-x86_64-with-glibc2.31"}, "lat_ms": {"p10": 0.2716059680096805, "p50": 0.2742449869401753, "p90": 0.2774460008367896, "mean": 0.2755257999524474, "iqr": 0.0037999707274138927, "raw_times": [0.28068601386621594, 0.2736460301093757, 0.2742449869401753, 0.2716059680096805, 0.2774460008367896], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.27578597655519843, "peak_bytes": 302022656, "ok": false, "absmax": 0.03125, "corr": {"ok": false, "rtol": 0.001, "atol": 0.001, "absmax": 0.03125, "mae": 0.0015106201171875, "mse": 1.1086463928222656e-05, "ref": "layer_norm_fp32"}, "err": null}
-{"ts": "2025-10-24T19:24:01Z", "run": "fba0677bba464dcfbc87714fa9385ac6", "impl": "torch_layer_norm", "tags": {"family": "torch", "op": "layer_norm"}, "wl": {"name": "LN_B4_S128_D1024", "batch": 4, "seq_len": 128, "hidden_dim": 1024, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L4", "sm": "8.9", "py": "3.12.7", "plat": "Linux-5.15.0-1084-aws-x86_64-with-glibc2.31"}, "lat_ms": {"p10": 0.030030030757188797, "p50": 0.031871022656559944, "p90": 0.03221101360395551, "mean": 0.03341861302033067, "iqr": 0.00040099257603287697, "raw_times": [0.030030030757188797, 0.03221101360395551, 0.04117097705602646, 0.031871022656559944, 0.03181002102792263], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.034871045500040054, "peak_bytes": 69242880, "ok": false, "absmax": 0.03125, "corr": {"ok": false, "rtol": 0.001, "atol": 0.001, "absmax": 0.03125, "mae": 0.001556396484375, "mse": 1.1205673217773438e-05, "ref": "layer_norm_fp32"}, "err": null}
-{"ts": "2025-10-24T19:24:01Z", "run": "fba0677bba464dcfbc87714fa9385ac6", "impl": "torch_layer_norm", "tags": {"family": "torch", "op": "layer_norm"}, "wl": {"name": "LN_B4_S128_D2048", "batch": 4, "seq_len": 128, "hidden_dim": 2048, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L4", "sm": "8.9", "py": "3.12.7", "plat": "Linux-5.15.0-1084-aws-x86_64-with-glibc2.31"}, "lat_ms": {"p10": 0.02923101419582963, "p50": 0.030590977985411882, "p90": 0.030929979402571917, "mean": 0.030384794808924198, "iqr": 0.0010989606380462646, "raw_times": [0.02923101419582963, 0.03134098369628191, 0.030590977985411882, 0.029831018764525652, 0.030929979402571917], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.03408099291846156, "peak_bytes": 18882560, "ok": false, "absmax": 0.03125, "corr": {"ok": false, "rtol": 0.001, "atol": 0.001, "absmax": 0.03125, "mae": 0.00154876708984375, "mse": 1.1265277862548828e-05, "ref": "layer_norm_fp32"}, "err": null}
-{"ts": "2025-10-24T19:24:01Z", "run": "fba0677bba464dcfbc87714fa9385ac6", "impl": "torch_layer_norm", "tags": {"family": "torch", "op": "layer_norm"}, "wl": {"name": "LN_B4_S128_D4096", "batch": 4, "seq_len": 128, "hidden_dim": 4096, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L4", "sm": "8.9", "py": "3.12.7", "plat": "Linux-5.15.0-1084-aws-x86_64-with-glibc2.31"}, "lat_ms": {"p10": 0.030279974453151226, "p50": 0.03076100256294012, "p90": 0.03130995901301503, "mean": 0.031078385654836893, "iqr": 0.0005599576979875565, "raw_times": [0.03130995901301503, 0.03076100256294012, 0.030279974453151226, 0.03229099093005061, 0.030750001315027475], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.03285001730546355, "peak_bytes": 37765120, "ok": false, "absmax": 0.03125, "corr": {"ok": false, "rtol": 0.001, "atol": 0.001, "absmax": 0.03125, "mae": 0.0015869140625, "mse": 1.1563301086425781e-05, "ref": "layer_norm_fp32"}, "err": null}
-{"ts": "2025-10-24T19:24:01Z", "run": "fba0677bba464dcfbc87714fa9385ac6", "impl": "torch_layer_norm", "tags": {"family": "torch", "op": "layer_norm"}, "wl": {"name": "LN_B4_S128_D8192", "batch": 4, "seq_len": 128, "hidden_dim": 8192, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L4", "sm": "8.9", "py": "3.12.7", "plat": "Linux-5.15.0-1084-aws-x86_64-with-glibc2.31"}, "lat_ms": {"p10": 0.03846001345664263, "p50": 0.03899098373949528, "p90": 0.039101054426282644, "mean": 0.039028620813041925, "iqr": 0.00023102620616555214, "raw_times": [0.03899098373949528, 0.039101054426282644, 0.039721024222671986, 0.03887002822011709, 0.03846001345664263], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.040511018596589565, "peak_bytes": 75530240, "ok": false, "absmax": 0.03125, "corr": {"ok": false, "rtol": 0.001, "atol": 0.001, "absmax": 0.03125, "mae": 0.0014801025390625, "mse": 1.0848045349121094e-05, "ref": "layer_norm_fp32"}, "err": null}
-{"ts": "2025-10-24T19:24:01Z", "run": "fba0677bba464dcfbc87714fa9385ac6", "impl": "torch_layer_norm", "tags": {"family": "torch", "op": "layer_norm"}, "wl": {"name": "LN_B4_S512_D1024", "batch": 4, "seq_len": 512, "hidden_dim": 1024, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L4", "sm": "8.9", "py": "3.12.7", "plat": "Linux-5.15.0-1084-aws-x86_64-with-glibc2.31"}, "lat_ms": {"p10": 0.0330110196955502, "p50": 0.034221040550619364, "p90": 0.03488000947982073, "mean": 0.03425482427701354, "iqr": 0.0009589712135493755, "raw_times": [0.0330110196955502, 0.034221040550619364, 0.03524101339280605, 0.03392103826627135, 0.03488000947982073], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.036880956031382084, "peak_bytes": 37752832, "ok": false, "absmax": 0.03125, "corr": {"ok": false, "rtol": 0.001, "atol": 0.001, "absmax": 0.03125, "mae": 0.00157928466796875, "mse": 1.1444091796875e-05, "ref": "layer_norm_fp32"}, "err": null}
-{"ts": "2025-10-24T19:24:01Z", "run": "fba0677bba464dcfbc87714fa9385ac6", "impl": "torch_layer_norm", "tags": {"family": "torch", "op": "layer_norm"}, "wl": {"name": "LN_B4_S512_D2048", "batch": 4, "seq_len": 512, "hidden_dim": 2048, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L4", "sm": "8.9", "py": "3.12.7", "plat": "Linux-5.15.0-1084-aws-x86_64-with-glibc2.31"}, "lat_ms": {"p10": 0.039421021938323975, "p50": 0.039631035178899765, "p90": 0.041121034882962704, "mean": 0.04027721006423235, "iqr": 0.0015800469554960728, "raw_times": [0.039631035178899765, 0.04167197039350867, 0.041121034882962704, 0.03954098792746663, 0.039421021938323975], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.04271004581823945, "peak_bytes": 75505664, "ok": false, "absmax": 0.03125, "corr": {"ok": false, "rtol": 0.001, "atol": 0.001, "absmax": 0.03125, "mae": 0.0015716552734375, "mse": 1.1444091796875e-05, "ref": "layer_norm_fp32"}, "err": null}
-{"ts": "2025-10-24T19:24:01Z", "run": "fba0677bba464dcfbc87714fa9385ac6", "impl": "torch_layer_norm", "tags": {"family": "torch", "op": "layer_norm"}, "wl": {"name": "LN_B4_S512_D4096", "batch": 4, "seq_len": 512, "hidden_dim": 4096, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L4", "sm": "8.9", "py": "3.12.7", "plat": "Linux-5.15.0-1084-aws-x86_64-with-glibc2.31"}, "lat_ms": {"p10": 0.052101968321949244, "p50": 0.05301198689267039, "p90": 0.053400988690555096, "mean": 0.052991590928286314, "iqr": 0.0005399924702942371, "raw_times": [0.052101968321949244, 0.05358201451599598, 0.05286099622026086, 0.05301198689267039, 0.053400988690555096], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.054280972108244896, "peak_bytes": 151011328, "ok": false, "absmax": 0.03125, "corr": {"ok": false, "rtol": 0.001, "atol": 0.001, "absmax": 0.03125, "mae": 0.0015411376953125, "mse": 1.1205673217773438e-05, "ref": "layer_norm_fp32"}, "err": null}
-{"ts": "2025-10-24T19:24:01Z", "run": "fba0677bba464dcfbc87714fa9385ac6", "impl": "torch_layer_norm", "tags": {"family": "torch", "op": "layer_norm"}, "wl": {"name": "LN_B4_S512_D8192", "batch": 4, "seq_len": 512, "hidden_dim": 8192, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L4", "sm": "8.9", "py": "3.12.7", "plat": "Linux-5.15.0-1084-aws-x86_64-with-glibc2.31"}, "lat_ms": {"p10": 0.2699450124055147, "p50": 0.2703659702092409, "p90": 0.2711050328798592, "mean": 0.27191760018467903, "iqr": 0.0009190407581627369, "raw_times": [0.2703659702092409, 0.2711050328798592, 0.27798599330708385, 0.27018599212169647, 0.2699450124055147], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.27769600274041295, "peak_bytes": 302022656, "ok": false, "absmax": 0.03125, "corr": {"ok": false, "rtol": 0.001, "atol": 0.001, "absmax": 0.03125, "mae": 0.0015106201171875, "mse": 1.1086463928222656e-05, "ref": "layer_norm_fp32"}, "err": null}
-{"ts": "2025-10-24T19:24:01Z", "run": "fba0677bba464dcfbc87714fa9385ac6", "impl": "torch_layer_norm", "tags": {"family": "torch", "op": "layer_norm"}, "wl": {"name": "LN_B4_S1024_D1024", "batch": 4, "seq_len": 1024, "hidden_dim": 1024, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L4", "sm": "8.9", "py": "3.12.7", "plat": "Linux-5.15.0-1084-aws-x86_64-with-glibc2.31"}, "lat_ms": {"p10": 0.04519004141911864, "p50": 0.045621010940521955, "p90": 0.04610104952007532, "mean": 0.046770821791142225, "iqr": 0.0005300389602780342, "raw_times": [0.04557101055979729, 0.04610104952007532, 0.045621010940521955, 0.04519004141911864, 0.05137099651619792], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.0482110190205276, "peak_bytes": 83922944, "ok": false, "absmax": 0.03125, "corr": {"ok": false, "rtol": 0.001, "atol": 0.001, "absmax": 0.03125, "mae": 0.0015869140625, "mse": 1.150369644165039e-05, "ref": "layer_norm_fp32"}, "err": null}
-{"ts": "2025-10-24T19:24:01Z", "run": "fba0677bba464dcfbc87714fa9385ac6", "impl": "torch_layer_norm", "tags": {"family": "torch", "op": "layer_norm"}, "wl": {"name": "LN_B4_S1024_D2048", "batch": 4, "seq_len": 1024, "hidden_dim": 2048, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L4", "sm": "8.9", "py": "3.12.7", "plat": "Linux-5.15.0-1084-aws-x86_64-with-glibc2.31"}, "lat_ms": {"p10": 0.05651102401316166, "p50": 0.05743099609389901, "p90": 0.05767098627984524, "mean": 0.05834901239722967, "iqr": 0.00033993273973464966, "raw_times": [0.05651102401316166, 0.06280100205913186, 0.05743099609389901, 0.05767098627984524, 0.05733105354011059], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.05965103628113866, "peak_bytes": 151003136, "ok": false, "absmax": 0.03125, "corr": {"ok": false, "rtol": 0.001, "atol": 0.001, "absmax": 0.03125, "mae": 0.0015716552734375, "mse": 1.1444091796875e-05, "ref": "layer_norm_fp32"}, "err": null}
-{"ts": "2025-10-24T19:24:01Z", "run": "fba0677bba464dcfbc87714fa9385ac6", "impl": "torch_layer_norm", "tags": {"family": "torch", "op": "layer_norm"}, "wl": {"name": "LN_B4_S1024_D4096", "batch": 4, "seq_len": 1024, "hidden_dim": 4096, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L4", "sm": "8.9", "py": "3.12.7", "plat": "Linux-5.15.0-1084-aws-x86_64-with-glibc2.31"}, "lat_ms": {"p10": 0.2797759952954948, "p50": 0.2818260109052062, "p90": 0.2828260185196996, "mean": 0.28369000647217035, "iqr": 0.0011200318112969398, "raw_times": [0.2797759952954948, 0.28170598670840263, 0.2828260185196996, 0.2818260109052062, 0.29231602093204856], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.278854975476861, "peak_bytes": 302006272, "ok": false, "absmax": 0.03125, "corr": {"ok": false, "rtol": 0.001, "atol": 0.001, "absmax": 0.03125, "mae": 0.00154876708984375, "mse": 1.1265277862548828e-05, "ref": "layer_norm_fp32"}, "err": null}
-{"ts": "2025-10-24T19:24:01Z", "run": "fba0677bba464dcfbc87714fa9385ac6", "impl": "torch_layer_norm", "tags": {"family": "torch", "op": "layer_norm"}, "wl": {"name": "LN_B4_S1024_D8192", "batch": 4, "seq_len": 1024, "hidden_dim": 8192, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L4", "sm": "8.9", "py": "3.12.7", "plat": "Linux-5.15.0-1084-aws-x86_64-with-glibc2.31"}, "lat_ms": {"p10": 0.5820420337840915, "p50": 0.589212984777987, "p90": 0.5898119998164475, "mean": 0.5889245891012251, "iqr": 0.002659042365849018, "raw_times": [0.5898119998164475, 0.5871529574505985, 0.589212984777987, 0.5820420337840915, 0.5964029696770012], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.5985530442558229, "peak_bytes": 604012544, "ok": false, "absmax": 0.03125, "corr": {"ok": false, "rtol": 0.001, "atol": 0.001, "absmax": 0.03125, "mae": 0.0015106201171875, "mse": 1.1026859283447266e-05, "ref": "layer_norm_fp32"}, "err": null}
-{"ts": "2025-10-24T19:24:01Z", "run": "fba0677bba464dcfbc87714fa9385ac6", "impl": "torch_layer_norm", "tags": {"family": "torch", "op": "layer_norm"}, "wl": {"name": "LN_B4_S2048_D1024", "batch": 4, "seq_len": 2048, "hidden_dim": 1024, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L4", "sm": "8.9", "py": "3.12.7", "plat": "Linux-5.15.0-1084-aws-x86_64-with-glibc2.31"}, "lat_ms": {"p10": 0.06898096762597561, "p50": 0.07057201582938433, "p90": 0.07107201963663101, "mean": 0.07093560416251421, "iqr": 0.0007100170478224754, "raw_times": [0.07369101513177156, 0.07036200258880854, 0.07107201963663101, 0.06898096762597561, 0.07057201582938433], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.06987096276134253, "peak_bytes": 167809024, "ok": false, "absmax": 0.03125, "corr": {"ok": false, "rtol": 0.001, "atol": 0.001, "absmax": 0.03125, "mae": 0.0015869140625, "mse": 1.1563301086425781e-05, "ref": "layer_norm_fp32"}, "err": null}
-{"ts": "2025-10-24T19:24:01Z", "run": "fba0677bba464dcfbc87714fa9385ac6", "impl": "torch_layer_norm", "tags": {"family": "torch", "op": "layer_norm"}, "wl": {"name": "LN_B4_S2048_D2048", "batch": 4, "seq_len": 2048, "hidden_dim": 2048, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L4", "sm": "8.9", "py": "3.12.7", "plat": "Linux-5.15.0-1084-aws-x86_64-with-glibc2.31"}, "lat_ms": {"p10": 0.2780960057862103, "p50": 0.2848859876394272, "p90": 0.28691597981378436, "mean": 0.2848202013410628, "iqr": 0.003049965016543865, "raw_times": [0.2838660147972405, 0.28691597981378436, 0.2848859876394272, 0.2903370186686516, 0.2780960057862103], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.27952599339187145, "peak_bytes": 301998080, "ok": false, "absmax": 0.0625, "corr": {"ok": false, "rtol": 0.001, "atol": 0.001, "absmax": 0.0625, "mae": 0.0015716552734375, "mse": 1.150369644165039e-05, "ref": "layer_norm_fp32"}, "err": null}
-{"ts": "2025-10-24T19:24:01Z", "run": "fba0677bba464dcfbc87714fa9385ac6", "impl": "torch_layer_norm", "tags": {"family": "torch", "op": "layer_norm"}, "wl": {"name": "LN_B4_S2048_D4096", "batch": 4, "seq_len": 2048, "hidden_dim": 4096, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L4", "sm": "8.9", "py": "3.12.7", "plat": "Linux-5.15.0-1084-aws-x86_64-with-glibc2.31"}, "lat_ms": {"p10": 0.5754730082117021, "p50": 0.5831019952893257, "p90": 0.5851630121469498, "mean": 0.582532596308738, "iqr": 0.004841014742851257, "raw_times": [0.5831019952893257, 0.5851630121469498, 0.5803219974040985, 0.5886029684916139, 0.5754730082117021], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.5842720274813473, "peak_bytes": 603996160, "ok": false, "absmax": 0.03125, "corr": {"ok": false, "rtol": 0.001, "atol": 0.001, "absmax": 0.03125, "mae": 0.0015411376953125, "mse": 1.1205673217773438e-05, "ref": "layer_norm_fp32"}, "err": null}
-{"ts": "2025-10-24T19:24:01Z", "run": "fba0677bba464dcfbc87714fa9385ac6", "impl": "torch_layer_norm", "tags": {"family": "torch", "op": "layer_norm"}, "wl": {"name": "LN_B4_S2048_D8192", "batch": 4, "seq_len": 2048, "hidden_dim": 8192, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L4", "sm": "8.9", "py": "3.12.7", "plat": "Linux-5.15.0-1084-aws-x86_64-with-glibc2.31"}, "lat_ms": {"p10": 1.1429850128479302, "p50": 1.1474639759398997, "p90": 1.1484349961392581, "mean": 1.1493865866214037, "iqr": 0.001491047441959381, "raw_times": [1.1474639759398997, 1.1429850128479302, 1.1484349961392581, 1.1469439486972988, 1.1611049994826317], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 1.1680549941956997, "peak_bytes": 1207992320, "ok": false, "absmax": 0.03125, "corr": {"ok": false, "rtol": 0.001, "atol": 0.001, "absmax": 0.03125, "mae": 0.0015106201171875, "mse": 1.1026859283447266e-05, "ref": "layer_norm_fp32"}, "err": null}
-{"ts": "2025-10-24T19:24:01Z", "run": "fba0677bba464dcfbc87714fa9385ac6", "impl": "torch_layer_norm", "tags": {"family": "torch", "op": "layer_norm"}, "wl": {"name": "LN_B16_S128_D1024", "batch": 16, "seq_len": 128, "hidden_dim": 1024, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L4", "sm": "8.9", "py": "3.12.7", "plat": "Linux-5.15.0-1084-aws-x86_64-with-glibc2.31"}, "lat_ms": {"p10": 0.03370095510035753, "p50": 0.03467098576948047, "p90": 0.03499101148918271, "mean": 0.034558994229882956, "iqr": 0.0006700283847749233, "raw_times": [0.03432098310440779, 0.03467098576948047, 0.03499101148918271, 0.03511103568598628, 0.03370095510035753], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.035281002055853605, "peak_bytes": 276860928, "ok": false, "absmax": 0.03125, "corr": {"ok": false, "rtol": 0.001, "atol": 0.001, "absmax": 0.03125, "mae": 0.00157928466796875, "mse": 1.1444091796875e-05, "ref": "layer_norm_fp32"}, "err": null}
-{"ts": "2025-10-24T19:24:01Z", "run": "fba0677bba464dcfbc87714fa9385ac6", "impl": "torch_layer_norm", "tags": {"family": "torch", "op": "layer_norm"}, "wl": {"name": "LN_B16_S128_D2048", "batch": 16, "seq_len": 128, "hidden_dim": 2048, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L4", "sm": "8.9", "py": "3.12.7", "plat": "Linux-5.15.0-1084-aws-x86_64-with-glibc2.31"}, "lat_ms": {"p10": 0.04038098268210888, "p50": 0.04121096571907401, "p90": 0.041480991058051586, "mean": 0.04112699534744024, "iqr": 0.0006599584594368935, "raw_times": [0.04038098268210888, 0.04121096571907401, 0.04082103259861469, 0.041741004679352045, 0.041480991058051586], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.04577101208269596, "peak_bytes": 75505664, "ok": false, "absmax": 0.03125, "corr": {"ok": false, "rtol": 0.001, "atol": 0.001, "absmax": 0.03125, "mae": 0.0015716552734375, "mse": 1.1444091796875e-05, "ref": "layer_norm_fp32"}, "err": null}
-{"ts": "2025-10-24T19:24:01Z", "run": "fba0677bba464dcfbc87714fa9385ac6", "impl": "torch_layer_norm", "tags": {"family": "torch", "op": "layer_norm"}, "wl": {"name": "LN_B16_S128_D4096", "batch": 16, "seq_len": 128, "hidden_dim": 4096, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L4", "sm": "8.9", "py": "3.12.7", "plat": "Linux-5.15.0-1084-aws-x86_64-with-glibc2.31"}, "lat_ms": {"p10": 0.0530310207977891, "p50": 0.05350098945200443, "p90": 0.05359097849577665, "mean": 0.053398997988551855, "iqr": 0.0005399924702942371, "raw_times": [0.05350098945200443, 0.053050986025482416, 0.0530310207977891, 0.05359097849577665, 0.053821015171706676], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.062031031120568514, "peak_bytes": 151011328, "ok": false, "absmax": 0.03125, "corr": {"ok": false, "rtol": 0.001, "atol": 0.001, "absmax": 0.03125, "mae": 0.0015411376953125, "mse": 1.1205673217773438e-05, "ref": "layer_norm_fp32"}, "err": null}
-{"ts": "2025-10-24T19:24:01Z", "run": "fba0677bba464dcfbc87714fa9385ac6", "impl": "torch_layer_norm", "tags": {"family": "torch", "op": "layer_norm"}, "wl": {"name": "LN_B16_S128_D8192", "batch": 16, "seq_len": 128, "hidden_dim": 8192, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L4", "sm": "8.9", "py": "3.12.7", "plat": "Linux-5.15.0-1084-aws-x86_64-with-glibc2.31"}, "lat_ms": {"p10": 0.26927603175863624, "p50": 0.27200503973290324, "p90": 0.27223501820117235, "mean": 0.2713776077143848, "iqr": 0.002459040842950344, "raw_times": [0.27200503973290324, 0.27359597152099013, 0.26927603175863624, 0.27223501820117235, 0.269775977358222], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.2747260150499642, "peak_bytes": 302022656, "ok": false, "absmax": 0.03125, "corr": {"ok": false, "rtol": 0.001, "atol": 0.001, "absmax": 0.03125, "mae": 0.0015106201171875, "mse": 1.1086463928222656e-05, "ref": "layer_norm_fp32"}, "err": null}
-{"ts": "2025-10-24T19:24:01Z", "run": "fba0677bba464dcfbc87714fa9385ac6", "impl": "torch_layer_norm", "tags": {"family": "torch", "op": "layer_norm"}, "wl": {"name": "LN_B16_S512_D1024", "batch": 16, "seq_len": 512, "hidden_dim": 1024, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L4", "sm": "8.9", "py": "3.12.7", "plat": "Linux-5.15.0-1084-aws-x86_64-with-glibc2.31"}, "lat_ms": {"p10": 0.06719096563756466, "p50": 0.07005198858678341, "p90": 0.07023202488198876, "mean": 0.06946560461074114, "iqr": 0.0011699739843606949, "raw_times": [0.06719096563756466, 0.07005198858678341, 0.06906205089762807, 0.07079099304974079, 0.07023202488198876], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.07167202420532703, "peak_bytes": 150999040, "ok": false, "absmax": 0.03125, "corr": {"ok": false, "rtol": 0.001, "atol": 0.001, "absmax": 0.03125, "mae": 0.0015869140625, "mse": 1.1563301086425781e-05, "ref": "layer_norm_fp32"}, "err": null}
-{"ts": "2025-10-24T19:24:01Z", "run": "fba0677bba464dcfbc87714fa9385ac6", "impl": "torch_layer_norm", "tags": {"family": "torch", "op": "layer_norm"}, "wl": {"name": "LN_B16_S512_D2048", "batch": 16, "seq_len": 512, "hidden_dim": 2048, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L4", "sm": "8.9", "py": "3.12.7", "plat": "Linux-5.15.0-1084-aws-x86_64-with-glibc2.31"}, "lat_ms": {"p10": 0.2804459654726088, "p50": 0.28543599182739854, "p90": 0.28567598201334476, "mean": 0.2847379771992564, "iqr": 0.0013799872249364853, "raw_times": [0.28543599182739854, 0.2878359518945217, 0.2842959947884083, 0.28567598201334476, 0.2804459654726088], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.28216600185260177, "peak_bytes": 301998080, "ok": false, "absmax": 0.0625, "corr": {"ok": false, "rtol": 0.001, "atol": 0.001, "absmax": 0.0625, "mae": 0.0015716552734375, "mse": 1.150369644165039e-05, "ref": "layer_norm_fp32"}, "err": null}
-{"ts": "2025-10-24T19:24:01Z", "run": "fba0677bba464dcfbc87714fa9385ac6", "impl": "torch_layer_norm", "tags": {"family": "torch", "op": "layer_norm"}, "wl": {"name": "LN_B16_S512_D4096", "batch": 16, "seq_len": 512, "hidden_dim": 4096, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L4", "sm": "8.9", "py": "3.12.7", "plat": "Linux-5.15.0-1084-aws-x86_64-with-glibc2.31"}, "lat_ms": {"p10": 0.5787130212411284, "p50": 0.5883330013602972, "p90": 0.5898720119148493, "mean": 0.5873608053661883, "iqr": 0.0030189985409379005, "raw_times": [0.5930329789407551, 0.5868530133739114, 0.5898720119148493, 0.5787130212411284, 0.5883330013602972], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.5958119872957468, "peak_bytes": 603996160, "ok": false, "absmax": 0.03125, "corr": {"ok": false, "rtol": 0.001, "atol": 0.001, "absmax": 0.03125, "mae": 0.0015411376953125, "mse": 1.1205673217773438e-05, "ref": "layer_norm_fp32"}, "err": null}
-{"ts": "2025-10-24T19:24:01Z", "run": "fba0677bba464dcfbc87714fa9385ac6", "impl": "torch_layer_norm", "tags": {"family": "torch", "op": "layer_norm"}, "wl": {"name": "LN_B16_S512_D8192", "batch": 16, "seq_len": 512, "hidden_dim": 8192, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L4", "sm": "8.9", "py": "3.12.7", "plat": "Linux-5.15.0-1084-aws-x86_64-with-glibc2.31"}, "lat_ms": {"p10": 1.1475840001367033, "p50": 1.149774994701147, "p90": 1.149774994701147, "mean": 1.149676798377186, "iqr": 0.0017299898900091648, "raw_times": [1.148045004811138, 1.149774994701147, 1.149774994701147, 1.153204997535795, 1.1475840001367033], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 1.1586649925448, "peak_bytes": 1207992320, "ok": false, "absmax": 0.03125, "corr": {"ok": false, "rtol": 0.001, "atol": 0.001, "absmax": 0.03125, "mae": 0.0015106201171875, "mse": 1.1026859283447266e-05, "ref": "layer_norm_fp32"}, "err": null}
-{"ts": "2025-10-24T19:24:01Z", "run": "fba0677bba464dcfbc87714fa9385ac6", "impl": "torch_layer_norm", "tags": {"family": "torch", "op": "layer_norm"}, "wl": {"name": "LN_B16_S1024_D1024", "batch": 16, "seq_len": 1024, "hidden_dim": 1024, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L4", "sm": "8.9", "py": "3.12.7", "plat": "Linux-5.15.0-1084-aws-x86_64-with-glibc2.31"}, "lat_ms": {"p10": 0.28807600028812885, "p50": 0.29028597055003047, "p90": 0.2923660213127732, "mean": 0.290472200140357, "iqr": 0.0033390242606401443, "raw_times": [0.2923660213127732, 0.29028597055003047, 0.2890269970521331, 0.28807600028812885, 0.29260601149871945], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.28624600963667035, "peak_bytes": 335581184, "ok": false, "absmax": 0.0625, "corr": {"ok": false, "rtol": 0.001, "atol": 0.001, "absmax": 0.0625, "mae": 0.0015869140625, "mse": 1.1563301086425781e-05, "ref": "layer_norm_fp32"}, "err": null}
-{"ts": "2025-10-24T19:24:01Z", "run": "fba0677bba464dcfbc87714fa9385ac6", "impl": "torch_layer_norm", "tags": {"family": "torch", "op": "layer_norm"}, "wl": {"name": "LN_B16_S1024_D2048", "batch": 16, "seq_len": 1024, "hidden_dim": 2048, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L4", "sm": "8.9", "py": "3.12.7", "plat": "Linux-5.15.0-1084-aws-x86_64-with-glibc2.31"}, "lat_ms": {"p10": 0.5808720015920699, "p50": 0.5865029525011778, "p90": 0.5889830063097179, "mean": 0.5862265941686928, "iqr": 0.0050209928303956985, "raw_times": [0.5808720015920699, 0.5865029525011778, 0.5908129969611764, 0.5889830063097179, 0.5839620134793222], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.5870030145160854, "peak_bytes": 603987968, "ok": false, "absmax": 0.0625, "corr": {"ok": false, "rtol": 0.001, "atol": 0.001, "absmax": 0.0625, "mae": 0.0015716552734375, "mse": 1.1444091796875e-05, "ref": "layer_norm_fp32"}, "err": null}
-{"ts": "2025-10-24T19:24:02Z", "run": "fba0677bba464dcfbc87714fa9385ac6", "impl": "torch_layer_norm", "tags": {"family": "torch", "op": "layer_norm"}, "wl": {"name": "LN_B16_S1024_D4096", "batch": 16, "seq_len": 1024, "hidden_dim": 4096, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L4", "sm": "8.9", "py": "3.12.7", "plat": "Linux-5.15.0-1084-aws-x86_64-with-glibc2.31"}, "lat_ms": {"p10": 1.1516850208863616, "p50": 1.1541039566509426, "p90": 1.159774954430759, "mean": 1.1568425805307925, "iqr": 0.006380956619977951, "raw_times": [1.1652549728751183, 1.153393997810781, 1.1541039566509426, 1.159774954430759, 1.1516850208863616], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 1.1652849498204887, "peak_bytes": 1207975936, "ok": false, "absmax": 0.03125, "corr": {"ok": false, "rtol": 0.001, "atol": 0.001, "absmax": 0.03125, "mae": 0.00154876708984375, "mse": 1.1265277862548828e-05, "ref": "layer_norm_fp32"}, "err": null}
-{"ts": "2025-10-24T19:24:02Z", "run": "fba0677bba464dcfbc87714fa9385ac6", "impl": "torch_layer_norm", "tags": {"family": "torch", "op": "layer_norm"}, "wl": {"name": "LN_B16_S1024_D8192", "batch": 16, "seq_len": 1024, "hidden_dim": 8192, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L4", "sm": "8.9", "py": "3.12.7", "plat": "Linux-5.15.0-1084-aws-x86_64-with-glibc2.31"}, "lat_ms": {"p10": 2.264778013341129, "p50": 2.2672080085612833, "p90": 2.2703579743392766, "mean": 2.2687464021146297, "iqr": 0.0047089415602386, "raw_times": [2.2703579743392766, 2.264778013341129, 2.2672080085612833, 2.265649032779038, 2.275738981552422], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 2.299049054272473, "peak_bytes": 2415951872, "ok": false, "absmax": 0.03125, "corr": {"ok": false, "rtol": 0.001, "atol": 0.001, "absmax": 0.03125, "mae": 0.00150299072265625, "mse": 1.0967254638671875e-05, "ref": "layer_norm_fp32"}, "err": null}
-{"ts": "2025-10-24T19:24:02Z", "run": "fba0677bba464dcfbc87714fa9385ac6", "impl": "torch_layer_norm", "tags": {"family": "torch", "op": "layer_norm"}, "wl": {"name": "LN_B16_S2048_D1024", "batch": 16, "seq_len": 2048, "hidden_dim": 1024, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L4", "sm": "8.9", "py": "3.12.7", "plat": "Linux-5.15.0-1084-aws-x86_64-with-glibc2.31"}, "lat_ms": {"p10": 0.5862729740329087, "p50": 0.5921830306760967, "p90": 0.5925330333411694, "mean": 0.591674807947129, "iqr": 0.00036100391298532486, "raw_times": [0.592172029428184, 0.5921830306760967, 0.5925330333411694, 0.5952129722572863, 0.5862729740329087], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.5872620386071503, "peak_bytes": 671125504, "ok": false, "absmax": 0.0625, "corr": {"ok": false, "rtol": 0.001, "atol": 0.001, "absmax": 0.0625, "mae": 0.0015869140625, "mse": 1.1563301086425781e-05, "ref": "layer_norm_fp32"}, "err": null}
-{"ts": "2025-10-24T19:24:02Z", "run": "fba0677bba464dcfbc87714fa9385ac6", "impl": "torch_layer_norm", "tags": {"family": "torch", "op": "layer_norm"}, "wl": {"name": "LN_B16_S2048_D2048", "batch": 16, "seq_len": 2048, "hidden_dim": 2048, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L4", "sm": "8.9", "py": "3.12.7", "plat": "Linux-5.15.0-1084-aws-x86_64-with-glibc2.31"}, "lat_ms": {"p10": 1.150664989836514, "p50": 1.156534010078758, "p90": 1.1567150359041989, "mean": 1.15486680297181, "iqr": 0.004480069037526846, "raw_times": [1.1567150359041989, 1.150664989836514, 1.1581850121729076, 1.156534010078758, 1.152234966866672], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 1.150145020801574, "peak_bytes": 1207967744, "ok": false, "absmax": 0.0625, "corr": {"ok": false, "rtol": 0.001, "atol": 0.001, "absmax": 0.0625, "mae": 0.00156402587890625, "mse": 1.138448715209961e-05, "ref": "layer_norm_fp32"}, "err": null}
-{"ts": "2025-10-24T19:24:02Z", "run": "fba0677bba464dcfbc87714fa9385ac6", "impl": "torch_layer_norm", "tags": {"family": "torch", "op": "layer_norm"}, "wl": {"name": "LN_B16_S2048_D4096", "batch": 16, "seq_len": 2048, "hidden_dim": 4096, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L4", "sm": "8.9", "py": "3.12.7", "plat": "Linux-5.15.0-1084-aws-x86_64-with-glibc2.31"}, "lat_ms": {"p10": 2.2770979558117688, "p50": 2.2999990032985806, "p90": 2.302108972799033, "mean": 2.2958547924645245, "iqr": 0.012759934179484844, "raw_times": [2.2770979558117688, 2.302108972799033, 2.310718991793692, 2.2999990032985806, 2.289349038619548], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 2.3001389927230775, "peak_bytes": 2415935488, "ok": false, "absmax": 0.03125, "corr": {"ok": false, "rtol": 0.001, "atol": 0.001, "absmax": 0.03125, "mae": 0.0015411376953125, "mse": 1.1265277862548828e-05, "ref": "layer_norm_fp32"}, "err": null}
-{"ts": "2025-10-24T19:24:02Z", "run": "fba0677bba464dcfbc87714fa9385ac6", "impl": "torch_layer_norm", "tags": {"family": "torch", "op": "layer_norm"}, "wl": {"name": "LN_B16_S2048_D8192", "batch": 16, "seq_len": 2048, "hidden_dim": 8192, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L4", "sm": "8.9", "py": "3.12.7", "plat": "Linux-5.15.0-1084-aws-x86_64-with-glibc2.31"}, "lat_ms": {"p10": 4.500485956668854, "p50": 4.510977014433593, "p90": 4.513906955253333, "mean": 4.509088769555092, "iqr": 0.010930001735687256, "raw_times": [4.500485956668854, 4.510977014433593, 4.502976953517646, 4.513906955253333, 4.5170969679020345], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 4.5062569552101195, "peak_bytes": 4831870976, "ok": false, "absmax": 0.03125, "corr": {"ok": false, "rtol": 0.001, "atol": 0.001, "absmax": 0.03125, "mae": 0.0015106201171875, "mse": 1.1086463928222656e-05, "ref": "layer_norm_fp32"}, "err": null}
+{"ts": "2025-10-27T14:46:20Z", "run": "9dc250db88854e28964e0fea9bac9f3c", "impl": "hf_kernels_layer_norm", "tags": {"family": "hf-kernels", "repo": "kernels-community/layer-norm", "op": "layer_norm"}, "wl": {"name": "LN_B1_S128_D1024", "batch": 1, "seq_len": 128, "hidden_dim": 1024, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.03865100006805733, "p50": 0.03903099991475756, "p90": 0.04018100003122527, "mean": 0.03959079995183856, "iqr": 0.001300000121773337, "raw_times": [0.03888099990945193, 0.03903099991475756, 0.04018100003122527, 0.04120999983570073, 0.03865100006805733], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.05060100011178292, "peak_bytes": 2363392, "ok": false, "absmax": 0.03125, "corr": {"ok": false, "rtol": 0.001, "atol": 0.001, "absmax": 0.03125, "mae": 0.0015716552734375, "mse": 1.1444091796875e-05, "ref": "layer_norm_fp32"}, "err": null}
+{"ts": "2025-10-27T14:46:20Z", "run": "9dc250db88854e28964e0fea9bac9f3c", "impl": "hf_kernels_layer_norm", "tags": {"family": "hf-kernels", "repo": "kernels-community/layer-norm", "op": "layer_norm"}, "wl": {"name": "LN_B1_S128_D2048", "batch": 1, "seq_len": 128, "hidden_dim": 2048, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.04517999991549004, "p50": 0.04712100007964182, "p90": 0.04805000003216264, "mean": 0.04695459997492435, "iqr": 0.001779000058377278, "raw_times": [0.04517999991549004, 0.04805000003216264, 0.04712100007964182, 0.046270999973785365, 0.04815099987354188], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.05504099999598111, "peak_bytes": 4726784, "ok": false, "absmax": 0.03125, "corr": {"ok": false, "rtol": 0.001, "atol": 0.001, "absmax": 0.03125, "mae": 0.0015716552734375, "mse": 1.1324882507324219e-05, "ref": "layer_norm_fp32"}, "err": null}
+{"ts": "2025-10-27T14:46:20Z", "run": "9dc250db88854e28964e0fea9bac9f3c", "impl": "hf_kernels_layer_norm", "tags": {"family": "hf-kernels", "repo": "kernels-community/layer-norm", "op": "layer_norm"}, "wl": {"name": "LN_B1_S128_D4096", "batch": 1, "seq_len": 128, "hidden_dim": 4096, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.04581999996844388, "p50": 0.04766099982589367, "p90": 0.04786099998455029, "mean": 0.047156599976005964, "iqr": 0.0017899999420478707, "raw_times": [0.04766099982589367, 0.04786099998455029, 0.04837000005863956, 0.04607100004250242, 0.04581999996844388], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.05361099988476781, "peak_bytes": 9453568, "ok": false, "absmax": 0.03125, "corr": {"ok": false, "rtol": 0.001, "atol": 0.001, "absmax": 0.03125, "mae": 0.00146484375, "mse": 1.049041748046875e-05, "ref": "layer_norm_fp32"}, "err": null}
+{"ts": "2025-10-27T14:46:20Z", "run": "9dc250db88854e28964e0fea9bac9f3c", "impl": "hf_kernels_layer_norm", "tags": {"family": "hf-kernels", "repo": "kernels-community/layer-norm", "op": "layer_norm"}, "wl": {"name": "LN_B1_S128_D8192", "batch": 1, "seq_len": 128, "hidden_dim": 8192, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.045190999799160636, "p50": 0.04684100008489622, "p90": 0.04752099994220771, "mean": 0.046596999982284615, "iqr": 0.00227999998969608, "raw_times": [0.04524099995251163, 0.04752099994220771, 0.04819100013264688, 0.04684100008489622, 0.045190999799160636], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.052801000038016355, "peak_bytes": 18907136, "ok": false, "absmax": 0.03125, "corr": {"ok": false, "rtol": 0.001, "atol": 0.001, "absmax": 0.03125, "mae": 0.0016326904296875, "mse": 1.1801719665527344e-05, "ref": "layer_norm_fp32"}, "err": null}
+{"ts": "2025-10-27T14:46:20Z", "run": "9dc250db88854e28964e0fea9bac9f3c", "impl": "hf_kernels_layer_norm", "tags": {"family": "hf-kernels", "repo": "kernels-community/layer-norm", "op": "layer_norm"}, "wl": {"name": "LN_B1_S512_D1024", "batch": 1, "seq_len": 512, "hidden_dim": 1024, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.04395000019030704, "p50": 0.045061000037094345, "p90": 0.046920999920985196, "mean": 0.04563460001918429, "iqr": 0.0018609998733154498, "raw_times": [0.04718099989986513, 0.046920999920985196, 0.045060000047669746, 0.045061000037094345, 0.04395000019030704], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.05090100012239418, "peak_bytes": 9441280, "ok": false, "absmax": 0.03125, "corr": {"ok": false, "rtol": 0.001, "atol": 0.001, "absmax": 0.03125, "mae": 0.0015716552734375, "mse": 1.138448715209961e-05, "ref": "layer_norm_fp32"}, "err": null}
+{"ts": "2025-10-27T14:46:20Z", "run": "9dc250db88854e28964e0fea9bac9f3c", "impl": "hf_kernels_layer_norm", "tags": {"family": "hf-kernels", "repo": "kernels-community/layer-norm", "op": "layer_norm"}, "wl": {"name": "LN_B1_S512_D2048", "batch": 1, "seq_len": 512, "hidden_dim": 2048, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.04656000010072603, "p50": 0.046920999920985196, "p90": 0.04878100003224972, "mean": 0.04884479999418545, "iqr": 0.0020300001324358163, "raw_times": [0.04656000010072603, 0.046750999899813905, 0.04878100003224972, 0.0552110000171524, 0.046920999920985196], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.0497399998948822, "peak_bytes": 18882560, "ok": false, "absmax": 0.03125, "corr": {"ok": false, "rtol": 0.001, "atol": 0.001, "absmax": 0.03125, "mae": 0.00154876708984375, "mse": 1.1205673217773438e-05, "ref": "layer_norm_fp32"}, "err": null}
+{"ts": "2025-10-27T14:46:20Z", "run": "9dc250db88854e28964e0fea9bac9f3c", "impl": "hf_kernels_layer_norm", "tags": {"family": "hf-kernels", "repo": "kernels-community/layer-norm", "op": "layer_norm"}, "wl": {"name": "LN_B1_S512_D4096", "batch": 1, "seq_len": 512, "hidden_dim": 4096, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.04567099995256285, "p50": 0.04622100004780805, "p90": 0.04798100007974426, "mean": 0.047496800016233465, "iqr": 0.0018200000795332016, "raw_times": [0.04567099995256285, 0.0514500000008411, 0.04616100000021106, 0.04798100007974426, 0.04622100004780805], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.04885000021204178, "peak_bytes": 37765120, "ok": false, "absmax": 0.03125, "corr": {"ok": false, "rtol": 0.001, "atol": 0.001, "absmax": 0.03125, "mae": 0.0015716552734375, "mse": 1.1444091796875e-05, "ref": "layer_norm_fp32"}, "err": null}
+{"ts": "2025-10-27T14:46:20Z", "run": "9dc250db88854e28964e0fea9bac9f3c", "impl": "hf_kernels_layer_norm", "tags": {"family": "hf-kernels", "repo": "kernels-community/layer-norm", "op": "layer_norm"}, "wl": {"name": "LN_B1_S512_D8192", "batch": 1, "seq_len": 512, "hidden_dim": 8192, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.04487000001063279, "p50": 0.045961000068928115, "p90": 0.046200000042517786, "mean": 0.04860060003011313, "iqr": 0.000509000074089272, "raw_times": [0.06028100006005843, 0.04487000001063279, 0.045690999968428514, 0.045961000068928115, 0.046200000042517786], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.05061100000602892, "peak_bytes": 75530240, "ok": false, "absmax": 0.03125, "corr": {"ok": false, "rtol": 0.001, "atol": 0.001, "absmax": 0.03125, "mae": 0.00157928466796875, "mse": 1.150369644165039e-05, "ref": "layer_norm_fp32"}, "err": null}
+{"ts": "2025-10-27T14:46:20Z", "run": "9dc250db88854e28964e0fea9bac9f3c", "impl": "hf_kernels_layer_norm", "tags": {"family": "hf-kernels", "repo": "kernels-community/layer-norm", "op": "layer_norm"}, "wl": {"name": "LN_B1_S1024_D1024", "batch": 1, "seq_len": 1024, "hidden_dim": 1024, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.043511000058060745, "p50": 0.046270999973785365, "p90": 0.04790999992110301, "mean": 0.047574600012012525, "iqr": 0.002919999815276242, "raw_times": [0.044990000105826766, 0.04790999992110301, 0.043511000058060745, 0.05519100000128674, 0.046270999973785365], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.048970999841913, "peak_bytes": 21008384, "ok": false, "absmax": 0.03125, "corr": {"ok": false, "rtol": 0.001, "atol": 0.001, "absmax": 0.03125, "mae": 0.00157928466796875, "mse": 1.138448715209961e-05, "ref": "layer_norm_fp32"}, "err": null}
+{"ts": "2025-10-27T14:46:20Z", "run": "9dc250db88854e28964e0fea9bac9f3c", "impl": "hf_kernels_layer_norm", "tags": {"family": "hf-kernels", "repo": "kernels-community/layer-norm", "op": "layer_norm"}, "wl": {"name": "LN_B1_S1024_D2048", "batch": 1, "seq_len": 1024, "hidden_dim": 2048, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.043170000026293565, "p50": 0.04767099994751334, "p90": 0.0476899999739544, "mean": 0.04691639996963204, "iqr": 0.0009390000741404947, "raw_times": [0.043170000026293565, 0.04930000000058499, 0.04767099994751334, 0.046750999899813905, 0.0476899999739544], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.05084099984742352, "peak_bytes": 37756928, "ok": false, "absmax": 0.03125, "corr": {"ok": false, "rtol": 0.001, "atol": 0.001, "absmax": 0.03125, "mae": 0.0015716552734375, "mse": 1.1444091796875e-05, "ref": "layer_norm_fp32"}, "err": null}
+{"ts": "2025-10-27T14:46:20Z", "run": "9dc250db88854e28964e0fea9bac9f3c", "impl": "hf_kernels_layer_norm", "tags": {"family": "hf-kernels", "repo": "kernels-community/layer-norm", "op": "layer_norm"}, "wl": {"name": "LN_B1_S1024_D4096", "batch": 1, "seq_len": 1024, "hidden_dim": 4096, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.044720999994751764, "p50": 0.045860000000175205, "p90": 0.046411000084845, "mean": 0.04585680003401649, "iqr": 0.0012000000424450263, "raw_times": [0.044720999994751764, 0.04708100004791049, 0.046411000084845, 0.045860000000175205, 0.045211000042399974], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.05302099998516496, "peak_bytes": 75513856, "ok": false, "absmax": 0.03125, "corr": {"ok": false, "rtol": 0.001, "atol": 0.001, "absmax": 0.03125, "mae": 0.0016021728515625, "mse": 1.1682510375976562e-05, "ref": "layer_norm_fp32"}, "err": null}
+{"ts": "2025-10-27T14:46:20Z", "run": "9dc250db88854e28964e0fea9bac9f3c", "impl": "hf_kernels_layer_norm", "tags": {"family": "hf-kernels", "repo": "kernels-community/layer-norm", "op": "layer_norm"}, "wl": {"name": "LN_B1_S1024_D8192", "batch": 1, "seq_len": 1024, "hidden_dim": 8192, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.04476999993130448, "p50": 0.04614999988916679, "p90": 0.04633100002138235, "mean": 0.04639259996110923, "iqr": 0.00019000003703695256, "raw_times": [0.04476999993130448, 0.04614999988916679, 0.04857099997934711, 0.0461409999843454, 0.04633100002138235], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.047730999995110324, "peak_bytes": 151027712, "ok": false, "absmax": 0.03125, "corr": {"ok": false, "rtol": 0.001, "atol": 0.001, "absmax": 0.03125, "mae": 0.00156402587890625, "mse": 1.1444091796875e-05, "ref": "layer_norm_fp32"}, "err": null}
+{"ts": "2025-10-27T14:46:20Z", "run": "9dc250db88854e28964e0fea9bac9f3c", "impl": "hf_kernels_layer_norm", "tags": {"family": "hf-kernels", "repo": "kernels-community/layer-norm", "op": "layer_norm"}, "wl": {"name": "LN_B1_S2048_D1024", "batch": 1, "seq_len": 2048, "hidden_dim": 1024, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.04606099992088275, "p50": 0.046679999968546326, "p90": 0.04687099999500788, "mean": 0.0466285999664251, "iqr": 0.0006509999366244301, "raw_times": [0.04606099992088275, 0.047310999889305094, 0.04687099999500788, 0.046679999968546326, 0.04622000005838345], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.050389999842082034, "peak_bytes": 41979904, "ok": false, "absmax": 0.03125, "corr": {"ok": false, "rtol": 0.001, "atol": 0.001, "absmax": 0.03125, "mae": 0.00159454345703125, "mse": 1.150369644165039e-05, "ref": "layer_norm_fp32"}, "err": null}
+{"ts": "2025-10-27T14:46:20Z", "run": "9dc250db88854e28964e0fea9bac9f3c", "impl": "hf_kernels_layer_norm", "tags": {"family": "hf-kernels", "repo": "kernels-community/layer-norm", "op": "layer_norm"}, "wl": {"name": "LN_B1_S2048_D2048", "batch": 1, "seq_len": 2048, "hidden_dim": 2048, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.04560100001071987, "p50": 0.04617999979927845, "p90": 0.04656999999497202, "mean": 0.0462445999346528, "iqr": 0.0007090000053722179, "raw_times": [0.04617999979927845, 0.045860999989599804, 0.04560100001071987, 0.04701099987869384, 0.04656999999497202], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.049061000026995316, "peak_bytes": 75505664, "ok": false, "absmax": 0.03125, "corr": {"ok": false, "rtol": 0.001, "atol": 0.001, "absmax": 0.03125, "mae": 0.00159454345703125, "mse": 1.1563301086425781e-05, "ref": "layer_norm_fp32"}, "err": null}
+{"ts": "2025-10-27T14:46:20Z", "run": "9dc250db88854e28964e0fea9bac9f3c", "impl": "hf_kernels_layer_norm", "tags": {"family": "hf-kernels", "repo": "kernels-community/layer-norm", "op": "layer_norm"}, "wl": {"name": "LN_B1_S2048_D4096", "batch": 1, "seq_len": 2048, "hidden_dim": 4096, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.04474100001061743, "p50": 0.04615000011654047, "p90": 0.04696099995271652, "mean": 0.046176800060493406, "iqr": 0.0009599998520570807, "raw_times": [0.047031000121933175, 0.04474100001061743, 0.04600100010065944, 0.04615000011654047, 0.04696099995271652], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.051490000032572425, "peak_bytes": 151011328, "ok": false, "absmax": 0.03125, "corr": {"ok": false, "rtol": 0.001, "atol": 0.001, "absmax": 0.03125, "mae": 0.00156402587890625, "mse": 1.1444091796875e-05, "ref": "layer_norm_fp32"}, "err": null}
+{"ts": "2025-10-27T14:46:20Z", "run": "9dc250db88854e28964e0fea9bac9f3c", "impl": "hf_kernels_layer_norm", "tags": {"family": "hf-kernels", "repo": "kernels-community/layer-norm", "op": "layer_norm"}, "wl": {"name": "LN_B1_S2048_D8192", "batch": 1, "seq_len": 2048, "hidden_dim": 8192, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.051341000016691396, "p50": 0.05155100006959401, "p90": 0.05226099983701715, "mean": 0.051880799992432, "iqr": 0.0007709998044447275, "raw_times": [0.051341000016691396, 0.051490000032572425, 0.05226099983701715, 0.05155100006959401, 0.05276100000628503], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.053531000048678834, "peak_bytes": 302022656, "ok": false, "absmax": 0.03125, "corr": {"ok": false, "rtol": 0.001, "atol": 0.001, "absmax": 0.03125, "mae": 0.00152587890625, "mse": 1.1146068572998047e-05, "ref": "layer_norm_fp32"}, "err": null}
+{"ts": "2025-10-27T14:46:20Z", "run": "9dc250db88854e28964e0fea9bac9f3c", "impl": "hf_kernels_layer_norm", "tags": {"family": "hf-kernels", "repo": "kernels-community/layer-norm", "op": "layer_norm"}, "wl": {"name": "LN_B4_S128_D1024", "batch": 4, "seq_len": 128, "hidden_dim": 1024, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.044059999936507666, "p50": 0.04549100003714557, "p90": 0.045540999963122886, "mean": 0.04540859999906388, "iqr": 0.0004099999841855606, "raw_times": [0.04549100003714557, 0.044059999936507666, 0.04682000007960596, 0.045130999978937325, 0.045540999963122886], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.048860999868338695, "peak_bytes": 69242880, "ok": false, "absmax": 0.03125, "corr": {"ok": false, "rtol": 0.001, "atol": 0.001, "absmax": 0.03125, "mae": 0.0015716552734375, "mse": 1.138448715209961e-05, "ref": "layer_norm_fp32"}, "err": null}
+{"ts": "2025-10-27T14:46:20Z", "run": "9dc250db88854e28964e0fea9bac9f3c", "impl": "hf_kernels_layer_norm", "tags": {"family": "hf-kernels", "repo": "kernels-community/layer-norm", "op": "layer_norm"}, "wl": {"name": "LN_B4_S128_D2048", "batch": 4, "seq_len": 128, "hidden_dim": 2048, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.04604099990501709, "p50": 0.04642099997909099, "p90": 0.04698099996858218, "mean": 0.05290099998092046, "iqr": 0.0009299999419454252, "raw_times": [0.07901100002527528, 0.04698099996858218, 0.04604099990501709, 0.04605100002663676, 0.04642099997909099], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.048481000021638465, "peak_bytes": 18882560, "ok": false, "absmax": 0.03125, "corr": {"ok": false, "rtol": 0.001, "atol": 0.001, "absmax": 0.03125, "mae": 0.00154876708984375, "mse": 1.1205673217773438e-05, "ref": "layer_norm_fp32"}, "err": null}
+{"ts": "2025-10-27T14:46:20Z", "run": "9dc250db88854e28964e0fea9bac9f3c", "impl": "hf_kernels_layer_norm", "tags": {"family": "hf-kernels", "repo": "kernels-community/layer-norm", "op": "layer_norm"}, "wl": {"name": "LN_B4_S128_D4096", "batch": 4, "seq_len": 128, "hidden_dim": 4096, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.04422100005285756, "p50": 0.045961000068928115, "p90": 0.04607100004250242, "mean": 0.04557280003609776, "iqr": 0.0010700000530050602, "raw_times": [0.04500099998949736, 0.045961000068928115, 0.04422100005285756, 0.046610000026703347, 0.04607100004250242], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.05000100009056041, "peak_bytes": 37765120, "ok": false, "absmax": 0.03125, "corr": {"ok": false, "rtol": 0.001, "atol": 0.001, "absmax": 0.03125, "mae": 0.0015716552734375, "mse": 1.1444091796875e-05, "ref": "layer_norm_fp32"}, "err": null}
+{"ts": "2025-10-27T14:46:20Z", "run": "9dc250db88854e28964e0fea9bac9f3c", "impl": "hf_kernels_layer_norm", "tags": {"family": "hf-kernels", "repo": "kernels-community/layer-norm", "op": "layer_norm"}, "wl": {"name": "LN_B4_S128_D8192", "batch": 4, "seq_len": 128, "hidden_dim": 8192, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.044550999973580474, "p50": 0.04615100010596507, "p90": 0.04661999992094934, "mean": 0.04619880000973353, "iqr": 0.0006089999260439072, "raw_times": [0.04661999992094934, 0.047661000053267344, 0.04615100010596507, 0.04601099999490543, 0.044550999973580474], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.05021999982091074, "peak_bytes": 75530240, "ok": false, "absmax": 0.03125, "corr": {"ok": false, "rtol": 0.001, "atol": 0.001, "absmax": 0.03125, "mae": 0.00157928466796875, "mse": 1.150369644165039e-05, "ref": "layer_norm_fp32"}, "err": null}
+{"ts": "2025-10-27T14:46:20Z", "run": "9dc250db88854e28964e0fea9bac9f3c", "impl": "hf_kernels_layer_norm", "tags": {"family": "hf-kernels", "repo": "kernels-community/layer-norm", "op": "layer_norm"}, "wl": {"name": "LN_B4_S512_D1024", "batch": 4, "seq_len": 512, "hidden_dim": 1024, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.04479100016396842, "p50": 0.04570999999486958, "p90": 0.04578100015351083, "mean": 0.04546060008578934, "iqr": 0.0006410000423784368, "raw_times": [0.045881000005465467, 0.04578100015351083, 0.045140000111132395, 0.04479100016396842, 0.04570999999486958], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.05074099999546888, "peak_bytes": 37752832, "ok": false, "absmax": 0.03125, "corr": {"ok": false, "rtol": 0.001, "atol": 0.001, "absmax": 0.03125, "mae": 0.00159454345703125, "mse": 1.150369644165039e-05, "ref": "layer_norm_fp32"}, "err": null}
+{"ts": "2025-10-27T14:46:20Z", "run": "9dc250db88854e28964e0fea9bac9f3c", "impl": "hf_kernels_layer_norm", "tags": {"family": "hf-kernels", "repo": "kernels-community/layer-norm", "op": "layer_norm"}, "wl": {"name": "LN_B4_S512_D2048", "batch": 4, "seq_len": 512, "hidden_dim": 2048, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.04388000002109038, "p50": 0.046260999852165696, "p90": 0.047070999926290824, "mean": 0.046070799999142764, "iqr": 0.0010899998414970469, "raw_times": [0.04716100011137314, 0.04598100008479378, 0.04388000002109038, 0.046260999852165696, 0.047070999926290824], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.05007100003240339, "peak_bytes": 75505664, "ok": false, "absmax": 0.03125, "corr": {"ok": false, "rtol": 0.001, "atol": 0.001, "absmax": 0.03125, "mae": 0.00159454345703125, "mse": 1.1563301086425781e-05, "ref": "layer_norm_fp32"}, "err": null}
+{"ts": "2025-10-27T14:46:20Z", "run": "9dc250db88854e28964e0fea9bac9f3c", "impl": "hf_kernels_layer_norm", "tags": {"family": "hf-kernels", "repo": "kernels-community/layer-norm", "op": "layer_norm"}, "wl": {"name": "LN_B4_S512_D4096", "batch": 4, "seq_len": 512, "hidden_dim": 4096, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.04435100004229753, "p50": 0.045130999978937325, "p90": 0.04698099996858218, "mean": 0.04562479998639901, "iqr": 0.0023600000531587284, "raw_times": [0.044620999915423454, 0.04698099996858218, 0.04704000002675457, 0.04435100004229753, 0.045130999978937325], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.04849099991588446, "peak_bytes": 151011328, "ok": false, "absmax": 0.03125, "corr": {"ok": false, "rtol": 0.001, "atol": 0.001, "absmax": 0.03125, "mae": 0.00156402587890625, "mse": 1.1444091796875e-05, "ref": "layer_norm_fp32"}, "err": null}
+{"ts": "2025-10-27T14:46:20Z", "run": "9dc250db88854e28964e0fea9bac9f3c", "impl": "hf_kernels_layer_norm", "tags": {"family": "hf-kernels", "repo": "kernels-community/layer-norm", "op": "layer_norm"}, "wl": {"name": "LN_B4_S512_D8192", "batch": 4, "seq_len": 512, "hidden_dim": 8192, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.05119000002196117, "p50": 0.05123000005369249, "p90": 0.05150099991624302, "mean": 0.051574400004028575, "iqr": 0.00027999999474559445, "raw_times": [0.05122099992149742, 0.05150099991624302, 0.05123000005369249, 0.052730000106748776, 0.05119000002196117], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.05633099999613478, "peak_bytes": 302022656, "ok": false, "absmax": 0.03125, "corr": {"ok": false, "rtol": 0.001, "atol": 0.001, "absmax": 0.03125, "mae": 0.00152587890625, "mse": 1.1146068572998047e-05, "ref": "layer_norm_fp32"}, "err": null}
+{"ts": "2025-10-27T14:46:20Z", "run": "9dc250db88854e28964e0fea9bac9f3c", "impl": "hf_kernels_layer_norm", "tags": {"family": "hf-kernels", "repo": "kernels-community/layer-norm", "op": "layer_norm"}, "wl": {"name": "LN_B4_S1024_D1024", "batch": 4, "seq_len": 1024, "hidden_dim": 1024, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.04580100016937649, "p50": 0.04708999995273189, "p90": 0.04770099985762499, "mean": 0.05188039999666216, "iqr": 0.00096099984148168, "raw_times": [0.07206999998743413, 0.04674000001614331, 0.04580100016937649, 0.04708999995273189, 0.04770099985762499], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.04944000011164462, "peak_bytes": 83922944, "ok": false, "absmax": 0.03125, "corr": {"ok": false, "rtol": 0.001, "atol": 0.001, "absmax": 0.03125, "mae": 0.0015869140625, "mse": 1.150369644165039e-05, "ref": "layer_norm_fp32"}, "err": null}
+{"ts": "2025-10-27T14:46:20Z", "run": "9dc250db88854e28964e0fea9bac9f3c", "impl": "hf_kernels_layer_norm", "tags": {"family": "hf-kernels", "repo": "kernels-community/layer-norm", "op": "layer_norm"}, "wl": {"name": "LN_B4_S1024_D2048", "batch": 4, "seq_len": 1024, "hidden_dim": 2048, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.04320099992582982, "p50": 0.04512100008469133, "p90": 0.04604099990501709, "mean": 0.04527500000222062, "iqr": 0.001329999804511317, "raw_times": [0.04320099992582982, 0.04604099990501709, 0.04471100010050577, 0.0473009999950591, 0.04512100008469133], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.051181000117139774, "peak_bytes": 151003136, "ok": false, "absmax": 0.03125, "corr": {"ok": false, "rtol": 0.001, "atol": 0.001, "absmax": 0.03125, "mae": 0.0015716552734375, "mse": 1.1444091796875e-05, "ref": "layer_norm_fp32"}, "err": null}
+{"ts": "2025-10-27T14:46:20Z", "run": "9dc250db88854e28964e0fea9bac9f3c", "impl": "hf_kernels_layer_norm", "tags": {"family": "hf-kernels", "repo": "kernels-community/layer-norm", "op": "layer_norm"}, "wl": {"name": "LN_B4_S1024_D4096", "batch": 4, "seq_len": 1024, "hidden_dim": 4096, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.04984999986845651, "p50": 0.050290999979552, "p90": 0.050490999910834944, "mean": 0.050288599959458224, "iqr": 0.0005399999736255268, "raw_times": [0.04995099993720942, 0.050490999910834944, 0.050290999979552, 0.050860000101238256, 0.04984999986845651], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.052241000048525166, "peak_bytes": 302006272, "ok": false, "absmax": 0.03125, "corr": {"ok": false, "rtol": 0.001, "atol": 0.001, "absmax": 0.03125, "mae": 0.0015411376953125, "mse": 1.1205673217773438e-05, "ref": "layer_norm_fp32"}, "err": null}
+{"ts": "2025-10-27T14:46:20Z", "run": "9dc250db88854e28964e0fea9bac9f3c", "impl": "hf_kernels_layer_norm", "tags": {"family": "hf-kernels", "repo": "kernels-community/layer-norm", "op": "layer_norm"}, "wl": {"name": "LN_B4_S1024_D8192", "batch": 4, "seq_len": 1024, "hidden_dim": 8192, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.2064129998871067, "p50": 0.2123330000358692, "p90": 0.218262999851504, "mean": 0.2148927999769512, "iqr": 0.010130999726243317, "raw_times": [0.20813200012526067, 0.218262999851504, 0.2123330000358692, 0.2064129998871067, 0.22932299998501549], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.21481299995684822, "peak_bytes": 604012544, "ok": false, "absmax": 0.03125, "corr": {"ok": false, "rtol": 0.001, "atol": 0.001, "absmax": 0.03125, "mae": 0.00152587890625, "mse": 1.1205673217773438e-05, "ref": "layer_norm_fp32"}, "err": null}
+{"ts": "2025-10-27T14:46:20Z", "run": "9dc250db88854e28964e0fea9bac9f3c", "impl": "hf_kernels_layer_norm", "tags": {"family": "hf-kernels", "repo": "kernels-community/layer-norm", "op": "layer_norm"}, "wl": {"name": "LN_B4_S2048_D1024", "batch": 4, "seq_len": 2048, "hidden_dim": 1024, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.04353000008450181, "p50": 0.04543000000012398, "p90": 0.04657099998439662, "mean": 0.04557060001388891, "iqr": 0.001390000079481979, "raw_times": [0.04543000000012398, 0.04518099990491464, 0.04714100009550748, 0.04657099998439662, 0.04353000008450181], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.0490809998154873, "peak_bytes": 167809024, "ok": false, "absmax": 0.03125, "corr": {"ok": false, "rtol": 0.001, "atol": 0.001, "absmax": 0.03125, "mae": 0.0015869140625, "mse": 1.150369644165039e-05, "ref": "layer_norm_fp32"}, "err": null}
+{"ts": "2025-10-27T14:46:20Z", "run": "9dc250db88854e28964e0fea9bac9f3c", "impl": "hf_kernels_layer_norm", "tags": {"family": "hf-kernels", "repo": "kernels-community/layer-norm", "op": "layer_norm"}, "wl": {"name": "LN_B4_S2048_D2048", "batch": 4, "seq_len": 2048, "hidden_dim": 2048, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.054420999958892935, "p50": 0.05506100001184677, "p90": 0.055460999874412664, "mean": 0.055042999929355574, "iqr": 0.0008699998943484388, "raw_times": [0.054420999958892935, 0.055460999874412664, 0.05568099982156127, 0.054590999980064225, 0.05506100001184677], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.05802099985885434, "peak_bytes": 301998080, "ok": false, "absmax": 0.03125, "corr": {"ok": false, "rtol": 0.001, "atol": 0.001, "absmax": 0.03125, "mae": 0.00157928466796875, "mse": 1.1444091796875e-05, "ref": "layer_norm_fp32"}, "err": null}
+{"ts": "2025-10-27T14:46:20Z", "run": "9dc250db88854e28964e0fea9bac9f3c", "impl": "hf_kernels_layer_norm", "tags": {"family": "hf-kernels", "repo": "kernels-community/layer-norm", "op": "layer_norm"}, "wl": {"name": "LN_B4_S2048_D4096", "batch": 4, "seq_len": 2048, "hidden_dim": 4096, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.20996300008846447, "p50": 0.2102230000673444, "p90": 0.21053299997220165, "mean": 0.21050080003988114, "iqr": 0.0004209998678561533, "raw_times": [0.20996300008846447, 0.21053299997220165, 0.2102230000673444, 0.2101120001043455, 0.2116729999670497], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.21157300011509506, "peak_bytes": 603996160, "ok": false, "absmax": 0.03125, "corr": {"ok": false, "rtol": 0.001, "atol": 0.001, "absmax": 0.03125, "mae": 0.0015411376953125, "mse": 1.1265277862548828e-05, "ref": "layer_norm_fp32"}, "err": null}
+{"ts": "2025-10-27T14:46:20Z", "run": "9dc250db88854e28964e0fea9bac9f3c", "impl": "hf_kernels_layer_norm", "tags": {"family": "hf-kernels", "repo": "kernels-community/layer-norm", "op": "layer_norm"}, "wl": {"name": "LN_B4_S2048_D8192", "batch": 4, "seq_len": 2048, "hidden_dim": 8192, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.4341660001045966, "p50": 0.4372359999251785, "p90": 0.4383160000998032, "mean": 0.437980000015159, "iqr": 0.004120000085094944, "raw_times": [0.4383160000998032, 0.4372359999251785, 0.4341660001045966, 0.43419600001470826, 0.44598599993150856], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.44448700009525055, "peak_bytes": 1207992320, "ok": false, "absmax": 0.03125, "corr": {"ok": false, "rtol": 0.001, "atol": 0.001, "absmax": 0.03125, "mae": 0.00152587890625, "mse": 1.1205673217773438e-05, "ref": "layer_norm_fp32"}, "err": null}
+{"ts": "2025-10-27T14:46:20Z", "run": "9dc250db88854e28964e0fea9bac9f3c", "impl": "hf_kernels_layer_norm", "tags": {"family": "hf-kernels", "repo": "kernels-community/layer-norm", "op": "layer_norm"}, "wl": {"name": "LN_B16_S128_D1024", "batch": 16, "seq_len": 128, "hidden_dim": 1024, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.04514099987318332, "p50": 0.0465299999632407, "p90": 0.04655099996853096, "mean": 0.04629059999388119, "iqr": 0.0011309998626529705, "raw_times": [0.04514099987318332, 0.04655099996853096, 0.04781100005857297, 0.04542000010587799, 0.0465299999632407], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.04811100006918423, "peak_bytes": 276860928, "ok": false, "absmax": 0.03125, "corr": {"ok": false, "rtol": 0.001, "atol": 0.001, "absmax": 0.03125, "mae": 0.00159454345703125, "mse": 1.150369644165039e-05, "ref": "layer_norm_fp32"}, "err": null}
+{"ts": "2025-10-27T14:46:20Z", "run": "9dc250db88854e28964e0fea9bac9f3c", "impl": "hf_kernels_layer_norm", "tags": {"family": "hf-kernels", "repo": "kernels-community/layer-norm", "op": "layer_norm"}, "wl": {"name": "LN_B16_S128_D2048", "batch": 16, "seq_len": 128, "hidden_dim": 2048, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.043820999962917995, "p50": 0.045551000084742554, "p90": 0.04633000003195775, "mean": 0.04580079998959263, "iqr": 0.0007890000688348664, "raw_times": [0.043820999962917995, 0.04776099990522198, 0.045551000084742554, 0.045540999963122886, 0.04633000003195775], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.05054000007476134, "peak_bytes": 75505664, "ok": false, "absmax": 0.03125, "corr": {"ok": false, "rtol": 0.001, "atol": 0.001, "absmax": 0.03125, "mae": 0.00159454345703125, "mse": 1.1563301086425781e-05, "ref": "layer_norm_fp32"}, "err": null}
+{"ts": "2025-10-27T14:46:20Z", "run": "9dc250db88854e28964e0fea9bac9f3c", "impl": "hf_kernels_layer_norm", "tags": {"family": "hf-kernels", "repo": "kernels-community/layer-norm", "op": "layer_norm"}, "wl": {"name": "LN_B16_S128_D4096", "batch": 16, "seq_len": 128, "hidden_dim": 4096, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.04511099996307166, "p50": 0.04610100017998775, "p90": 0.04624100006367371, "mean": 0.04598500008796691, "iqr": 0.0004099999841855606, "raw_times": [0.04583100007948815, 0.04664100015361328, 0.04511099996307166, 0.04610100017998775, 0.04624100006367371], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.04932000001645065, "peak_bytes": 151011328, "ok": false, "absmax": 0.03125, "corr": {"ok": false, "rtol": 0.001, "atol": 0.001, "absmax": 0.03125, "mae": 0.00156402587890625, "mse": 1.1444091796875e-05, "ref": "layer_norm_fp32"}, "err": null}
+{"ts": "2025-10-27T14:46:20Z", "run": "9dc250db88854e28964e0fea9bac9f3c", "impl": "hf_kernels_layer_norm", "tags": {"family": "hf-kernels", "repo": "kernels-community/layer-norm", "op": "layer_norm"}, "wl": {"name": "LN_B16_S128_D8192", "batch": 16, "seq_len": 128, "hidden_dim": 8192, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.05165100014892232, "p50": 0.052130999847577186, "p90": 0.05317099999047059, "mean": 0.05250480003269331, "iqr": 0.0012309999419812812, "raw_times": [0.052130999847577186, 0.05165100014892232, 0.053631000128007145, 0.05317099999047059, 0.05194000004848931], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.055000999964249786, "peak_bytes": 302022656, "ok": false, "absmax": 0.03125, "corr": {"ok": false, "rtol": 0.001, "atol": 0.001, "absmax": 0.03125, "mae": 0.00152587890625, "mse": 1.1146068572998047e-05, "ref": "layer_norm_fp32"}, "err": null}
+{"ts": "2025-10-27T14:46:20Z", "run": "9dc250db88854e28964e0fea9bac9f3c", "impl": "hf_kernels_layer_norm", "tags": {"family": "hf-kernels", "repo": "kernels-community/layer-norm", "op": "layer_norm"}, "wl": {"name": "LN_B16_S512_D1024", "batch": 16, "seq_len": 512, "hidden_dim": 1024, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.045381000063571264, "p50": 0.045759999920846894, "p90": 0.04781100005857297, "mean": 0.04770240002471837, "iqr": 0.00238100005844899, "raw_times": [0.045759999920846894, 0.04781100005857297, 0.045381000063571264, 0.04543000000012398, 0.05413000008047675], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.04919000002701068, "peak_bytes": 150999040, "ok": false, "absmax": 0.03125, "corr": {"ok": false, "rtol": 0.001, "atol": 0.001, "absmax": 0.03125, "mae": 0.0015869140625, "mse": 1.150369644165039e-05, "ref": "layer_norm_fp32"}, "err": null}
+{"ts": "2025-10-27T14:46:20Z", "run": "9dc250db88854e28964e0fea9bac9f3c", "impl": "hf_kernels_layer_norm", "tags": {"family": "hf-kernels", "repo": "kernels-community/layer-norm", "op": "layer_norm"}, "wl": {"name": "LN_B16_S512_D2048", "batch": 16, "seq_len": 512, "hidden_dim": 2048, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.05421099990599032, "p50": 0.054861000080563826, "p90": 0.05564100001720362, "mean": 0.05508300000656163, "iqr": 0.0010100000054080738, "raw_times": [0.056071000017254846, 0.054861000080563826, 0.05564100001720362, 0.05421099990599032, 0.05463100001179555], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.05805000000691507, "peak_bytes": 301998080, "ok": false, "absmax": 0.03125, "corr": {"ok": false, "rtol": 0.001, "atol": 0.001, "absmax": 0.03125, "mae": 0.00157928466796875, "mse": 1.1444091796875e-05, "ref": "layer_norm_fp32"}, "err": null}
+{"ts": "2025-10-27T14:46:20Z", "run": "9dc250db88854e28964e0fea9bac9f3c", "impl": "hf_kernels_layer_norm", "tags": {"family": "hf-kernels", "repo": "kernels-community/layer-norm", "op": "layer_norm"}, "wl": {"name": "LN_B16_S512_D4096", "batch": 16, "seq_len": 512, "hidden_dim": 4096, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.20916299990858533, "p50": 0.21016300001974741, "p90": 0.21141399997759436, "mean": 0.21107719999235997, "iqr": 0.0015210000583465444, "raw_times": [0.21141399997759436, 0.2147530001366249, 0.2098929999192478, 0.21016300001974741, 0.20916299990858533], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.21191299993006396, "peak_bytes": 603996160, "ok": false, "absmax": 0.03125, "corr": {"ok": false, "rtol": 0.001, "atol": 0.001, "absmax": 0.03125, "mae": 0.0015411376953125, "mse": 1.1265277862548828e-05, "ref": "layer_norm_fp32"}, "err": null}
+{"ts": "2025-10-27T14:46:20Z", "run": "9dc250db88854e28964e0fea9bac9f3c", "impl": "hf_kernels_layer_norm", "tags": {"family": "hf-kernels", "repo": "kernels-community/layer-norm", "op": "layer_norm"}, "wl": {"name": "LN_B16_S512_D8192", "batch": 16, "seq_len": 512, "hidden_dim": 8192, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.43155599996680394, "p50": 0.43475600000419945, "p90": 0.4373360000045068, "mean": 0.43558200000006764, "iqr": 0.003800000058618025, "raw_times": [0.43475600000419945, 0.44072600007893925, 0.4373360000045068, 0.43353599994588876, 0.43155599996680394], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.44892699997944874, "peak_bytes": 1207992320, "ok": false, "absmax": 0.03125, "corr": {"ok": false, "rtol": 0.001, "atol": 0.001, "absmax": 0.03125, "mae": 0.00152587890625, "mse": 1.1205673217773438e-05, "ref": "layer_norm_fp32"}, "err": null}
+{"ts": "2025-10-27T14:46:20Z", "run": "9dc250db88854e28964e0fea9bac9f3c", "impl": "hf_kernels_layer_norm", "tags": {"family": "hf-kernels", "repo": "kernels-community/layer-norm", "op": "layer_norm"}, "wl": {"name": "LN_B16_S1024_D1024", "batch": 16, "seq_len": 1024, "hidden_dim": 1024, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.0483500000427739, "p50": 0.049099999841928366, "p90": 0.04950099992129253, "mean": 0.050544599935165024, "iqr": 0.0011199999789823778, "raw_times": [0.048380999942310154, 0.04950099992129253, 0.05739099992752017, 0.049099999841928366, 0.0483500000427739], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.05153099982635467, "peak_bytes": 335581184, "ok": false, "absmax": 0.03125, "corr": {"ok": false, "rtol": 0.001, "atol": 0.001, "absmax": 0.03125, "mae": 0.0015869140625, "mse": 1.150369644165039e-05, "ref": "layer_norm_fp32"}, "err": null}
+{"ts": "2025-10-27T14:46:20Z", "run": "9dc250db88854e28964e0fea9bac9f3c", "impl": "hf_kernels_layer_norm", "tags": {"family": "hf-kernels", "repo": "kernels-community/layer-norm", "op": "layer_norm"}, "wl": {"name": "LN_B16_S1024_D2048", "batch": 16, "seq_len": 1024, "hidden_dim": 2048, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.2181429999836837, "p50": 0.2215729998624738, "p90": 0.2217329999893991, "mean": 0.22086119997766218, "iqr": 0.003440000000409782, "raw_times": [0.2181429999836837, 0.2217329999893991, 0.21829299998898932, 0.2215729998624738, 0.22456400006376498], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.22583300005862839, "peak_bytes": 603987968, "ok": false, "absmax": 0.03125, "corr": {"ok": false, "rtol": 0.001, "atol": 0.001, "absmax": 0.03125, "mae": 0.0015716552734375, "mse": 1.1444091796875e-05, "ref": "layer_norm_fp32"}, "err": null}
+{"ts": "2025-10-27T14:46:20Z", "run": "9dc250db88854e28964e0fea9bac9f3c", "impl": "hf_kernels_layer_norm", "tags": {"family": "hf-kernels", "repo": "kernels-community/layer-norm", "op": "layer_norm"}, "wl": {"name": "LN_B16_S1024_D4096", "batch": 16, "seq_len": 1024, "hidden_dim": 4096, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.43596600016826415, "p50": 0.4398270000365301, "p90": 0.4409260000102222, "mean": 0.4390922000766295, "iqr": 0.003549999973984086, "raw_times": [0.4398270000365301, 0.4409260000102222, 0.4413660001318931, 0.43596600016826415, 0.4373760000362381], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.44040700004188693, "peak_bytes": 1207975936, "ok": false, "absmax": 0.03125, "corr": {"ok": false, "rtol": 0.001, "atol": 0.001, "absmax": 0.03125, "mae": 0.0015411376953125, "mse": 1.1205673217773438e-05, "ref": "layer_norm_fp32"}, "err": null}
+{"ts": "2025-10-27T14:46:20Z", "run": "9dc250db88854e28964e0fea9bac9f3c", "impl": "hf_kernels_layer_norm", "tags": {"family": "hf-kernels", "repo": "kernels-community/layer-norm", "op": "layer_norm"}, "wl": {"name": "LN_B16_S1024_D8192", "batch": 16, "seq_len": 1024, "hidden_dim": 8192, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.8329219999723136, "p50": 0.8419220000632777, "p90": 0.8434520000264456, "mean": 0.84072780000497, "iqr": 0.002130000211764127, "raw_times": [0.8329219999723136, 0.8419220000632777, 0.8440210001481319, 0.8434520000264456, 0.8413219998146815], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.8442119999472197, "peak_bytes": 2415951872, "ok": false, "absmax": 0.03125, "corr": {"ok": false, "rtol": 0.001, "atol": 0.001, "absmax": 0.03125, "mae": 0.00151824951171875, "mse": 1.1146068572998047e-05, "ref": "layer_norm_fp32"}, "err": null}
+{"ts": "2025-10-27T14:46:20Z", "run": "9dc250db88854e28964e0fea9bac9f3c", "impl": "hf_kernels_layer_norm", "tags": {"family": "hf-kernels", "repo": "kernels-community/layer-norm", "op": "layer_norm"}, "wl": {"name": "LN_B16_S2048_D1024", "batch": 16, "seq_len": 2048, "hidden_dim": 1024, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.21230300012575753, "p50": 0.2135429999725602, "p90": 0.2142630000889767, "mean": 0.21426700000120036, "iqr": 0.0008800002433417831, "raw_times": [0.21230300012575753, 0.2133829998456349, 0.2135429999725602, 0.2142630000889767, 0.21784299997307244], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.22175300000526477, "peak_bytes": 671125504, "ok": false, "absmax": 0.0625, "corr": {"ok": false, "rtol": 0.001, "atol": 0.001, "absmax": 0.0625, "mae": 0.0015869140625, "mse": 1.1563301086425781e-05, "ref": "layer_norm_fp32"}, "err": null}
+{"ts": "2025-10-27T14:46:20Z", "run": "9dc250db88854e28964e0fea9bac9f3c", "impl": "hf_kernels_layer_norm", "tags": {"family": "hf-kernels", "repo": "kernels-community/layer-norm", "op": "layer_norm"}, "wl": {"name": "LN_B16_S2048_D2048", "batch": 16, "seq_len": 2048, "hidden_dim": 2048, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.4536460000963416, "p50": 0.45670700001210207, "p90": 0.4569770001126017, "mean": 0.45669080004699936, "iqr": 0.00113999999484804, "raw_times": [0.4536460000963416, 0.4569770001126017, 0.45583700011775363, 0.45670700001210207, 0.4602869998961978], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.4546860000118613, "peak_bytes": 1207967744, "ok": false, "absmax": 0.0625, "corr": {"ok": false, "rtol": 0.001, "atol": 0.001, "absmax": 0.0625, "mae": 0.0015716552734375, "mse": 1.1444091796875e-05, "ref": "layer_norm_fp32"}, "err": null}
+{"ts": "2025-10-27T14:46:20Z", "run": "9dc250db88854e28964e0fea9bac9f3c", "impl": "hf_kernels_layer_norm", "tags": {"family": "hf-kernels", "repo": "kernels-community/layer-norm", "op": "layer_norm"}, "wl": {"name": "LN_B16_S2048_D4096", "batch": 16, "seq_len": 2048, "hidden_dim": 4096, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.8352710001418018, "p50": 0.8370320001631626, "p90": 0.8388319999994565, "mean": 0.8375798000997747, "iqr": 0.0019899998733308166, "raw_times": [0.8352710001418018, 0.8368420001261256, 0.8399220000683272, 0.8370320001631626, 0.8388319999994565], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.849921000053655, "peak_bytes": 2415935488, "ok": false, "absmax": 0.03125, "corr": {"ok": false, "rtol": 0.001, "atol": 0.001, "absmax": 0.03125, "mae": 0.0015411376953125, "mse": 1.1205673217773438e-05, "ref": "layer_norm_fp32"}, "err": null}
+{"ts": "2025-10-27T14:46:20Z", "run": "9dc250db88854e28964e0fea9bac9f3c", "impl": "hf_kernels_layer_norm", "tags": {"family": "hf-kernels", "repo": "kernels-community/layer-norm", "op": "layer_norm"}, "wl": {"name": "LN_B16_S2048_D8192", "batch": 16, "seq_len": 2048, "hidden_dim": 8192, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 1.6451530000267667, "p50": 1.6546740000649152, "p90": 1.6553830000702874, "mean": 1.6516054000476288, "iqr": 0.008870000101524056, "raw_times": [1.6553830000702874, 1.6465129999687633, 1.6563040001074114, 1.6546740000649152, 1.6451530000267667], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 1.655194000022675, "peak_bytes": 4831870976, "ok": false, "absmax": 0.03125, "corr": {"ok": false, "rtol": 0.001, "atol": 0.001, "absmax": 0.03125, "mae": 0.0015106201171875, "mse": 1.1086463928222656e-05, "ref": "layer_norm_fp32"}, "err": null}
diff --git a/layer_norm/impls/cells/benchmark.py b/layer_norm/impls/cells/benchmark.py
index 6a00a9f99d8d044ab5f9dc0f5019344cef0612b9..d871d1b25fedf8b294c567e9ac582decb62f3cde 100644
--- a/layer_norm/impls/cells/benchmark.py
+++ b/layer_norm/impls/cells/benchmark.py
@@ -3,6 +3,7 @@
 # dependencies = [
 #     "numpy",
 #     "torch==2.8.0",
+#     "kernels",
 #     "kernels-benchmark-tools",
 # ]
 #
@@ -12,15 +13,37 @@
 import torch
 import sys
 from kernels_benchmark_tools import KernelTypeEnum, run_benchmark
+from kernels import get_kernel
 
+# Load the layer norm kernel
+layer_norm_kernel = get_kernel("kernels-community/layer-norm")
 
-def torch_layer_norm(x, weight, bias, eps: float = 1e-5):
-    return torch.nn.functional.layer_norm(x, (x.shape[-1],), weight, bias, eps)
+
+def hf_kernels_layer_norm(x, weight, bias, eps: float = 1e-5):
+    B, S, D = x.shape
+    # The kernel expects [N, D] input; support beta (bias) if provided.
+    out = layer_norm_kernel.dropout_add_ln_fwd(
+        input=x.view(-1, D),
+        gamma=weight,
+        beta=bias,
+        rowscale=None,
+        colscale=None,
+        x0_subset=None,
+        z_subset=None,
+        dropout_p=0.0,
+        epsilon=eps,
+        rowscale_const=1.0,
+        z_numrows=S,
+        gen=None,
+        residual_in_fp32=False,
+        is_rms_norm=False,
+    )[0].view(B, S, D)
+    return out
 
 
 run_benchmark(
     kernel_type=KernelTypeEnum.LAYER_NORM,
-    impl_name="torch_layer_norm",
-    impl_tags={"family": "torch", "op": "layer_norm"},
-    impl_func=torch_layer_norm,
+    impl_name="hf_kernels_layer_norm",
+    impl_tags={"family": "hf-kernels", "repo": "kernels-community/layer-norm", "op": "layer_norm"},
+    impl_func=hf_kernels_layer_norm,
 )
\ No newline at end of file
diff --git a/layer_norm/impls/hf_kernels_layer_norm.html b/layer_norm/impls/hf_kernels_layer_norm.html
index 7c6779ec37fa43900d621bf6ad69b69b9c4ea785..41eded723a429c13af73316bd24eb2cd47fde3c8 100644
--- a/layer_norm/impls/hf_kernels_layer_norm.html
+++ b/layer_norm/impls/hf_kernels_layer_norm.html
@@ -3857,7 +3857,7 @@ span.linenos.special { color: #000000; background-color: #ffffc0; padding-left:
     <div class="system-info">
         <div class="system-info-header">Generated on:</div>
         <div class="system-info-content">
-            Linux x86_64 | Linux-5.15.0-1084-aws-x86_64-with-glibc2.31
+            Linux x86_64 | Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35
         </div>
     </div>
     
@@ -3873,7 +3873,7 @@ span.linenos.special { color: #000000; background-color: #ffffc0; padding-left:
 <span onclick="toggleOutput('benchmark')" style="cursor: pointer;">▼ output</span>
  <span id="uv-indicator-benchmark" onclick="toggleUvLogsFromHeader('benchmark')" style="cursor: pointer;">▶ uv-logs</span>
 </span> | 
-Cell: benchmark | 47.20s
+Cell: benchmark | 6.33s
  | <button class="run-btn" onclick="runCell('benchmark')">▶ run</button>
 <button class="copy-btn" onclick="copyCell('benchmark')">Copy</button>
 <a href="cells/benchmark.py" target="_blank" class="raw-btn">Raw</a>
@@ -3943,19 +3943,19 @@ PROFILE TRACE: hf_kernels_layer_norm | LN_B1_S128_D1024
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
                                                    Name    Self CPU %      Self CPU   CPU total %     CPU total  CPU time avg     Self CUDA   Self CUDA %    CUDA total  CUDA time avg    # of Calls  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-                                  hf_kernels_layer_norm         0.00%       0.000us         0.00%       0.000us       0.000us     130.368us      1245.87%     130.368us     130.368us             1  
-                                  hf_kernels_layer_norm        10.50%     197.573us        99.64%       1.875ms       1.875ms       0.000us         0.00%      14.048us      14.048us             1  
-                _layer_norm_f8ec252::dropout_add_ln_fwd         3.47%      65.272us        87.77%       1.652ms     550.605us      10.464us       100.00%      14.048us       4.683us             3  
-void layer_norm::ln_fwd_kernel&lt;layer_norm::Kernel_tr...         0.00%       0.000us         0.00%       0.000us       0.000us      10.464us       100.00%      10.464us       3.488us             3  
-                                Activity Buffer Request        79.11%       1.489ms        79.11%       1.489ms       1.489ms       3.584us        34.25%       3.584us       3.584us             1  
-                                             aten::view         1.38%      25.881us         1.38%      25.881us       4.314us       0.000us         0.00%       0.000us       0.000us             6  
-                                            aten::empty         2.61%      49.141us         2.61%      49.141us       5.460us       0.000us         0.00%       0.000us       0.000us             9  
-cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFla...         0.46%       8.610us         0.46%       8.610us       2.870us       0.000us         0.00%       0.000us       0.000us             3  
-                                       cudaLaunchKernel         2.12%      39.872us         2.12%      39.872us      13.291us       0.000us         0.00%       0.000us       0.000us             3  
-                                  cudaDeviceSynchronize         0.36%       6.770us         0.36%       6.770us       6.770us       0.000us         0.00%       0.000us       0.000us             1  
+                                  hf_kernels_layer_norm         0.00%       0.000us         0.00%       0.000us       0.000us     126.624us      1327.85%     126.624us     126.624us             1  
+                                  hf_kernels_layer_norm        10.50%     192.054us        99.63%       1.822ms       1.822ms       0.000us         0.00%      12.800us      12.800us             1  
+                _layer_norm_f8ec252::dropout_add_ln_fwd         3.73%      68.149us        87.79%       1.605ms     535.007us       9.536us       100.00%      12.800us       4.267us             3  
+void layer_norm::ln_fwd_kernel&lt;layer_norm::Kernel_tr...         0.00%       0.000us         0.00%       0.000us       0.000us       9.536us       100.00%       9.536us       3.179us             3  
+                                Activity Buffer Request        78.93%       1.443ms        78.93%       1.443ms       1.443ms       3.264us        34.23%       3.264us       3.264us             1  
+                                             aten::view         1.34%      24.540us         1.34%      24.540us       4.090us       0.000us         0.00%       0.000us       0.000us             6  
+                                            aten::empty         2.50%      45.632us         2.50%      45.632us       5.070us       0.000us         0.00%       0.000us       0.000us             9  
+cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFla...         0.52%       9.500us         0.52%       9.500us       3.167us       0.000us         0.00%       0.000us       0.000us             3  
+                                       cudaLaunchKernel         2.11%      38.660us         2.11%      38.660us      12.887us       0.000us         0.00%       0.000us       0.000us             3  
+                                  cudaDeviceSynchronize         0.37%       6.690us         0.37%       6.690us       6.690us       0.000us         0.00%       0.000us       0.000us             1  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-Self CPU time total: 1.882ms
-Self CUDA time total: 10.464us
+Self CPU time total: 1.828ms
+Self CUDA time total: 9.536us
 
 
 
@@ -3965,19 +3965,19 @@ PROFILE TRACE: hf_kernels_layer_norm | LN_B1_S128_D2048
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
                                                    Name    Self CPU %      Self CPU   CPU total %     CPU total  CPU time avg     Self CUDA   Self CUDA %    CUDA total  CUDA time avg    # of Calls  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-                                  hf_kernels_layer_norm         0.00%       0.000us         0.00%       0.000us       0.000us     117.981us       863.51%     117.981us     117.981us             1  
-                                  hf_kernels_layer_norm         7.44%     129.853us        99.69%       1.741ms       1.741ms       0.000us         0.00%      18.271us      18.271us             1  
-                _layer_norm_f8ec252::dropout_add_ln_fwd         2.62%      45.831us        91.51%       1.598ms     532.638us      13.663us       100.00%      18.271us       6.090us             3  
-void layer_norm::ln_fwd_kernel&lt;layer_norm::Kernel_tr...         0.00%       0.000us         0.00%       0.000us       0.000us      13.663us       100.00%      13.663us       4.554us             3  
-                                Activity Buffer Request        85.13%       1.487ms        85.13%       1.487ms       1.487ms       4.608us        33.73%       4.608us       4.608us             1  
-                                             aten::view         0.75%      13.060us         0.75%      13.060us       2.177us       0.000us         0.00%       0.000us       0.000us             6  
-                                            aten::empty         1.75%      30.520us         1.75%      30.520us       3.391us       0.000us         0.00%       0.000us       0.000us             9  
-cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFla...         0.27%       4.661us         0.27%       4.661us       1.554us       0.000us         0.00%       0.000us       0.000us             3  
-                                       cudaLaunchKernel         1.74%      30.321us         1.74%      30.321us      10.107us       0.000us         0.00%       0.000us       0.000us             3  
-                                  cudaDeviceSynchronize         0.31%       5.410us         0.31%       5.410us       5.410us       0.000us         0.00%       0.000us       0.000us             1  
+                                  hf_kernels_layer_norm         0.00%       0.000us         0.00%       0.000us       0.000us     118.975us       960.72%     118.975us     118.975us             1  
+                                  hf_kernels_layer_norm         8.90%     155.923us        99.67%       1.747ms       1.747ms       0.000us         0.00%      16.576us      16.576us             1  
+                _layer_norm_f8ec252::dropout_add_ln_fwd         2.71%      47.470us        90.07%       1.579ms     526.204us      12.384us       100.00%      16.576us       5.525us             3  
+void layer_norm::ln_fwd_kernel&lt;layer_norm::Kernel_tr...         0.00%       0.000us         0.00%       0.000us       0.000us      12.384us       100.00%      12.384us       4.128us             3  
+                                Activity Buffer Request        83.60%       1.465ms        83.60%       1.465ms       1.465ms       4.192us        33.85%       4.192us       4.192us             1  
+                                             aten::view         0.71%      12.400us         0.71%      12.400us       2.067us       0.000us         0.00%       0.000us       0.000us             6  
+                                            aten::empty         1.73%      30.340us         1.73%      30.340us       3.371us       0.000us         0.00%       0.000us       0.000us             9  
+cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFla...         0.28%       4.970us         0.28%       4.970us       1.657us       0.000us         0.00%       0.000us       0.000us             3  
+                                       cudaLaunchKernel         1.74%      30.551us         1.74%      30.551us      10.184us       0.000us         0.00%       0.000us       0.000us             3  
+                                  cudaDeviceSynchronize         0.33%       5.780us         0.33%       5.780us       5.780us       0.000us         0.00%       0.000us       0.000us             1  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-Self CPU time total: 1.746ms
-Self CUDA time total: 13.663us
+Self CPU time total: 1.753ms
+Self CUDA time total: 12.384us
 
 
 
@@ -3987,19 +3987,19 @@ PROFILE TRACE: hf_kernels_layer_norm | LN_B1_S128_D4096
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
                                                    Name    Self CPU %      Self CPU   CPU total %     CPU total  CPU time avg     Self CUDA   Self CUDA %    CUDA total  CUDA time avg    # of Calls  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-                                  hf_kernels_layer_norm         0.00%       0.000us         0.00%       0.000us       0.000us     116.509us       943.24%     116.509us     116.509us             1  
-                                  hf_kernels_layer_norm         7.98%     138.752us        99.72%       1.735ms       1.735ms       0.000us         0.00%      16.480us      16.480us             1  
-                _layer_norm_f8ec252::dropout_add_ln_fwd         2.45%      42.600us        91.01%       1.583ms     527.711us      12.352us       100.00%      16.480us       5.493us             3  
-void layer_norm::ln_fwd_kernel&lt;layer_norm::Kernel_tr...         0.00%       0.000us         0.00%       0.000us       0.000us      12.352us       100.00%      12.352us       4.117us             3  
-                                Activity Buffer Request        85.01%       1.479ms        85.01%       1.479ms       1.479ms       4.128us        33.42%       4.128us       4.128us             1  
-                                             aten::view         0.74%      12.801us         0.74%      12.801us       2.134us       0.000us         0.00%       0.000us       0.000us             6  
-                                            aten::empty         1.67%      29.111us         1.67%      29.111us       3.235us       0.000us         0.00%       0.000us       0.000us             9  
-cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFla...         0.27%       4.660us         0.27%       4.660us       1.553us       0.000us         0.00%       0.000us       0.000us             3  
-                                       cudaLaunchKernel         1.61%      28.011us         1.61%      28.011us       9.337us       0.000us         0.00%       0.000us       0.000us             3  
-                                  cudaDeviceSynchronize         0.28%       4.840us         0.28%       4.840us       4.840us       0.000us         0.00%       0.000us       0.000us             1  
+                                  hf_kernels_layer_norm         0.00%       0.000us         0.00%       0.000us       0.000us     109.887us      1003.99%     109.887us     109.887us             1  
+                                  hf_kernels_layer_norm         7.66%     143.860us        99.71%       1.872ms       1.872ms       0.000us         0.00%      14.626us      14.626us             1  
+                _layer_norm_f8ec252::dropout_add_ln_fwd         2.49%      46.702us        91.41%       1.716ms     571.882us      10.945us       100.00%      14.626us       4.875us             3  
+void layer_norm::ln_fwd_kernel&lt;layer_norm::Kernel_tr...         0.00%       0.000us         0.00%       0.000us       0.000us      10.945us       100.00%      10.945us       3.648us             3  
+                                Activity Buffer Request        85.70%       1.609ms        85.70%       1.609ms       1.609ms       3.681us        33.63%       3.681us       3.681us             1  
+                                             aten::view         0.64%      12.051us         0.64%      12.051us       2.008us       0.000us         0.00%       0.000us       0.000us             6  
+                                            aten::empty         1.56%      29.239us         1.56%      29.239us       3.249us       0.000us         0.00%       0.000us       0.000us             9  
+cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFla...         0.26%       4.870us         0.26%       4.870us       1.623us       0.000us         0.00%       0.000us       0.000us             3  
+                                       cudaLaunchKernel         1.40%      26.311us         1.40%      26.311us       8.770us       0.000us         0.00%       0.000us       0.000us             3  
+                                  cudaDeviceSynchronize         0.29%       5.350us         0.29%       5.350us       5.350us       0.000us         0.00%       0.000us       0.000us             1  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-Self CPU time total: 1.740ms
-Self CUDA time total: 12.352us
+Self CPU time total: 1.877ms
+Self CUDA time total: 10.945us
 
 
 
@@ -4009,19 +4009,19 @@ PROFILE TRACE: hf_kernels_layer_norm | LN_B1_S128_D8192
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
                                                    Name    Self CPU %      Self CPU   CPU total %     CPU total  CPU time avg     Self CUDA   Self CUDA %    CUDA total  CUDA time avg    # of Calls  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-                                  hf_kernels_layer_norm         0.00%       0.000us         0.00%       0.000us       0.000us     125.982us       578.96%     125.982us     125.982us             1  
-                                  hf_kernels_layer_norm         6.68%     137.125us        99.77%       2.048ms       2.048ms       0.000us         0.00%      29.120us      29.120us             1  
-                _layer_norm_f8ec252::dropout_add_ln_fwd         2.07%      42.461us        92.46%       1.898ms     632.783us      21.760us       100.00%      29.120us       9.707us             3  
-void layer_norm::ln_fwd_kernel&lt;layer_norm::Kernel_tr...         0.00%       0.000us         0.00%       0.000us       0.000us      21.760us       100.00%      21.760us       7.253us             3  
-                                Activity Buffer Request        73.49%       1.509ms        73.49%       1.509ms       1.509ms       7.360us        33.82%       7.360us       7.360us             1  
-                                             aten::view         0.63%      13.010us         0.63%      13.010us       2.168us       0.000us         0.00%       0.000us       0.000us             6  
-                                            aten::empty         1.55%      31.790us         1.55%      31.790us       3.532us       0.000us         0.00%       0.000us       0.000us             9  
-cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFla...         0.23%       4.660us         0.23%       4.660us       1.553us       0.000us         0.00%       0.000us       0.000us             3  
-                                       cudaLaunchKernel        15.12%     310.466us        15.12%     310.466us     103.489us       0.000us         0.00%       0.000us       0.000us             3  
-                                  cudaDeviceSynchronize         0.23%       4.720us         0.23%       4.720us       4.720us       0.000us         0.00%       0.000us       0.000us             1  
+                                  hf_kernels_layer_norm         0.00%       0.000us         0.00%       0.000us       0.000us     120.287us       916.82%     120.287us     120.287us             1  
+                                  hf_kernels_layer_norm         7.38%     148.710us        99.70%       2.008ms       2.008ms       0.000us         0.00%      17.504us      17.504us             1  
+                _layer_norm_f8ec252::dropout_add_ln_fwd         2.28%      45.984us        91.73%       1.848ms     615.912us      13.120us       100.00%      17.504us       5.835us             3  
+void layer_norm::ln_fwd_kernel&lt;layer_norm::Kernel_tr...         0.00%       0.000us         0.00%       0.000us       0.000us      13.120us       100.00%      13.120us       4.373us             3  
+                                Activity Buffer Request        71.87%       1.448ms        71.87%       1.448ms       1.448ms       4.384us        33.41%       4.384us       4.384us             1  
+                                             aten::view         0.60%      12.011us         0.60%      12.011us       2.002us       0.000us         0.00%       0.000us       0.000us             6  
+                                            aten::empty         1.48%      29.740us         1.48%      29.740us       3.304us       0.000us         0.00%       0.000us       0.000us             9  
+cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFla...         0.26%       5.319us         0.26%       5.319us       1.773us       0.000us         0.00%       0.000us       0.000us             3  
+                                       cudaLaunchKernel        15.83%     318.904us        15.83%     318.904us     106.301us       0.000us         0.00%       0.000us       0.000us             3  
+                                  cudaDeviceSynchronize         0.30%       5.970us         0.30%       5.970us       5.970us       0.000us         0.00%       0.000us       0.000us             1  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-Self CPU time total: 2.053ms
-Self CUDA time total: 21.760us
+Self CPU time total: 2.014ms
+Self CUDA time total: 13.120us
 
 
 
@@ -4031,19 +4031,19 @@ PROFILE TRACE: hf_kernels_layer_norm | LN_B1_S512_D1024
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
                                                    Name    Self CPU %      Self CPU   CPU total %     CPU total  CPU time avg     Self CUDA   Self CUDA %    CUDA total  CUDA time avg    # of Calls  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-                                  hf_kernels_layer_norm         0.00%       0.000us         0.00%       0.000us       0.000us     121.087us      1103.20%     121.087us     121.087us             1  
-                                  hf_kernels_layer_norm        42.59%       1.314ms        99.83%       3.079ms       3.079ms       0.000us         0.00%      14.528us      14.528us             1  
-                _layer_norm_f8ec252::dropout_add_ln_fwd         1.41%      43.391us        56.84%       1.753ms     584.439us      10.976us       100.00%      14.528us       4.843us             3  
-void layer_norm::ln_fwd_kernel&lt;layer_norm::Kernel_tr...         0.00%       0.000us         0.00%       0.000us       0.000us      10.976us       100.00%      10.976us       3.659us             3  
-                                Activity Buffer Request        48.69%       1.502ms        48.69%       1.502ms       1.502ms       3.552us        32.36%       3.552us       3.552us             1  
-                                             aten::view         0.40%      12.250us         0.40%      12.250us       2.042us       0.000us         0.00%       0.000us       0.000us             6  
-                                            aten::empty         0.96%      29.520us         0.96%      29.520us       3.280us       0.000us         0.00%       0.000us       0.000us             9  
-cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFla...         0.17%       5.350us         0.17%       5.350us       1.783us       0.000us         0.00%       0.000us       0.000us             3  
-                                       cudaLaunchKernel         5.61%     173.174us         5.61%     173.174us      57.725us       0.000us         0.00%       0.000us       0.000us             3  
-                                  cudaDeviceSynchronize         0.17%       5.330us         0.17%       5.330us       5.330us       0.000us         0.00%       0.000us       0.000us             1  
+                                  hf_kernels_layer_norm         0.00%       0.000us         0.00%       0.000us       0.000us     114.559us      1189.48%     114.559us     114.559us             1  
+                                  hf_kernels_layer_norm         7.21%     135.832us        99.75%       1.879ms       1.879ms       0.000us         0.00%      12.767us      12.767us             1  
+                _layer_norm_f8ec252::dropout_add_ln_fwd         2.53%      47.731us        91.89%       1.731ms     576.915us       9.631us       100.00%      12.767us       4.256us             3  
+void layer_norm::ln_fwd_kernel&lt;layer_norm::Kernel_tr...         0.00%       0.000us         0.00%       0.000us       0.000us       9.631us       100.00%       9.631us       3.210us             3  
+                                Activity Buffer Request        78.55%       1.480ms        78.55%       1.480ms       1.480ms       3.136us        32.56%       3.136us       3.136us             1  
+                                             aten::view         0.65%      12.210us         0.65%      12.210us       2.035us       0.000us         0.00%       0.000us       0.000us             6  
+                                            aten::empty         1.55%      29.201us         1.55%      29.201us       3.245us       0.000us         0.00%       0.000us       0.000us             9  
+cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFla...         0.26%       4.830us         0.26%       4.830us       1.610us       0.000us         0.00%       0.000us       0.000us             3  
+                                       cudaLaunchKernel         9.00%     169.482us         9.00%     169.482us      56.494us       0.000us         0.00%       0.000us       0.000us             3  
+                                  cudaDeviceSynchronize         0.25%       4.770us         0.25%       4.770us       4.770us       0.000us         0.00%       0.000us       0.000us             1  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-Self CPU time total: 3.084ms
-Self CUDA time total: 10.976us
+Self CPU time total: 1.884ms
+Self CUDA time total: 9.631us
 
 
 
@@ -4053,19 +4053,19 @@ PROFILE TRACE: hf_kernels_layer_norm | LN_B1_S512_D2048
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
                                                    Name    Self CPU %      Self CPU   CPU total %     CPU total  CPU time avg     Self CUDA   Self CUDA %    CUDA total  CUDA time avg    # of Calls  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-                                  hf_kernels_layer_norm         0.00%       0.000us         0.00%       0.000us       0.000us     118.334us       463.40%     118.334us     118.334us             1  
-                                  hf_kernels_layer_norm        20.93%     106.845us        98.96%     505.171us     505.171us       0.000us         0.00%      34.112us      34.112us             1  
-                _layer_norm_f8ec252::dropout_add_ln_fwd         8.38%      42.772us        75.59%     385.897us     128.632us      25.536us       100.00%      34.112us      11.371us             3  
-void layer_norm::ln_fwd_kernel&lt;layer_norm::Kernel_tr...         0.00%       0.000us         0.00%       0.000us       0.000us      25.536us       100.00%      25.536us       8.512us             3  
-                                Activity Buffer Request        27.16%     138.642us        27.16%     138.642us     138.642us       8.576us        33.58%       8.576us       8.576us             1  
-                                             aten::view         2.43%      12.429us         2.43%      12.429us       2.072us       0.000us         0.00%       0.000us       0.000us             6  
-                                            aten::empty         6.18%      31.540us         6.18%      31.540us       3.504us       0.000us         0.00%       0.000us       0.000us             9  
-cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFla...         1.06%       5.420us         1.06%       5.420us       1.807us       0.000us         0.00%       0.000us       0.000us             3  
-                                       cudaLaunchKernel        32.82%     167.523us        32.82%     167.523us      55.841us       0.000us         0.00%       0.000us       0.000us             3  
-                                  cudaDeviceSynchronize         1.04%       5.330us         1.04%       5.330us       5.330us       0.000us         0.00%       0.000us       0.000us             1  
+                                  hf_kernels_layer_norm         0.00%       0.000us         0.00%       0.000us       0.000us     117.151us       841.66%     117.151us     117.151us             1  
+                                  hf_kernels_layer_norm         7.38%     134.703us        99.74%       1.819ms       1.819ms       0.000us         0.00%      18.495us      18.495us             1  
+                _layer_norm_f8ec252::dropout_add_ln_fwd         2.52%      45.930us        91.68%       1.673ms     557.511us      13.919us       100.00%      18.495us       6.165us             3  
+void layer_norm::ln_fwd_kernel&lt;layer_norm::Kernel_tr...         0.00%       0.000us         0.00%       0.000us       0.000us      13.919us       100.00%      13.919us       4.640us             3  
+                                Activity Buffer Request        78.70%       1.436ms        78.70%       1.436ms       1.436ms       4.576us        32.88%       4.576us       4.576us             1  
+                                             aten::view         0.67%      12.200us         0.67%      12.200us       2.033us       0.000us         0.00%       0.000us       0.000us             6  
+                                            aten::empty         1.63%      29.679us         1.63%      29.679us       3.298us       0.000us         0.00%       0.000us       0.000us             9  
+cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFla...         0.30%       5.450us         0.30%       5.450us       1.817us       0.000us         0.00%       0.000us       0.000us             3  
+                                       cudaLaunchKernel         8.54%     155.763us         8.54%     155.763us      51.921us       0.000us         0.00%       0.000us       0.000us             3  
+                                  cudaDeviceSynchronize         0.26%       4.800us         0.26%       4.800us       4.800us       0.000us         0.00%       0.000us       0.000us             1  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-Self CPU time total: 510.501us
-Self CUDA time total: 25.536us
+Self CPU time total: 1.824ms
+Self CUDA time total: 13.919us
 
 
 
@@ -4075,19 +4075,19 @@ PROFILE TRACE: hf_kernels_layer_norm | LN_B1_S512_D4096
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
                                                    Name    Self CPU %      Self CPU   CPU total %     CPU total  CPU time avg     Self CUDA   Self CUDA %    CUDA total  CUDA time avg    # of Calls  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-                                  hf_kernels_layer_norm         0.00%       0.000us         0.00%       0.000us       0.000us     120.447us       409.57%     120.447us     120.447us             1  
-                                  hf_kernels_layer_norm        17.42%     106.524us        99.31%     607.323us     607.323us       0.000us         0.00%      39.296us      39.296us             1  
-                _layer_norm_f8ec252::dropout_add_ln_fwd         7.38%      45.140us        79.94%     488.879us     162.960us      29.408us       100.00%      39.296us      13.099us             3  
-void layer_norm::ln_fwd_kernel&lt;layer_norm::Kernel_tr...         0.00%       0.000us         0.00%       0.000us       0.000us      29.408us       100.00%      29.408us       9.803us             3  
-                                Activity Buffer Request        39.10%     239.095us        39.10%     239.095us     239.095us       9.888us        33.62%       9.888us       9.888us             1  
-                                             aten::view         1.95%      11.920us         1.95%      11.920us       1.987us       0.000us         0.00%       0.000us       0.000us             6  
-                                            aten::empty         4.84%      29.591us         4.84%      29.591us       3.288us       0.000us         0.00%       0.000us       0.000us             9  
-cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFla...         0.81%       4.930us         0.81%       4.930us       1.643us       0.000us         0.00%       0.000us       0.000us             3  
-                                       cudaLaunchKernel        27.82%     170.123us        27.82%     170.123us      56.708us       0.000us         0.00%       0.000us       0.000us             3  
-                                  cudaDeviceSynchronize         0.69%       4.200us         0.69%       4.200us       4.200us       0.000us         0.00%       0.000us       0.000us             1  
+                                  hf_kernels_layer_norm         0.00%       0.000us         0.00%       0.000us       0.000us     121.982us       816.32%     121.982us     121.982us             1  
+                                  hf_kernels_layer_norm         7.42%     137.921us        99.71%       1.853ms       1.853ms       0.000us         0.00%      19.934us      19.934us             1  
+                _layer_norm_f8ec252::dropout_add_ln_fwd         2.51%      46.641us        91.61%       1.702ms     567.498us      14.943us       100.00%      19.934us       6.645us             3  
+void layer_norm::ln_fwd_kernel&lt;layer_norm::Kernel_tr...         0.00%       0.000us         0.00%       0.000us       0.000us      14.943us       100.00%      14.943us       4.981us             3  
+                                Activity Buffer Request        78.68%       1.462ms        78.68%       1.462ms       1.462ms       4.991us        33.40%       4.991us       4.991us             1  
+                                             aten::view         0.68%      12.581us         0.68%      12.581us       2.097us       0.000us         0.00%       0.000us       0.000us             6  
+                                            aten::empty         1.61%      30.011us         1.61%      30.011us       3.335us       0.000us         0.00%       0.000us       0.000us             9  
+cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFla...         0.26%       4.880us         0.26%       4.880us       1.627us       0.000us         0.00%       0.000us       0.000us             3  
+                                       cudaLaunchKernel         8.55%     158.912us         8.55%     158.912us      52.971us       0.000us         0.00%       0.000us       0.000us             3  
+                                  cudaDeviceSynchronize         0.29%       5.320us         0.29%       5.320us       5.320us       0.000us         0.00%       0.000us       0.000us             1  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-Self CPU time total: 611.523us
-Self CUDA time total: 29.408us
+Self CPU time total: 1.858ms
+Self CUDA time total: 14.943us
 
 
 
@@ -4097,19 +4097,19 @@ PROFILE TRACE: hf_kernels_layer_norm | LN_B1_S512_D8192
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
                                                    Name    Self CPU %      Self CPU   CPU total %     CPU total  CPU time avg     Self CUDA   Self CUDA %    CUDA total  CUDA time avg    # of Calls  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-                                  hf_kernels_layer_norm         0.00%       0.000us         0.00%       0.000us       0.000us     133.151us       162.79%     133.151us     133.151us             1  
-                                  hf_kernels_layer_norm         6.90%     130.311us        99.76%       1.885ms       1.885ms       0.000us         0.00%     131.167us     131.167us             1  
-                _layer_norm_f8ec252::dropout_add_ln_fwd         2.33%      44.060us        92.18%       1.742ms     580.686us      81.791us       100.00%     131.167us      43.722us             3  
-void layer_norm::ln_fwd_kernel&lt;layer_norm::Kernel_tr...         0.00%       0.000us         0.00%       0.000us       0.000us      81.791us       100.00%      81.791us      27.264us             3  
-                                Activity Buffer Request        79.05%       1.494ms        79.05%       1.494ms       1.494ms      49.376us        60.37%      49.376us      49.376us             1  
-                                             aten::view         0.68%      12.842us         0.68%      12.842us       2.140us       0.000us         0.00%       0.000us       0.000us             6  
-                                            aten::empty         1.69%      31.890us         1.69%      31.890us       3.543us       0.000us         0.00%       0.000us       0.000us             9  
-cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFla...         0.27%       5.171us         0.27%       5.171us       1.724us       0.000us         0.00%       0.000us       0.000us             3  
-                                       cudaLaunchKernel         8.84%     167.034us         8.84%     167.034us      55.678us       0.000us         0.00%       0.000us       0.000us             3  
-                                  cudaDeviceSynchronize         0.24%       4.571us         0.24%       4.571us       4.571us       0.000us         0.00%       0.000us       0.000us             1  
+                                  hf_kernels_layer_norm         0.00%       0.000us         0.00%       0.000us       0.000us     122.336us       491.39%     122.336us     122.336us             1  
+                                  hf_kernels_layer_norm         7.27%     134.311us        99.73%       1.842ms       1.842ms       0.000us         0.00%      33.152us      33.152us             1  
+                _layer_norm_f8ec252::dropout_add_ln_fwd         2.48%      45.720us        91.77%       1.695ms     564.845us      24.896us       100.00%      33.152us      11.051us             3  
+void layer_norm::ln_fwd_kernel&lt;layer_norm::Kernel_tr...         0.00%       0.000us         0.00%       0.000us       0.000us      24.896us       100.00%      24.896us       8.299us             3  
+                                Activity Buffer Request        78.89%       1.457ms        78.89%       1.457ms       1.457ms       8.256us        33.16%       8.256us       8.256us             1  
+                                             aten::view         0.69%      12.770us         0.69%      12.770us       2.128us       0.000us         0.00%       0.000us       0.000us             6  
+                                            aten::empty         1.64%      30.291us         1.64%      30.291us       3.366us       0.000us         0.00%       0.000us       0.000us             9  
+cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFla...         0.28%       5.131us         0.28%       5.131us       1.710us       0.000us         0.00%       0.000us       0.000us             3  
+                                       cudaLaunchKernel         8.48%     156.672us         8.48%     156.672us      52.224us       0.000us         0.00%       0.000us       0.000us             3  
+                                  cudaDeviceSynchronize         0.27%       4.950us         0.27%       4.950us       4.950us       0.000us         0.00%       0.000us       0.000us             1  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-Self CPU time total: 1.890ms
-Self CUDA time total: 81.791us
+Self CPU time total: 1.847ms
+Self CUDA time total: 24.896us
 
 
 
@@ -4119,19 +4119,19 @@ PROFILE TRACE: hf_kernels_layer_norm | LN_B1_S1024_D1024
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
                                                    Name    Self CPU %      Self CPU   CPU total %     CPU total  CPU time avg     Self CUDA   Self CUDA %    CUDA total  CUDA time avg    # of Calls  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-                                  hf_kernels_layer_norm         0.00%       0.000us         0.00%       0.000us       0.000us     117.246us       654.27%     117.246us     117.246us             1  
-                                  hf_kernels_layer_norm        22.73%     119.272us        99.14%     520.171us     520.171us       0.000us         0.00%      23.808us      23.808us             1  
-                _layer_norm_f8ec252::dropout_add_ln_fwd         8.22%      43.142us        74.00%     388.268us     129.423us      17.920us       100.00%      23.808us       7.936us             3  
-void layer_norm::ln_fwd_kernel&lt;layer_norm::Kernel_tr...         0.00%       0.000us         0.00%       0.000us       0.000us      17.920us       100.00%      17.920us       5.973us             3  
-                                Activity Buffer Request        27.73%     145.503us        27.73%     145.503us     145.503us       5.888us        32.86%       5.888us       5.888us             1  
-                                             aten::view         2.41%      12.631us         2.41%      12.631us       2.105us       0.000us         0.00%       0.000us       0.000us             6  
-                                            aten::empty         5.99%      31.410us         5.99%      31.410us       3.490us       0.000us         0.00%       0.000us       0.000us             9  
-cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFla...         0.87%       4.560us         0.87%       4.560us       1.520us       0.000us         0.00%       0.000us       0.000us             3  
-                                       cudaLaunchKernel        31.19%     163.653us        31.19%     163.653us      54.551us       0.000us         0.00%       0.000us       0.000us             3  
-                                  cudaDeviceSynchronize         0.86%       4.531us         0.86%       4.531us       4.531us       0.000us         0.00%       0.000us       0.000us             1  
+                                  hf_kernels_layer_norm         0.00%       0.000us         0.00%       0.000us       0.000us     112.508us      1085.25%     112.508us     112.508us             1  
+                                  hf_kernels_layer_norm        20.69%     103.551us        99.03%     495.767us     495.767us       0.000us         0.00%      13.759us      13.759us             1  
+                _layer_norm_f8ec252::dropout_add_ln_fwd         9.55%      47.810us        76.09%     380.926us     126.975us      10.367us       100.00%      13.759us       4.586us             3  
+void layer_norm::ln_fwd_kernel&lt;layer_norm::Kernel_tr...         0.00%       0.000us         0.00%       0.000us       0.000us      10.367us       100.00%      10.367us       3.456us             3  
+                                Activity Buffer Request        28.93%     144.803us        28.93%     144.803us     144.803us       3.392us        32.72%       3.392us       3.392us             1  
+                                             aten::view         2.26%      11.290us         2.26%      11.290us       1.882us       0.000us         0.00%       0.000us       0.000us             6  
+                                            aten::empty         5.78%      28.941us         5.78%      28.941us       3.216us       0.000us         0.00%       0.000us       0.000us             9  
+cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFla...         1.18%       5.889us         1.18%       5.889us       1.963us       0.000us         0.00%       0.000us       0.000us             3  
+                                       cudaLaunchKernel        30.66%     153.483us        30.66%     153.483us      51.161us       0.000us         0.00%       0.000us       0.000us             3  
+                                  cudaDeviceSynchronize         0.97%       4.840us         0.97%       4.840us       4.840us       0.000us         0.00%       0.000us       0.000us             1  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-Self CPU time total: 524.702us
-Self CUDA time total: 17.920us
+Self CPU time total: 500.607us
+Self CUDA time total: 10.367us
 
 
 
@@ -4141,19 +4141,19 @@ PROFILE TRACE: hf_kernels_layer_norm | LN_B1_S1024_D2048
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
                                                    Name    Self CPU %      Self CPU   CPU total %     CPU total  CPU time avg     Self CUDA   Self CUDA %    CUDA total  CUDA time avg    # of Calls  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-                                  hf_kernels_layer_norm         0.00%       0.000us         0.00%       0.000us       0.000us     136.733us       373.85%     136.733us     136.733us             1  
-                                  hf_kernels_layer_norm         7.33%     138.162us        99.74%       1.881ms       1.881ms       0.000us         0.00%      48.861us      48.861us             1  
-                _layer_norm_f8ec252::dropout_add_ln_fwd         2.44%      46.001us        91.74%       1.730ms     576.679us      36.574us       100.00%      48.861us      16.287us             3  
-void layer_norm::ln_fwd_kernel&lt;layer_norm::Kernel_tr...         0.00%       0.000us         0.00%       0.000us       0.000us      36.574us       100.00%      36.574us      12.191us             3  
-                                Activity Buffer Request        78.81%       1.486ms        78.81%       1.486ms       1.486ms      12.287us        33.59%      12.287us      12.287us             1  
-                                             aten::view         0.68%      12.810us         0.68%      12.810us       2.135us       0.000us         0.00%       0.000us       0.000us             6  
-                                            aten::empty         1.62%      30.630us         1.62%      30.630us       3.403us       0.000us         0.00%       0.000us       0.000us             9  
-cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFla...         0.26%       4.810us         0.26%       4.810us       1.603us       0.000us         0.00%       0.000us       0.000us             3  
-                                       cudaLaunchKernel         8.61%     162.344us         8.61%     162.344us      54.115us       0.000us         0.00%       0.000us       0.000us             3  
-                                  cudaDeviceSynchronize         0.26%       4.870us         0.26%       4.870us       4.870us       0.000us         0.00%       0.000us       0.000us             1  
+                                  hf_kernels_layer_norm         0.00%       0.000us         0.00%       0.000us       0.000us     114.622us       709.29%     114.622us     114.622us             1  
+                                  hf_kernels_layer_norm        17.15%     104.082us        99.15%     601.769us     601.769us       0.000us         0.00%      21.536us      21.536us             1  
+                _layer_norm_f8ec252::dropout_add_ln_fwd         7.36%      44.690us        80.00%     485.537us     161.846us      16.160us       100.00%      21.536us       7.179us             3  
+void layer_norm::ln_fwd_kernel&lt;layer_norm::Kernel_tr...         0.00%       0.000us         0.00%       0.000us       0.000us      16.160us       100.00%      16.160us       5.387us             3  
+                                Activity Buffer Request        41.13%     249.624us        41.13%     249.624us     249.624us       5.376us        33.27%       5.376us       5.376us             1  
+                                             aten::view         2.00%      12.150us         2.00%      12.150us       2.025us       0.000us         0.00%       0.000us       0.000us             6  
+                                            aten::empty         4.85%      29.441us         4.85%      29.441us       3.271us       0.000us         0.00%       0.000us       0.000us             9  
+cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFla...         0.88%       5.329us         0.88%       5.329us       1.776us       0.000us         0.00%       0.000us       0.000us             3  
+                                       cudaLaunchKernel        25.78%     156.453us        25.78%     156.453us      52.151us       0.000us         0.00%       0.000us       0.000us             3  
+                                  cudaDeviceSynchronize         0.85%       5.140us         0.85%       5.140us       5.140us       0.000us         0.00%       0.000us       0.000us             1  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-Self CPU time total: 1.886ms
-Self CUDA time total: 36.574us
+Self CPU time total: 606.909us
+Self CUDA time total: 16.160us
 
 
 
@@ -4163,19 +4163,19 @@ PROFILE TRACE: hf_kernels_layer_norm | LN_B1_S1024_D4096
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
                                                    Name    Self CPU %      Self CPU   CPU total %     CPU total  CPU time avg     Self CUDA   Self CUDA %    CUDA total  CUDA time avg    # of Calls  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-                                  hf_kernels_layer_norm         0.00%       0.000us         0.00%       0.000us       0.000us     133.789us       167.24%     133.789us     133.789us             1  
-                                  hf_kernels_layer_norm         7.08%     135.354us        99.76%       1.906ms       1.906ms       0.000us         0.00%     130.077us     130.077us             1  
-                _layer_norm_f8ec252::dropout_add_ln_fwd         2.31%      44.121us        92.04%       1.758ms     586.112us      79.998us       100.00%     130.077us      43.359us             3  
-void layer_norm::ln_fwd_kernel&lt;layer_norm::Kernel_tr...         0.00%       0.000us         0.00%       0.000us       0.000us      79.998us       100.00%      79.998us      26.666us             3  
-                                Activity Buffer Request        79.38%       1.516ms        79.38%       1.516ms       1.516ms      50.079us        62.60%      50.079us      50.079us             1  
-                                             aten::view         0.64%      12.280us         0.64%      12.280us       2.047us       0.000us         0.00%       0.000us       0.000us             6  
-                                            aten::empty         1.63%      31.230us         1.63%      31.230us       3.470us       0.000us         0.00%       0.000us       0.000us             9  
-cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFla...         0.30%       5.759us         0.30%       5.759us       1.920us       0.000us         0.00%       0.000us       0.000us             3  
-                                       cudaLaunchKernel         8.41%     160.764us         8.41%     160.764us      53.588us       0.000us         0.00%       0.000us       0.000us             3  
-                                  cudaDeviceSynchronize         0.24%       4.530us         0.24%       4.530us       4.530us       0.000us         0.00%       0.000us       0.000us             1  
+                                  hf_kernels_layer_norm         0.00%       0.000us         0.00%       0.000us       0.000us     116.126us       544.07%     116.126us     116.126us             1  
+                                  hf_kernels_layer_norm        21.73%     103.750us        98.94%     472.437us     472.437us       0.000us         0.00%      28.448us      28.448us             1  
+                _layer_norm_f8ec252::dropout_add_ln_fwd         9.81%      46.840us        74.86%     357.435us     119.145us      21.344us       100.00%      28.448us       9.483us             3  
+void layer_norm::ln_fwd_kernel&lt;layer_norm::Kernel_tr...         0.00%       0.000us         0.00%       0.000us       0.000us      21.344us       100.00%      21.344us       7.115us             3  
+                                Activity Buffer Request        25.46%     121.562us        25.46%     121.562us     121.562us       7.104us        33.28%       7.104us       7.104us             1  
+                                             aten::view         2.36%      11.252us         2.36%      11.252us       1.875us       0.000us         0.00%       0.000us       0.000us             6  
+                                            aten::empty         6.20%      29.622us         6.20%      29.622us       3.291us       0.000us         0.00%       0.000us       0.000us             9  
+cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFla...         1.03%       4.929us         1.03%       4.929us       1.643us       0.000us         0.00%       0.000us       0.000us             3  
+                                       cudaLaunchKernel        32.35%     154.482us        32.35%     154.482us      51.494us       0.000us         0.00%       0.000us       0.000us             3  
+                                  cudaDeviceSynchronize         1.06%       5.060us         1.06%       5.060us       5.060us       0.000us         0.00%       0.000us       0.000us             1  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-Self CPU time total: 1.911ms
-Self CUDA time total: 79.998us
+Self CPU time total: 477.497us
+Self CUDA time total: 21.344us
 
 
 
@@ -4185,19 +4185,19 @@ PROFILE TRACE: hf_kernels_layer_norm | LN_B1_S1024_D8192
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
                                                    Name    Self CPU %      Self CPU   CPU total %     CPU total  CPU time avg     Self CUDA   Self CUDA %    CUDA total  CUDA time avg    # of Calls  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-                                  hf_kernels_layer_norm        24.43%     134.014us        98.52%     540.532us     540.532us       0.000us         0.00%     271.640us     271.640us             1  
-                _layer_norm_f8ec252::dropout_add_ln_fwd         7.86%      43.128us        71.93%     394.628us     131.543us     169.403us       100.00%     271.640us      90.547us             3  
-                                  hf_kernels_layer_norm         0.00%       0.000us         0.00%       0.000us       0.000us     170.779us       100.81%     170.779us     170.779us             1  
-void layer_norm::ln_fwd_kernel&lt;layer_norm::Kernel_tr...         0.00%       0.000us         0.00%       0.000us       0.000us     169.403us       100.00%     169.403us      56.468us             3  
-                                Activity Buffer Request        27.58%     151.333us        27.58%     151.333us     151.333us     102.237us        60.35%     102.237us     102.237us             1  
-                                             aten::view         2.17%      11.890us         2.17%      11.890us       1.982us       0.000us         0.00%       0.000us       0.000us             6  
-                                            aten::empty         5.94%      32.563us         5.94%      32.563us       3.618us       0.000us         0.00%       0.000us       0.000us             9  
-cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFla...         0.88%       4.820us         0.88%       4.820us       1.607us       0.000us         0.00%       0.000us       0.000us             3  
-                                       cudaLaunchKernel        29.67%     162.784us        29.67%     162.784us      54.261us       0.000us         0.00%       0.000us       0.000us             3  
-                                  cudaDeviceSynchronize         1.48%       8.120us         1.48%       8.120us       8.120us       0.000us         0.00%       0.000us       0.000us             1  
+                                  hf_kernels_layer_norm         0.00%       0.000us         0.00%       0.000us       0.000us     123.005us       198.35%     123.005us     123.005us             1  
+                                  hf_kernels_layer_norm        17.67%     104.362us        99.18%     585.739us     585.739us       0.000us         0.00%      97.950us      97.950us             1  
+                _layer_norm_f8ec252::dropout_add_ln_fwd         7.69%      45.431us        79.53%     469.697us     156.566us      62.015us       100.00%      97.950us      32.650us             3  
+void layer_norm::ln_fwd_kernel&lt;layer_norm::Kernel_tr...         0.00%       0.000us         0.00%       0.000us       0.000us      62.015us       100.00%      62.015us      20.672us             3  
+                                Activity Buffer Request        38.94%     229.994us        38.94%     229.994us     229.994us      35.935us        57.95%      35.935us      35.935us             1  
+                                             aten::view         1.98%      11.680us         1.98%      11.680us       1.947us       0.000us         0.00%       0.000us       0.000us             6  
+                                            aten::empty         4.96%      29.301us         4.96%      29.301us       3.256us       0.000us         0.00%       0.000us       0.000us             9  
+cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFla...         0.94%       5.530us         0.94%       5.530us       1.843us       0.000us         0.00%       0.000us       0.000us             3  
+                                       cudaLaunchKernel        27.00%     159.441us        27.00%     159.441us      53.147us       0.000us         0.00%       0.000us       0.000us             3  
+                                  cudaDeviceSynchronize         0.82%       4.870us         0.82%       4.870us       4.870us       0.000us         0.00%       0.000us       0.000us             1  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-Self CPU time total: 548.652us
-Self CUDA time total: 169.403us
+Self CPU time total: 590.609us
+Self CUDA time total: 62.015us
 
 
 
@@ -4207,19 +4207,19 @@ PROFILE TRACE: hf_kernels_layer_norm | LN_B1_S2048_D1024
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
                                                    Name    Self CPU %      Self CPU   CPU total %     CPU total  CPU time avg     Self CUDA   Self CUDA %    CUDA total  CUDA time avg    # of Calls  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-                                  hf_kernels_layer_norm         0.00%       0.000us         0.00%       0.000us       0.000us     123.518us       475.36%     123.518us     123.518us             1  
-                                  hf_kernels_layer_norm         6.86%     128.144us        99.77%       1.864ms       1.864ms       0.000us         0.00%      34.752us      34.752us             1  
-                _layer_norm_f8ec252::dropout_add_ln_fwd         2.35%      44.000us        92.22%       1.723ms     574.492us      25.984us       100.00%      34.752us      11.584us             3  
-void layer_norm::ln_fwd_kernel&lt;layer_norm::Kernel_tr...         0.00%       0.000us         0.00%       0.000us       0.000us      25.984us       100.00%      25.984us       8.661us             3  
-                                Activity Buffer Request        79.41%       1.484ms        79.41%       1.484ms       1.484ms       8.768us        33.74%       8.768us       8.768us             1  
-                                             aten::view         0.69%      12.810us         0.69%      12.810us       2.135us       0.000us         0.00%       0.000us       0.000us             6  
-                                            aten::empty         1.65%      30.922us         1.65%      30.922us       3.436us       0.000us         0.00%       0.000us       0.000us             9  
-cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFla...         0.24%       4.540us         0.24%       4.540us       1.513us       0.000us         0.00%       0.000us       0.000us             3  
-                                       cudaLaunchKernel         8.56%     160.003us         8.56%     160.003us      53.334us       0.000us         0.00%       0.000us       0.000us             3  
-                                  cudaDeviceSynchronize         0.23%       4.370us         0.23%       4.370us       4.370us       0.000us         0.00%       0.000us       0.000us             1  
+                                  hf_kernels_layer_norm         0.00%       0.000us         0.00%       0.000us       0.000us     112.925us       880.03%     112.925us     112.925us             1  
+                                  hf_kernels_layer_norm        21.36%     101.251us        98.99%     469.286us     469.286us       0.000us         0.00%      17.152us      17.152us             1  
+                _layer_norm_f8ec252::dropout_add_ln_fwd         9.95%      47.161us        75.23%     356.625us     118.875us      12.832us       100.00%      17.152us       5.717us             3  
+void layer_norm::ln_fwd_kernel&lt;layer_norm::Kernel_tr...         0.00%       0.000us         0.00%       0.000us       0.000us      12.832us       100.00%      12.832us       4.277us             3  
+                                Activity Buffer Request        24.52%     116.222us        24.52%     116.222us     116.222us       4.320us        33.67%       4.320us       4.320us             1  
+                                             aten::view         2.41%      11.410us         2.41%      11.410us       1.902us       0.000us         0.00%       0.000us       0.000us             6  
+                                            aten::empty         6.33%      30.000us         6.33%      30.000us       3.333us       0.000us         0.00%       0.000us       0.000us             9  
+cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFla...         0.99%       4.690us         0.99%       4.690us       1.563us       0.000us         0.00%       0.000us       0.000us             3  
+                                       cudaLaunchKernel        33.44%     158.552us        33.44%     158.552us      52.851us       0.000us         0.00%       0.000us       0.000us             3  
+                                  cudaDeviceSynchronize         1.01%       4.791us         1.01%       4.791us       4.791us       0.000us         0.00%       0.000us       0.000us             1  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-Self CPU time total: 1.869ms
-Self CUDA time total: 25.984us
+Self CPU time total: 474.077us
+Self CUDA time total: 12.832us
 
 
 
@@ -4229,19 +4229,19 @@ PROFILE TRACE: hf_kernels_layer_norm | LN_B1_S2048_D2048
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
                                                    Name    Self CPU %      Self CPU   CPU total %     CPU total  CPU time avg     Self CUDA   Self CUDA %    CUDA total  CUDA time avg    # of Calls  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-                                  hf_kernels_layer_norm        22.41%     105.332us        99.06%     465.510us     465.510us       0.000us         0.00%     143.994us     143.994us             1  
-                _layer_norm_f8ec252::dropout_add_ln_fwd         9.32%      43.790us        74.15%     348.436us     116.145us      90.972us       100.00%     143.994us      47.998us             3  
-                                  hf_kernels_layer_norm         0.00%       0.000us         0.00%       0.000us       0.000us     126.972us       139.57%     126.972us     126.972us             1  
-void layer_norm::ln_fwd_kernel&lt;layer_norm::Kernel_tr...         0.00%       0.000us         0.00%       0.000us       0.000us      90.972us       100.00%      90.972us      30.324us             3  
-                                Activity Buffer Request        22.99%     108.033us        22.99%     108.033us     108.033us      53.022us        58.28%      53.022us      53.022us             1  
-                                             aten::view         2.50%      11.742us         2.50%      11.742us       1.957us       0.000us         0.00%       0.000us       0.000us             6  
-                                            aten::empty         6.55%      30.800us         6.55%      30.800us       3.422us       0.000us         0.00%       0.000us       0.000us             9  
-cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFla...         1.14%       5.380us         1.14%       5.380us       1.793us       0.000us         0.00%       0.000us       0.000us             3  
-                                       cudaLaunchKernel        34.14%     160.433us        34.14%     160.433us      53.478us       0.000us         0.00%       0.000us       0.000us             3  
-                                  cudaDeviceSynchronize         0.94%       4.420us         0.94%       4.420us       4.420us       0.000us         0.00%       0.000us       0.000us             1  
+                                  hf_kernels_layer_norm         0.00%       0.000us         0.00%       0.000us       0.000us     116.894us       456.05%     116.894us     116.894us             1  
+                                  hf_kernels_layer_norm        16.78%     104.390us        99.21%     617.040us     617.040us       0.000us         0.00%      34.336us      34.336us             1  
+                _layer_norm_f8ec252::dropout_add_ln_fwd         7.67%      47.682us        80.57%     501.128us     167.043us      25.632us       100.00%      34.336us      11.445us             3  
+void layer_norm::ln_fwd_kernel&lt;layer_norm::Kernel_tr...         0.00%       0.000us         0.00%       0.000us       0.000us      25.632us       100.00%      25.632us       8.544us             3  
+                                Activity Buffer Request        42.51%     264.394us        42.51%     264.394us     264.394us       8.704us        33.96%       8.704us       8.704us             1  
+                                             aten::view         1.85%      11.522us         1.85%      11.522us       1.920us       0.000us         0.00%       0.000us       0.000us             6  
+                                            aten::empty         4.71%      29.300us         4.71%      29.300us       3.256us       0.000us         0.00%       0.000us       0.000us             9  
+cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFla...         0.84%       5.220us         0.84%       5.220us       1.740us       0.000us         0.00%       0.000us       0.000us             3  
+                                       cudaLaunchKernel        24.85%     154.532us        24.85%     154.532us      51.511us       0.000us         0.00%       0.000us       0.000us             3  
+                                  cudaDeviceSynchronize         0.79%       4.910us         0.79%       4.910us       4.910us       0.000us         0.00%       0.000us       0.000us             1  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-Self CPU time total: 469.930us
-Self CUDA time total: 90.972us
+Self CPU time total: 621.950us
+Self CUDA time total: 25.632us
 
 
 
@@ -4251,19 +4251,19 @@ PROFILE TRACE: hf_kernels_layer_norm | LN_B1_S2048_D4096
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
                                                    Name    Self CPU %      Self CPU   CPU total %     CPU total  CPU time avg     Self CUDA   Self CUDA %    CUDA total  CUDA time avg    # of Calls  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-                                  hf_kernels_layer_norm         7.06%     132.903us        99.72%       1.877ms       1.877ms       0.000us         0.00%     251.833us     251.833us             1  
-                _layer_norm_f8ec252::dropout_add_ln_fwd         2.36%      44.503us        92.00%       1.732ms     577.246us     154.620us       100.00%     251.833us      83.944us             3  
-                                  hf_kernels_layer_norm         0.00%       0.000us         0.00%       0.000us       0.000us     155.868us       100.81%     155.868us     155.868us             1  
-void layer_norm::ln_fwd_kernel&lt;layer_norm::Kernel_tr...         0.00%       0.000us         0.00%       0.000us       0.000us     154.620us       100.00%     154.620us      51.540us             3  
-                                Activity Buffer Request        79.11%       1.489ms        79.11%       1.489ms       1.489ms      97.213us        62.87%      97.213us      97.213us             1  
-                                             aten::view         0.66%      12.470us         0.66%      12.470us       2.078us       0.000us         0.00%       0.000us       0.000us             6  
-                                            aten::empty         1.68%      31.630us         1.68%      31.630us       3.514us       0.000us         0.00%       0.000us       0.000us             9  
-cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFla...         0.25%       4.790us         0.25%       4.790us       1.597us       0.000us         0.00%       0.000us       0.000us             3  
-                                       cudaLaunchKernel         8.59%     161.763us         8.59%     161.763us      53.921us       0.000us         0.00%       0.000us       0.000us             3  
-                                  cudaDeviceSynchronize         0.28%       5.240us         0.28%       5.240us       5.240us       0.000us         0.00%       0.000us       0.000us             1  
+                                  hf_kernels_layer_norm         0.00%       0.000us         0.00%       0.000us       0.000us     123.901us       207.17%     123.901us     123.901us             1  
+                                  hf_kernels_layer_norm        17.03%     105.700us        99.25%     616.179us     616.179us       0.000us         0.00%      95.452us      95.452us             1  
+                _layer_norm_f8ec252::dropout_add_ln_fwd         7.57%      46.994us        80.35%     498.838us     166.279us      59.805us       100.00%      95.452us      31.817us             3  
+void layer_norm::ln_fwd_kernel&lt;layer_norm::Kernel_tr...         0.00%       0.000us         0.00%       0.000us       0.000us      59.805us       100.00%      59.805us      19.935us             3  
+                                Activity Buffer Request        42.09%     261.283us        42.09%     261.283us     261.283us      35.647us        59.61%      35.647us      35.647us             1  
+                                             aten::view         1.88%      11.641us         1.88%      11.641us       1.940us       0.000us         0.00%       0.000us       0.000us             6  
+                                            aten::empty         4.84%      30.020us         4.84%      30.020us       3.336us       0.000us         0.00%       0.000us       0.000us             9  
+cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFla...         0.92%       5.739us         0.92%       5.739us       1.913us       0.000us         0.00%       0.000us       0.000us             3  
+                                       cudaLaunchKernel        24.93%     154.802us        24.93%     154.802us      51.601us       0.000us         0.00%       0.000us       0.000us             3  
+                                  cudaDeviceSynchronize         0.75%       4.650us         0.75%       4.650us       4.650us       0.000us         0.00%       0.000us       0.000us             1  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-Self CPU time total: 1.882ms
-Self CUDA time total: 154.620us
+Self CPU time total: 620.829us
+Self CUDA time total: 59.805us
 
 
 
@@ -4273,19 +4273,19 @@ PROFILE TRACE: hf_kernels_layer_norm | LN_B1_S2048_D8192
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
                                                    Name    Self CPU %      Self CPU   CPU total %     CPU total  CPU time avg     Self CUDA   Self CUDA %    CUDA total  CUDA time avg    # of Calls  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-                                  hf_kernels_layer_norm         5.50%     137.653us        75.82%       1.896ms       1.896ms       0.000us         0.00%       1.022ms       1.022ms             1  
-                _layer_norm_f8ec252::dropout_add_ln_fwd         1.78%      44.522us        69.80%       1.746ms     581.866us     773.939us       100.00%       1.022ms     340.666us             3  
-                                  hf_kernels_layer_norm         0.00%       0.000us         0.00%       0.000us       0.000us     775.315us       100.18%     775.315us     775.315us             1  
-void layer_norm::ln_fwd_kernel&lt;layer_norm::Kernel_tr...         0.00%       0.000us         0.00%       0.000us       0.000us     773.939us       100.00%     773.939us     257.980us             3  
-                                Activity Buffer Request        60.12%       1.504ms        60.12%       1.504ms       1.504ms     248.060us        32.05%     248.060us     248.060us             1  
-                                             aten::view         0.52%      12.931us         0.52%      12.931us       2.155us       0.000us         0.00%       0.000us       0.000us             6  
-                                            aten::empty         1.22%      30.580us         1.22%      30.580us       3.398us       0.000us         0.00%       0.000us       0.000us             9  
-cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFla...         0.18%       4.610us         0.18%       4.610us       1.537us       0.000us         0.00%       0.000us       0.000us             3  
-                                       cudaLaunchKernel         6.49%     162.233us         6.49%     162.233us      54.078us       0.000us         0.00%       0.000us       0.000us             3  
-                                  cudaDeviceSynchronize        24.18%     604.773us        24.18%     604.773us     604.773us       0.000us         0.00%       0.000us       0.000us             1  
+                                  hf_kernels_layer_norm        20.93%     115.170us        99.06%     545.227us     545.227us       0.000us         0.00%     194.686us     194.686us             1  
+                _layer_norm_f8ec252::dropout_add_ln_fwd         8.82%      48.552us        75.83%     417.326us     139.109us     120.767us       100.00%     194.686us      64.895us             3  
+                                  hf_kernels_layer_norm         0.00%       0.000us         0.00%       0.000us       0.000us     137.247us       113.65%     137.247us     137.247us             1  
+void layer_norm::ln_fwd_kernel&lt;layer_norm::Kernel_tr...         0.00%       0.000us         0.00%       0.000us       0.000us     120.767us       100.00%     120.767us      40.256us             3  
+                                Activity Buffer Request        31.56%     173.672us        31.56%     173.672us     173.672us      73.919us        61.21%      73.919us      73.919us             1  
+                                             aten::view         2.31%      12.731us         2.31%      12.731us       2.122us       0.000us         0.00%       0.000us       0.000us             6  
+                                            aten::empty         5.79%      31.840us         5.79%      31.840us       3.538us       0.000us         0.00%       0.000us       0.000us             9  
+cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFla...         1.15%       6.350us         1.15%       6.350us       2.117us       0.000us         0.00%       0.000us       0.000us             3  
+                                       cudaLaunchKernel        28.51%     156.912us        28.51%     156.912us      52.304us       0.000us         0.00%       0.000us       0.000us             3  
+                                  cudaDeviceSynchronize         0.94%       5.151us         0.94%       5.151us       5.151us       0.000us         0.00%       0.000us       0.000us             1  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-Self CPU time total: 2.501ms
-Self CUDA time total: 773.939us
+Self CPU time total: 550.378us
+Self CUDA time total: 120.767us
 
 
 
@@ -4295,19 +4295,19 @@ PROFILE TRACE: hf_kernels_layer_norm | LN_B4_S128_D1024
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
                                                    Name    Self CPU %      Self CPU   CPU total %     CPU total  CPU time avg     Self CUDA   Self CUDA %    CUDA total  CUDA time avg    # of Calls  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-                                  hf_kernels_layer_norm         0.00%       0.000us         0.00%       0.000us       0.000us     114.235us      1022.79%     114.235us     114.235us             1  
-                                  hf_kernels_layer_norm        20.56%     107.954us        99.19%     520.921us     520.921us       0.000us         0.00%      14.722us      14.722us             1  
-                _layer_norm_f8ec252::dropout_add_ln_fwd         8.06%      42.351us        76.34%     400.957us     133.652us      11.169us       100.00%      14.722us       4.907us             3  
-void layer_norm::ln_fwd_kernel&lt;layer_norm::Kernel_tr...         0.00%       0.000us         0.00%       0.000us       0.000us      11.169us       100.00%      11.169us       3.723us             3  
-                                Activity Buffer Request        31.14%     163.523us        31.14%     163.523us     163.523us       3.553us        31.81%       3.553us       3.553us             1  
-                                             aten::view         2.29%      12.010us         2.29%      12.010us       2.002us       0.000us         0.00%       0.000us       0.000us             6  
-                                            aten::empty         6.02%      31.620us         6.02%      31.620us       3.513us       0.000us         0.00%       0.000us       0.000us             9  
-cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFla...         0.87%       4.550us         0.87%       4.550us       1.517us       0.000us         0.00%       0.000us       0.000us             3  
-                                       cudaLaunchKernel        30.26%     158.913us        30.26%     158.913us      52.971us       0.000us         0.00%       0.000us       0.000us             3  
-                                  cudaDeviceSynchronize         0.81%       4.270us         0.81%       4.270us       4.270us       0.000us         0.00%       0.000us       0.000us             1  
+                                  hf_kernels_layer_norm         0.00%       0.000us         0.00%       0.000us       0.000us     120.958us      1277.01%     120.958us     120.958us             1  
+                                  hf_kernels_layer_norm        13.96%     126.333us        99.48%     900.293us     900.293us       0.000us         0.00%      12.480us      12.480us             1  
+                _layer_norm_f8ec252::dropout_add_ln_fwd         5.25%      47.490us        84.03%     760.450us     253.483us       9.472us       100.00%      12.480us       4.160us             3  
+void layer_norm::ln_fwd_kernel&lt;layer_norm::Kernel_tr...         0.00%       0.000us         0.00%       0.000us       0.000us       9.472us       100.00%       9.472us       3.157us             3  
+                                Activity Buffer Request        56.99%     515.778us        56.99%     515.778us     515.778us       3.008us        31.76%       3.008us       3.008us             1  
+                                             aten::view         1.49%      13.510us         1.49%      13.510us       2.252us       0.000us         0.00%       0.000us       0.000us             6  
+                                            aten::empty         3.30%      29.900us         3.30%      29.900us       3.322us       0.000us         0.00%       0.000us       0.000us             9  
+cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFla...         0.61%       5.520us         0.61%       5.520us       1.840us       0.000us         0.00%       0.000us       0.000us             3  
+                                       cudaLaunchKernel        17.87%     161.762us        17.87%     161.762us      53.921us       0.000us         0.00%       0.000us       0.000us             3  
+                                  cudaDeviceSynchronize         0.52%       4.731us         0.52%       4.731us       4.731us       0.000us         0.00%       0.000us       0.000us             1  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-Self CPU time total: 525.191us
-Self CUDA time total: 11.169us
+Self CPU time total: 905.024us
+Self CUDA time total: 9.472us
 
 
 
@@ -4317,19 +4317,19 @@ PROFILE TRACE: hf_kernels_layer_norm | LN_B4_S128_D2048
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
                                                    Name    Self CPU %      Self CPU   CPU total %     CPU total  CPU time avg     Self CUDA   Self CUDA %    CUDA total  CUDA time avg    # of Calls  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-                                  hf_kernels_layer_norm         0.00%       0.000us         0.00%       0.000us       0.000us     123.677us       491.74%     123.677us     123.677us             1  
-                                  hf_kernels_layer_norm         6.82%     128.063us        99.76%       1.873ms       1.873ms       0.000us         0.00%      33.759us      33.759us             1  
-                _layer_norm_f8ec252::dropout_add_ln_fwd         2.38%      44.761us        92.27%       1.732ms     577.489us      25.151us       100.00%      33.759us      11.253us             3  
-void layer_norm::ln_fwd_kernel&lt;layer_norm::Kernel_tr...         0.00%       0.000us         0.00%       0.000us       0.000us      25.151us       100.00%      25.151us       8.384us             3  
-                                Activity Buffer Request        79.39%       1.491ms        79.39%       1.491ms       1.491ms       8.608us        34.23%       8.608us       8.608us             1  
-                                             aten::view         0.67%      12.571us         0.67%      12.571us       2.095us       0.000us         0.00%       0.000us       0.000us             6  
-                                            aten::empty         1.69%      31.810us         1.69%      31.810us       3.534us       0.000us         0.00%       0.000us       0.000us             9  
-cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFla...         0.24%       4.510us         0.24%       4.510us       1.503us       0.000us         0.00%       0.000us       0.000us             3  
-                                       cudaLaunchKernel         8.56%     160.733us         8.56%     160.733us      53.578us       0.000us         0.00%       0.000us       0.000us             3  
-                                  cudaDeviceSynchronize         0.24%       4.560us         0.24%       4.560us       4.560us       0.000us         0.00%       0.000us       0.000us             1  
+                                  hf_kernels_layer_norm         0.00%       0.000us         0.00%       0.000us       0.000us     119.647us       905.32%     119.647us     119.647us             1  
+                                  hf_kernels_layer_norm         7.02%     129.983us        99.72%       1.846ms       1.846ms       0.000us         0.00%      17.632us      17.632us             1  
+                _layer_norm_f8ec252::dropout_add_ln_fwd         2.48%      45.879us        92.05%       1.704ms     568.058us      13.216us       100.00%      17.632us       5.877us             3  
+void layer_norm::ln_fwd_kernel&lt;layer_norm::Kernel_tr...         0.00%       0.000us         0.00%       0.000us       0.000us      13.216us       100.00%      13.216us       4.405us             3  
+                                Activity Buffer Request        79.30%       1.468ms        79.30%       1.468ms       1.468ms       4.416us        33.41%       4.416us       4.416us             1  
+                                             aten::view         0.65%      12.030us         0.65%      12.030us       2.005us       0.000us         0.00%       0.000us       0.000us             6  
+                                            aten::empty         1.60%      29.701us         1.60%      29.701us       3.300us       0.000us         0.00%       0.000us       0.000us             9  
+cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFla...         0.33%       6.090us         0.33%       6.090us       2.030us       0.000us         0.00%       0.000us       0.000us             3  
+                                       cudaLaunchKernel         8.34%     154.332us         8.34%     154.332us      51.444us       0.000us         0.00%       0.000us       0.000us             3  
+                                  cudaDeviceSynchronize         0.28%       5.130us         0.28%       5.130us       5.130us       0.000us         0.00%       0.000us       0.000us             1  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-Self CPU time total: 1.878ms
-Self CUDA time total: 25.151us
+Self CPU time total: 1.851ms
+Self CUDA time total: 13.216us
 
 
 
@@ -4339,19 +4339,19 @@ PROFILE TRACE: hf_kernels_layer_norm | LN_B4_S128_D4096
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
                                                    Name    Self CPU %      Self CPU   CPU total %     CPU total  CPU time avg     Self CUDA   Self CUDA %    CUDA total  CUDA time avg    # of Calls  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-                                  hf_kernels_layer_norm         0.00%       0.000us         0.00%       0.000us       0.000us     119.706us       417.98%     119.706us     119.706us             1  
-                                  hf_kernels_layer_norm        25.81%     125.022us        99.07%     479.820us     479.820us       0.000us         0.00%      38.142us      38.142us             1  
-                _layer_norm_f8ec252::dropout_add_ln_fwd         9.03%      43.713us        70.85%     343.148us     114.383us      28.639us       100.00%      38.142us      12.714us             3  
-void layer_norm::ln_fwd_kernel&lt;layer_norm::Kernel_tr...         0.00%       0.000us         0.00%       0.000us       0.000us      28.639us       100.00%      28.639us       9.546us             3  
-                                Activity Buffer Request        21.68%     105.002us        21.68%     105.002us     105.002us       9.503us        33.18%       9.503us       9.503us             1  
-                                             aten::view         2.41%      11.650us         2.41%      11.650us       1.942us       0.000us         0.00%       0.000us       0.000us             6  
-                                            aten::empty         6.56%      31.751us         6.56%      31.751us       3.528us       0.000us         0.00%       0.000us       0.000us             9  
-cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFla...         0.93%       4.499us         0.93%       4.499us       1.500us       0.000us         0.00%       0.000us       0.000us             3  
-                                       cudaLaunchKernel        32.66%     158.183us        32.66%     158.183us      52.728us       0.000us         0.00%       0.000us       0.000us             3  
-                                  cudaDeviceSynchronize         0.93%       4.510us         0.93%       4.510us       4.510us       0.000us         0.00%       0.000us       0.000us             1  
+                                  hf_kernels_layer_norm         0.00%       0.000us         0.00%       0.000us       0.000us     119.904us       814.57%     119.904us     119.904us             1  
+                                  hf_kernels_layer_norm         6.96%     128.481us        99.73%       1.842ms       1.842ms       0.000us         0.00%      19.648us      19.648us             1  
+                _layer_norm_f8ec252::dropout_add_ln_fwd         2.56%      47.250us        92.11%       1.701ms     566.981us      14.720us       100.00%      19.648us       6.549us             3  
+void layer_norm::ln_fwd_kernel&lt;layer_norm::Kernel_tr...         0.00%       0.000us         0.00%       0.000us       0.000us      14.720us       100.00%      14.720us       4.907us             3  
+                                Activity Buffer Request        79.23%       1.463ms        79.23%       1.463ms       1.463ms       4.928us        33.48%       4.928us       4.928us             1  
+                                             aten::view         0.66%      12.121us         0.66%      12.121us       2.020us       0.000us         0.00%       0.000us       0.000us             6  
+                                            aten::empty         1.62%      29.881us         1.62%      29.881us       3.320us       0.000us         0.00%       0.000us       0.000us             9  
+cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFla...         0.34%       6.300us         0.34%       6.300us       2.100us       0.000us         0.00%       0.000us       0.000us             3  
+                                       cudaLaunchKernel         8.36%     154.452us         8.36%     154.452us      51.484us       0.000us         0.00%       0.000us       0.000us             3  
+                                  cudaDeviceSynchronize         0.27%       5.031us         0.27%       5.031us       5.031us       0.000us         0.00%       0.000us       0.000us             1  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-Self CPU time total: 484.330us
-Self CUDA time total: 28.639us
+Self CPU time total: 1.847ms
+Self CUDA time total: 14.720us
 
 
 
@@ -4361,19 +4361,19 @@ PROFILE TRACE: hf_kernels_layer_norm | LN_B4_S128_D8192
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
                                                    Name    Self CPU %      Self CPU   CPU total %     CPU total  CPU time avg     Self CUDA   Self CUDA %    CUDA total  CUDA time avg    # of Calls  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-                                  hf_kernels_layer_norm         0.00%       0.000us         0.00%       0.000us       0.000us     136.221us       162.42%     136.221us     136.221us             1  
-                                  hf_kernels_layer_norm         5.91%     110.784us        99.77%       1.870ms       1.870ms       0.000us         0.00%     135.358us     135.358us             1  
-                _layer_norm_f8ec252::dropout_add_ln_fwd         2.49%      46.760us        93.19%       1.747ms     582.332us      83.871us       100.00%     135.358us      45.119us             3  
-void layer_norm::ln_fwd_kernel&lt;layer_norm::Kernel_tr...         0.00%       0.000us         0.00%       0.000us       0.000us      83.871us       100.00%      83.871us      27.957us             3  
-                                Activity Buffer Request        79.40%       1.488ms        79.40%       1.488ms       1.488ms      51.487us        61.39%      51.487us      51.487us             1  
-                                             aten::view         0.67%      12.640us         0.67%      12.640us       2.107us       0.000us         0.00%       0.000us       0.000us             6  
-                                            aten::empty         1.64%      30.810us         1.64%      30.810us       3.423us       0.000us         0.00%       0.000us       0.000us             9  
-cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFla...         0.26%       4.850us         0.26%       4.850us       1.617us       0.000us         0.00%       0.000us       0.000us             3  
-                                       cudaLaunchKernel         9.39%     176.124us         9.39%     176.124us      58.708us       0.000us         0.00%       0.000us       0.000us             3  
-                                  cudaDeviceSynchronize         0.23%       4.240us         0.23%       4.240us       4.240us       0.000us         0.00%       0.000us       0.000us             1  
+                                  hf_kernels_layer_norm         0.00%       0.000us         0.00%       0.000us       0.000us     123.838us       511.90%     123.838us     123.838us             1  
+                                  hf_kernels_layer_norm         6.93%     126.950us        99.73%       1.827ms       1.827ms       0.000us         0.00%      32.224us      32.224us             1  
+                _layer_norm_f8ec252::dropout_add_ln_fwd         2.51%      46.080us        92.13%       1.688ms     562.698us      24.192us       100.00%      32.224us      10.741us             3  
+void layer_norm::ln_fwd_kernel&lt;layer_norm::Kernel_tr...         0.00%       0.000us         0.00%       0.000us       0.000us      24.192us       100.00%      24.192us       8.064us             3  
+                                Activity Buffer Request        79.12%       1.450ms        79.12%       1.450ms       1.450ms       8.032us        33.20%       8.032us       8.032us             1  
+                                             aten::view         0.67%      12.241us         0.67%      12.241us       2.040us       0.000us         0.00%       0.000us       0.000us             6  
+                                            aten::empty         1.67%      30.641us         1.67%      30.641us       3.405us       0.000us         0.00%       0.000us       0.000us             9  
+cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFla...         0.33%       5.980us         0.33%       5.980us       1.993us       0.000us         0.00%       0.000us       0.000us             3  
+                                       cudaLaunchKernel         8.50%     155.772us         8.50%     155.772us      51.924us       0.000us         0.00%       0.000us       0.000us             3  
+                                  cudaDeviceSynchronize         0.27%       4.990us         0.27%       4.990us       4.990us       0.000us         0.00%       0.000us       0.000us             1  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-Self CPU time total: 1.875ms
-Self CUDA time total: 83.871us
+Self CPU time total: 1.832ms
+Self CUDA time total: 24.192us
 
 
 
@@ -4383,19 +4383,19 @@ PROFILE TRACE: hf_kernels_layer_norm | LN_B4_S512_D1024
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
                                                    Name    Self CPU %      Self CPU   CPU total %     CPU total  CPU time avg     Self CUDA   Self CUDA %    CUDA total  CUDA time avg    # of Calls  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-                                  hf_kernels_layer_norm         0.00%       0.000us         0.00%       0.000us       0.000us     127.996us       494.44%     127.996us     127.996us             1  
-                                  hf_kernels_layer_norm         7.05%     134.013us        99.76%       1.896ms       1.896ms       0.000us         0.00%      34.367us      34.367us             1  
-                _layer_norm_f8ec252::dropout_add_ln_fwd         2.28%      43.262us        92.05%       1.750ms     583.272us      25.887us       100.00%      34.367us      11.456us             3  
-void layer_norm::ln_fwd_kernel&lt;layer_norm::Kernel_tr...         0.00%       0.000us         0.00%       0.000us       0.000us      25.887us       100.00%      25.887us       8.629us             3  
-                                Activity Buffer Request        79.44%       1.510ms        79.44%       1.510ms       1.510ms       8.480us        32.76%       8.480us       8.480us             1  
-                                             aten::view         0.66%      12.451us         0.66%      12.451us       2.075us       0.000us         0.00%       0.000us       0.000us             6  
-                                            aten::empty         1.65%      31.400us         1.65%      31.400us       3.489us       0.000us         0.00%       0.000us       0.000us             9  
-cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFla...         0.25%       4.830us         0.25%       4.830us       1.610us       0.000us         0.00%       0.000us       0.000us             3  
-                                       cudaLaunchKernel         8.44%     160.343us         8.44%     160.343us      53.448us       0.000us         0.00%       0.000us       0.000us             3  
-                                  cudaDeviceSynchronize         0.24%       4.600us         0.24%       4.600us       4.600us       0.000us         0.00%       0.000us       0.000us             1  
+                                  hf_kernels_layer_norm         0.00%       0.000us         0.00%       0.000us       0.000us     114.752us       903.27%     114.752us     114.752us             1  
+                                  hf_kernels_layer_norm         6.98%     127.002us        99.74%       1.816ms       1.816ms       0.000us         0.00%      16.896us      16.896us             1  
+                _layer_norm_f8ec252::dropout_add_ln_fwd         2.46%      44.721us        92.11%       1.677ms     559.031us      12.704us       100.00%      16.896us       5.632us             3  
+void layer_norm::ln_fwd_kernel&lt;layer_norm::Kernel_tr...         0.00%       0.000us         0.00%       0.000us       0.000us      12.704us       100.00%      12.704us       4.235us             3  
+                                Activity Buffer Request        79.42%       1.446ms        79.42%       1.446ms       1.446ms       4.192us        33.00%       4.192us       4.192us             1  
+                                             aten::view         0.65%      11.810us         0.65%      11.810us       1.968us       0.000us         0.00%       0.000us       0.000us             6  
+                                            aten::empty         1.61%      29.350us         1.61%      29.350us       3.261us       0.000us         0.00%       0.000us       0.000us             9  
+cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFla...         0.30%       5.480us         0.30%       5.480us       1.827us       0.000us         0.00%       0.000us       0.000us             3  
+                                       cudaLaunchKernel         8.33%     151.582us         8.33%     151.582us      50.527us       0.000us         0.00%       0.000us       0.000us             3  
+                                  cudaDeviceSynchronize         0.26%       4.810us         0.26%       4.810us       4.810us       0.000us         0.00%       0.000us       0.000us             1  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-Self CPU time total: 1.901ms
-Self CUDA time total: 25.887us
+Self CPU time total: 1.821ms
+Self CUDA time total: 12.704us
 
 
 
@@ -4405,19 +4405,19 @@ PROFILE TRACE: hf_kernels_layer_norm | LN_B4_S512_D2048
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
                                                    Name    Self CPU %      Self CPU   CPU total %     CPU total  CPU time avg     Self CUDA   Self CUDA %    CUDA total  CUDA time avg    # of Calls  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-                                  hf_kernels_layer_norm        25.05%     130.783us        99.19%     517.901us     517.901us       0.000us         0.00%     143.738us     143.738us             1  
-                _layer_norm_f8ec252::dropout_add_ln_fwd         8.16%      42.610us        71.72%     374.457us     124.819us      90.940us       100.00%     143.738us      47.913us             3  
-                                  hf_kernels_layer_norm         0.00%       0.000us         0.00%       0.000us       0.000us     129.787us       142.72%     129.787us     129.787us             1  
-void layer_norm::ln_fwd_kernel&lt;layer_norm::Kernel_tr...         0.00%       0.000us         0.00%       0.000us       0.000us      90.940us       100.00%      90.940us      30.313us             3  
-                                Activity Buffer Request        26.41%     137.873us        26.41%     137.873us     137.873us      52.798us        58.06%      52.798us      52.798us             1  
-                                             aten::view         2.42%      12.661us         2.42%      12.661us       2.110us       0.000us         0.00%       0.000us       0.000us             6  
-                                            aten::empty         6.15%      32.091us         6.15%      32.091us       3.566us       0.000us         0.00%       0.000us       0.000us             9  
-cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFla...         0.86%       4.510us         0.86%       4.510us       1.503us       0.000us         0.00%       0.000us       0.000us             3  
-                                       cudaLaunchKernel        30.14%     157.373us        30.14%     157.373us      52.458us       0.000us         0.00%       0.000us       0.000us             3  
-                                  cudaDeviceSynchronize         0.81%       4.209us         0.81%       4.209us       4.209us       0.000us         0.00%       0.000us       0.000us             1  
+                                  hf_kernels_layer_norm         0.00%       0.000us         0.00%       0.000us       0.000us     114.171us       434.06%     114.171us     114.171us             1  
+                                  hf_kernels_layer_norm        21.27%     106.031us        98.93%     493.167us     493.167us       0.000us         0.00%      35.134us      35.134us             1  
+                _layer_norm_f8ec252::dropout_add_ln_fwd         8.94%      44.581us        75.39%     375.835us     125.278us      26.303us       100.00%      35.134us      11.711us             3  
+void layer_norm::ln_fwd_kernel&lt;layer_norm::Kernel_tr...         0.00%       0.000us         0.00%       0.000us       0.000us      26.303us       100.00%      26.303us       8.768us             3  
+                                Activity Buffer Request        28.70%     143.052us        28.70%     143.052us     143.052us       8.831us        33.57%       8.831us       8.831us             1  
+                                             aten::view         2.27%      11.301us         2.27%      11.301us       1.883us       0.000us         0.00%       0.000us       0.000us             6  
+                                            aten::empty         5.86%      29.220us         5.86%      29.220us       3.247us       0.000us         0.00%       0.000us       0.000us             9  
+cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFla...         0.95%       4.720us         0.95%       4.720us       1.573us       0.000us         0.00%       0.000us       0.000us             3  
+                                       cudaLaunchKernel        30.95%     154.262us        30.95%     154.262us      51.421us       0.000us         0.00%       0.000us       0.000us             3  
+                                  cudaDeviceSynchronize         1.07%       5.331us         1.07%       5.331us       5.331us       0.000us         0.00%       0.000us       0.000us             1  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-Self CPU time total: 522.110us
-Self CUDA time total: 90.940us
+Self CPU time total: 498.498us
+Self CUDA time total: 26.303us
 
 
 
@@ -4427,19 +4427,19 @@ PROFILE TRACE: hf_kernels_layer_norm | LN_B4_S512_D4096
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
                                                    Name    Self CPU %      Self CPU   CPU total %     CPU total  CPU time avg     Self CUDA   Self CUDA %    CUDA total  CUDA time avg    # of Calls  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-                                  hf_kernels_layer_norm         7.06%     133.042us        99.73%       1.880ms       1.880ms       0.000us         0.00%     249.725us     249.725us             1  
-                _layer_norm_f8ec252::dropout_add_ln_fwd         2.38%      44.822us        92.02%       1.735ms     578.362us     152.446us       100.00%     249.725us      83.242us             3  
-                                  hf_kernels_layer_norm         0.00%       0.000us         0.00%       0.000us       0.000us     153.726us       100.84%     153.726us     153.726us             1  
-void layer_norm::ln_fwd_kernel&lt;layer_norm::Kernel_tr...         0.00%       0.000us         0.00%       0.000us       0.000us     152.446us       100.00%     152.446us      50.815us             3  
-                                Activity Buffer Request        78.99%       1.489ms        78.99%       1.489ms       1.489ms      97.279us        63.81%      97.279us      97.279us             1  
-                                             aten::view         0.65%      12.322us         0.65%      12.322us       2.054us       0.000us         0.00%       0.000us       0.000us             6  
-                                            aten::empty         1.73%      32.600us         1.73%      32.600us       3.622us       0.000us         0.00%       0.000us       0.000us             9  
-cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFla...         0.25%       4.629us         0.25%       4.629us       1.543us       0.000us         0.00%       0.000us       0.000us             3  
-                                       cudaLaunchKernel         8.68%     163.723us         8.68%     163.723us      54.574us       0.000us         0.00%       0.000us       0.000us             3  
-                                  cudaDeviceSynchronize         0.27%       5.040us         0.27%       5.040us       5.040us       0.000us         0.00%       0.000us       0.000us             1  
+                                  hf_kernels_layer_norm         0.00%       0.000us         0.00%       0.000us       0.000us     126.302us       214.16%     126.302us     126.302us             1  
+                                  hf_kernels_layer_norm         6.77%     126.701us        99.74%       1.866ms       1.866ms       0.000us         0.00%      94.496us      94.496us             1  
+                _layer_norm_f8ec252::dropout_add_ln_fwd         2.55%      47.732us        92.27%       1.726ms     575.432us      58.976us       100.00%      94.496us      31.499us             3  
+void layer_norm::ln_fwd_kernel&lt;layer_norm::Kernel_tr...         0.00%       0.000us         0.00%       0.000us       0.000us      58.976us       100.00%      58.976us      19.659us             3  
+                                Activity Buffer Request        79.36%       1.485ms        79.36%       1.485ms       1.485ms      35.520us        60.23%      35.520us      35.520us             1  
+                                             aten::view         0.70%      13.010us         0.70%      13.010us       2.168us       0.000us         0.00%       0.000us       0.000us             6  
+                                            aten::empty         1.62%      30.339us         1.62%      30.339us       3.371us       0.000us         0.00%       0.000us       0.000us             9  
+cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFla...         0.26%       4.881us         0.26%       4.881us       1.627us       0.000us         0.00%       0.000us       0.000us             3  
+                                       cudaLaunchKernel         8.48%     158.562us         8.48%     158.562us      52.854us       0.000us         0.00%       0.000us       0.000us             3  
+                                  cudaDeviceSynchronize         0.26%       4.860us         0.26%       4.860us       4.860us       0.000us         0.00%       0.000us       0.000us             1  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-Self CPU time total: 1.885ms
-Self CUDA time total: 152.446us
+Self CPU time total: 1.871ms
+Self CUDA time total: 58.976us
 
 
 
@@ -4449,19 +4449,19 @@ PROFILE TRACE: hf_kernels_layer_norm | LN_B4_S512_D8192
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
                                                    Name    Self CPU %      Self CPU   CPU total %     CPU total  CPU time avg     Self CUDA   Self CUDA %    CUDA total  CUDA time avg    # of Calls  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-                                  hf_kernels_layer_norm        11.27%     123.983us        43.73%     481.131us     481.131us       0.000us         0.00%       1.032ms       1.032ms             1  
-                _layer_norm_f8ec252::dropout_add_ln_fwd         3.84%      42.226us        31.34%     344.788us     114.929us     778.697us       100.00%       1.032ms     344.128us             3  
-                                  hf_kernels_layer_norm         0.00%       0.000us         0.00%       0.000us       0.000us     780.107us       100.18%     780.107us     780.107us             1  
-void layer_norm::ln_fwd_kernel&lt;layer_norm::Kernel_tr...         0.00%       0.000us         0.00%       0.000us       0.000us     778.697us       100.00%     778.697us     259.566us             3  
-                                Activity Buffer Request         9.29%     102.192us         9.29%     102.192us     102.192us     253.688us        32.58%     253.688us     253.688us             1  
-                                             aten::view         1.12%      12.360us         1.12%      12.360us       2.060us       0.000us         0.00%       0.000us       0.000us             6  
-                                            aten::empty         2.69%      29.646us         2.69%      29.646us       3.294us       0.000us         0.00%       0.000us       0.000us             9  
-cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFla...         0.46%       5.100us         0.46%       5.100us       1.700us       0.000us         0.00%       0.000us       0.000us             3  
-                                       cudaLaunchKernel        15.05%     165.624us        15.05%     165.624us      55.208us       0.000us         0.00%       0.000us       0.000us             3  
-                                  cudaDeviceSynchronize        56.27%     619.134us        56.27%     619.134us     619.134us       0.000us         0.00%       0.000us       0.000us             1  
+                                  hf_kernels_layer_norm         7.35%     135.313us        99.73%       1.836ms       1.836ms       0.000us         0.00%     200.830us     200.830us             1  
+                _layer_norm_f8ec252::dropout_add_ln_fwd         2.50%      46.052us        91.69%       1.688ms     562.585us     126.431us       100.00%     200.830us      66.943us             3  
+                                  hf_kernels_layer_norm         0.00%       0.000us         0.00%       0.000us       0.000us     142.015us       112.33%     142.015us     142.015us             1  
+void layer_norm::ln_fwd_kernel&lt;layer_norm::Kernel_tr...         0.00%       0.000us         0.00%       0.000us       0.000us     126.431us       100.00%     126.431us      42.144us             3  
+                                Activity Buffer Request        77.83%       1.433ms        77.83%       1.433ms       1.433ms      74.399us        58.85%      74.399us      74.399us             1  
+                                             aten::view         0.68%      12.599us         0.68%      12.599us       2.100us       0.000us         0.00%       0.000us       0.000us             6  
+                                            aten::empty         1.73%      31.929us         1.73%      31.929us       3.548us       0.000us         0.00%       0.000us       0.000us             9  
+cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFla...         0.30%       5.440us         0.30%       5.440us       1.813us       0.000us         0.00%       0.000us       0.000us             3  
+                                       cudaLaunchKernel         9.33%     171.692us         9.33%     171.692us      57.231us       0.000us         0.00%       0.000us       0.000us             3  
+                                  cudaDeviceSynchronize         0.27%       4.980us         0.27%       4.980us       4.980us       0.000us         0.00%       0.000us       0.000us             1  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-Self CPU time total: 1.100ms
-Self CUDA time total: 778.697us
+Self CPU time total: 1.841ms
+Self CUDA time total: 126.431us
 
 
 
@@ -4471,19 +4471,19 @@ PROFILE TRACE: hf_kernels_layer_norm | LN_B4_S1024_D1024
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
                                                    Name    Self CPU %      Self CPU   CPU total %     CPU total  CPU time avg     Self CUDA   Self CUDA %    CUDA total  CUDA time avg    # of Calls  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-                                  hf_kernels_layer_norm        21.50%     105.382us        99.03%     485.481us     485.481us       0.000us         0.00%     129.403us     129.403us             1  
-                _layer_norm_f8ec252::dropout_add_ln_fwd         8.84%      43.324us        74.99%     367.638us     122.546us      78.013us       100.00%     129.403us      43.134us             3  
-                                  hf_kernels_layer_norm         0.00%       0.000us         0.00%       0.000us       0.000us     123.195us       157.92%     123.195us     123.195us             1  
-void layer_norm::ln_fwd_kernel&lt;layer_norm::Kernel_tr...         0.00%       0.000us         0.00%       0.000us       0.000us      78.013us       100.00%      78.013us      26.004us             3  
-                                Activity Buffer Request        25.08%     122.952us        25.08%     122.952us     122.952us      51.390us        65.87%      51.390us      51.390us             1  
-                                             aten::view         2.54%      12.461us         2.54%      12.461us       2.077us       0.000us         0.00%       0.000us       0.000us             6  
-                                            aten::empty         6.14%      30.100us         6.14%      30.100us       3.344us       0.000us         0.00%       0.000us       0.000us             9  
-cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFla...         0.92%       4.499us         0.92%       4.499us       1.500us       0.000us         0.00%       0.000us       0.000us             3  
-                                       cudaLaunchKernel        34.02%     166.763us        34.02%     166.763us      55.588us       0.000us         0.00%       0.000us       0.000us             3  
-                                  cudaDeviceSynchronize         0.97%       4.770us         0.97%       4.770us       4.770us       0.000us         0.00%       0.000us       0.000us             1  
+                                  hf_kernels_layer_norm         0.00%       0.000us         0.00%       0.000us       0.000us     114.877us       559.23%     114.877us     114.877us             1  
+                                  hf_kernels_layer_norm        18.77%     104.472us        99.13%     551.627us     551.627us       0.000us         0.00%      27.357us      27.357us             1  
+                _layer_norm_f8ec252::dropout_add_ln_fwd         8.09%      45.039us        78.27%     435.585us     145.195us      20.542us       100.00%      27.357us       9.119us             3  
+void layer_norm::ln_fwd_kernel&lt;layer_norm::Kernel_tr...         0.00%       0.000us         0.00%       0.000us       0.000us      20.542us       100.00%      20.542us       6.847us             3  
+                                Activity Buffer Request        36.72%     204.352us        36.72%     204.352us     204.352us       6.815us        33.18%       6.815us       6.815us             1  
+                                             aten::view         2.08%      11.570us         2.08%      11.570us       1.928us       0.000us         0.00%       0.000us       0.000us             6  
+                                            aten::empty         5.24%      29.142us         5.24%      29.142us       3.238us       0.000us         0.00%       0.000us       0.000us             9  
+cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFla...         0.93%       5.150us         0.93%       5.150us       1.717us       0.000us         0.00%       0.000us       0.000us             3  
+                                       cudaLaunchKernel        27.30%     151.902us        27.30%     151.902us      50.634us       0.000us         0.00%       0.000us       0.000us             3  
+                                  cudaDeviceSynchronize         0.87%       4.869us         0.87%       4.869us       4.869us       0.000us         0.00%       0.000us       0.000us             1  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-Self CPU time total: 490.251us
-Self CUDA time total: 78.013us
+Self CPU time total: 556.496us
+Self CUDA time total: 20.542us
 
 
 
@@ -4493,19 +4493,19 @@ PROFILE TRACE: hf_kernels_layer_norm | LN_B4_S1024_D2048
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
                                                    Name    Self CPU %      Self CPU   CPU total %     CPU total  CPU time avg     Self CUDA   Self CUDA %    CUDA total  CUDA time avg    # of Calls  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-                                  hf_kernels_layer_norm        17.00%     113.402us        99.21%     661.694us     661.694us       0.000us         0.00%     284.025us     284.025us             1  
-                _layer_norm_f8ec252::dropout_add_ln_fwd         6.92%      46.121us        80.21%     534.951us     178.317us     178.523us       100.00%     284.025us      94.675us             3  
-                                  hf_kernels_layer_norm         0.00%       0.000us         0.00%       0.000us       0.000us     179.835us       100.73%     179.835us     179.835us             1  
-void layer_norm::ln_fwd_kernel&lt;layer_norm::Kernel_tr...         0.00%       0.000us         0.00%       0.000us       0.000us     178.523us       100.00%     178.523us      59.508us             3  
-                                Activity Buffer Request        40.82%     272.256us        40.82%     272.256us     272.256us     105.502us        59.10%     105.502us     105.502us             1  
-                                             aten::view         2.00%      13.341us         2.00%      13.341us       2.223us       0.000us         0.00%       0.000us       0.000us             6  
-                                            aten::empty         6.85%      45.671us         6.85%      45.671us       5.075us       0.000us         0.00%       0.000us       0.000us             9  
-cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFla...         0.72%       4.820us         0.72%       4.820us       1.607us       0.000us         0.00%       0.000us       0.000us             3  
-                                       cudaLaunchKernel        24.90%     166.083us        24.90%     166.083us      55.361us       0.000us         0.00%       0.000us       0.000us             3  
-                                  cudaDeviceSynchronize         0.79%       5.260us         0.79%       5.260us       5.260us       0.000us         0.00%       0.000us       0.000us             1  
+                                  hf_kernels_layer_norm         0.00%       0.000us         0.00%       0.000us       0.000us     128.543us       194.15%     128.543us     128.543us             1  
+                                  hf_kernels_layer_norm         6.47%     121.263us        99.74%       1.870ms       1.870ms       0.000us         0.00%     103.680us     103.680us             1  
+                _layer_norm_f8ec252::dropout_add_ln_fwd         2.50%      46.880us        92.61%       1.737ms     578.834us      66.208us       100.00%     103.680us      34.560us             3  
+void layer_norm::ln_fwd_kernel&lt;layer_norm::Kernel_tr...         0.00%       0.000us         0.00%       0.000us       0.000us      66.208us       100.00%      66.208us      22.069us             3  
+                                Activity Buffer Request        80.04%       1.501ms        80.04%       1.501ms       1.501ms      37.472us        56.60%      37.472us      37.472us             1  
+                                             aten::view         0.67%      12.550us         0.67%      12.550us       2.092us       0.000us         0.00%       0.000us       0.000us             6  
+                                            aten::empty         1.61%      30.111us         1.61%      30.111us       3.346us       0.000us         0.00%       0.000us       0.000us             9  
+cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFla...         0.29%       5.429us         0.29%       5.429us       1.810us       0.000us         0.00%       0.000us       0.000us             3  
+                                       cudaLaunchKernel         8.17%     153.262us         8.17%     153.262us      51.087us       0.000us         0.00%       0.000us       0.000us             3  
+                                  cudaDeviceSynchronize         0.26%       4.790us         0.26%       4.790us       4.790us       0.000us         0.00%       0.000us       0.000us             1  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-Self CPU time total: 666.954us
-Self CUDA time total: 178.523us
+Self CPU time total: 1.875ms
+Self CUDA time total: 66.208us
 
 
 
@@ -4515,19 +4515,19 @@ PROFILE TRACE: hf_kernels_layer_norm | LN_B4_S1024_D4096
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
                                                    Name    Self CPU %      Self CPU   CPU total %     CPU total  CPU time avg     Self CUDA   Self CUDA %    CUDA total  CUDA time avg    # of Calls  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-                                  hf_kernels_layer_norm        10.00%     107.534us        43.74%     470.530us     470.530us       0.000us         0.00%       1.006ms       1.006ms             1  
-                _layer_norm_f8ec252::dropout_add_ln_fwd         3.96%      42.640us        32.65%     351.216us     117.072us     763.349us       100.00%       1.006ms     335.355us             3  
-                                  hf_kernels_layer_norm         0.00%       0.000us         0.00%       0.000us       0.000us     764.661us       100.17%     764.661us     764.661us             1  
-void layer_norm::ln_fwd_kernel&lt;layer_norm::Kernel_tr...         0.00%       0.000us         0.00%       0.000us       0.000us     763.349us       100.00%     763.349us     254.450us             3  
-                                Activity Buffer Request        10.47%     112.593us        10.47%     112.593us     112.593us     242.717us        31.80%     242.717us     242.717us             1  
-                                             aten::view         1.10%      11.780us         1.10%      11.780us       1.963us       0.000us         0.00%       0.000us       0.000us             6  
-                                            aten::empty         2.92%      31.430us         2.92%      31.430us       3.492us       0.000us         0.00%       0.000us       0.000us             9  
-cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFla...         0.42%       4.470us         0.42%       4.470us       1.490us       0.000us         0.00%       0.000us       0.000us             3  
-                                       cudaLaunchKernel        14.88%     160.083us        14.88%     160.083us      53.361us       0.000us         0.00%       0.000us       0.000us             3  
-                                  cudaDeviceSynchronize        56.26%     605.233us        56.26%     605.233us     605.233us       0.000us         0.00%       0.000us       0.000us             1  
+                                  hf_kernels_layer_norm        21.88%     101.912us        98.91%     460.726us     460.726us       0.000us         0.00%     193.786us     193.786us             1  
+                _layer_norm_f8ec252::dropout_add_ln_fwd        10.30%      47.997us        74.62%     347.614us     115.871us     120.124us       100.00%     193.786us      64.595us             3  
+                                  hf_kernels_layer_norm         0.00%       0.000us         0.00%       0.000us       0.000us     129.116us       107.49%     129.116us     129.116us             1  
+void layer_norm::ln_fwd_kernel&lt;layer_norm::Kernel_tr...         0.00%       0.000us         0.00%       0.000us       0.000us     120.124us       100.00%     120.124us      40.041us             3  
+                                Activity Buffer Request        23.66%     110.222us        23.66%     110.222us     110.222us      73.662us        61.32%      73.662us      73.662us             1  
+                                             aten::view         2.40%      11.200us         2.40%      11.200us       1.867us       0.000us         0.00%       0.000us       0.000us             6  
+                                            aten::empty         6.29%      29.283us         6.29%      29.283us       3.254us       0.000us         0.00%       0.000us       0.000us             9  
+cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFla...         1.07%       4.970us         1.07%       4.970us       1.657us       0.000us         0.00%       0.000us       0.000us             3  
+                                       cudaLaunchKernel        33.30%     155.142us        33.30%     155.142us      51.714us       0.000us         0.00%       0.000us       0.000us             3  
+                                  cudaDeviceSynchronize         1.09%       5.100us         1.09%       5.100us       5.100us       0.000us         0.00%       0.000us       0.000us             1  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-Self CPU time total: 1.076ms
-Self CUDA time total: 763.349us
+Self CPU time total: 465.826us
+Self CUDA time total: 120.124us
 
 
 
@@ -4537,19 +4537,19 @@ PROFILE TRACE: hf_kernels_layer_norm | LN_B4_S1024_D8192
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
                                                    Name    Self CPU %      Self CPU   CPU total %     CPU total  CPU time avg     Self CUDA   Self CUDA %    CUDA total  CUDA time avg    # of Calls  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-                                  hf_kernels_layer_norm         4.84%     112.452us        36.30%     843.368us     843.368us       0.000us         0.00%       2.131ms       2.131ms             1  
-                _layer_norm_f8ec252::dropout_add_ln_fwd         1.85%      43.099us        30.92%     718.406us     239.469us       1.645ms       100.00%       2.131ms     710.322us             3  
-                                  hf_kernels_layer_norm         0.00%       0.000us         0.00%       0.000us       0.000us       1.647ms       100.09%       1.647ms       1.647ms             1  
-void layer_norm::ln_fwd_kernel&lt;layer_norm::Kernel_tr...         0.00%       0.000us         0.00%       0.000us       0.000us       1.645ms       100.00%       1.645ms     548.437us             3  
-                                Activity Buffer Request        20.28%     471.291us        20.28%     471.291us     471.291us     485.654us        29.52%     485.654us     485.654us             1  
-                                             aten::view         0.54%      12.510us         0.54%      12.510us       2.085us       0.000us         0.00%       0.000us       0.000us             6  
-                                            aten::empty         1.34%      31.070us         1.34%      31.070us       3.452us       0.000us         0.00%       0.000us       0.000us             9  
-cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFla...         0.22%       5.221us         0.22%       5.221us       1.740us       0.000us         0.00%       0.000us       0.000us             3  
-                                       cudaLaunchKernel         7.22%     167.725us         7.22%     167.725us      55.908us       0.000us         0.00%       0.000us       0.000us             3  
-                                  cudaDeviceSynchronize        63.70%       1.480ms        63.70%       1.480ms       1.480ms       0.000us         0.00%       0.000us       0.000us             1  
+                                  hf_kernels_layer_norm        10.47%     108.133us        61.96%     639.990us     639.990us       0.000us         0.00%     741.038us     741.038us             1  
+                _layer_norm_f8ec252::dropout_add_ln_fwd         4.66%      48.171us        50.27%     519.257us     173.086us     556.019us       100.00%     741.038us     247.013us             3  
+                                  hf_kernels_layer_norm         0.00%       0.000us         0.00%       0.000us       0.000us     557.395us       100.25%     557.395us     557.395us             1  
+void layer_norm::ln_fwd_kernel&lt;layer_norm::Kernel_tr...         0.00%       0.000us         0.00%       0.000us       0.000us     556.019us       100.00%     556.019us     185.340us             3  
+                                Activity Buffer Request        26.52%     273.914us        26.52%     273.914us     273.914us     185.019us        33.28%     185.019us     185.019us             1  
+                                             aten::view         1.22%      12.600us         1.22%      12.600us       2.100us       0.000us         0.00%       0.000us       0.000us             6  
+                                            aten::empty         2.91%      30.100us         2.91%      30.100us       3.344us       0.000us         0.00%       0.000us       0.000us             9  
+cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFla...         0.47%       4.869us         0.47%       4.869us       1.623us       0.000us         0.00%       0.000us       0.000us             3  
+                                       cudaLaunchKernel        15.70%     162.203us        15.70%     162.203us      54.068us       0.000us         0.00%       0.000us       0.000us             3  
+                                  cudaDeviceSynchronize        38.04%     392.946us        38.04%     392.946us     392.946us       0.000us         0.00%       0.000us       0.000us             1  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-Self CPU time total: 2.324ms
-Self CUDA time total: 1.645ms
+Self CPU time total: 1.033ms
+Self CUDA time total: 556.019us
 
 
 
@@ -4559,19 +4559,19 @@ PROFILE TRACE: hf_kernels_layer_norm | LN_B4_S2048_D1024
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
                                                    Name    Self CPU %      Self CPU   CPU total %     CPU total  CPU time avg     Self CUDA   Self CUDA %    CUDA total  CUDA time avg    # of Calls  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-                                  hf_kernels_layer_norm        22.85%     107.114us        99.00%     464.050us     464.050us       0.000us         0.00%     251.384us     251.384us             1  
-                _layer_norm_f8ec252::dropout_add_ln_fwd         9.45%      44.281us        73.62%     345.116us     115.039us     150.043us       100.00%     251.384us      83.795us             3  
-                                  hf_kernels_layer_norm         0.00%       0.000us         0.00%       0.000us       0.000us     151.420us       100.92%     151.420us     151.420us             1  
-void layer_norm::ln_fwd_kernel&lt;layer_norm::Kernel_tr...         0.00%       0.000us         0.00%       0.000us       0.000us     150.043us       100.00%     150.043us      50.014us             3  
-                                Activity Buffer Request        22.03%     103.262us        22.03%     103.262us     103.262us     101.341us        67.54%     101.341us     101.341us             1  
-                                             aten::view         2.52%      11.820us         2.52%      11.820us       1.970us       0.000us         0.00%       0.000us       0.000us             6  
-                                            aten::empty         6.37%      29.880us         6.37%      29.880us       3.320us       0.000us         0.00%       0.000us       0.000us             9  
-cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFla...         1.27%       5.930us         1.27%       5.930us       1.977us       0.000us         0.00%       0.000us       0.000us             3  
-                                       cudaLaunchKernel        34.51%     161.763us        34.51%     161.763us      53.921us       0.000us         0.00%       0.000us       0.000us             3  
-                                  cudaDeviceSynchronize         1.00%       4.700us         1.00%       4.700us       4.700us       0.000us         0.00%       0.000us       0.000us             1  
+                                  hf_kernels_layer_norm         0.00%       0.000us         0.00%       0.000us       0.000us     117.533us       202.70%     117.533us     117.533us             1  
+                                  hf_kernels_layer_norm        16.63%     101.441us        99.21%     605.228us     605.228us       0.000us         0.00%      93.950us      93.950us             1  
+                _layer_norm_f8ec252::dropout_add_ln_fwd         7.68%      46.841us        80.72%     492.428us     164.143us      57.983us       100.00%      93.950us      31.317us             3  
+void layer_norm::ln_fwd_kernel&lt;layer_norm::Kernel_tr...         0.00%       0.000us         0.00%       0.000us       0.000us      57.983us       100.00%      57.983us      19.328us             3  
+                                Activity Buffer Request        41.81%     255.054us        41.81%     255.054us     255.054us      35.967us        62.03%      35.967us      35.967us             1  
+                                             aten::view         1.86%      11.359us         1.86%      11.359us       1.893us       0.000us         0.00%       0.000us       0.000us             6  
+                                            aten::empty         4.84%      29.531us         4.84%      29.531us       3.281us       0.000us         0.00%       0.000us       0.000us             9  
+cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFla...         0.88%       5.399us         0.88%       5.399us       1.800us       0.000us         0.00%       0.000us       0.000us             3  
+                                       cudaLaunchKernel        25.51%     155.603us        25.51%     155.603us      51.868us       0.000us         0.00%       0.000us       0.000us             3  
+                                  cudaDeviceSynchronize         0.79%       4.850us         0.79%       4.850us       4.850us       0.000us         0.00%       0.000us       0.000us             1  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-Self CPU time total: 468.750us
-Self CUDA time total: 150.043us
+Self CPU time total: 610.078us
+Self CUDA time total: 57.983us
 
 
 
@@ -4581,19 +4581,19 @@ PROFILE TRACE: hf_kernels_layer_norm | LN_B4_S2048_D2048
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
                                                    Name    Self CPU %      Self CPU   CPU total %     CPU total  CPU time avg     Self CUDA   Self CUDA %    CUDA total  CUDA time avg    # of Calls  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-                                  hf_kernels_layer_norm         7.34%     110.231us        57.64%     865.428us     865.428us       0.000us         0.00%       1.059ms       1.059ms             1  
-                _layer_norm_f8ec252::dropout_add_ln_fwd         2.97%      44.579us        49.44%     742.306us     247.435us     800.455us       100.00%       1.059ms     352.853us             3  
-                                  hf_kernels_layer_norm         0.00%       0.000us         0.00%       0.000us       0.000us     801.894us       100.18%     801.894us     801.894us             1  
-void layer_norm::ln_fwd_kernel&lt;layer_norm::Kernel_tr...         0.00%       0.000us         0.00%       0.000us       0.000us     800.455us       100.00%     800.455us     266.818us             3  
-                                Activity Buffer Request        33.27%     499.511us        33.27%     499.511us     499.511us     258.104us        32.24%     258.104us     258.104us             1  
-                                             aten::view         0.86%      12.891us         0.86%      12.891us       2.148us       0.000us         0.00%       0.000us       0.000us             6  
-                                            aten::empty         2.04%      30.574us         2.04%      30.574us       3.397us       0.000us         0.00%       0.000us       0.000us             9  
-cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFla...         0.36%       5.369us         0.36%       5.369us       1.790us       0.000us         0.00%       0.000us       0.000us             3  
-                                       cudaLaunchKernel        10.81%     162.273us        10.81%     162.273us      54.091us       0.000us         0.00%       0.000us       0.000us             3  
-                                  cudaDeviceSynchronize        42.36%     635.954us        42.36%     635.954us     635.954us       0.000us         0.00%       0.000us       0.000us             1  
+                                  hf_kernels_layer_norm        16.67%     104.061us        99.23%     619.539us     619.539us       0.000us         0.00%     218.617us     218.617us             1  
+                _layer_norm_f8ec252::dropout_add_ln_fwd         7.57%      47.260us        80.66%     503.568us     167.856us     138.780us       100.00%     218.617us      72.872us             3  
+                                  hf_kernels_layer_norm         0.00%       0.000us         0.00%       0.000us       0.000us     140.188us       101.01%     140.188us     140.188us             1  
+void layer_norm::ln_fwd_kernel&lt;layer_norm::Kernel_tr...         0.00%       0.000us         0.00%       0.000us       0.000us     138.780us       100.00%     138.780us      46.260us             3  
+                                Activity Buffer Request        42.90%     267.854us        42.90%     267.854us     267.854us      79.837us        57.53%      79.837us      79.837us             1  
+                                             aten::view         1.91%      11.910us         1.91%      11.910us       1.985us       0.000us         0.00%       0.000us       0.000us             6  
+                                            aten::empty         4.81%      30.001us         4.81%      30.001us       3.333us       0.000us         0.00%       0.000us       0.000us             9  
+cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFla...         0.76%       4.720us         0.76%       4.720us       1.573us       0.000us         0.00%       0.000us       0.000us             3  
+                                       cudaLaunchKernel        24.62%     153.733us        24.62%     153.733us      51.244us       0.000us         0.00%       0.000us       0.000us             3  
+                                  cudaDeviceSynchronize         0.77%       4.780us         0.77%       4.780us       4.780us       0.000us         0.00%       0.000us       0.000us             1  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-Self CPU time total: 1.501ms
-Self CUDA time total: 800.455us
+Self CPU time total: 624.319us
+Self CUDA time total: 138.780us
 
 
 
@@ -4603,19 +4603,19 @@ PROFILE TRACE: hf_kernels_layer_norm | LN_B4_S2048_D4096
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
                                                    Name    Self CPU %      Self CPU   CPU total %     CPU total  CPU time avg     Self CUDA   Self CUDA %    CUDA total  CUDA time avg    # of Calls  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-                                  hf_kernels_layer_norm         6.21%     121.211us        24.06%     469.730us     469.730us       0.000us         0.00%       2.133ms       2.133ms             1  
-                _layer_norm_f8ec252::dropout_add_ln_fwd         2.16%      42.120us        17.26%     337.067us     112.356us       1.640ms       100.00%       2.133ms     710.841us             3  
-                                  hf_kernels_layer_norm         0.00%       0.000us         0.00%       0.000us       0.000us       1.641ms       100.08%       1.641ms       1.641ms             1  
-void layer_norm::ln_fwd_kernel&lt;layer_norm::Kernel_tr...         0.00%       0.000us         0.00%       0.000us       0.000us       1.640ms       100.00%       1.640ms     546.618us             3  
-                                Activity Buffer Request         4.76%      92.922us         4.76%      92.922us      92.922us     492.667us        30.04%     492.667us     492.667us             1  
-                                             aten::view         0.59%      11.452us         0.59%      11.452us       1.909us       0.000us         0.00%       0.000us       0.000us             6  
-                                            aten::empty         1.61%      31.362us         1.61%      31.362us       3.485us       0.000us         0.00%       0.000us       0.000us             9  
-cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFla...         0.23%       4.460us         0.23%       4.460us       1.487us       0.000us         0.00%       0.000us       0.000us             3  
-                                       cudaLaunchKernel         8.51%     166.203us         8.51%     166.203us      55.401us       0.000us         0.00%       0.000us       0.000us             3  
-                                  cudaDeviceSynchronize        75.94%       1.483ms        75.94%       1.483ms       1.483ms       0.000us         0.00%       0.000us       0.000us             1  
+                                  hf_kernels_layer_norm        11.56%     103.222us        56.17%     501.697us     501.697us       0.000us         0.00%     729.744us     729.744us             1  
+                _layer_norm_f8ec252::dropout_add_ln_fwd         5.35%      47.791us        43.31%     386.845us     128.948us     547.924us       100.00%     729.744us     243.248us             3  
+                                  hf_kernels_layer_norm         0.00%       0.000us         0.00%       0.000us       0.000us     549.427us       100.27%     549.427us     549.427us             1  
+void layer_norm::ln_fwd_kernel&lt;layer_norm::Kernel_tr...         0.00%       0.000us         0.00%       0.000us       0.000us     547.924us       100.00%     547.924us     182.641us             3  
+                                Activity Buffer Request        16.56%     147.902us        16.56%     147.902us     147.902us     181.820us        33.18%     181.820us     181.820us             1  
+                                             aten::view         1.30%      11.630us         1.30%      11.630us       1.938us       0.000us         0.00%       0.000us       0.000us             6  
+                                            aten::empty         3.31%      29.600us         3.31%      29.600us       3.289us       0.000us         0.00%       0.000us       0.000us             9  
+cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFla...         0.55%       4.940us         0.55%       4.940us       1.647us       0.000us         0.00%       0.000us       0.000us             3  
+                                       cudaLaunchKernel        17.53%     156.612us        17.53%     156.612us      52.204us       0.000us         0.00%       0.000us       0.000us             3  
+                                  cudaDeviceSynchronize        43.83%     391.555us        43.83%     391.555us     391.555us       0.000us         0.00%       0.000us       0.000us             1  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-Self CPU time total: 1.953ms
-Self CUDA time total: 1.640ms
+Self CPU time total: 893.252us
+Self CUDA time total: 547.924us
 
 
 
@@ -4625,19 +4625,19 @@ PROFILE TRACE: hf_kernels_layer_norm | LN_B4_S2048_D8192
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
                                                    Name    Self CPU %      Self CPU   CPU total %     CPU total  CPU time avg     Self CUDA   Self CUDA %    CUDA total  CUDA time avg    # of Calls  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-                                  hf_kernels_layer_norm         2.73%     111.353us        22.06%     898.879us     898.879us       0.000us         0.00%       4.367ms       4.367ms             1  
-                _layer_norm_f8ec252::dropout_add_ln_fwd         1.02%      41.530us        19.00%     774.227us     258.076us       3.342ms       100.00%       4.367ms       1.456ms             3  
-                                  hf_kernels_layer_norm         0.00%       0.000us         0.00%       0.000us       0.000us       3.343ms       100.04%       3.343ms       3.343ms             1  
-void layer_norm::ln_fwd_kernel&lt;layer_norm::Kernel_tr...         0.00%       0.000us         0.00%       0.000us       0.000us       3.342ms       100.00%       3.342ms       1.114ms             3  
-                                Activity Buffer Request        12.92%     526.282us        12.92%     526.282us     526.282us       1.025ms        30.68%       1.025ms       1.025ms             1  
-                                             aten::view         0.33%      13.299us         0.33%      13.299us       2.217us       0.000us         0.00%       0.000us       0.000us             6  
-                                            aten::empty         0.83%      33.890us         0.83%      33.890us       3.766us       0.000us         0.00%       0.000us       0.000us             9  
-cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFla...         0.13%       5.100us         0.13%       5.100us       1.700us       0.000us         0.00%       0.000us       0.000us             3  
-                                       cudaLaunchKernel         4.11%     167.425us         4.11%     167.425us      55.808us       0.000us         0.00%       0.000us       0.000us             3  
-                                  cudaDeviceSynchronize        77.94%       3.175ms        77.94%       3.175ms       3.175ms       0.000us         0.00%       0.000us       0.000us             1  
+                                  hf_kernels_layer_norm         6.34%     102.532us        36.35%     588.198us     588.198us       0.000us         0.00%       1.536ms       1.536ms             1  
+                _layer_norm_f8ec252::dropout_add_ln_fwd         2.97%      48.143us        29.27%     473.696us     157.899us       1.186ms       100.00%       1.536ms     511.906us             3  
+                                  hf_kernels_layer_norm         0.00%       0.000us         0.00%       0.000us       0.000us       1.188ms       100.13%       1.188ms       1.188ms             1  
+void layer_norm::ln_fwd_kernel&lt;layer_norm::Kernel_tr...         0.00%       0.000us         0.00%       0.000us       0.000us       1.186ms       100.00%       1.186ms     395.396us             3  
+                                Activity Buffer Request        14.38%     232.673us        14.38%     232.673us     232.673us     349.530us        29.47%     349.530us     349.530us             1  
+                                             aten::view         0.74%      11.970us         0.74%      11.970us       1.995us       0.000us         0.00%       0.000us       0.000us             6  
+                                            aten::empty         1.86%      30.039us         1.86%      30.039us       3.338us       0.000us         0.00%       0.000us       0.000us             9  
+cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFla...         0.30%       4.850us         0.30%       4.850us       1.617us       0.000us         0.00%       0.000us       0.000us             3  
+                                       cudaLaunchKernel         9.76%     157.991us         9.76%     157.991us      52.664us       0.000us         0.00%       0.000us       0.000us             3  
+                                  cudaDeviceSynchronize        63.65%       1.030ms        63.65%       1.030ms       1.030ms       0.000us         0.00%       0.000us       0.000us             1  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-Self CPU time total: 4.074ms
-Self CUDA time total: 3.342ms
+Self CPU time total: 1.618ms
+Self CUDA time total: 1.186ms
 
 
 
@@ -4647,19 +4647,19 @@ PROFILE TRACE: hf_kernels_layer_norm | LN_B16_S128_D1024
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
                                                    Name    Self CPU %      Self CPU   CPU total %     CPU total  CPU time avg     Self CUDA   Self CUDA %    CUDA total  CUDA time avg    # of Calls  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-                                  hf_kernels_layer_norm         0.00%       0.000us         0.00%       0.000us       0.000us     123.232us       471.36%     123.232us     123.232us             1  
-                                  hf_kernels_layer_norm        12.80%     108.565us        99.46%     843.618us     843.618us       0.000us         0.00%      35.008us      35.008us             1  
-                _layer_norm_f8ec252::dropout_add_ln_fwd         5.05%      42.820us        85.21%     722.694us     240.898us      26.144us       100.00%      35.008us      11.669us             3  
-void layer_norm::ln_fwd_kernel&lt;layer_norm::Kernel_tr...         0.00%       0.000us         0.00%       0.000us       0.000us      26.144us       100.00%      26.144us       8.715us             3  
-                                Activity Buffer Request        56.64%     480.421us        56.64%     480.421us     480.421us       8.864us        33.90%       8.864us       8.864us             1  
-                                             aten::view         1.46%      12.359us         1.46%      12.359us       2.060us       0.000us         0.00%       0.000us       0.000us             6  
-                                            aten::empty         3.60%      30.550us         3.60%      30.550us       3.394us       0.000us         0.00%       0.000us       0.000us             9  
-cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFla...         0.62%       5.250us         0.62%       5.250us       1.750us       0.000us         0.00%       0.000us       0.000us             3  
-                                       cudaLaunchKernel        19.29%     163.653us        19.29%     163.653us      54.551us       0.000us         0.00%       0.000us       0.000us             3  
-                                  cudaDeviceSynchronize         0.54%       4.550us         0.54%       4.550us       4.550us       0.000us         0.00%       0.000us       0.000us             1  
+                                  hf_kernels_layer_norm         0.00%       0.000us         0.00%       0.000us       0.000us     112.413us       848.59%     112.413us     112.413us             1  
+                                  hf_kernels_layer_norm        21.62%     101.733us        99.00%     465.906us     465.906us       0.000us         0.00%      17.726us      17.726us             1  
+                _layer_norm_f8ec252::dropout_add_ln_fwd        10.03%      47.199us        74.95%     352.704us     117.568us      13.247us       100.00%      17.726us       5.909us             3  
+void layer_norm::ln_fwd_kernel&lt;layer_norm::Kernel_tr...         0.00%       0.000us         0.00%       0.000us       0.000us      13.247us       100.00%      13.247us       4.416us             3  
+                                Activity Buffer Request        24.84%     116.882us        24.84%     116.882us     116.882us       4.479us        33.81%       4.479us       4.479us             1  
+                                             aten::view         2.44%      11.469us         2.44%      11.469us       1.912us       0.000us         0.00%       0.000us       0.000us             6  
+                                            aten::empty         6.31%      29.701us         6.31%      29.701us       3.300us       0.000us         0.00%       0.000us       0.000us             9  
+cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFla...         1.17%       5.520us         1.17%       5.520us       1.840us       0.000us         0.00%       0.000us       0.000us             3  
+                                       cudaLaunchKernel        32.60%     153.402us        32.60%     153.402us      51.134us       0.000us         0.00%       0.000us       0.000us             3  
+                                  cudaDeviceSynchronize         1.00%       4.700us         1.00%       4.700us       4.700us       0.000us         0.00%       0.000us       0.000us             1  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-Self CPU time total: 848.168us
-Self CUDA time total: 26.144us
+Self CPU time total: 470.606us
+Self CUDA time total: 13.247us
 
 
 
@@ -4669,19 +4669,19 @@ PROFILE TRACE: hf_kernels_layer_norm | LN_B16_S128_D2048
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
                                                    Name    Self CPU %      Self CPU   CPU total %     CPU total  CPU time avg     Self CUDA   Self CUDA %    CUDA total  CUDA time avg    # of Calls  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-                                  hf_kernels_layer_norm        20.19%     105.083us        99.15%     516.111us     516.111us       0.000us         0.00%     144.730us     144.730us             1  
-                _layer_norm_f8ec252::dropout_add_ln_fwd         8.34%      43.421us        76.72%     399.369us     133.123us      91.356us       100.00%     144.730us      48.243us             3  
-                                  hf_kernels_layer_norm         0.00%       0.000us         0.00%       0.000us       0.000us     129.148us       141.37%     129.148us     129.148us             1  
-void layer_norm::ln_fwd_kernel&lt;layer_norm::Kernel_tr...         0.00%       0.000us         0.00%       0.000us       0.000us      91.356us       100.00%      91.356us      30.452us             3  
-                                Activity Buffer Request        30.57%     159.154us        30.57%     159.154us     159.154us      53.374us        58.42%      53.374us      53.374us             1  
-                                             aten::view         2.24%      11.659us         2.24%      11.659us       1.943us       0.000us         0.00%       0.000us       0.000us             6  
-                                            aten::empty         5.91%      30.740us         5.91%      30.740us       3.416us       0.000us         0.00%       0.000us       0.000us             9  
-cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFla...         1.03%       5.350us         1.03%       5.350us       1.783us       0.000us         0.00%       0.000us       0.000us             3  
-                                       cudaLaunchKernel        30.87%     160.704us        30.87%     160.704us      53.568us       0.000us         0.00%       0.000us       0.000us             3  
-                                  cudaDeviceSynchronize         0.85%       4.440us         0.85%       4.440us       4.440us       0.000us         0.00%       0.000us       0.000us             1  
+                                  hf_kernels_layer_norm         0.00%       0.000us         0.00%       0.000us       0.000us     116.766us       456.71%     116.766us     116.766us             1  
+                                  hf_kernels_layer_norm        17.51%     102.502us        99.17%     580.409us     580.409us       0.000us         0.00%      34.239us      34.239us             1  
+                _layer_norm_f8ec252::dropout_add_ln_fwd         7.99%      46.742us        79.55%     465.587us     155.196us      25.567us       100.00%      34.239us      11.413us             3  
+void layer_norm::ln_fwd_kernel&lt;layer_norm::Kernel_tr...         0.00%       0.000us         0.00%       0.000us       0.000us      25.567us       100.00%      25.567us       8.522us             3  
+                                Activity Buffer Request        39.32%     230.104us        39.32%     230.104us     230.104us       8.672us        33.92%       8.672us       8.672us             1  
+                                             aten::view         2.11%      12.320us         2.11%      12.320us       2.053us       0.000us         0.00%       0.000us       0.000us             6  
+                                            aten::empty         5.04%      29.500us         5.04%      29.500us       3.278us       0.000us         0.00%       0.000us       0.000us             9  
+cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFla...         1.02%       5.979us         1.02%       5.979us       1.993us       0.000us         0.00%       0.000us       0.000us             3  
+                                       cudaLaunchKernel        26.19%     153.262us        26.19%     153.262us      51.087us       0.000us         0.00%       0.000us       0.000us             3  
+                                  cudaDeviceSynchronize         0.83%       4.860us         0.83%       4.860us       4.860us       0.000us         0.00%       0.000us       0.000us             1  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-Self CPU time total: 520.551us
-Self CUDA time total: 91.356us
+Self CPU time total: 585.269us
+Self CUDA time total: 25.567us
 
 
 
@@ -4691,19 +4691,19 @@ PROFILE TRACE: hf_kernels_layer_norm | LN_B16_S128_D4096
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
                                                    Name    Self CPU %      Self CPU   CPU total %     CPU total  CPU time avg     Self CUDA   Self CUDA %    CUDA total  CUDA time avg    # of Calls  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-                                  hf_kernels_layer_norm        13.09%     110.823us        99.44%     841.628us     841.628us       0.000us         0.00%     249.627us     249.627us             1  
-                _layer_norm_f8ec252::dropout_add_ln_fwd         5.86%      49.630us        84.88%     718.434us     239.478us     153.277us       100.00%     249.627us      83.209us             3  
-                                  hf_kernels_layer_norm         0.00%       0.000us         0.00%       0.000us       0.000us     154.621us       100.88%     154.621us     154.621us             1  
-void layer_norm::ln_fwd_kernel&lt;layer_norm::Kernel_tr...         0.00%       0.000us         0.00%       0.000us       0.000us     153.277us       100.00%     153.277us      51.092us             3  
-                                Activity Buffer Request        55.47%     469.480us        55.47%     469.480us     469.480us      96.350us        62.86%      96.350us      96.350us             1  
-                                             aten::view         1.46%      12.371us         1.46%      12.371us       2.062us       0.000us         0.00%       0.000us       0.000us             6  
-                                            aten::empty         3.70%      31.290us         3.70%      31.290us       3.477us       0.000us         0.00%       0.000us       0.000us             9  
-cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFla...         0.56%       4.751us         0.56%       4.751us       1.584us       0.000us         0.00%       0.000us       0.000us             3  
-                                       cudaLaunchKernel        19.29%     163.283us        19.29%     163.283us      54.428us       0.000us         0.00%       0.000us       0.000us             3  
-                                  cudaDeviceSynchronize         0.56%       4.750us         0.56%       4.750us       4.750us       0.000us         0.00%       0.000us       0.000us             1  
+                                  hf_kernels_layer_norm         0.00%       0.000us         0.00%       0.000us       0.000us     120.223us       201.23%     120.223us     120.223us             1  
+                                  hf_kernels_layer_norm        16.35%     102.201us        99.23%     620.398us     620.398us       0.000us         0.00%      95.200us      95.200us             1  
+                _layer_norm_f8ec252::dropout_add_ln_fwd         7.44%      46.527us        81.07%     506.887us     168.962us      59.744us       100.00%      95.200us      31.733us             3  
+void layer_norm::ln_fwd_kernel&lt;layer_norm::Kernel_tr...         0.00%       0.000us         0.00%       0.000us       0.000us      59.744us       100.00%      59.744us      19.915us             3  
+                                Activity Buffer Request        43.52%     272.134us        43.52%     272.134us     272.134us      35.456us        59.35%      35.456us      35.456us             1  
+                                             aten::view         1.81%      11.310us         1.81%      11.310us       1.885us       0.000us         0.00%       0.000us       0.000us             6  
+                                            aten::empty         4.69%      29.332us         4.69%      29.332us       3.259us       0.000us         0.00%       0.000us       0.000us             9  
+cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFla...         0.86%       5.391us         0.86%       5.391us       1.797us       0.000us         0.00%       0.000us       0.000us             3  
+                                       cudaLaunchKernel        24.55%     153.503us        24.55%     153.503us      51.168us       0.000us         0.00%       0.000us       0.000us             3  
+                                  cudaDeviceSynchronize         0.77%       4.841us         0.77%       4.841us       4.841us       0.000us         0.00%       0.000us       0.000us             1  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-Self CPU time total: 846.378us
-Self CUDA time total: 153.277us
+Self CPU time total: 625.239us
+Self CUDA time total: 59.744us
 
 
 
@@ -4713,19 +4713,19 @@ PROFILE TRACE: hf_kernels_layer_norm | LN_B16_S128_D8192
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
                                                    Name    Self CPU %      Self CPU   CPU total %     CPU total  CPU time avg     Self CUDA   Self CUDA %    CUDA total  CUDA time avg    # of Calls  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-                                  hf_kernels_layer_norm         9.75%     106.662us        43.81%     479.070us     479.070us       0.000us         0.00%       1.022ms       1.022ms             1  
-                _layer_norm_f8ec252::dropout_add_ln_fwd         3.93%      43.001us        32.91%     359.867us     119.956us     772.676us       100.00%       1.022ms     340.734us             3  
-                                  hf_kernels_layer_norm         0.00%       0.000us         0.00%       0.000us       0.000us     773.988us       100.17%     773.988us     773.988us             1  
-void layer_norm::ln_fwd_kernel&lt;layer_norm::Kernel_tr...         0.00%       0.000us         0.00%       0.000us       0.000us     772.676us       100.00%     772.676us     257.559us             3  
-                                Activity Buffer Request        10.97%     119.943us        10.97%     119.943us     119.943us     249.527us        32.29%     249.527us     249.527us             1  
-                                             aten::view         1.15%      12.541us         1.15%      12.541us       2.090us       0.000us         0.00%       0.000us       0.000us             6  
-                                            aten::empty         2.94%      32.110us         2.94%      32.110us       3.568us       0.000us         0.00%       0.000us       0.000us             9  
-cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFla...         0.41%       4.440us         0.41%       4.440us       1.480us       0.000us         0.00%       0.000us       0.000us             3  
-                                       cudaLaunchKernel        14.66%     160.373us        14.66%     160.373us      53.458us       0.000us         0.00%       0.000us       0.000us             3  
-                                  cudaDeviceSynchronize        56.19%     614.524us        56.19%     614.524us     614.524us       0.000us         0.00%       0.000us       0.000us             1  
+                                  hf_kernels_layer_norm        20.57%     103.320us        99.00%     497.196us     497.196us       0.000us         0.00%     197.814us     197.814us             1  
+                _layer_norm_f8ec252::dropout_add_ln_fwd         9.11%      45.760us        76.10%     382.195us     127.398us     124.346us       100.00%     197.814us      65.938us             3  
+                                  hf_kernels_layer_norm         0.00%       0.000us         0.00%       0.000us       0.000us     132.857us       106.84%     132.857us     132.857us             1  
+void layer_norm::ln_fwd_kernel&lt;layer_norm::Kernel_tr...         0.00%       0.000us         0.00%       0.000us       0.000us     124.346us       100.00%     124.346us      41.449us             3  
+                                Activity Buffer Request        28.52%     143.222us        28.52%     143.222us     143.222us      73.468us        59.08%      73.468us      73.468us             1  
+                                             aten::view         2.33%      11.681us         2.33%      11.681us       1.947us       0.000us         0.00%       0.000us       0.000us             6  
+                                            aten::empty         6.37%      31.970us         6.37%      31.970us       3.552us       0.000us         0.00%       0.000us       0.000us             9  
+cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFla...         0.95%       4.761us         0.95%       4.761us       1.587us       0.000us         0.00%       0.000us       0.000us             3  
+                                       cudaLaunchKernel        31.16%     156.482us        31.16%     156.482us      52.161us       0.000us         0.00%       0.000us       0.000us             3  
+                                  cudaDeviceSynchronize         1.00%       5.020us         1.00%       5.020us       5.020us       0.000us         0.00%       0.000us       0.000us             1  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-Self CPU time total: 1.094ms
-Self CUDA time total: 772.676us
+Self CPU time total: 502.216us
+Self CUDA time total: 124.346us
 
 
 
@@ -4735,19 +4735,19 @@ PROFILE TRACE: hf_kernels_layer_norm | LN_B16_S512_D1024
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
                                                    Name    Self CPU %      Self CPU   CPU total %     CPU total  CPU time avg     Self CUDA   Self CUDA %    CUDA total  CUDA time avg    # of Calls  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-                                  hf_kernels_layer_norm        12.77%     113.905us        99.47%     886.969us     886.969us       0.000us         0.00%     250.621us     250.621us             1  
-                _layer_norm_f8ec252::dropout_add_ln_fwd         5.13%      45.759us        85.35%     761.025us     253.675us     149.982us       100.00%     250.621us      83.540us             3  
-                                  hf_kernels_layer_norm         0.00%       0.000us         0.00%       0.000us       0.000us     151.390us       100.94%     151.390us     151.390us             1  
-void layer_norm::ln_fwd_kernel&lt;layer_norm::Kernel_tr...         0.00%       0.000us         0.00%       0.000us       0.000us     149.982us       100.00%     149.982us      49.994us             3  
-                                Activity Buffer Request        57.16%     509.711us        57.16%     509.711us     509.711us     100.639us        67.10%     100.639us     100.639us             1  
-                                             aten::view         1.35%      12.039us         1.35%      12.039us       2.007us       0.000us         0.00%       0.000us       0.000us             6  
-                                            aten::empty         3.32%      29.620us         3.32%      29.620us       3.291us       0.000us         0.00%       0.000us       0.000us             9  
-cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFla...         0.60%       5.321us         0.60%       5.321us       1.774us       0.000us         0.00%       0.000us       0.000us             3  
-                                       cudaLaunchKernel        19.13%     170.614us        19.13%     170.614us      56.871us       0.000us         0.00%       0.000us       0.000us             3  
-                                  cudaDeviceSynchronize         0.53%       4.691us         0.53%       4.691us       4.691us       0.000us         0.00%       0.000us       0.000us             1  
+                                  hf_kernels_layer_norm         0.00%       0.000us         0.00%       0.000us       0.000us     124.255us       213.59%     124.255us     124.255us             1  
+                                  hf_kernels_layer_norm        13.39%     104.902us        99.38%     778.360us     778.360us       0.000us         0.00%      94.430us      94.430us             1  
+                _layer_norm_f8ec252::dropout_add_ln_fwd         6.10%      47.738us        84.51%     661.878us     220.626us      58.175us       100.00%      94.430us      31.477us             3  
+void layer_norm::ln_fwd_kernel&lt;layer_norm::Kernel_tr...         0.00%       0.000us         0.00%       0.000us       0.000us      58.175us       100.00%      58.175us      19.392us             3  
+                                Activity Buffer Request        54.12%     423.885us        54.12%     423.885us     423.885us      36.255us        62.32%      36.255us      36.255us             1  
+                                             aten::view         1.48%      11.580us         1.48%      11.580us       1.930us       0.000us         0.00%       0.000us       0.000us             6  
+                                            aten::empty         3.89%      30.461us         3.89%      30.461us       3.385us       0.000us         0.00%       0.000us       0.000us             9  
+cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFla...         0.64%       5.001us         0.64%       5.001us       1.667us       0.000us         0.00%       0.000us       0.000us             3  
+                                       cudaLaunchKernel        19.76%     154.793us        19.76%     154.793us      51.598us       0.000us         0.00%       0.000us       0.000us             3  
+                                  cudaDeviceSynchronize         0.62%       4.840us         0.62%       4.840us       4.840us       0.000us         0.00%       0.000us       0.000us             1  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-Self CPU time total: 891.660us
-Self CUDA time total: 149.982us
+Self CPU time total: 783.200us
+Self CUDA time total: 58.175us
 
 
 
@@ -4757,19 +4757,19 @@ PROFILE TRACE: hf_kernels_layer_norm | LN_B16_S512_D2048
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
                                                    Name    Self CPU %      Self CPU   CPU total %     CPU total  CPU time avg     Self CUDA   Self CUDA %    CUDA total  CUDA time avg    # of Calls  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-                                  hf_kernels_layer_norm         8.98%     115.483us        49.89%     641.834us     641.834us       0.000us         0.00%       1.066ms       1.066ms             1  
-                _layer_norm_f8ec252::dropout_add_ln_fwd         3.41%      43.812us        39.92%     513.570us     171.190us     804.418us       100.00%       1.066ms     355.382us             3  
-                                  hf_kernels_layer_norm         0.00%       0.000us         0.00%       0.000us       0.000us     805.858us       100.18%     805.858us     805.858us             1  
-void layer_norm::ln_fwd_kernel&lt;layer_norm::Kernel_tr...         0.00%       0.000us         0.00%       0.000us       0.000us     804.418us       100.00%     804.418us     268.139us             3  
-                                Activity Buffer Request        21.15%     272.045us        21.15%     272.045us     272.045us     261.728us        32.54%     261.728us     261.728us             1  
-                                             aten::view         0.99%      12.781us         0.99%      12.781us       2.130us       0.000us         0.00%       0.000us       0.000us             6  
-                                            aten::empty         2.33%      29.920us         2.33%      29.920us       3.324us       0.000us         0.00%       0.000us       0.000us             9  
-cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFla...         0.35%       4.450us         0.35%       4.450us       1.483us       0.000us         0.00%       0.000us       0.000us             3  
-                                       cudaLaunchKernel        12.70%     163.343us        12.70%     163.343us      54.448us       0.000us         0.00%       0.000us       0.000us             3  
-                                  cudaDeviceSynchronize        50.11%     644.543us        50.11%     644.543us     644.543us       0.000us         0.00%       0.000us       0.000us             1  
+                                  hf_kernels_layer_norm        21.79%     100.002us        98.87%     453.846us     453.846us       0.000us         0.00%     220.923us     220.923us             1  
+                _layer_norm_f8ec252::dropout_add_ln_fwd         9.94%      45.651us        74.52%     342.064us     114.021us     139.741us       100.00%     220.923us      73.641us             3  
+                                  hf_kernels_layer_norm         0.00%       0.000us         0.00%       0.000us       0.000us     141.149us       101.01%     141.149us     141.149us             1  
+void layer_norm::ln_fwd_kernel&lt;layer_norm::Kernel_tr...         0.00%       0.000us         0.00%       0.000us       0.000us     139.741us       100.00%     139.741us      46.580us             3  
+                                Activity Buffer Request        23.19%     106.461us        23.19%     106.461us     106.461us      81.182us        58.09%      81.182us      81.182us             1  
+                                             aten::view         2.57%      11.780us         2.57%      11.780us       1.963us       0.000us         0.00%       0.000us       0.000us             6  
+                                            aten::empty         6.95%      31.900us         6.95%      31.900us       3.544us       0.000us         0.00%       0.000us       0.000us             9  
+cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFla...         1.20%       5.510us         1.20%       5.510us       1.837us       0.000us         0.00%       0.000us       0.000us             3  
+                                       cudaLaunchKernel        33.23%     152.542us        33.23%     152.542us      50.847us       0.000us         0.00%       0.000us       0.000us             3  
+                                  cudaDeviceSynchronize         1.13%       5.191us         1.13%       5.191us       5.191us       0.000us         0.00%       0.000us       0.000us             1  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-Self CPU time total: 1.286ms
-Self CUDA time total: 804.418us
+Self CPU time total: 459.037us
+Self CUDA time total: 139.741us
 
 
 
@@ -4779,19 +4779,19 @@ PROFILE TRACE: hf_kernels_layer_norm | LN_B16_S512_D4096
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
                                                    Name    Self CPU %      Self CPU   CPU total %     CPU total  CPU time avg     Self CUDA   Self CUDA %    CUDA total  CUDA time avg    # of Calls  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-                                  hf_kernels_layer_norm         5.64%     113.303us        25.29%     508.381us     508.381us       0.000us         0.00%       2.159ms       2.159ms             1  
-                _layer_norm_f8ec252::dropout_add_ln_fwd         2.17%      43.641us        19.06%     383.097us     127.699us       1.664ms       100.00%       2.159ms     719.632us             3  
-                                  hf_kernels_layer_norm         0.00%       0.000us         0.00%       0.000us       0.000us       1.665ms       100.07%       1.665ms       1.665ms             1  
-void layer_norm::ln_fwd_kernel&lt;layer_norm::Kernel_tr...         0.00%       0.000us         0.00%       0.000us       0.000us       1.664ms       100.00%       1.664ms     554.697us             3  
-                                Activity Buffer Request         7.17%     144.123us         7.17%     144.123us     144.123us     494.805us        29.73%     494.805us     494.805us             1  
-                                             aten::view         0.60%      11.981us         0.60%      11.981us       1.997us       0.000us         0.00%       0.000us       0.000us             6  
-                                            aten::empty         1.55%      31.100us         1.55%      31.100us       3.456us       0.000us         0.00%       0.000us       0.000us             9  
-cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFla...         0.23%       4.550us         0.23%       4.550us       1.517us       0.000us         0.00%       0.000us       0.000us             3  
-                                       cudaLaunchKernel         7.94%     159.683us         7.94%     159.683us      53.228us       0.000us         0.00%       0.000us       0.000us             3  
-                                  cudaDeviceSynchronize        74.71%       1.502ms        74.71%       1.502ms       1.502ms       0.000us         0.00%       0.000us       0.000us             1  
+                                  hf_kernels_layer_norm         8.50%     106.103us        68.87%     859.212us     859.212us       0.000us         0.00%     730.264us     730.264us             1  
+                _layer_norm_f8ec252::dropout_add_ln_fwd         3.84%      47.858us        59.45%     741.700us     247.233us     547.642us       100.00%     730.264us     243.421us             3  
+                                  hf_kernels_layer_norm         0.00%       0.000us         0.00%       0.000us       0.000us     549.114us       100.27%     549.114us     549.114us             1  
+void layer_norm::ln_fwd_kernel&lt;layer_norm::Kernel_tr...         0.00%       0.000us         0.00%       0.000us       0.000us     547.642us       100.00%     547.642us     182.547us             3  
+                                Activity Buffer Request        40.36%     503.557us        40.36%     503.557us     503.557us     182.622us        33.35%     182.622us     182.622us             1  
+                                             aten::view         0.91%      11.409us         0.91%      11.409us       1.901us       0.000us         0.00%       0.000us       0.000us             6  
+                                            aten::empty         2.41%      30.103us         2.41%      30.103us       3.345us       0.000us         0.00%       0.000us       0.000us             9  
+cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFla...         0.44%       5.510us         0.44%       5.510us       1.837us       0.000us         0.00%       0.000us       0.000us             3  
+                                       cudaLaunchKernel        12.40%     154.672us        12.40%     154.672us      51.557us       0.000us         0.00%       0.000us       0.000us             3  
+                                  cudaDeviceSynchronize        31.13%     388.435us        31.13%     388.435us     388.435us       0.000us         0.00%       0.000us       0.000us             1  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-Self CPU time total: 2.010ms
-Self CUDA time total: 1.664ms
+Self CPU time total: 1.248ms
+Self CUDA time total: 547.642us
 
 
 
@@ -4801,19 +4801,19 @@ PROFILE TRACE: hf_kernels_layer_norm | LN_B16_S512_D8192
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
                                                    Name    Self CPU %      Self CPU   CPU total %     CPU total  CPU time avg     Self CUDA   Self CUDA %    CUDA total  CUDA time avg    # of Calls  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-                                  hf_kernels_layer_norm         2.98%     113.892us        17.62%     674.305us     674.305us       0.000us         0.00%       4.332ms       4.332ms             1  
-                _layer_norm_f8ec252::dropout_add_ln_fwd         1.13%      43.322us        14.33%     548.253us     182.751us       3.318ms       100.00%       4.332ms       1.444ms             3  
-                                  hf_kernels_layer_norm         0.00%       0.000us         0.00%       0.000us       0.000us       3.319ms       100.04%       3.319ms       3.319ms             1  
-void layer_norm::ln_fwd_kernel&lt;layer_norm::Kernel_tr...         0.00%       0.000us         0.00%       0.000us       0.000us       3.318ms       100.00%       3.318ms       1.106ms             3  
-                                Activity Buffer Request         7.93%     303.427us         7.93%     303.427us     303.427us       1.015ms        30.58%       1.015ms       1.015ms             1  
-                                             aten::view         0.32%      12.160us         0.32%      12.160us       2.027us       0.000us         0.00%       0.000us       0.000us             6  
-                                            aten::empty         0.81%      30.960us         0.81%      30.960us       3.440us       0.000us         0.00%       0.000us       0.000us             9  
-cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFla...         0.13%       5.090us         0.13%       5.090us       1.697us       0.000us         0.00%       0.000us       0.000us             3  
-                                       cudaLaunchKernel         4.32%     165.454us         4.32%     165.454us      55.151us       0.000us         0.00%       0.000us       0.000us             3  
-                                  cudaDeviceSynchronize        82.38%       3.152ms        82.38%       3.152ms       3.152ms       0.000us         0.00%       0.000us       0.000us             1  
+                                  hf_kernels_layer_norm         6.20%     117.401us        45.70%     865.822us     865.822us       0.000us         0.00%       1.533ms       1.533ms             1  
+                _layer_norm_f8ec252::dropout_add_ln_fwd         2.53%      47.909us        38.86%     736.290us     245.430us       1.191ms       100.00%       1.533ms     511.056us             3  
+                                  hf_kernels_layer_norm         0.00%       0.000us         0.00%       0.000us       0.000us       1.192ms       100.13%       1.192ms       1.192ms             1  
+void layer_norm::ln_fwd_kernel&lt;layer_norm::Kernel_tr...         0.00%       0.000us         0.00%       0.000us       0.000us       1.191ms       100.00%       1.191ms     396.977us             3  
+                                Activity Buffer Request        26.13%     495.047us        26.13%     495.047us     495.047us     342.236us        28.74%     342.236us     342.236us             1  
+                                             aten::view         0.64%      12.131us         0.64%      12.131us       2.022us       0.000us         0.00%       0.000us       0.000us             6  
+                                            aten::empty         1.61%      30.562us         1.61%      30.562us       3.396us       0.000us         0.00%       0.000us       0.000us             9  
+cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFla...         0.31%       5.930us         0.31%       5.930us       1.977us       0.000us         0.00%       0.000us       0.000us             3  
+                                       cudaLaunchKernel         8.28%     156.842us         8.28%     156.842us      52.281us       0.000us         0.00%       0.000us       0.000us             3  
+                                  cudaDeviceSynchronize        54.30%       1.029ms        54.30%       1.029ms       1.029ms       0.000us         0.00%       0.000us       0.000us             1  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-Self CPU time total: 3.826ms
-Self CUDA time total: 3.318ms
+Self CPU time total: 1.895ms
+Self CUDA time total: 1.191ms
 
 
 
@@ -4823,19 +4823,19 @@ PROFILE TRACE: hf_kernels_layer_norm | LN_B16_S1024_D1024
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
                                                    Name    Self CPU %      Self CPU   CPU total %     CPU total  CPU time avg     Self CUDA   Self CUDA %    CUDA total  CUDA time avg    # of Calls  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-                                  hf_kernels_layer_norm         9.56%     107.052us        43.15%     483.460us     483.460us       0.000us         0.00%       1.061ms       1.061ms             1  
-                _layer_norm_f8ec252::dropout_add_ln_fwd         3.89%      43.551us        32.51%     364.228us     121.409us     796.221us       100.00%       1.061ms     353.791us             3  
-                                  hf_kernels_layer_norm         0.00%       0.000us         0.00%       0.000us       0.000us     797.501us       100.16%     797.501us     797.501us             1  
-void layer_norm::ln_fwd_kernel&lt;layer_norm::Kernel_tr...         0.00%       0.000us         0.00%       0.000us       0.000us     796.221us       100.00%     796.221us     265.407us             3  
-                                Activity Buffer Request        10.86%     121.633us        10.86%     121.633us     121.633us     265.151us        33.30%     265.151us     265.151us             1  
-                                             aten::view         1.09%      12.180us         1.09%      12.180us       2.030us       0.000us         0.00%       0.000us       0.000us             6  
-                                            aten::empty         2.75%      30.759us         2.75%      30.759us       3.418us       0.000us         0.00%       0.000us       0.000us             9  
-cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFla...         0.45%       5.070us         0.45%       5.070us       1.690us       0.000us         0.00%       0.000us       0.000us             3  
-                                       cudaLaunchKernel        14.57%     163.215us        14.57%     163.215us      54.405us       0.000us         0.00%       0.000us       0.000us             3  
-                                  cudaDeviceSynchronize        56.85%     636.843us        56.85%     636.843us     636.843us       0.000us         0.00%       0.000us       0.000us             1  
+                                  hf_kernels_layer_norm        18.76%     102.890us        99.05%     543.128us     543.128us       0.000us         0.00%     191.549us     191.549us             1  
+                _layer_norm_f8ec252::dropout_add_ln_fwd         9.08%      49.784us        78.18%     428.658us     142.886us     117.790us       100.00%     191.549us      63.850us             3  
+                                  hf_kernels_layer_norm         0.00%       0.000us         0.00%       0.000us       0.000us     127.934us       108.61%     127.934us     127.934us             1  
+void layer_norm::ln_fwd_kernel&lt;layer_norm::Kernel_tr...         0.00%       0.000us         0.00%       0.000us       0.000us     117.790us       100.00%     117.790us      39.263us             3  
+                                Activity Buffer Request        33.02%     181.032us        33.02%     181.032us     181.032us      73.759us        62.62%      73.759us      73.759us             1  
+                                             aten::view         2.11%      11.580us         2.11%      11.580us       1.930us       0.000us         0.00%       0.000us       0.000us             6  
+                                            aten::empty         5.47%      30.020us         5.47%      30.020us       3.336us       0.000us         0.00%       0.000us       0.000us             9  
+cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFla...         1.00%       5.460us         1.00%       5.460us       1.820us       0.000us         0.00%       0.000us       0.000us             3  
+                                       cudaLaunchKernel        29.61%     162.362us        29.61%     162.362us      54.121us       0.000us         0.00%       0.000us       0.000us             3  
+                                  cudaDeviceSynchronize         0.95%       5.190us         0.95%       5.190us       5.190us       0.000us         0.00%       0.000us       0.000us             1  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-Self CPU time total: 1.120ms
-Self CUDA time total: 796.221us
+Self CPU time total: 548.318us
+Self CUDA time total: 117.790us
 
 
 
@@ -4845,19 +4845,19 @@ PROFILE TRACE: hf_kernels_layer_norm | LN_B16_S1024_D2048
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
                                                    Name    Self CPU %      Self CPU   CPU total %     CPU total  CPU time avg     Self CUDA   Self CUDA %    CUDA total  CUDA time avg    # of Calls  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-                                  hf_kernels_layer_norm         5.01%     109.623us        28.99%     634.714us     634.714us       0.000us         0.00%       2.221ms       2.221ms             1  
-                _layer_norm_f8ec252::dropout_add_ln_fwd         1.96%      43.002us        23.41%     512.541us     170.847us       1.714ms       100.00%       2.221ms     740.491us             3  
-                                  hf_kernels_layer_norm         0.00%       0.000us         0.00%       0.000us       0.000us       1.715ms       100.07%       1.715ms       1.715ms             1  
-void layer_norm::ln_fwd_kernel&lt;layer_norm::Kernel_tr...         0.00%       0.000us         0.00%       0.000us       0.000us       1.714ms       100.00%       1.714ms     571.320us             3  
-                                Activity Buffer Request        12.44%     272.486us        12.44%     272.486us     272.486us     507.513us        29.61%     507.513us     507.513us             1  
-                                             aten::view         0.57%      12.550us         0.57%      12.550us       2.092us       0.000us         0.00%       0.000us       0.000us             6  
-                                            aten::empty         1.46%      31.970us         1.46%      31.970us       3.552us       0.000us         0.00%       0.000us       0.000us             9  
-cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFla...         0.21%       4.629us         0.21%       4.629us       1.543us       0.000us         0.00%       0.000us       0.000us             3  
-                                       cudaLaunchKernel         7.33%     160.454us         7.33%     160.454us      53.485us       0.000us         0.00%       0.000us       0.000us             3  
-                                  cudaDeviceSynchronize        71.01%       1.555ms        71.01%       1.555ms       1.555ms       0.000us         0.00%       0.000us       0.000us             1  
+                                  hf_kernels_layer_norm         9.87%     125.762us        69.07%     879.903us     879.903us       0.000us         0.00%     766.838us     766.838us             1  
+                _layer_norm_f8ec252::dropout_add_ln_fwd         3.87%      49.332us        58.21%     741.561us     247.187us     575.481us       100.00%     766.838us     255.613us             3  
+                                  hf_kernels_layer_norm         0.00%       0.000us         0.00%       0.000us       0.000us     576.857us       100.24%     576.857us     576.857us             1  
+void layer_norm::ln_fwd_kernel&lt;layer_norm::Kernel_tr...         0.00%       0.000us         0.00%       0.000us       0.000us     575.481us       100.00%     575.481us     191.827us             3  
+                                Activity Buffer Request        39.29%     500.518us        39.29%     500.518us     500.518us     191.357us        33.25%     191.357us     191.357us             1  
+                                             aten::view         0.99%      12.580us         0.99%      12.580us       2.097us       0.000us         0.00%       0.000us       0.000us             6  
+                                            aten::empty         2.41%      30.689us         2.41%      30.689us       3.410us       0.000us         0.00%       0.000us       0.000us             9  
+cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFla...         0.43%       5.420us         0.43%       5.420us       1.807us       0.000us         0.00%       0.000us       0.000us             3  
+                                       cudaLaunchKernel        12.21%     155.602us        12.21%     155.602us      51.867us       0.000us         0.00%       0.000us       0.000us             3  
+                                  cudaDeviceSynchronize        30.93%     394.045us        30.93%     394.045us     394.045us       0.000us         0.00%       0.000us       0.000us             1  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-Self CPU time total: 2.190ms
-Self CUDA time total: 1.714ms
+Self CPU time total: 1.274ms
+Self CUDA time total: 575.481us
 
 
 
@@ -4867,19 +4867,19 @@ PROFILE TRACE: hf_kernels_layer_norm | LN_B16_S1024_D4096
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
                                                    Name    Self CPU %      Self CPU   CPU total %     CPU total  CPU time avg     Self CUDA   Self CUDA %    CUDA total  CUDA time avg    # of Calls  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-                                  hf_kernels_layer_norm         2.73%     138.274us        37.76%       1.915ms       1.915ms       0.000us         0.00%       4.337ms       4.337ms             1  
-                _layer_norm_f8ec252::dropout_add_ln_fwd         0.84%      42.541us        34.77%       1.764ms     588.006us       3.325ms       100.00%       4.337ms       1.446ms             3  
-                                  hf_kernels_layer_norm         0.00%       0.000us         0.00%       0.000us       0.000us       3.326ms       100.04%       3.326ms       3.326ms             1  
-void layer_norm::ln_fwd_kernel&lt;layer_norm::Kernel_tr...         0.00%       0.000us         0.00%       0.000us       0.000us       3.325ms       100.00%       3.325ms       1.108ms             3  
-                                Activity Buffer Request        29.98%       1.521ms        29.98%       1.521ms       1.521ms       1.012ms        30.42%       1.012ms       1.012ms             1  
-                                             aten::view         0.26%      13.190us         0.26%      13.190us       2.198us       0.000us         0.00%       0.000us       0.000us             6  
-                                            aten::empty         0.63%      32.210us         0.63%      32.210us       3.579us       0.000us         0.00%       0.000us       0.000us             9  
-cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFla...         0.10%       4.921us         0.10%       4.921us       1.640us       0.000us         0.00%       0.000us       0.000us             3  
-                                       cudaLaunchKernel         3.22%     163.343us         3.22%     163.343us      54.448us       0.000us         0.00%       0.000us       0.000us             3  
-                                  cudaDeviceSynchronize        62.24%       3.157ms        62.24%       3.157ms       3.157ms       0.000us         0.00%       0.000us       0.000us             1  
+                                  hf_kernels_layer_norm         6.87%     103.651us        31.62%     476.976us     476.976us       0.000us         0.00%       1.531ms       1.531ms             1  
+                _layer_norm_f8ec252::dropout_add_ln_fwd         3.16%      47.619us        23.98%     361.844us     120.615us       1.187ms       100.00%       1.531ms     510.298us             3  
+                                  hf_kernels_layer_norm         0.00%       0.000us         0.00%       0.000us       0.000us       1.188ms       100.13%       1.188ms       1.188ms             1  
+void layer_norm::ln_fwd_kernel&lt;layer_norm::Kernel_tr...         0.00%       0.000us         0.00%       0.000us       0.000us       1.187ms       100.00%       1.187ms     395.515us             3  
+                                Activity Buffer Request         8.20%     123.752us         8.20%     123.752us     123.752us     344.347us        29.02%     344.347us     344.347us             1  
+                                             aten::view         0.76%      11.481us         0.76%      11.481us       1.913us       0.000us         0.00%       0.000us       0.000us             6  
+                                            aten::empty         1.98%      29.821us         1.98%      29.821us       3.313us       0.000us         0.00%       0.000us       0.000us             9  
+cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFla...         0.39%       5.930us         0.39%       5.930us       1.977us       0.000us         0.00%       0.000us       0.000us             3  
+                                       cudaLaunchKernel        10.26%     154.722us        10.26%     154.722us      51.574us       0.000us         0.00%       0.000us       0.000us             3  
+                                  cudaDeviceSynchronize        68.38%       1.032ms        68.38%       1.032ms       1.032ms       0.000us         0.00%       0.000us       0.000us             1  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-Self CPU time total: 5.073ms
-Self CUDA time total: 3.325ms
+Self CPU time total: 1.509ms
+Self CUDA time total: 1.187ms
 
 
 
@@ -4889,19 +4889,19 @@ PROFILE TRACE: hf_kernels_layer_norm | LN_B16_S1024_D8192
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
                                                    Name    Self CPU %      Self CPU   CPU total %     CPU total  CPU time avg     Self CUDA   Self CUDA %    CUDA total  CUDA time avg    # of Calls  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-                                  hf_kernels_layer_norm         1.55%     109.525us         7.88%     556.992us     556.992us       0.000us         0.00%       8.859ms       8.859ms             1  
-                _layer_norm_f8ec252::dropout_add_ln_fwd         0.62%      43.791us         6.15%     434.348us     144.783us       6.670ms       100.00%       8.859ms       2.953ms             3  
-                                  hf_kernels_layer_norm         0.00%       0.000us         0.00%       0.000us       0.000us       6.672ms       100.02%       6.672ms       6.672ms             1  
-void layer_norm::ln_fwd_kernel&lt;layer_norm::Kernel_tr...         0.00%       0.000us         0.00%       0.000us       0.000us       6.670ms       100.00%       6.670ms       2.223ms             3  
-                                Activity Buffer Request         2.69%     189.754us         2.69%     189.754us     189.754us       2.188ms        32.81%       2.188ms       2.188ms             1  
-                                             aten::view         0.19%      13.119us         0.19%      13.119us       2.187us       0.000us         0.00%       0.000us       0.000us             6  
-                                            aten::empty         0.46%      32.450us         0.46%      32.450us       3.606us       0.000us         0.00%       0.000us       0.000us             9  
-cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFla...         0.07%       4.630us         0.07%       4.630us       1.543us       0.000us         0.00%       0.000us       0.000us             3  
-                                       cudaLaunchKernel         2.32%     163.723us         2.32%     163.723us      54.574us       0.000us         0.00%       0.000us       0.000us             3  
-                                  cudaDeviceSynchronize        92.12%       6.509ms        92.12%       6.509ms       6.509ms       0.000us         0.00%       0.000us       0.000us             1  
+                                  hf_kernels_layer_norm         4.11%     127.961us        28.50%     887.612us     887.612us       0.000us         0.00%       3.104ms       3.104ms             1  
+                _layer_norm_f8ec252::dropout_add_ln_fwd         1.47%      45.722us        24.01%     747.701us     249.234us       2.375ms       100.00%       3.104ms       1.035ms             3  
+                                  hf_kernels_layer_norm         0.00%       0.000us         0.00%       0.000us       0.000us       2.376ms       100.06%       2.376ms       2.376ms             1  
+void layer_norm::ln_fwd_kernel&lt;layer_norm::Kernel_tr...         0.00%       0.000us         0.00%       0.000us       0.000us       2.375ms       100.00%       2.375ms     791.601us             3  
+                                Activity Buffer Request        16.22%     505.157us        16.22%     505.157us     505.157us     729.500us        30.72%     729.500us     729.500us             1  
+                                             aten::view         0.38%      11.950us         0.38%      11.950us       1.992us       0.000us         0.00%       0.000us       0.000us             6  
+                                            aten::empty         0.97%      30.190us         0.97%      30.190us       3.354us       0.000us         0.00%       0.000us       0.000us             9  
+cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFla...         0.16%       4.890us         0.16%       4.890us       1.630us       0.000us         0.00%       0.000us       0.000us             3  
+                                       cudaLaunchKernel         5.19%     161.742us         5.19%     161.742us      53.914us       0.000us         0.00%       0.000us       0.000us             3  
+                                  cudaDeviceSynchronize        71.50%       2.226ms        71.50%       2.226ms       2.226ms       0.000us         0.00%       0.000us       0.000us             1  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-Self CPU time total: 7.066ms
-Self CUDA time total: 6.670ms
+Self CPU time total: 3.114ms
+Self CUDA time total: 2.375ms
 
 
 
@@ -4911,19 +4911,19 @@ PROFILE TRACE: hf_kernels_layer_norm | LN_B16_S2048_D1024
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
                                                    Name    Self CPU %      Self CPU   CPU total %     CPU total  CPU time avg     Self CUDA   Self CUDA %    CUDA total  CUDA time avg    # of Calls  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-                                  hf_kernels_layer_norm         3.93%     134.994us        55.36%       1.902ms       1.902ms       0.000us         0.00%       2.214ms       2.214ms             1  
-                _layer_norm_f8ec252::dropout_add_ln_fwd         1.25%      43.084us        51.05%       1.754ms     584.723us       1.702ms       100.00%       2.214ms     738.134us             3  
-                                  hf_kernels_layer_norm         0.00%       0.000us         0.00%       0.000us       0.000us       1.703ms       100.08%       1.703ms       1.703ms             1  
-void layer_norm::ln_fwd_kernel&lt;layer_norm::Kernel_tr...         0.00%       0.000us         0.00%       0.000us       0.000us       1.702ms       100.00%       1.702ms     567.353us             3  
-                                Activity Buffer Request        44.05%       1.514ms        44.05%       1.514ms       1.514ms     512.345us        30.10%     512.345us     512.345us             1  
-                                             aten::view         0.39%      13.309us         0.39%      13.309us       2.218us       0.000us         0.00%       0.000us       0.000us             6  
-                                            aten::empty         0.90%      31.069us         0.90%      31.069us       3.452us       0.000us         0.00%       0.000us       0.000us             9  
-cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFla...         0.14%       4.900us         0.14%       4.900us       1.633us       0.000us         0.00%       0.000us       0.000us             3  
-                                       cudaLaunchKernel         4.70%     161.503us         4.70%     161.503us      53.834us       0.000us         0.00%       0.000us       0.000us             3  
-                                  cudaDeviceSynchronize        44.64%       1.534ms        44.64%       1.534ms       1.534ms       0.000us         0.00%       0.000us       0.000us             1  
+                                  hf_kernels_layer_norm         5.82%     128.863us        81.59%       1.808ms       1.808ms       0.000us         0.00%     756.792us     756.792us             1  
+                _layer_norm_f8ec252::dropout_add_ln_fwd         2.11%      46.800us        75.21%       1.666ms     555.488us     566.586us       100.00%     756.792us     252.264us             3  
+                                  hf_kernels_layer_norm         0.00%       0.000us         0.00%       0.000us       0.000us     567.994us       100.25%     567.994us     567.994us             1  
+void layer_norm::ln_fwd_kernel&lt;layer_norm::Kernel_tr...         0.00%       0.000us         0.00%       0.000us       0.000us     566.586us       100.00%     566.586us     188.862us             3  
+                                Activity Buffer Request        64.48%       1.429ms        64.48%       1.429ms       1.429ms     190.206us        33.57%     190.206us     190.206us             1  
+                                             aten::view         0.56%      12.380us         0.56%      12.380us       2.063us       0.000us         0.00%       0.000us       0.000us             6  
+                                            aten::empty         1.35%      29.990us         1.35%      29.990us       3.332us       0.000us         0.00%       0.000us       0.000us             9  
+cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFla...         0.24%       5.300us         0.24%       5.300us       1.767us       0.000us         0.00%       0.000us       0.000us             3  
+                                       cudaLaunchKernel         7.03%     155.802us         7.03%     155.802us      51.934us       0.000us         0.00%       0.000us       0.000us             3  
+                                  cudaDeviceSynchronize        18.41%     407.946us        18.41%     407.946us     407.946us       0.000us         0.00%       0.000us       0.000us             1  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-Self CPU time total: 3.436ms
-Self CUDA time total: 1.702ms
+Self CPU time total: 2.216ms
+Self CUDA time total: 566.586us
 
 
 
@@ -4933,19 +4933,19 @@ PROFILE TRACE: hf_kernels_layer_norm | LN_B16_S2048_D2048
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
                                                    Name    Self CPU %      Self CPU   CPU total %     CPU total  CPU time avg     Self CUDA   Self CUDA %    CUDA total  CUDA time avg    # of Calls  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-                                  hf_kernels_layer_norm         2.85%     110.083us        15.45%     596.563us     596.563us       0.000us         0.00%       4.460ms       4.460ms             1  
-                _layer_norm_f8ec252::dropout_add_ln_fwd         1.10%      42.541us        12.28%     474.190us     158.063us       3.425ms       100.00%       4.460ms       1.487ms             3  
-                                  hf_kernels_layer_norm         0.00%       0.000us         0.00%       0.000us       0.000us       3.426ms       100.04%       3.426ms       3.426ms             1  
-void layer_norm::ln_fwd_kernel&lt;layer_norm::Kernel_tr...         0.00%       0.000us         0.00%       0.000us       0.000us       3.425ms       100.00%       3.425ms       1.142ms             3  
-                                Activity Buffer Request         5.89%     227.585us         5.89%     227.585us     227.585us       1.035ms        30.23%       1.035ms       1.035ms             1  
-                                             aten::view         0.32%      12.290us         0.32%      12.290us       2.048us       0.000us         0.00%       0.000us       0.000us             6  
-                                            aten::empty         0.80%      30.950us         0.80%      30.950us       3.439us       0.000us         0.00%       0.000us       0.000us             9  
-cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFla...         0.11%       4.410us         0.11%       4.410us       1.470us       0.000us         0.00%       0.000us       0.000us             3  
-                                       cudaLaunchKernel         4.37%     168.704us         4.37%     168.704us      56.235us       0.000us         0.00%       0.000us       0.000us             3  
-                                  cudaDeviceSynchronize        84.55%       3.265ms        84.55%       3.265ms       3.265ms       0.000us         0.00%       0.000us       0.000us             1  
+                                  hf_kernels_layer_norm         6.78%     107.581us        32.18%     510.957us     510.957us       0.000us         0.00%       1.590ms       1.590ms             1  
+                _layer_norm_f8ec252::dropout_add_ln_fwd         2.95%      46.851us        24.67%     391.616us     130.539us       1.234ms       100.00%       1.590ms     529.905us             3  
+                                  hf_kernels_layer_norm         0.00%       0.000us         0.00%       0.000us       0.000us       1.235ms       100.12%       1.235ms       1.235ms             1  
+void layer_norm::ln_fwd_kernel&lt;layer_norm::Kernel_tr...         0.00%       0.000us         0.00%       0.000us       0.000us       1.234ms       100.00%       1.234ms     411.346us             3  
+                                Activity Buffer Request         9.78%     155.342us         9.78%     155.342us     155.342us     355.677us        28.82%     355.677us     355.677us             1  
+                                             aten::view         0.74%      11.760us         0.74%      11.760us       1.960us       0.000us         0.00%       0.000us       0.000us             6  
+                                            aten::empty         1.88%      29.861us         1.88%      29.861us       3.318us       0.000us         0.00%       0.000us       0.000us             9  
+cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFla...         0.38%       5.960us         0.38%       5.960us       1.987us       0.000us         0.00%       0.000us       0.000us             3  
+                                       cudaLaunchKernel         9.67%     153.602us         9.67%     153.602us      51.201us       0.000us         0.00%       0.000us       0.000us             3  
+                                  cudaDeviceSynchronize        67.82%       1.077ms        67.82%       1.077ms       1.077ms       0.000us         0.00%       0.000us       0.000us             1  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-Self CPU time total: 3.862ms
-Self CUDA time total: 3.425ms
+Self CPU time total: 1.588ms
+Self CUDA time total: 1.234ms
 
 
 
@@ -4955,19 +4955,19 @@ PROFILE TRACE: hf_kernels_layer_norm | LN_B16_S2048_D4096
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
                                                    Name    Self CPU %      Self CPU   CPU total %     CPU total  CPU time avg     Self CUDA   Self CUDA %    CUDA total  CUDA time avg    # of Calls  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-                                  hf_kernels_layer_norm         1.57%     111.872us         8.59%     613.703us     613.703us       0.000us         0.00%       8.913ms       8.913ms             1  
-                _layer_norm_f8ec252::dropout_add_ln_fwd         0.61%      43.889us         6.86%     489.921us     163.307us       6.694ms       100.00%       8.913ms       2.971ms             3  
-                                  hf_kernels_layer_norm         0.00%       0.000us         0.00%       0.000us       0.000us       6.696ms       100.02%       6.696ms       6.696ms             1  
-void layer_norm::ln_fwd_kernel&lt;layer_norm::Kernel_tr...         0.00%       0.000us         0.00%       0.000us       0.000us       6.694ms       100.00%       6.694ms       2.231ms             3  
-                                Activity Buffer Request         3.32%     237.355us         3.32%     237.355us     237.355us       2.218ms        33.13%       2.218ms       2.218ms             1  
-                                             aten::view         0.17%      11.910us         0.17%      11.910us       1.985us       0.000us         0.00%       0.000us       0.000us             6  
-                                            aten::empty         0.44%      31.511us         0.44%      31.511us       3.501us       0.000us         0.00%       0.000us       0.000us             9  
-cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFla...         0.07%       5.351us         0.07%       5.351us       1.784us       0.000us         0.00%       0.000us       0.000us             3  
-                                       cudaLaunchKernel         2.41%     171.815us         2.41%     171.815us      57.272us       0.000us         0.00%       0.000us       0.000us             3  
-                                  cudaDeviceSynchronize        91.41%       6.530ms        91.41%       6.530ms       6.530ms       0.000us         0.00%       0.000us       0.000us             1  
+                                  hf_kernels_layer_norm         4.29%     122.511us        22.27%     635.379us     635.379us       0.000us         0.00%       3.116ms       3.116ms             1  
+                _layer_norm_f8ec252::dropout_add_ln_fwd         1.67%      47.772us        17.54%     500.568us     166.856us       2.375ms       100.00%       3.116ms       1.039ms             3  
+                                  hf_kernels_layer_norm         0.00%       0.000us         0.00%       0.000us       0.000us       2.377ms       100.06%       2.377ms       2.377ms             1  
+void layer_norm::ln_fwd_kernel&lt;layer_norm::Kernel_tr...         0.00%       0.000us         0.00%       0.000us       0.000us       2.375ms       100.00%       2.375ms     791.801us             3  
+                                Activity Buffer Request         8.85%     252.513us         8.85%     252.513us     252.513us     740.986us        31.19%     740.986us     740.986us             1  
+                                             aten::view         0.43%      12.300us         0.43%      12.300us       2.050us       0.000us         0.00%       0.000us       0.000us             6  
+                                            aten::empty         1.05%      29.891us         1.05%      29.891us       3.321us       0.000us         0.00%       0.000us       0.000us             9  
+cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFla...         0.21%       6.001us         0.21%       6.001us       2.000us       0.000us         0.00%       0.000us       0.000us             3  
+                                       cudaLaunchKernel         5.76%     164.391us         5.76%     164.391us      54.797us       0.000us         0.00%       0.000us       0.000us             3  
+                                  cudaDeviceSynchronize        77.73%       2.218ms        77.73%       2.218ms       2.218ms       0.000us         0.00%       0.000us       0.000us             1  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-Self CPU time total: 7.144ms
-Self CUDA time total: 6.694ms
+Self CPU time total: 2.853ms
+Self CUDA time total: 2.375ms
 
 
 
@@ -4977,129 +4977,81 @@ PROFILE TRACE: hf_kernels_layer_norm | LN_B16_S2048_D8192
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
                                                    Name    Self CPU %      Self CPU   CPU total %     CPU total  CPU time avg     Self CUDA   Self CUDA %    CUDA total  CUDA time avg    # of Calls  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-                                  hf_kernels_layer_norm         0.82%     112.894us         4.93%     682.565us     682.565us       0.000us         0.00%      17.728ms      17.728ms             1  
-                _layer_norm_f8ec252::dropout_add_ln_fwd         0.31%      43.279us         4.02%     556.541us     185.514us      13.321ms       100.00%      17.728ms       5.909ms             3  
-                                  hf_kernels_layer_norm         0.00%       0.000us         0.00%       0.000us       0.000us      13.323ms       100.01%      13.323ms      13.323ms             1  
-void layer_norm::ln_fwd_kernel&lt;layer_norm::Kernel_tr...         0.00%       0.000us         0.00%       0.000us       0.000us      13.321ms       100.00%      13.321ms       4.440ms             3  
-                                Activity Buffer Request         2.16%     298.477us         2.16%     298.477us     298.477us       4.407ms        33.08%       4.407ms       4.407ms             1  
-                                             aten::view         0.09%      13.130us         0.09%      13.130us       2.188us       0.000us         0.00%       0.000us       0.000us             6  
-                                            aten::empty         0.24%      33.051us         0.24%      33.051us       3.672us       0.000us         0.00%       0.000us       0.000us             9  
-cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFla...         0.04%       5.240us         0.04%       5.240us       1.747us       0.000us         0.00%       0.000us       0.000us             3  
-                                       cudaLaunchKernel         1.28%     176.494us         1.28%     176.494us      58.831us       0.000us         0.00%       0.000us       0.000us             3  
-                                  cudaDeviceSynchronize        95.07%      13.154ms        95.07%      13.154ms      13.154ms       0.000us         0.00%       0.000us       0.000us             1  
+                                  hf_kernels_layer_norm         2.07%     109.351us        12.73%     673.809us     673.809us       0.000us         0.00%       6.337ms       6.337ms             1  
+                _layer_norm_f8ec252::dropout_add_ln_fwd         0.93%      49.100us        10.45%     553.127us     184.376us       4.781ms       100.00%       6.337ms       2.112ms             3  
+                                  hf_kernels_layer_norm         0.00%       0.000us         0.00%       0.000us       0.000us       4.782ms       100.03%       4.782ms       4.782ms             1  
+void layer_norm::ln_fwd_kernel&lt;layer_norm::Kernel_tr...         0.00%       0.000us         0.00%       0.000us       0.000us       4.781ms       100.00%       4.781ms       1.594ms             3  
+                                Activity Buffer Request         5.38%     284.544us         5.38%     284.544us     284.544us       1.556ms        32.54%       1.556ms       1.556ms             1  
+                                             aten::view         0.21%      11.331us         0.21%      11.331us       1.889us       0.000us         0.00%       0.000us       0.000us             6  
+                                            aten::empty         0.57%      29.971us         0.57%      29.971us       3.330us       0.000us         0.00%       0.000us       0.000us             9  
+cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFla...         0.11%       5.990us         0.11%       5.990us       1.997us       0.000us         0.00%       0.000us       0.000us             3  
+                                       cudaLaunchKernel         3.47%     183.522us         3.47%     183.522us      61.174us       0.000us         0.00%       0.000us       0.000us             3  
+                                  cudaDeviceSynchronize        87.27%       4.620ms        87.27%       4.620ms       4.620ms       0.000us         0.00%       0.000us       0.000us             1  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-Self CPU time total: 13.836ms
-Self CUDA time total: 13.321ms
+Self CPU time total: 5.294ms
+Self CUDA time total: 4.781ms
 
 
 impl                     wl                  p50(ms)  ok
-hf_kernels_layer_norm    LN_B16_S1024_D1024     0.29  False
-hf_kernels_layer_norm    LN_B16_S1024_D2048     0.61  False
-hf_kernels_layer_norm    LN_B16_S1024_D4096     1.15  False
-hf_kernels_layer_norm    LN_B16_S1024_D8192     2.27  False
+hf_kernels_layer_norm    LN_B16_S1024_D1024     0.05  False
+hf_kernels_layer_norm    LN_B16_S1024_D2048     0.22  False
+hf_kernels_layer_norm    LN_B16_S1024_D4096     0.44  False
+hf_kernels_layer_norm    LN_B16_S1024_D8192     0.84  False
 hf_kernels_layer_norm    LN_B16_S128_D1024      0.05  False
 hf_kernels_layer_norm    LN_B16_S128_D2048      0.05  False
-hf_kernels_layer_norm    LN_B16_S128_D4096      0.06  False
-hf_kernels_layer_norm    LN_B16_S128_D8192      0.30  False
-hf_kernels_layer_norm    LN_B16_S2048_D1024     0.61  False
-hf_kernels_layer_norm    LN_B16_S2048_D2048     1.20  False
-hf_kernels_layer_norm    LN_B16_S2048_D4096     2.27  False
-hf_kernels_layer_norm    LN_B16_S2048_D8192     4.51  False
-hf_kernels_layer_norm    LN_B16_S512_D1024      0.06  False
-hf_kernels_layer_norm    LN_B16_S512_D2048      0.30  False
-hf_kernels_layer_norm    LN_B16_S512_D4096      0.59  False
-hf_kernels_layer_norm    LN_B16_S512_D8192      1.16  False
+hf_kernels_layer_norm    LN_B16_S128_D4096      0.05  False
+hf_kernels_layer_norm    LN_B16_S128_D8192      0.05  False
+hf_kernels_layer_norm    LN_B16_S2048_D1024     0.21  False
+hf_kernels_layer_norm    LN_B16_S2048_D2048     0.46  False
+hf_kernels_layer_norm    LN_B16_S2048_D4096     0.84  False
+hf_kernels_layer_norm    LN_B16_S2048_D8192     1.65  False
+hf_kernels_layer_norm    LN_B16_S512_D1024      0.05  False
+hf_kernels_layer_norm    LN_B16_S512_D2048      0.05  False
+hf_kernels_layer_norm    LN_B16_S512_D4096      0.21  False
+hf_kernels_layer_norm    LN_B16_S512_D8192      0.43  False
 hf_kernels_layer_norm    LN_B1_S1024_D1024      0.05  False
 hf_kernels_layer_norm    LN_B1_S1024_D2048      0.05  False
 hf_kernels_layer_norm    LN_B1_S1024_D4096      0.05  False
-hf_kernels_layer_norm    LN_B1_S1024_D8192      0.06  False
-hf_kernels_layer_norm    LN_B1_S128_D1024       0.05  False
+hf_kernels_layer_norm    LN_B1_S1024_D8192      0.05  False
+hf_kernels_layer_norm    LN_B1_S128_D1024       0.04  False
 hf_kernels_layer_norm    LN_B1_S128_D2048       0.05  False
 hf_kernels_layer_norm    LN_B1_S128_D4096       0.05  False
 hf_kernels_layer_norm    LN_B1_S128_D8192       0.05  False
 hf_kernels_layer_norm    LN_B1_S2048_D1024      0.05  False
 hf_kernels_layer_norm    LN_B1_S2048_D2048      0.05  False
-hf_kernels_layer_norm    LN_B1_S2048_D4096      0.06  False
-hf_kernels_layer_norm    LN_B1_S2048_D8192      0.29  False
+hf_kernels_layer_norm    LN_B1_S2048_D4096      0.05  False
+hf_kernels_layer_norm    LN_B1_S2048_D8192      0.05  False
 hf_kernels_layer_norm    LN_B1_S512_D1024       0.05  False
 hf_kernels_layer_norm    LN_B1_S512_D2048       0.05  False
 hf_kernels_layer_norm    LN_B1_S512_D4096       0.05  False
 hf_kernels_layer_norm    LN_B1_S512_D8192       0.05  False
 hf_kernels_layer_norm    LN_B4_S1024_D1024      0.05  False
-hf_kernels_layer_norm    LN_B4_S1024_D2048      0.07  False
-hf_kernels_layer_norm    LN_B4_S1024_D4096      0.29  False
-hf_kernels_layer_norm    LN_B4_S1024_D8192      0.59  False
+hf_kernels_layer_norm    LN_B4_S1024_D2048      0.05  False
+hf_kernels_layer_norm    LN_B4_S1024_D4096      0.05  False
+hf_kernels_layer_norm    LN_B4_S1024_D8192      0.21  False
 hf_kernels_layer_norm    LN_B4_S128_D1024       0.05  False
 hf_kernels_layer_norm    LN_B4_S128_D2048       0.05  False
 hf_kernels_layer_norm    LN_B4_S128_D4096       0.05  False
 hf_kernels_layer_norm    LN_B4_S128_D8192       0.05  False
-hf_kernels_layer_norm    LN_B4_S2048_D1024      0.06  False
-hf_kernels_layer_norm    LN_B4_S2048_D2048      0.30  False
-hf_kernels_layer_norm    LN_B4_S2048_D4096      0.60  False
-hf_kernels_layer_norm    LN_B4_S2048_D8192      1.15  False
+hf_kernels_layer_norm    LN_B4_S2048_D1024      0.05  False
+hf_kernels_layer_norm    LN_B4_S2048_D2048      0.06  False
+hf_kernels_layer_norm    LN_B4_S2048_D4096      0.21  False
+hf_kernels_layer_norm    LN_B4_S2048_D8192      0.44  False
 hf_kernels_layer_norm    LN_B4_S512_D1024       0.05  False
 hf_kernels_layer_norm    LN_B4_S512_D2048       0.05  False
-hf_kernels_layer_norm    LN_B4_S512_D4096       0.06  False
-hf_kernels_layer_norm    LN_B4_S512_D8192       0.29  False
+hf_kernels_layer_norm    LN_B4_S512_D4096       0.05  False
+hf_kernels_layer_norm    LN_B4_S512_D8192       0.05  False
 </pre></div>
 <div class="uv-install-logs" id="uv-logs-benchmark">
 <div class="uv-logs-header" onclick="toggleUvLogs(this)">▶ UV Install Logs</div>
 <div class="uv-logs-content" style="display: none;">
-   Building kernels-benchmark-tools @ file:///home/ubuntu/Projects/kernels-benchmarks-consolidated/tools
-Downloading hf-xet (3.2MiB)
-Downloading nvidia-cuda-cupti-cu12 (9.8MiB)
-Downloading nvidia-curand-cu12 (60.7MiB)
-Downloading numpy (15.9MiB)
-Downloading nvidia-cusparselt-cu12 (273.9MiB)
-Downloading sympy (6.0MiB)
-Downloading setuptools (1.1MiB)
-Downloading nvidia-cublas-cu12 (566.8MiB)
-Downloading nvidia-cuda-nvrtc-cu12 (84.0MiB)
-Downloading networkx (1.9MiB)
-Downloading pillow (6.7MiB)
-Downloading matplotlib (8.3MiB)
-Downloading nvidia-cudnn-cu12 (674.0MiB)
-Downloading nvidia-cufile-cu12 (1.1MiB)
-Downloading nvidia-nvjitlink-cu12 (37.4MiB)
-Downloading kiwisolver (1.4MiB)
-Downloading fonttools (4.7MiB)
-Downloading nvidia-cusolver-cu12 (255.1MiB)
-Downloading nvidia-cufft-cu12 (184.2MiB)
-Downloading triton (148.4MiB)
-Downloading nvidia-cusparse-cu12 (274.9MiB)
-Downloading nvidia-nccl-cu12 (307.4MiB)
-Downloading torch (846.8MiB)
- Downloading nvidia-cufile-cu12
- Downloading kiwisolver
- Downloading hf-xet
- Downloading setuptools
- Downloading fonttools
- Downloading networkx
- Downloading pillow
-      Built kernels-benchmark-tools @ file:///home/ubuntu/Projects/kernels-benchmarks-consolidated/tools
- Downloading nvidia-cuda-cupti-cu12
- Downloading matplotlib
- Downloading numpy
- Downloading sympy
- Downloading nvidia-nvjitlink-cu12
- Downloading nvidia-curand-cu12
- Downloading nvidia-cuda-nvrtc-cu12
- Downloading triton
- Downloading nvidia-cufft-cu12
- Downloading nvidia-cusolver-cu12
- Downloading nvidia-cusparse-cu12
- Downloading nvidia-cusparselt-cu12
- Downloading nvidia-nccl-cu12
- Downloading nvidia-cublas-cu12
- Downloading nvidia-cudnn-cu12
- Downloading torch
-Installed 47 packages in 233ms
+Installed 15 packages in 14ms
 </div>
 </div>
 <div class="cell-stderr">Fetching 4 files:   0%|          | 0/4 [00:00&lt;?, ?it/s]
-Fetching 4 files:  25%|██▌       | 1/4 [00:00&lt;00:00,  4.84it/s]
-Fetching 4 files:  50%|█████     | 2/4 [00:02&lt;00:02,  1.20s/it]
-Fetching 4 files: 100%|██████████| 4/4 [00:02&lt;00:00,  1.91it/s]</div>
+Fetching 4 files:  25%|██▌       | 1/4 [00:00&lt;00:00,  7.10it/s]
+Fetching 4 files:  50%|█████     | 2/4 [00:01&lt;00:01,  1.13it/s]
+Fetching 4 files: 100%|██████████| 4/4 [00:01&lt;00:00,  2.59it/s]</div>
 <div class="cell-artifacts">
 <h4>Artifacts:</h4>
 <a href="artifacts/benchmark/layer_norm.jsonl" class="artifact" target="_blank">layer_norm.jsonl</a>
diff --git a/layer_norm/impls/torch_layer_norm.html b/layer_norm/impls/torch_layer_norm.html
index 41e261deb76e9889eb66386796e50a970adfd528..6ab9639b8e0f9d249da06445fa4c9ac229f15e09 100644
--- a/layer_norm/impls/torch_layer_norm.html
+++ b/layer_norm/impls/torch_layer_norm.html
@@ -3857,7 +3857,7 @@ span.linenos.special { color: #000000; background-color: #ffffc0; padding-left:
     <div class="system-info">
         <div class="system-info-header">Generated on:</div>
         <div class="system-info-content">
-            Linux x86_64 | Linux-5.15.0-1084-aws-x86_64-with-glibc2.31
+            Linux x86_64 | Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35
         </div>
     </div>
     
@@ -3872,7 +3872,7 @@ span.linenos.special { color: #000000; background-color: #ffffc0; padding-left:
 <span onclick="toggleOutput('nv')" style="cursor: pointer;">▼ output</span>
  <span id="uv-indicator-nv" style="cursor: default; opacity: 0.3;">▶ uv-logs</span>
 </span> | 
-Cell: nv | 4.05s
+Cell: nv | 0.22s
  | <button class="run-btn" onclick="runCell('nv')">▶ run</button>
 <button class="copy-btn" onclick="copyCell('nv')">Copy</button>
 <a href="cells/nv.py" target="_blank" class="raw-btn">Raw</a>
@@ -3887,34 +3887,22 @@ Cell: nv | 4.05s
 </div>
 </div>
 <div id="output-nv" class="cell-output">
-<div class="cell-stdout"><pre class="stdout-text">Fri Oct 24 19:23:22 2025       
+<div class="cell-stdout"><pre class="stdout-text">Mon Oct 27 14:46:07 2025       
 +-----------------------------------------------------------------------------------------+
-| NVIDIA-SMI 560.35.05              Driver Version: 560.35.05      CUDA Version: 12.6     |
+| NVIDIA-SMI 570.195.03             Driver Version: 570.195.03     CUDA Version: 12.8     |
 |-----------------------------------------+------------------------+----------------------+
 | GPU  Name                 Persistence-M | Bus-Id          Disp.A | Volatile Uncorr. ECC |
 | Fan  Temp   Perf          Pwr:Usage/Cap |           Memory-Usage | GPU-Util  Compute M. |
 |                                         |                        |               MIG M. |
 |=========================================+========================+======================|
-|   0  NVIDIA L4                      Off |   00000000:38:00.0 Off |                    0 |
-| N/A   36C    P0             27W /   72W |       1MiB /  23034MiB |      0%      Default |
-|                                         |                        |                  N/A |
-+-----------------------------------------+------------------------+----------------------+
-|   1  NVIDIA L4                      Off |   00000000:3A:00.0 Off |                    0 |
-| N/A   33C    P0             28W /   72W |       1MiB /  23034MiB |      2%      Default |
-|                                         |                        |                  N/A |
-+-----------------------------------------+------------------------+----------------------+
-|   2  NVIDIA L4                      Off |   00000000:3C:00.0 Off |                    0 |
-| N/A   34C    P0             27W /   72W |       1MiB /  23034MiB |      2%      Default |
-|                                         |                        |                  N/A |
-+-----------------------------------------+------------------------+----------------------+
-|   3  NVIDIA L4                      Off |   00000000:3E:00.0 Off |                    0 |
-| N/A   33C    P0             27W /   72W |       1MiB /  23034MiB |      2%      Default |
+|   0  NVIDIA L40S                    On  |   00000000:4D:00.0 Off |                    0 |
+| N/A   31C    P0             79W /  350W |       0MiB /  46068MiB |     11%      Default |
 |                                         |                        |                  N/A |
 +-----------------------------------------+------------------------+----------------------+
 
 +-----------------------------------------------------------------------------------------+
 | Processes:                                                                              |
-|  GPU   GI   CI        PID   Type   Process name                              GPU Memory |
+|  GPU   GI   CI              PID   Type   Process name                        GPU Memory |
 |        ID   ID                                                               Usage      |
 |=========================================================================================|
 |  No running processes found                                                             |
@@ -3932,7 +3920,7 @@ Cell: nv | 4.05s
 <span onclick="toggleOutput('benchmark')" style="cursor: pointer;">▼ output</span>
  <span id="uv-indicator-benchmark" onclick="toggleUvLogsFromHeader('benchmark')" style="cursor: pointer;">▶ uv-logs</span>
 </span> | 
-Cell: benchmark | 43.38s
+Cell: benchmark | 7.77s
  | <button class="run-btn" onclick="runCell('benchmark')">▶ run</button>
 <button class="copy-btn" onclick="copyCell('benchmark')">Copy</button>
 <a href="cells/benchmark.py" target="_blank" class="raw-btn">Raw</a>
@@ -3979,19 +3967,19 @@ PROFILE TRACE: torch_layer_norm | LN_B1_S128_D1024
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
                                                    Name    Self CPU %      Self CPU   CPU total %     CPU total  CPU time avg     Self CUDA   Self CUDA %    CUDA total  CUDA time avg    # of Calls  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-                                       torch_layer_norm         0.00%       0.000us         0.00%       0.000us       0.000us     112.575us      1075.83%     112.575us     112.575us             1  
-                                       torch_layer_norm         7.69%     144.433us        99.57%       1.871ms       1.871ms       0.000us         0.00%      14.080us      14.080us             1  
-                                       aten::layer_norm         0.88%      16.561us        91.89%       1.727ms     575.622us       0.000us         0.00%      14.080us       4.693us             3  
-                                aten::native_layer_norm         4.64%      87.271us        91.01%       1.710ms     570.102us      10.464us       100.00%      14.080us       4.693us             3  
-void at::native::(anonymous namespace)::vectorized_l...         0.00%       0.000us         0.00%       0.000us       0.000us      10.464us       100.00%      10.464us       3.488us             3  
-                                Activity Buffer Request        80.98%       1.522ms        80.98%       1.522ms       1.522ms       3.616us        34.56%       3.616us       3.616us             1  
-                                            aten::empty         2.62%      49.332us         2.62%      49.332us       5.481us       0.000us         0.00%       0.000us       0.000us             9  
-                                       cudaLaunchKernel         2.38%      44.720us         2.38%      44.720us      14.907us       0.000us         0.00%       0.000us       0.000us             3  
-                                             aten::view         0.37%       7.020us         0.37%       7.020us       1.170us       0.000us         0.00%       0.000us       0.000us             6  
-                                  cudaDeviceSynchronize         0.43%       8.020us         0.43%       8.020us       8.020us       0.000us         0.00%       0.000us       0.000us             1  
+                                       torch_layer_norm         0.00%       0.000us         0.00%       0.000us       0.000us     117.951us      1284.31%     117.951us     117.951us             1  
+                                       torch_layer_norm         8.74%     158.633us        99.57%       1.807ms       1.807ms       0.000us         0.00%      12.352us      12.352us             1  
+                                       aten::layer_norm         0.95%      17.160us        90.83%       1.649ms     549.530us       0.000us         0.00%      12.352us       4.117us             3  
+                                aten::native_layer_norm         4.49%      81.559us        89.88%       1.631ms     543.810us       9.184us       100.00%      12.352us       4.117us             3  
+void at::native::(anonymous namespace)::vectorized_l...         0.00%       0.000us         0.00%       0.000us       0.000us       9.184us       100.00%       9.184us       3.061us             3  
+                                Activity Buffer Request        79.88%       1.450ms        79.88%       1.450ms       1.450ms       3.168us        34.49%       3.168us       3.168us             1  
+                                            aten::empty         2.58%      46.801us         2.58%      46.801us       5.200us       0.000us         0.00%       0.000us       0.000us             9  
+                                       cudaLaunchKernel         2.54%      46.162us         2.54%      46.162us      15.387us       0.000us         0.00%       0.000us       0.000us             3  
+                                             aten::view         0.39%       7.072us         0.39%       7.072us       1.179us       0.000us         0.00%       0.000us       0.000us             6  
+                                  cudaDeviceSynchronize         0.43%       7.860us         0.43%       7.860us       7.860us       0.000us         0.00%       0.000us       0.000us             1  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-Self CPU time total: 1.879ms
-Self CUDA time total: 10.464us
+Self CPU time total: 1.815ms
+Self CUDA time total: 9.184us
 
 
 
@@ -4001,19 +3989,19 @@ PROFILE TRACE: torch_layer_norm | LN_B1_S128_D2048
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
                                                    Name    Self CPU %      Self CPU   CPU total %     CPU total  CPU time avg     Self CUDA   Self CUDA %    CUDA total  CUDA time avg    # of Calls  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-                                       torch_layer_norm         0.00%       0.000us         0.00%       0.000us       0.000us      92.221us       706.40%      92.221us      92.221us             1  
-                                       torch_layer_norm         4.41%      75.663us        99.71%       1.711ms       1.711ms       0.000us         0.00%      17.343us      17.343us             1  
-                                       aten::layer_norm         0.51%       8.781us        95.30%       1.636ms     545.198us       0.000us         0.00%      17.343us       5.781us             3  
-                                aten::native_layer_norm         2.86%      49.142us        94.79%       1.627ms     542.271us      13.055us       100.00%      17.343us       5.781us             3  
-void at::native::(anonymous namespace)::vectorized_l...         0.00%       0.000us         0.00%       0.000us       0.000us      13.055us       100.00%      13.055us       4.352us             3  
-                                Activity Buffer Request        88.33%       1.516ms        88.33%       1.516ms       1.516ms       4.288us        32.85%       4.288us       4.288us             1  
-                                            aten::empty         1.73%      29.720us         1.73%      29.720us       3.302us       0.000us         0.00%       0.000us       0.000us             9  
-                                       cudaLaunchKernel         1.63%      27.900us         1.63%      27.900us       9.300us       0.000us         0.00%       0.000us       0.000us             3  
-                                             aten::view         0.24%       4.089us         0.24%       4.089us       0.682us       0.000us         0.00%       0.000us       0.000us             6  
-                                  cudaDeviceSynchronize         0.29%       5.010us         0.29%       5.010us       5.010us       0.000us         0.00%       0.000us       0.000us             1  
+                                       torch_layer_norm         0.00%       0.000us         0.00%       0.000us       0.000us      91.263us       777.10%      91.263us      91.263us             1  
+                                       torch_layer_norm         4.45%      73.631us        99.68%       1.650ms       1.650ms       0.000us         0.00%      15.616us      15.616us             1  
+                                       aten::layer_norm         0.53%       8.730us        95.23%       1.577ms     525.519us       0.000us         0.00%      15.616us       5.205us             3  
+                                aten::native_layer_norm         3.21%      53.200us        94.70%       1.568ms     522.609us      11.744us       100.00%      15.616us       5.205us             3  
+void at::native::(anonymous namespace)::vectorized_l...         0.00%       0.000us         0.00%       0.000us       0.000us      11.744us       100.00%      11.744us       3.915us             3  
+                                Activity Buffer Request        87.81%       1.454ms        87.81%       1.454ms       1.454ms       3.872us        32.97%       3.872us       3.872us             1  
+                                            aten::empty         1.80%      29.853us         1.80%      29.853us       3.317us       0.000us         0.00%       0.000us       0.000us             9  
+                                       cudaLaunchKernel         1.64%      27.230us         1.64%      27.230us       9.077us       0.000us         0.00%       0.000us       0.000us             3  
+                                             aten::view         0.23%       3.770us         0.23%       3.770us       0.628us       0.000us         0.00%       0.000us       0.000us             6  
+                                  cudaDeviceSynchronize         0.32%       5.350us         0.32%       5.350us       5.350us       0.000us         0.00%       0.000us       0.000us             1  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-Self CPU time total: 1.716ms
-Self CUDA time total: 13.055us
+Self CPU time total: 1.656ms
+Self CUDA time total: 11.744us
 
 
 
@@ -4023,19 +4011,19 @@ PROFILE TRACE: torch_layer_norm | LN_B1_S128_D4096
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
                                                    Name    Self CPU %      Self CPU   CPU total %     CPU total  CPU time avg     Self CUDA   Self CUDA %    CUDA total  CUDA time avg    # of Calls  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-                                       torch_layer_norm         0.00%       0.000us         0.00%       0.000us       0.000us      93.568us       488.15%      93.568us      93.568us             1  
-                                       torch_layer_norm         4.18%      71.812us        99.71%       1.711ms       1.711ms       0.000us         0.00%      25.600us      25.600us             1  
-                                       aten::layer_norm         0.51%       8.700us        95.53%       1.639ms     546.498us       0.000us         0.00%      25.600us       8.533us             3  
-                                aten::native_layer_norm         2.93%      50.294us        95.02%       1.631ms     543.598us      19.168us       100.00%      25.600us       8.533us             3  
-void at::native::(anonymous namespace)::vectorized_l...         0.00%       0.000us         0.00%       0.000us       0.000us      19.168us       100.00%      19.168us       6.389us             3  
-                                Activity Buffer Request        88.71%       1.522ms        88.71%       1.522ms       1.522ms       6.432us        33.56%       6.432us       6.432us             1  
-                                            aten::empty         1.61%      27.640us         1.61%      27.640us       3.071us       0.000us         0.00%       0.000us       0.000us             9  
-                                       cudaLaunchKernel         1.55%      26.519us         1.55%      26.519us       8.840us       0.000us         0.00%       0.000us       0.000us             3  
-                                             aten::view         0.23%       3.889us         0.23%       3.889us       0.648us       0.000us         0.00%       0.000us       0.000us             6  
-                                  cudaDeviceSynchronize         0.29%       4.970us         0.29%       4.970us       4.970us       0.000us         0.00%       0.000us       0.000us             1  
+                                       torch_layer_norm         0.00%       0.000us         0.00%       0.000us       0.000us      93.407us       570.11%      93.407us      93.407us             1  
+                                       torch_layer_norm         4.26%      70.071us        99.67%       1.640ms       1.640ms       0.000us         0.00%      21.856us      21.856us             1  
+                                       aten::layer_norm         0.57%       9.440us        95.41%       1.570ms     523.176us       0.000us         0.00%      21.856us       7.285us             3  
+                                aten::native_layer_norm         3.17%      52.082us        94.83%       1.560ms     520.029us      16.384us       100.00%      21.856us       7.285us             3  
+void at::native::(anonymous namespace)::vectorized_l...         0.00%       0.000us         0.00%       0.000us       0.000us      16.384us       100.00%      16.384us       5.461us             3  
+                                Activity Buffer Request        87.95%       1.447ms        87.95%       1.447ms       1.447ms       5.472us        33.40%       5.472us       5.472us             1  
+                                            aten::empty         1.77%      29.121us         1.77%      29.121us       3.236us       0.000us         0.00%       0.000us       0.000us             9  
+                                       cudaLaunchKernel         1.71%      28.080us         1.71%      28.080us       9.360us       0.000us         0.00%       0.000us       0.000us             3  
+                                             aten::view         0.24%       4.030us         0.24%       4.030us       0.672us       0.000us         0.00%       0.000us       0.000us             6  
+                                  cudaDeviceSynchronize         0.33%       5.460us         0.33%       5.460us       5.460us       0.000us         0.00%       0.000us       0.000us             1  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-Self CPU time total: 1.716ms
-Self CUDA time total: 19.168us
+Self CPU time total: 1.645ms
+Self CUDA time total: 16.384us
 
 
 
@@ -4045,19 +4033,19 @@ PROFILE TRACE: torch_layer_norm | LN_B1_S128_D8192
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
                                                    Name    Self CPU %      Self CPU   CPU total %     CPU total  CPU time avg     Self CUDA   Self CUDA %    CUDA total  CUDA time avg    # of Calls  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-                                       torch_layer_norm         0.00%       0.000us         0.00%       0.000us       0.000us      92.541us       280.78%      92.541us      92.541us             1  
-                                       torch_layer_norm         4.34%      68.272us        99.69%       1.570ms       1.570ms       0.000us         0.00%      43.839us      43.839us             1  
-                                       aten::layer_norm         0.53%       8.411us        95.35%       1.502ms     500.504us       0.000us         0.00%      43.839us      14.613us             3  
-                                aten::native_layer_norm         3.08%      48.533us        94.82%       1.493ms     497.700us      32.959us       100.00%      43.839us      14.613us             3  
-void at::native::(anonymous namespace)::vectorized_l...         0.00%       0.000us         0.00%       0.000us       0.000us      32.959us       100.00%      32.959us      10.986us             3  
-                                Activity Buffer Request        75.11%       1.183ms        75.11%       1.183ms       1.183ms      10.880us        33.01%      10.880us      10.880us             1  
-                                            aten::empty         1.74%      27.430us         1.74%      27.430us       3.048us       0.000us         0.00%       0.000us       0.000us             9  
-                                       cudaLaunchKernel        14.65%     230.765us        14.65%     230.765us      76.922us       0.000us         0.00%       0.000us       0.000us             3  
-                                             aten::view         0.24%       3.717us         0.24%       3.717us       0.619us       0.000us         0.00%       0.000us       0.000us             6  
-                                  cudaDeviceSynchronize         0.31%       4.880us         0.31%       4.880us       4.880us       0.000us         0.00%       0.000us       0.000us             1  
+                                       torch_layer_norm         0.00%       0.000us         0.00%       0.000us       0.000us     118.239us       440.39%     118.239us     118.239us             1  
+                                       torch_layer_norm         5.44%      79.142us        99.61%       1.449ms       1.449ms       0.000us         0.00%      35.810us      35.810us             1  
+                                       aten::layer_norm         0.75%      10.900us        94.17%       1.370ms     456.578us       0.000us         0.00%      35.810us      11.937us             3  
+                                aten::native_layer_norm         4.07%      59.211us        93.42%       1.359ms     452.944us      26.849us       100.00%      35.810us      11.937us             3  
+void at::native::(anonymous namespace)::vectorized_l...         0.00%       0.000us         0.00%       0.000us       0.000us      26.849us       100.00%      26.849us       8.950us             3  
+                                Activity Buffer Request        72.70%       1.057ms        72.70%       1.057ms       1.057ms       8.961us        33.38%       8.961us       8.961us             1  
+                                            aten::empty         2.44%      35.559us         2.44%      35.559us       3.951us       0.000us         0.00%       0.000us       0.000us             9  
+                                       cudaLaunchKernel        13.86%     201.604us        13.86%     201.604us      67.201us       0.000us         0.00%       0.000us       0.000us             3  
+                                             aten::view         0.34%       4.961us         0.34%       4.961us       0.827us       0.000us         0.00%       0.000us       0.000us             6  
+                                  cudaDeviceSynchronize         0.39%       5.680us         0.39%       5.680us       5.680us       0.000us         0.00%       0.000us       0.000us             1  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-Self CPU time total: 1.575ms
-Self CUDA time total: 32.959us
+Self CPU time total: 1.455ms
+Self CUDA time total: 26.849us
 
 
 
@@ -4067,19 +4055,19 @@ PROFILE TRACE: torch_layer_norm | LN_B1_S512_D1024
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
                                                    Name    Self CPU %      Self CPU   CPU total %     CPU total  CPU time avg     Self CUDA   Self CUDA %    CUDA total  CUDA time avg    # of Calls  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-                                       torch_layer_norm         0.00%       0.000us         0.00%       0.000us       0.000us      87.967us       645.30%      87.967us      87.967us             1  
-                                       torch_layer_norm         3.73%      70.373us        99.76%       1.880ms       1.880ms       0.000us         0.00%      18.016us      18.016us             1  
-                                       aten::layer_norm         0.45%       8.529us        96.03%       1.809ms     603.153us       0.000us         0.00%      18.016us       6.005us             3  
-                                aten::native_layer_norm         2.56%      48.230us        95.57%       1.801ms     600.310us      13.632us       100.00%      18.016us       6.005us             3  
-void at::native::(anonymous namespace)::vectorized_l...         0.00%       0.000us         0.00%       0.000us       0.000us      13.632us       100.00%      13.632us       4.544us             3  
-                                Activity Buffer Request        79.33%       1.495ms        79.33%       1.495ms       1.495ms       4.384us        32.16%       4.384us       4.384us             1  
-                                            aten::empty         1.49%      27.990us         1.49%      27.990us       3.110us       0.000us         0.00%       0.000us       0.000us             9  
-                                       cudaLaunchKernel        12.00%     226.104us        12.00%     226.104us      75.368us       0.000us         0.00%       0.000us       0.000us             3  
-                                             aten::view         0.20%       3.822us         0.20%       3.822us       0.637us       0.000us         0.00%       0.000us       0.000us             6  
-                                  cudaDeviceSynchronize         0.24%       4.480us         0.24%       4.480us       4.480us       0.000us         0.00%       0.000us       0.000us             1  
+                                       torch_layer_norm         0.00%       0.000us         0.00%       0.000us       0.000us      95.007us       954.65%      95.007us      95.007us             1  
+                                       torch_layer_norm         4.08%      72.861us        99.69%       1.782ms       1.782ms       0.000us         0.00%      13.216us      13.216us             1  
+                                       aten::layer_norm         0.50%       9.010us        95.61%       1.709ms     569.593us       0.000us         0.00%      13.216us       4.405us             3  
+                                aten::native_layer_norm         3.10%      55.433us        95.11%       1.700ms     566.590us       9.952us       100.00%      13.216us       4.405us             3  
+void at::native::(anonymous namespace)::vectorized_l...         0.00%       0.000us         0.00%       0.000us       0.000us       9.952us       100.00%       9.952us       3.317us             3  
+                                Activity Buffer Request        81.03%       1.448ms        81.03%       1.448ms       1.448ms       3.264us        32.80%       3.264us       3.264us             1  
+                                            aten::empty         1.69%      30.250us         1.69%      30.250us       3.361us       0.000us         0.00%       0.000us       0.000us             9  
+                                       cudaLaunchKernel         9.05%     161.792us         9.05%     161.792us      53.931us       0.000us         0.00%       0.000us       0.000us             3  
+                                             aten::view         0.23%       4.100us         0.23%       4.100us       0.683us       0.000us         0.00%       0.000us       0.000us             6  
+                                  cudaDeviceSynchronize         0.31%       5.520us         0.31%       5.520us       5.520us       0.000us         0.00%       0.000us       0.000us             1  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-Self CPU time total: 1.884ms
-Self CUDA time total: 13.632us
+Self CPU time total: 1.787ms
+Self CUDA time total: 9.952us
 
 
 
@@ -4089,19 +4077,19 @@ PROFILE TRACE: torch_layer_norm | LN_B1_S512_D2048
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
                                                    Name    Self CPU %      Self CPU   CPU total %     CPU total  CPU time avg     Self CUDA   Self CUDA %    CUDA total  CUDA time avg    # of Calls  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-                                       torch_layer_norm         0.00%       0.000us         0.00%       0.000us       0.000us      87.132us       436.38%      87.132us      87.132us             1  
-                                       torch_layer_norm        11.20%      67.652us        99.23%     599.293us     599.293us       0.000us         0.00%      26.430us      26.430us             1  
-                                       aten::layer_norm         1.44%       8.699us        88.03%     531.641us     177.214us       0.000us         0.00%      26.430us       8.810us             3  
-                                aten::native_layer_norm         7.85%      47.430us        86.59%     522.942us     174.314us      19.967us       100.00%      26.430us       8.810us             3  
-void at::native::(anonymous namespace)::vectorized_l...         0.00%       0.000us         0.00%       0.000us       0.000us      19.967us       100.00%      19.967us       6.656us             3  
-                                Activity Buffer Request        37.02%     223.565us        37.02%     223.565us     223.565us       6.463us        32.37%       6.463us       6.463us             1  
-                                            aten::empty         4.44%      26.841us         4.44%      26.841us       2.982us       0.000us         0.00%       0.000us       0.000us             9  
-                                       cudaLaunchKernel        36.60%     221.066us        36.60%     221.066us      73.689us       0.000us         0.00%       0.000us       0.000us             3  
-                                             aten::view         0.67%       4.040us         0.67%       4.040us       0.673us       0.000us         0.00%       0.000us       0.000us             6  
-                                  cudaDeviceSynchronize         0.77%       4.630us         0.77%       4.630us       4.630us       0.000us         0.00%       0.000us       0.000us             1  
+                                       torch_layer_norm         0.00%       0.000us         0.00%       0.000us       0.000us      88.574us       668.68%      88.574us      88.574us             1  
+                                       torch_layer_norm        15.40%      66.901us        98.88%     429.607us     429.607us       0.000us         0.00%      17.629us      17.629us             1  
+                                       aten::layer_norm         2.14%       9.290us        83.48%     362.706us     120.902us       0.000us         0.00%      17.629us       5.876us             3  
+                                aten::native_layer_norm        12.03%      52.280us        81.34%     353.416us     117.805us      13.246us       100.00%      17.629us       5.876us             3  
+void at::native::(anonymous namespace)::vectorized_l...         0.00%       0.000us         0.00%       0.000us       0.000us      13.246us       100.00%      13.246us       4.415us             3  
+                                Activity Buffer Request        26.09%     113.362us        26.09%     113.362us     113.362us       4.383us        33.09%       4.383us       4.383us             1  
+                                            aten::empty         6.80%      29.541us         6.80%      29.541us       3.282us       0.000us         0.00%       0.000us       0.000us             9  
+                                       cudaLaunchKernel        35.53%     154.353us        35.53%     154.353us      51.451us       0.000us         0.00%       0.000us       0.000us             3  
+                                             aten::view         0.89%       3.880us         0.89%       3.880us       0.647us       0.000us         0.00%       0.000us       0.000us             6  
+                                  cudaDeviceSynchronize         1.12%       4.880us         1.12%       4.880us       4.880us       0.000us         0.00%       0.000us       0.000us             1  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-Self CPU time total: 603.923us
-Self CUDA time total: 19.967us
+Self CPU time total: 434.487us
+Self CUDA time total: 13.246us
 
 
 
@@ -4111,19 +4099,19 @@ PROFILE TRACE: torch_layer_norm | LN_B1_S512_D4096
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
                                                    Name    Self CPU %      Self CPU   CPU total %     CPU total  CPU time avg     Self CUDA   Self CUDA %    CUDA total  CUDA time avg    # of Calls  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-                                       torch_layer_norm         0.00%       0.000us         0.00%       0.000us       0.000us      98.654us       302.25%      98.654us      98.654us             1  
-                                       torch_layer_norm         3.90%      73.122us        99.74%       1.871ms       1.871ms       0.000us         0.00%      42.848us      42.848us             1  
-                                       aten::layer_norm         0.49%       9.220us        95.85%       1.798ms     599.309us       0.000us         0.00%      42.848us      14.283us             3  
-                                aten::native_layer_norm         2.69%      50.411us        95.35%       1.789ms     596.236us      32.640us       100.00%      42.848us      14.283us             3  
-void at::native::(anonymous namespace)::vectorized_l...         0.00%       0.000us         0.00%       0.000us       0.000us      32.640us       100.00%      32.640us      10.880us             3  
-                                Activity Buffer Request        79.57%       1.493ms        79.57%       1.493ms       1.493ms      10.208us        31.27%      10.208us      10.208us             1  
-                                            aten::empty         1.49%      28.020us         1.49%      28.020us       3.113us       0.000us         0.00%       0.000us       0.000us             9  
-                                       cudaLaunchKernel        11.39%     213.675us        11.39%     213.675us      71.225us       0.000us         0.00%       0.000us       0.000us             3  
-                                             aten::view         0.21%       4.030us         0.21%       4.030us       0.672us       0.000us         0.00%       0.000us       0.000us             6  
-                                  cudaDeviceSynchronize         0.26%       4.820us         0.26%       4.820us       4.820us       0.000us         0.00%       0.000us       0.000us             1  
+                                       torch_layer_norm         0.00%       0.000us         0.00%       0.000us       0.000us      96.609us       488.49%      96.609us      96.609us             1  
+                                       torch_layer_norm         4.03%      71.860us        99.72%       1.776ms       1.776ms       0.000us         0.00%      26.305us      26.305us             1  
+                                       aten::layer_norm         0.54%       9.591us        95.68%       1.704ms     568.087us       0.000us         0.00%      26.305us       8.768us             3  
+                                aten::native_layer_norm         2.97%      52.832us        95.14%       1.695ms     564.890us      19.777us       100.00%      26.305us       8.768us             3  
+void at::native::(anonymous namespace)::vectorized_l...         0.00%       0.000us         0.00%       0.000us       0.000us      19.777us       100.00%      19.777us       6.592us             3  
+                                Activity Buffer Request        81.50%       1.452ms        81.50%       1.452ms       1.452ms       6.528us        33.01%       6.528us       6.528us             1  
+                                            aten::empty         1.62%      28.940us         1.62%      28.940us       3.216us       0.000us         0.00%       0.000us       0.000us             9  
+                                       cudaLaunchKernel         8.82%     157.073us         8.82%     157.073us      52.358us       0.000us         0.00%       0.000us       0.000us             3  
+                                             aten::view         0.23%       4.100us         0.23%       4.100us       0.683us       0.000us         0.00%       0.000us       0.000us             6  
+                                  cudaDeviceSynchronize         0.28%       5.050us         0.28%       5.050us       5.050us       0.000us         0.00%       0.000us       0.000us             1  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-Self CPU time total: 1.876ms
-Self CUDA time total: 32.640us
+Self CPU time total: 1.781ms
+Self CUDA time total: 19.777us
 
 
 
@@ -4133,19 +4121,19 @@ PROFILE TRACE: torch_layer_norm | LN_B1_S512_D8192
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
                                                    Name    Self CPU %      Self CPU   CPU total %     CPU total  CPU time avg     Self CUDA   Self CUDA %    CUDA total  CUDA time avg    # of Calls  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-                                       torch_layer_norm         3.66%      69.232us        99.75%       1.887ms       1.887ms       0.000us         0.00%     140.349us     140.349us             1  
-                                       aten::layer_norm         0.48%       9.050us        96.09%       1.817ms     605.826us       0.000us         0.00%     140.349us      46.783us             3  
-                                aten::native_layer_norm         2.62%      49.510us        95.61%       1.808ms     602.810us      87.870us       100.00%     140.349us      46.783us             3  
-                                       torch_layer_norm         0.00%       0.000us         0.00%       0.000us       0.000us     103.646us       117.95%     103.646us     103.646us             1  
-void at::native::(anonymous namespace)::vectorized_l...         0.00%       0.000us         0.00%       0.000us       0.000us      87.870us       100.00%      87.870us      29.290us             3  
-                                Activity Buffer Request        80.23%       1.517ms        80.23%       1.517ms       1.517ms      52.479us        59.72%      52.479us      52.479us             1  
-                                            aten::empty         1.47%      27.721us         1.47%      27.721us       3.080us       0.000us         0.00%       0.000us       0.000us             9  
-                                       cudaLaunchKernel        11.09%     209.785us        11.09%     209.785us      69.928us       0.000us         0.00%       0.000us       0.000us             3  
-                                             aten::view         0.21%       3.950us         0.21%       3.950us       0.658us       0.000us         0.00%       0.000us       0.000us             6  
-                                  cudaDeviceSynchronize         0.25%       4.771us         0.25%       4.771us       4.771us       0.000us         0.00%       0.000us       0.000us             1  
+                                       torch_layer_norm         0.00%       0.000us         0.00%       0.000us       0.000us     101.087us       312.17%     101.087us     101.087us             1  
+                                       torch_layer_norm         4.21%      75.141us        99.72%       1.779ms       1.779ms       0.000us         0.00%      43.134us      43.134us             1  
+                                       aten::layer_norm         0.50%       9.000us        95.50%       1.703ms     567.803us       0.000us         0.00%      43.134us      14.378us             3  
+                                aten::native_layer_norm         3.03%      54.032us        95.00%       1.694ms     564.803us      32.382us       100.00%      43.134us      14.378us             3  
+void at::native::(anonymous namespace)::vectorized_l...         0.00%       0.000us         0.00%       0.000us       0.000us      32.382us       100.00%      32.382us      10.794us             3  
+                                Activity Buffer Request        81.39%       1.452ms        81.39%       1.452ms       1.452ms      10.752us        33.20%      10.752us      10.752us             1  
+                                            aten::empty         1.73%      30.799us         1.73%      30.799us       3.422us       0.000us         0.00%       0.000us       0.000us             9  
+                                       cudaLaunchKernel         8.63%     153.894us         8.63%     153.894us      51.298us       0.000us         0.00%       0.000us       0.000us             3  
+                                             aten::view         0.22%       3.990us         0.22%       3.990us       0.665us       0.000us         0.00%       0.000us       0.000us             6  
+                                  cudaDeviceSynchronize         0.28%       5.050us         0.28%       5.050us       5.050us       0.000us         0.00%       0.000us       0.000us             1  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-Self CPU time total: 1.891ms
-Self CUDA time total: 87.870us
+Self CPU time total: 1.784ms
+Self CUDA time total: 32.382us
 
 
 
@@ -4155,19 +4143,19 @@ PROFILE TRACE: torch_layer_norm | LN_B1_S1024_D1024
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
                                                    Name    Self CPU %      Self CPU   CPU total %     CPU total  CPU time avg     Self CUDA   Self CUDA %    CUDA total  CUDA time avg    # of Calls  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-                                       torch_layer_norm         0.00%       0.000us         0.00%       0.000us       0.000us      86.653us       383.56%      86.653us      86.653us             1  
-                                       torch_layer_norm        10.96%      67.652us        99.24%     612.643us     612.643us       0.000us         0.00%      29.888us      29.888us             1  
-                                       aten::layer_norm         1.40%       8.670us        88.28%     544.991us     181.664us       0.000us         0.00%      29.888us       9.963us             3  
-                                aten::native_layer_norm         7.55%      46.623us        86.87%     536.321us     178.774us      22.592us       100.00%      29.888us       9.963us             3  
-void at::native::(anonymous namespace)::vectorized_l...         0.00%       0.000us         0.00%       0.000us       0.000us      22.592us       100.00%      22.592us       7.531us             3  
-                                Activity Buffer Request        39.97%     246.735us        39.97%     246.735us     246.735us       7.296us        32.29%       7.296us       7.296us             1  
-                                            aten::empty         4.55%      28.120us         4.55%      28.120us       3.124us       0.000us         0.00%       0.000us       0.000us             9  
-                                       cudaLaunchKernel        34.17%     210.983us        34.17%     210.983us      70.328us       0.000us         0.00%       0.000us       0.000us             3  
-                                             aten::view         0.63%       3.860us         0.63%       3.860us       0.643us       0.000us         0.00%       0.000us       0.000us             6  
-                                  cudaDeviceSynchronize         0.76%       4.720us         0.76%       4.720us       4.720us       0.000us         0.00%       0.000us       0.000us             1  
+                                       torch_layer_norm         0.00%       0.000us         0.00%       0.000us       0.000us      84.605us       738.59%      84.605us      84.605us             1  
+                                       torch_layer_norm        14.65%      66.062us        98.90%     446.008us     446.008us       0.000us         0.00%      15.231us      15.231us             1  
+                                       aten::layer_norm         1.88%       8.459us        84.25%     379.946us     126.649us       0.000us         0.00%      15.231us       5.077us             3  
+                                aten::native_layer_norm        11.07%      49.901us        82.38%     371.487us     123.829us      11.455us       100.00%      15.231us       5.077us             3  
+void at::native::(anonymous namespace)::vectorized_l...         0.00%       0.000us         0.00%       0.000us       0.000us      11.455us       100.00%      11.455us       3.818us             3  
+                                Activity Buffer Request        30.37%     136.933us        30.37%     136.933us     136.933us       3.776us        32.96%       3.776us       3.776us             1  
+                                            aten::empty         6.35%      28.620us         6.35%      28.620us       3.180us       0.000us         0.00%       0.000us       0.000us             9  
+                                       cudaLaunchKernel        33.76%     152.233us        33.76%     152.233us      50.744us       0.000us         0.00%       0.000us       0.000us             3  
+                                             aten::view         0.84%       3.800us         0.84%       3.800us       0.633us       0.000us         0.00%       0.000us       0.000us             6  
+                                  cudaDeviceSynchronize         1.10%       4.941us         1.10%       4.941us       4.941us       0.000us         0.00%       0.000us       0.000us             1  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-Self CPU time total: 617.363us
-Self CUDA time total: 22.592us
+Self CPU time total: 450.949us
+Self CUDA time total: 11.455us
 
 
 
@@ -4177,19 +4165,19 @@ PROFILE TRACE: torch_layer_norm | LN_B1_S1024_D2048
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
                                                    Name    Self CPU %      Self CPU   CPU total %     CPU total  CPU time avg     Self CUDA   Self CUDA %    CUDA total  CUDA time avg    # of Calls  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-                                       torch_layer_norm         0.00%       0.000us         0.00%       0.000us       0.000us     101.408us       299.53%     101.408us     101.408us             1  
-                                       torch_layer_norm         3.59%      69.623us        99.76%       1.933ms       1.933ms       0.000us         0.00%      44.608us      44.608us             1  
-                                       aten::layer_norm         0.46%       8.960us        96.16%       1.864ms     621.253us       0.000us         0.00%      44.608us      14.869us             3  
-                                aten::native_layer_norm         2.58%      49.912us        95.70%       1.855ms     618.266us      33.856us       100.00%      44.608us      14.869us             3  
-void at::native::(anonymous namespace)::vectorized_l...         0.00%       0.000us         0.00%       0.000us       0.000us      33.856us       100.00%      33.856us      11.285us             3  
-                                Activity Buffer Request        80.06%       1.552ms        80.06%       1.552ms       1.552ms      10.752us        31.76%      10.752us      10.752us             1  
-                                            aten::empty         1.48%      28.770us         1.48%      28.770us       3.197us       0.000us         0.00%       0.000us       0.000us             9  
-                                       cudaLaunchKernel        11.38%     220.624us        11.38%     220.624us      73.541us       0.000us         0.00%       0.000us       0.000us             3  
-                                             aten::view         0.20%       3.930us         0.20%       3.930us       0.655us       0.000us         0.00%       0.000us       0.000us             6  
-                                  cudaDeviceSynchronize         0.24%       4.730us         0.24%       4.730us       4.730us       0.000us         0.00%       0.000us       0.000us             1  
+                                       torch_layer_norm         0.00%       0.000us         0.00%       0.000us       0.000us      95.615us       580.22%      95.615us      95.615us             1  
+                                       torch_layer_norm         3.86%      68.250us        99.72%       1.762ms       1.762ms       0.000us         0.00%      21.951us      21.951us             1  
+                                       aten::layer_norm         0.50%       8.771us        95.86%       1.694ms     564.703us       0.000us         0.00%      21.951us       7.317us             3  
+                                aten::native_layer_norm         3.18%      56.263us        95.36%       1.685ms     561.780us      16.479us       100.00%      21.951us       7.317us             3  
+void at::native::(anonymous namespace)::vectorized_l...         0.00%       0.000us         0.00%       0.000us       0.000us      16.479us       100.00%      16.479us       5.493us             3  
+                                Activity Buffer Request        81.70%       1.444ms        81.70%       1.444ms       1.444ms       5.472us        33.21%       5.472us       5.472us             1  
+                                            aten::empty         1.62%      28.639us         1.62%      28.639us       3.182us       0.000us         0.00%       0.000us       0.000us             9  
+                                       cudaLaunchKernel         8.61%     152.252us         8.61%     152.252us      50.751us       0.000us         0.00%       0.000us       0.000us             3  
+                                             aten::view         0.24%       4.230us         0.24%       4.230us       0.705us       0.000us         0.00%       0.000us       0.000us             6  
+                                  cudaDeviceSynchronize         0.28%       4.980us         0.28%       4.980us       4.980us       0.000us         0.00%       0.000us       0.000us             1  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-Self CPU time total: 1.938ms
-Self CUDA time total: 33.856us
+Self CPU time total: 1.767ms
+Self CUDA time total: 16.479us
 
 
 
@@ -4199,19 +4187,19 @@ PROFILE TRACE: torch_layer_norm | LN_B1_S1024_D4096
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
                                                    Name    Self CPU %      Self CPU   CPU total %     CPU total  CPU time avg     Self CUDA   Self CUDA %    CUDA total  CUDA time avg    # of Calls  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-                                       torch_layer_norm        11.17%      70.062us        99.23%     622.373us     622.373us       0.000us         0.00%     136.799us     136.799us             1  
-                                       aten::layer_norm         1.42%       8.898us        88.06%     552.311us     184.104us       0.000us         0.00%     136.799us      45.600us             3  
-                                aten::native_layer_norm         7.72%      48.411us        86.64%     543.413us     181.138us      86.463us       100.00%     136.799us      45.600us             3  
-                                       torch_layer_norm         0.00%       0.000us         0.00%       0.000us       0.000us     104.799us       121.21%     104.799us     104.799us             1  
-void at::native::(anonymous namespace)::vectorized_l...         0.00%       0.000us         0.00%       0.000us       0.000us      86.463us       100.00%      86.463us      28.821us             3  
-                                Activity Buffer Request        40.82%     256.046us        40.82%     256.046us     256.046us      50.336us        58.22%      50.336us      50.336us             1  
-                                            aten::empty         4.50%      28.250us         4.50%      28.250us       3.139us       0.000us         0.00%       0.000us       0.000us             9  
-                                       cudaLaunchKernel        32.98%     206.875us        32.98%     206.875us      68.958us       0.000us         0.00%       0.000us       0.000us             3  
-                                             aten::view         0.61%       3.831us         0.61%       3.831us       0.638us       0.000us         0.00%       0.000us       0.000us             6  
-                                  cudaDeviceSynchronize         0.77%       4.841us         0.77%       4.841us       4.841us       0.000us         0.00%       0.000us       0.000us             1  
+                                       torch_layer_norm         0.00%       0.000us         0.00%       0.000us       0.000us      88.894us       345.94%      88.894us      88.894us             1  
+                                       torch_layer_norm        15.31%      64.511us        98.72%     416.027us     416.027us       0.000us         0.00%      34.240us      34.240us             1  
+                                       aten::layer_norm         2.02%       8.530us        83.41%     351.516us     117.172us       0.000us         0.00%      34.240us      11.413us             3  
+                                aten::native_layer_norm        12.31%      51.881us        81.39%     342.986us     114.329us      25.696us       100.00%      34.240us      11.413us             3  
+void at::native::(anonymous namespace)::vectorized_l...         0.00%       0.000us         0.00%       0.000us       0.000us      25.696us       100.00%      25.696us       8.565us             3  
+                                Activity Buffer Request        25.35%     106.822us        25.35%     106.822us     106.822us       8.544us        33.25%       8.544us       8.544us             1  
+                                            aten::empty         6.69%      28.191us         6.69%      28.191us       3.132us       0.000us         0.00%       0.000us       0.000us             9  
+                                       cudaLaunchKernel        36.17%     152.423us        36.17%     152.423us      50.808us       0.000us         0.00%       0.000us       0.000us             3  
+                                             aten::view         0.87%       3.669us         0.87%       3.669us       0.612us       0.000us         0.00%       0.000us       0.000us             6  
+                                  cudaDeviceSynchronize         1.28%       5.400us         1.28%       5.400us       5.400us       0.000us         0.00%       0.000us       0.000us             1  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-Self CPU time total: 627.214us
-Self CUDA time total: 86.463us
+Self CPU time total: 421.427us
+Self CUDA time total: 25.696us
 
 
 
@@ -4221,19 +4209,19 @@ PROFILE TRACE: torch_layer_norm | LN_B1_S1024_D8192
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
                                                    Name    Self CPU %      Self CPU   CPU total %     CPU total  CPU time avg     Self CUDA   Self CUDA %    CUDA total  CUDA time avg    # of Calls  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-                                       torch_layer_norm        10.59%      67.421us        90.93%     578.903us     578.903us       0.000us         0.00%     292.824us     292.824us             1  
-                                       aten::layer_norm         1.32%       8.381us        80.34%     511.482us     170.494us       0.000us         0.00%     292.824us      97.608us             3  
-                                aten::native_layer_norm         7.29%      46.413us        79.02%     503.101us     167.700us     181.403us       100.00%     292.824us      97.608us             3  
-                                       torch_layer_norm         0.00%       0.000us         0.00%       0.000us       0.000us     182.844us       100.79%     182.844us     182.844us             1  
-void at::native::(anonymous namespace)::vectorized_l...         0.00%       0.000us         0.00%       0.000us       0.000us     181.403us       100.00%     181.403us      60.468us             3  
-                                Activity Buffer Request        34.89%     222.115us        34.89%     222.115us     222.115us     111.421us        61.42%     111.421us     111.421us             1  
-                                            aten::empty         4.31%      27.430us         4.31%      27.430us       3.048us       0.000us         0.00%       0.000us       0.000us             9  
-                                       cudaLaunchKernel        31.85%     202.744us        31.85%     202.744us      67.581us       0.000us         0.00%       0.000us       0.000us             3  
-                                             aten::view         0.69%       4.399us         0.69%       4.399us       0.733us       0.000us         0.00%       0.000us       0.000us             6  
-                                  cudaDeviceSynchronize         9.07%      57.751us         9.07%      57.751us      57.751us       0.000us         0.00%       0.000us       0.000us             1  
+                                       torch_layer_norm         3.99%      70.451us        99.68%       1.760ms       1.760ms       0.000us         0.00%     110.273us     110.273us             1  
+                                       aten::layer_norm         0.54%       9.469us        95.69%       1.690ms     563.186us       0.000us         0.00%     110.273us      36.758us             3  
+                                aten::native_layer_norm         2.91%      51.321us        95.15%       1.680ms     560.030us      70.464us       100.00%     110.273us      36.758us             3  
+                                       torch_layer_norm         0.00%       0.000us         0.00%       0.000us       0.000us     104.384us       148.14%     104.384us     104.384us             1  
+void at::native::(anonymous namespace)::vectorized_l...         0.00%       0.000us         0.00%       0.000us       0.000us      70.464us       100.00%      70.464us      23.488us             3  
+                                Activity Buffer Request        81.54%       1.440ms        81.54%       1.440ms       1.440ms      39.809us        56.50%      39.809us      39.809us             1  
+                                            aten::empty         1.69%      29.812us         1.69%      29.812us       3.312us       0.000us         0.00%       0.000us       0.000us             9  
+                                       cudaLaunchKernel         8.79%     155.141us         8.79%     155.141us      51.714us       0.000us         0.00%       0.000us       0.000us             3  
+                                             aten::view         0.23%       4.141us         0.23%       4.141us       0.690us       0.000us         0.00%       0.000us       0.000us             6  
+                                  cudaDeviceSynchronize         0.32%       5.631us         0.32%       5.631us       5.631us       0.000us         0.00%       0.000us       0.000us             1  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-Self CPU time total: 636.654us
-Self CUDA time total: 181.403us
+Self CPU time total: 1.766ms
+Self CUDA time total: 70.464us
 
 
 
@@ -4243,19 +4231,19 @@ PROFILE TRACE: torch_layer_norm | LN_B1_S2048_D1024
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
                                                    Name    Self CPU %      Self CPU   CPU total %     CPU total  CPU time avg     Self CUDA   Self CUDA %    CUDA total  CUDA time avg    # of Calls  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-                                       torch_layer_norm         0.00%       0.000us         0.00%       0.000us       0.000us      94.270us       223.52%      94.270us      94.270us             1  
-                                       torch_layer_norm        10.87%      65.642us        99.23%     599.223us     599.223us       0.000us         0.00%      55.232us      55.232us             1  
-                                       aten::layer_norm         1.37%       8.270us        88.36%     533.581us     177.860us       0.000us         0.00%      55.232us      18.411us             3  
-                                aten::native_layer_norm         8.01%      48.352us        86.99%     525.311us     175.104us      42.176us       100.00%      55.232us      18.411us             3  
-void at::native::(anonymous namespace)::vectorized_l...         0.00%       0.000us         0.00%       0.000us       0.000us      42.176us       100.00%      42.176us      14.059us             3  
-                                Activity Buffer Request        40.23%     242.915us        40.23%     242.915us     242.915us      13.056us        30.96%      13.056us      13.056us             1  
-                                            aten::empty         4.42%      26.710us         4.42%      26.710us       2.968us       0.000us         0.00%       0.000us       0.000us             9  
-                                       cudaLaunchKernel        33.66%     203.264us        33.66%     203.264us      67.755us       0.000us         0.00%       0.000us       0.000us             3  
-                                             aten::view         0.67%       4.070us         0.67%       4.070us       0.678us       0.000us         0.00%       0.000us       0.000us             6  
-                                  cudaDeviceSynchronize         0.77%       4.660us         0.77%       4.660us       4.660us       0.000us         0.00%       0.000us       0.000us             1  
+                                       torch_layer_norm         0.00%       0.000us         0.00%       0.000us       0.000us      94.879us       526.67%      94.879us      94.879us             1  
+                                       torch_layer_norm         3.90%      69.211us        99.68%       1.768ms       1.768ms       0.000us         0.00%      23.935us      23.935us             1  
+                                       aten::layer_norm         0.53%       9.340us        95.78%       1.699ms     566.293us       0.000us         0.00%      23.935us       7.978us             3  
+                                aten::native_layer_norm         2.96%      52.430us        95.26%       1.690ms     563.180us      18.015us       100.00%      23.935us       7.978us             3  
+void at::native::(anonymous namespace)::vectorized_l...         0.00%       0.000us         0.00%       0.000us       0.000us      18.015us       100.00%      18.015us       6.005us             3  
+                                Activity Buffer Request        81.67%       1.449ms        81.67%       1.449ms       1.449ms       5.920us        32.86%       5.920us       5.920us             1  
+                                            aten::empty         1.69%      29.991us         1.69%      29.991us       3.332us       0.000us         0.00%       0.000us       0.000us             9  
+                                       cudaLaunchKernel         8.72%     154.594us         8.72%     154.594us      51.531us       0.000us         0.00%       0.000us       0.000us             3  
+                                             aten::view         0.22%       3.890us         0.22%       3.890us       0.648us       0.000us         0.00%       0.000us       0.000us             6  
+                                  cudaDeviceSynchronize         0.32%       5.590us         0.32%       5.590us       5.590us       0.000us         0.00%       0.000us       0.000us             1  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-Self CPU time total: 603.883us
-Self CUDA time total: 42.176us
+Self CPU time total: 1.774ms
+Self CUDA time total: 18.015us
 
 
 
@@ -4265,19 +4253,19 @@ PROFILE TRACE: torch_layer_norm | LN_B1_S2048_D2048
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
                                                    Name    Self CPU %      Self CPU   CPU total %     CPU total  CPU time avg     Self CUDA   Self CUDA %    CUDA total  CUDA time avg    # of Calls  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-                                       torch_layer_norm        11.67%      65.482us        99.14%     556.242us     556.242us       0.000us         0.00%     139.454us     139.454us             1  
-                                       aten::layer_norm         1.45%       8.119us        87.47%     490.760us     163.587us       0.000us         0.00%     139.454us      46.485us             3  
-                                aten::native_layer_norm         8.23%      46.172us        86.02%     482.641us     160.880us      89.983us       100.00%     139.454us      46.485us             3  
-                                       torch_layer_norm         0.00%       0.000us         0.00%       0.000us       0.000us      99.327us       110.38%      99.327us      99.327us             1  
-void at::native::(anonymous namespace)::vectorized_l...         0.00%       0.000us         0.00%       0.000us       0.000us      89.983us       100.00%      89.983us      29.994us             3  
-                                Activity Buffer Request        36.14%     202.785us        36.14%     202.785us     202.785us      49.471us        54.98%      49.471us      49.471us             1  
-                                            aten::empty         4.95%      27.770us         4.95%      27.770us       3.086us       0.000us         0.00%       0.000us       0.000us             9  
-                                       cudaLaunchKernel        35.90%     201.414us        35.90%     201.414us      67.138us       0.000us         0.00%       0.000us       0.000us             3  
-                                             aten::view         0.80%       4.500us         0.80%       4.500us       0.750us       0.000us         0.00%       0.000us       0.000us             6  
-                                  cudaDeviceSynchronize         0.86%       4.841us         0.86%       4.841us       4.841us       0.000us         0.00%       0.000us       0.000us             1  
+                                       torch_layer_norm         0.00%       0.000us         0.00%       0.000us       0.000us      92.671us       343.53%      92.671us      92.671us             1  
+                                       torch_layer_norm        14.22%      66.652us        98.98%     463.858us     463.858us       0.000us         0.00%      35.872us      35.872us             1  
+                                       aten::layer_norm         1.92%       9.009us        84.76%     397.206us     132.402us       0.000us         0.00%      35.872us      11.957us             3  
+                                aten::native_layer_norm        11.29%      52.919us        82.83%     388.197us     129.399us      26.976us       100.00%      35.872us      11.957us             3  
+void at::native::(anonymous namespace)::vectorized_l...         0.00%       0.000us         0.00%       0.000us       0.000us      26.976us       100.00%      26.976us       8.992us             3  
+                                Activity Buffer Request        32.20%     150.883us        32.20%     150.883us     150.883us       8.896us        32.98%       8.896us       8.896us             1  
+                                            aten::empty         6.01%      28.182us         6.01%      28.182us       3.131us       0.000us         0.00%       0.000us       0.000us             9  
+                                       cudaLaunchKernel        32.49%     152.273us        32.49%     152.273us      50.758us       0.000us         0.00%       0.000us       0.000us             3  
+                                             aten::view         0.84%       3.940us         0.84%       3.940us       0.657us       0.000us         0.00%       0.000us       0.000us             6  
+                                  cudaDeviceSynchronize         1.02%       4.791us         1.02%       4.791us       4.791us       0.000us         0.00%       0.000us       0.000us             1  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-Self CPU time total: 561.083us
-Self CUDA time total: 89.983us
+Self CPU time total: 468.649us
+Self CUDA time total: 26.976us
 
 
 
@@ -4287,19 +4275,19 @@ PROFILE TRACE: torch_layer_norm | LN_B1_S2048_D4096
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
                                                    Name    Self CPU %      Self CPU   CPU total %     CPU total  CPU time avg     Self CUDA   Self CUDA %    CUDA total  CUDA time avg    # of Calls  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-                                       torch_layer_norm         3.41%      68.312us        98.25%       1.970ms       1.970ms       0.000us         0.00%     270.744us     270.744us             1  
-                                       aten::layer_norm         0.47%       9.381us        94.84%       1.901ms     633.747us       0.000us         0.00%     270.744us      90.248us             3  
-                                aten::native_layer_norm         2.60%      52.050us        94.37%       1.892ms     630.620us     169.179us       100.00%     270.744us      90.248us             3  
-                                       torch_layer_norm         0.00%       0.000us         0.00%       0.000us       0.000us     170.619us       100.85%     170.619us     170.619us             1  
-void at::native::(anonymous namespace)::vectorized_l...         0.00%       0.000us         0.00%       0.000us       0.000us     169.179us       100.00%     169.179us      56.393us             3  
-                                Activity Buffer Request        80.19%       1.608ms        80.19%       1.608ms       1.608ms     101.565us        60.03%     101.565us     101.565us             1  
-                                            aten::empty         1.37%      27.561us         1.37%      27.561us       3.062us       0.000us         0.00%       0.000us       0.000us             9  
-                                       cudaLaunchKernel         9.98%     199.994us         9.98%     199.994us      66.665us       0.000us         0.00%       0.000us       0.000us             3  
-                                             aten::view         0.23%       4.700us         0.23%       4.700us       0.783us       0.000us         0.00%       0.000us       0.000us             6  
-                                  cudaDeviceSynchronize         1.75%      35.140us         1.75%      35.140us      35.140us       0.000us         0.00%       0.000us       0.000us             1  
+                                       torch_layer_norm         0.00%       0.000us         0.00%       0.000us       0.000us     133.341us       184.87%     133.341us     133.341us             1  
+                                       torch_layer_norm         3.93%      69.900us        99.72%       1.772ms       1.772ms       0.000us         0.00%     112.892us     112.892us             1  
+                                       aten::layer_norm         0.55%       9.790us        95.79%       1.702ms     567.350us       0.000us         0.00%     112.892us      37.631us             3  
+                                aten::native_layer_norm         3.28%      58.200us        95.24%       1.692ms     564.087us      72.125us       100.00%     112.892us      37.631us             3  
+void at::native::(anonymous namespace)::vectorized_l...         0.00%       0.000us         0.00%       0.000us       0.000us      72.125us       100.00%      72.125us      24.042us             3  
+                                Activity Buffer Request        80.05%       1.422ms        80.05%       1.422ms       1.422ms      40.767us        56.52%      40.767us      40.767us             1  
+                                            aten::empty         1.64%      29.113us         1.64%      29.113us       3.235us       0.000us         0.00%       0.000us       0.000us             9  
+                                       cudaLaunchKernel        10.01%     177.823us        10.01%     177.823us      59.274us       0.000us         0.00%       0.000us       0.000us             3  
+                                             aten::view         0.27%       4.770us         0.27%       4.770us       0.795us       0.000us         0.00%       0.000us       0.000us             6  
+                                  cudaDeviceSynchronize         0.28%       4.900us         0.28%       4.900us       4.900us       0.000us         0.00%       0.000us       0.000us             1  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-Self CPU time total: 2.005ms
-Self CUDA time total: 169.179us
+Self CPU time total: 1.777ms
+Self CUDA time total: 72.125us
 
 
 
@@ -4309,19 +4297,19 @@ PROFILE TRACE: torch_layer_norm | LN_B1_S2048_D8192
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
                                                    Name    Self CPU %      Self CPU   CPU total %     CPU total  CPU time avg     Self CUDA   Self CUDA %    CUDA total  CUDA time avg    # of Calls  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-                                       torch_layer_norm         5.72%      67.681us        46.83%     553.822us     553.822us       0.000us         0.00%       1.004ms       1.004ms             1  
-                                       aten::layer_norm         0.73%       8.590us        41.11%     486.141us     162.047us       0.000us         0.00%       1.004ms     334.666us             3  
-                                aten::native_layer_norm         3.92%      46.321us        40.38%     477.551us     159.184us     752.710us       100.00%       1.004ms     334.666us             3  
-                                       torch_layer_norm         0.00%       0.000us         0.00%       0.000us       0.000us     754.214us       100.20%     754.214us     754.214us             1  
-void at::native::(anonymous namespace)::vectorized_l...         0.00%       0.000us         0.00%       0.000us       0.000us     752.710us       100.00%     752.710us     250.903us             3  
-                                Activity Buffer Request        16.98%     200.864us        16.98%     200.864us     200.864us     251.287us        33.38%     251.287us     251.287us             1  
-                                            aten::empty         2.27%      26.822us         2.27%      26.822us       2.980us       0.000us         0.00%       0.000us       0.000us             9  
-                                       cudaLaunchKernel        16.81%     198.794us        16.81%     198.794us      66.265us       0.000us         0.00%       0.000us       0.000us             3  
-                                             aten::view         0.40%       4.750us         0.40%       4.750us       0.792us       0.000us         0.00%       0.000us       0.000us             6  
-                                  cudaDeviceSynchronize        53.17%     628.854us        53.17%     628.854us     628.854us       0.000us         0.00%       0.000us       0.000us             1  
+                                       torch_layer_norm        14.68%      65.741us        95.47%     427.658us     427.658us       0.000us         0.00%     230.621us     230.621us             1  
+                                       aten::layer_norm         2.04%       9.121us        80.79%     361.917us     120.639us       0.000us         0.00%     230.621us      76.874us             3  
+                                aten::native_layer_norm        11.17%      50.059us        78.75%     352.796us     117.599us     144.510us       100.00%     230.621us      76.874us             3  
+                                       torch_layer_norm         0.00%       0.000us         0.00%       0.000us       0.000us     146.014us       101.04%     146.014us     146.014us             1  
+void at::native::(anonymous namespace)::vectorized_l...         0.00%       0.000us         0.00%       0.000us       0.000us     144.510us       100.00%     144.510us      48.170us             3  
+                                Activity Buffer Request        26.04%     116.642us        26.04%     116.642us     116.642us      86.111us        59.59%      86.111us      86.111us             1  
+                                            aten::empty         6.43%      28.811us         6.43%      28.811us       3.201us       0.000us         0.00%       0.000us       0.000us             9  
+                                       cudaLaunchKernel        34.20%     153.184us        34.20%     153.184us      51.061us       0.000us         0.00%       0.000us       0.000us             3  
+                                             aten::view         0.92%       4.100us         0.92%       4.100us       0.683us       0.000us         0.00%       0.000us       0.000us             6  
+                                  cudaDeviceSynchronize         4.53%      20.311us         4.53%      20.311us      20.311us       0.000us         0.00%       0.000us       0.000us             1  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-Self CPU time total: 1.183ms
-Self CUDA time total: 752.710us
+Self CPU time total: 447.969us
+Self CUDA time total: 144.510us
 
 
 
@@ -4331,19 +4319,19 @@ PROFILE TRACE: torch_layer_norm | LN_B4_S128_D1024
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
                                                    Name    Self CPU %      Self CPU   CPU total %     CPU total  CPU time avg     Self CUDA   Self CUDA %    CUDA total  CUDA time avg    # of Calls  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-                                       torch_layer_norm         0.00%       0.000us         0.00%       0.000us       0.000us      98.270us       717.51%      98.270us      98.270us             1  
-                                       torch_layer_norm         4.06%      75.621us        99.75%       1.859ms       1.859ms       0.000us         0.00%      18.144us      18.144us             1  
-                                       aten::layer_norm         0.48%       8.869us        95.70%       1.783ms     594.400us       0.000us         0.00%      18.144us       6.048us             3  
-                                aten::native_layer_norm         2.66%      49.555us        95.22%       1.774ms     591.443us      13.696us       100.00%      18.144us       6.048us             3  
-void at::native::(anonymous namespace)::vectorized_l...         0.00%       0.000us         0.00%       0.000us       0.000us      13.696us       100.00%      13.696us       4.565us             3  
-                                Activity Buffer Request        80.42%       1.498ms        80.42%       1.498ms       1.498ms       4.448us        32.48%       4.448us       4.448us             1  
-                                            aten::empty         1.52%      28.408us         1.52%      28.408us       3.156us       0.000us         0.00%       0.000us       0.000us             9  
-                                       cudaLaunchKernel        10.37%     193.204us        10.37%     193.204us      64.401us       0.000us         0.00%       0.000us       0.000us             3  
-                                             aten::view         0.25%       4.670us         0.25%       4.670us       0.778us       0.000us         0.00%       0.000us       0.000us             6  
-                                  cudaDeviceSynchronize         0.25%       4.580us         0.25%       4.580us       4.580us       0.000us         0.00%       0.000us       0.000us             1  
+                                       torch_layer_norm         0.00%       0.000us         0.00%       0.000us       0.000us      92.096us       943.61%      92.096us      92.096us             1  
+                                       torch_layer_norm         3.85%      68.512us        99.73%       1.773ms       1.773ms       0.000us         0.00%      12.864us      12.864us             1  
+                                       aten::layer_norm         0.55%       9.759us        95.87%       1.705ms     568.216us       0.000us         0.00%      12.864us       4.288us             3  
+                                aten::native_layer_norm         3.00%      53.309us        95.32%       1.695ms     564.963us       9.760us       100.00%      12.864us       4.288us             3  
+void at::native::(anonymous namespace)::vectorized_l...         0.00%       0.000us         0.00%       0.000us       0.000us       9.760us       100.00%       9.760us       3.253us             3  
+                                Activity Buffer Request        81.26%       1.445ms        81.26%       1.445ms       1.445ms       3.104us        31.80%       3.104us       3.104us             1  
+                                            aten::empty         1.70%      30.172us         1.70%      30.172us       3.352us       0.000us         0.00%       0.000us       0.000us             9  
+                                       cudaLaunchKernel         9.14%     162.452us         9.14%     162.452us      54.151us       0.000us         0.00%       0.000us       0.000us             3  
+                                             aten::view         0.24%       4.201us         0.24%       4.201us       0.700us       0.000us         0.00%       0.000us       0.000us             6  
+                                  cudaDeviceSynchronize         0.27%       4.880us         0.27%       4.880us       4.880us       0.000us         0.00%       0.000us       0.000us             1  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-Self CPU time total: 1.863ms
-Self CUDA time total: 13.696us
+Self CPU time total: 1.778ms
+Self CUDA time total: 9.760us
 
 
 
@@ -4353,19 +4341,19 @@ PROFILE TRACE: torch_layer_norm | LN_B4_S128_D2048
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
                                                    Name    Self CPU %      Self CPU   CPU total %     CPU total  CPU time avg     Self CUDA   Self CUDA %    CUDA total  CUDA time avg    # of Calls  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-                                       torch_layer_norm         0.00%       0.000us         0.00%       0.000us       0.000us      84.157us       423.54%      84.157us      84.157us             1  
-                                       torch_layer_norm        12.56%      65.601us        99.10%     517.451us     517.451us       0.000us         0.00%      26.238us      26.238us             1  
-                                       aten::layer_norm         1.65%       8.620us        86.53%     451.850us     150.617us       0.000us         0.00%      26.238us       8.746us             3  
-                                aten::native_layer_norm         8.95%      46.731us        84.88%     443.230us     147.743us      19.870us       100.00%      26.238us       8.746us             3  
-void at::native::(anonymous namespace)::vectorized_l...         0.00%       0.000us         0.00%       0.000us       0.000us      19.870us       100.00%      19.870us       6.623us             3  
-                                Activity Buffer Request        33.96%     177.304us        33.96%     177.304us     177.304us       6.368us        32.05%       6.368us       6.368us             1  
-                                            aten::empty         5.03%      26.250us         5.03%      26.250us       2.917us       0.000us         0.00%       0.000us       0.000us             9  
-                                       cudaLaunchKernel        36.12%     188.585us        36.12%     188.585us      62.862us       0.000us         0.00%       0.000us       0.000us             3  
-                                             aten::view         0.83%       4.360us         0.83%       4.360us       0.727us       0.000us         0.00%       0.000us       0.000us             6  
-                                  cudaDeviceSynchronize         0.90%       4.720us         0.90%       4.720us       4.720us       0.000us         0.00%       0.000us       0.000us             1  
+                                       torch_layer_norm         0.00%       0.000us         0.00%       0.000us       0.000us      91.521us       709.63%      91.521us      91.521us             1  
+                                       torch_layer_norm         4.32%      76.641us        99.71%       1.771ms       1.771ms       0.000us         0.00%      17.186us      17.186us             1  
+                                       aten::layer_norm         0.52%       9.251us        95.40%       1.694ms     564.620us       0.000us         0.00%      17.186us       5.729us             3  
+                                aten::native_layer_norm         2.94%      52.208us        94.87%       1.685ms     561.536us      12.897us       100.00%      17.186us       5.729us             3  
+void at::native::(anonymous namespace)::vectorized_l...         0.00%       0.000us         0.00%       0.000us       0.000us      12.897us       100.00%      12.897us       4.299us             3  
+                                Activity Buffer Request        81.35%       1.444ms        81.35%       1.444ms       1.444ms       4.289us        33.26%       4.289us       4.289us             1  
+                                            aten::empty         1.65%      29.223us         1.65%      29.223us       3.247us       0.000us         0.00%       0.000us       0.000us             9  
+                                       cudaLaunchKernel         8.72%     154.793us         8.72%     154.793us      51.598us       0.000us         0.00%       0.000us       0.000us             3  
+                                             aten::view         0.22%       3.890us         0.22%       3.890us       0.648us       0.000us         0.00%       0.000us       0.000us             6  
+                                  cudaDeviceSynchronize         0.29%       5.110us         0.29%       5.110us       5.110us       0.000us         0.00%       0.000us       0.000us             1  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-Self CPU time total: 522.171us
-Self CUDA time total: 19.870us
+Self CPU time total: 1.776ms
+Self CUDA time total: 12.897us
 
 
 
@@ -4375,19 +4363,19 @@ PROFILE TRACE: torch_layer_norm | LN_B4_S128_D4096
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
                                                    Name    Self CPU %      Self CPU   CPU total %     CPU total  CPU time avg     Self CUDA   Self CUDA %    CUDA total  CUDA time avg    # of Calls  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-                                       torch_layer_norm         0.00%       0.000us         0.00%       0.000us       0.000us     100.830us       309.83%     100.830us     100.830us             1  
-                                       torch_layer_norm         3.66%      68.162us        99.75%       1.858ms       1.858ms       0.000us         0.00%      42.752us      42.752us             1  
-                                       aten::layer_norm         0.47%       8.830us        96.09%       1.790ms     596.629us       0.000us         0.00%      42.752us      14.251us             3  
-                                aten::native_layer_norm         2.68%      49.840us        95.61%       1.781ms     593.686us      32.544us       100.00%      42.752us      14.251us             3  
-void at::native::(anonymous namespace)::vectorized_l...         0.00%       0.000us         0.00%       0.000us       0.000us      32.544us       100.00%      32.544us      10.848us             3  
-                                Activity Buffer Request        81.03%       1.509ms        81.03%       1.509ms       1.509ms      10.208us        31.37%      10.208us      10.208us             1  
-                                            aten::empty         1.49%      27.810us         1.49%      27.810us       3.090us       0.000us         0.00%       0.000us       0.000us             9  
-                                       cudaLaunchKernel        10.20%     189.944us        10.20%     189.944us      63.315us       0.000us         0.00%       0.000us       0.000us             3  
-                                             aten::view         0.21%       3.981us         0.21%       3.981us       0.663us       0.000us         0.00%       0.000us       0.000us             6  
-                                  cudaDeviceSynchronize         0.25%       4.711us         0.25%       4.711us       4.711us       0.000us         0.00%       0.000us       0.000us             1  
+                                       torch_layer_norm         0.00%       0.000us         0.00%       0.000us       0.000us      88.130us       448.50%      88.130us      88.130us             1  
+                                       torch_layer_norm        11.06%      64.130us        99.16%     575.190us     575.190us       0.000us         0.00%      26.147us      26.147us             1  
+                                       aten::layer_norm         1.59%       9.222us        88.10%     511.060us     170.353us       0.000us         0.00%      26.147us       8.716us             3  
+                                aten::native_layer_norm         8.61%      49.940us        86.51%     501.838us     167.279us      19.650us       100.00%      26.147us       8.716us             3  
+void at::native::(anonymous namespace)::vectorized_l...         0.00%       0.000us         0.00%       0.000us       0.000us      19.650us       100.00%      19.650us       6.550us             3  
+                                Activity Buffer Request        45.46%     263.724us        45.46%     263.724us     263.724us       6.497us        33.06%       6.497us       6.497us             1  
+                                            aten::empty         4.97%      28.852us         4.97%      28.852us       3.206us       0.000us         0.00%       0.000us       0.000us             9  
+                                       cudaLaunchKernel        26.69%     154.833us        26.69%     154.833us      51.611us       0.000us         0.00%       0.000us       0.000us             3  
+                                             aten::view         0.77%       4.489us         0.77%       4.489us       0.748us       0.000us         0.00%       0.000us       0.000us             6  
+                                  cudaDeviceSynchronize         0.84%       4.880us         0.84%       4.880us       4.880us       0.000us         0.00%       0.000us       0.000us             1  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-Self CPU time total: 1.863ms
-Self CUDA time total: 32.544us
+Self CPU time total: 580.070us
+Self CUDA time total: 19.650us
 
 
 
@@ -4397,19 +4385,19 @@ PROFILE TRACE: torch_layer_norm | LN_B4_S128_D8192
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
                                                    Name    Self CPU %      Self CPU   CPU total %     CPU total  CPU time avg     Self CUDA   Self CUDA %    CUDA total  CUDA time avg    # of Calls  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-                                       torch_layer_norm         3.74%      69.652us        99.74%       1.860ms       1.860ms       0.000us         0.00%     141.437us     141.437us             1  
-                                       aten::layer_norm         0.49%       9.081us        96.01%       1.790ms     596.709us       0.000us         0.00%     141.437us      47.146us             3  
-                                aten::native_layer_norm         2.73%      50.892us        95.52%       1.781ms     593.682us      88.286us       100.00%     141.437us      47.146us             3  
-                                       torch_layer_norm         0.00%       0.000us         0.00%       0.000us       0.000us     105.086us       119.03%     105.086us     105.086us             1  
-void at::native::(anonymous namespace)::vectorized_l...         0.00%       0.000us         0.00%       0.000us       0.000us      88.286us       100.00%      88.286us      29.429us             3  
-                                Activity Buffer Request        81.06%       1.511ms        81.06%       1.511ms       1.511ms      53.151us        60.20%      53.151us      53.151us             1  
-                                            aten::empty         1.52%      28.430us         1.52%      28.430us       3.159us       0.000us         0.00%       0.000us       0.000us             9  
-                                       cudaLaunchKernel        10.00%     186.444us        10.00%     186.444us      62.148us       0.000us         0.00%       0.000us       0.000us             3  
-                                             aten::view         0.21%       3.839us         0.21%       3.839us       0.640us       0.000us         0.00%       0.000us       0.000us             6  
-                                  cudaDeviceSynchronize         0.26%       4.780us         0.26%       4.780us       4.780us       0.000us         0.00%       0.000us       0.000us             1  
+                                       torch_layer_norm         0.00%       0.000us         0.00%       0.000us       0.000us      92.576us       290.74%      92.576us      92.576us             1  
+                                       torch_layer_norm        10.78%      63.911us        99.14%     587.520us     587.520us       0.000us         0.00%      42.562us      42.562us             1  
+                                       aten::layer_norm         1.44%       8.510us        88.35%     523.609us     174.536us       0.000us         0.00%      42.562us      14.187us             3  
+                                aten::native_layer_norm         8.62%      51.095us        86.92%     515.099us     171.700us      31.841us       100.00%      42.562us      14.187us             3  
+void at::native::(anonymous namespace)::vectorized_l...         0.00%       0.000us         0.00%       0.000us       0.000us      31.841us       100.00%      31.841us      10.614us             3  
+                                Activity Buffer Request        46.87%     277.744us        46.87%     277.744us     277.744us      10.721us        33.67%      10.721us      10.721us             1  
+                                            aten::empty         4.75%      28.169us         4.75%      28.169us       3.130us       0.000us         0.00%       0.000us       0.000us             9  
+                                       cudaLaunchKernel        25.92%     153.632us        25.92%     153.632us      51.211us       0.000us         0.00%       0.000us       0.000us             3  
+                                             aten::view         0.75%       4.459us         0.75%       4.459us       0.743us       0.000us         0.00%       0.000us       0.000us             6  
+                                  cudaDeviceSynchronize         0.86%       5.110us         0.86%       5.110us       5.110us       0.000us         0.00%       0.000us       0.000us             1  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-Self CPU time total: 1.865ms
-Self CUDA time total: 88.286us
+Self CPU time total: 592.630us
+Self CUDA time total: 31.841us
 
 
 
@@ -4419,19 +4407,19 @@ PROFILE TRACE: torch_layer_norm | LN_B4_S512_D1024
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
                                                    Name    Self CPU %      Self CPU   CPU total %     CPU total  CPU time avg     Self CUDA   Self CUDA %    CUDA total  CUDA time avg    # of Calls  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-                                       torch_layer_norm         0.00%       0.000us         0.00%       0.000us       0.000us      93.917us       224.90%      93.917us      93.917us             1  
-                                       torch_layer_norm        11.99%      66.702us        99.09%     551.052us     551.052us       0.000us         0.00%      54.848us      54.848us             1  
-                                       aten::layer_norm         1.50%       8.369us        87.10%     484.350us     161.450us       0.000us         0.00%      54.848us      18.283us             3  
-                                aten::native_layer_norm         8.25%      45.863us        85.59%     475.981us     158.660us      41.760us       100.00%      54.848us      18.283us             3  
-void at::native::(anonymous namespace)::vectorized_l...         0.00%       0.000us         0.00%       0.000us       0.000us      41.760us       100.00%      41.760us      13.920us             3  
-                                Activity Buffer Request        38.06%     211.665us        38.06%     211.665us     211.665us      13.088us        31.34%      13.088us      13.088us             1  
-                                            aten::empty         5.01%      27.870us         5.01%      27.870us       3.097us       0.000us         0.00%       0.000us       0.000us             9  
-                                       cudaLaunchKernel        33.56%     186.643us        33.56%     186.643us      62.214us       0.000us         0.00%       0.000us       0.000us             3  
-                                             aten::view         0.71%       3.940us         0.71%       3.940us       0.657us       0.000us         0.00%       0.000us       0.000us             6  
-                                  cudaDeviceSynchronize         0.91%       5.050us         0.91%       5.050us       5.050us       0.000us         0.00%       0.000us       0.000us             1  
+                                       torch_layer_norm         0.00%       0.000us         0.00%       0.000us       0.000us      95.776us       539.28%      95.776us      95.776us             1  
+                                       torch_layer_norm        13.84%     112.583us        99.26%     807.595us     807.595us       0.000us         0.00%      23.680us      23.680us             1  
+                                       aten::layer_norm         1.40%      11.400us        85.42%     695.012us     231.671us       0.000us         0.00%      23.680us       7.893us             3  
+                                aten::native_layer_norm         7.57%      61.601us        84.02%     683.612us     227.871us      17.760us       100.00%      23.680us       7.893us             3  
+void at::native::(anonymous namespace)::vectorized_l...         0.00%       0.000us         0.00%       0.000us       0.000us      17.760us       100.00%      17.760us       5.920us             3  
+                                Activity Buffer Request        33.76%     274.664us        33.76%     274.664us     274.664us       5.920us        33.33%       5.920us       5.920us             1  
+                                            aten::empty         3.69%      30.062us         3.69%      30.062us       3.340us       0.000us         0.00%       0.000us       0.000us             9  
+                                       cudaLaunchKernel        38.34%     311.955us        38.34%     311.955us     103.985us       0.000us         0.00%       0.000us       0.000us             3  
+                                             aten::view         0.66%       5.330us         0.66%       5.330us       0.888us       0.000us         0.00%       0.000us       0.000us             6  
+                                  cudaDeviceSynchronize         0.74%       6.030us         0.74%       6.030us       6.030us       0.000us         0.00%       0.000us       0.000us             1  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-Self CPU time total: 556.102us
-Self CUDA time total: 41.760us
+Self CPU time total: 813.625us
+Self CUDA time total: 17.760us
 
 
 
@@ -4441,19 +4429,19 @@ PROFILE TRACE: torch_layer_norm | LN_B4_S512_D2048
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
                                                    Name    Self CPU %      Self CPU   CPU total %     CPU total  CPU time avg     Self CUDA   Self CUDA %    CUDA total  CUDA time avg    # of Calls  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-                                       torch_layer_norm         4.62%      86.531us        99.75%       1.867ms       1.867ms       0.000us         0.00%     136.638us     136.638us             1  
-                                       aten::layer_norm         0.50%       9.359us        95.12%       1.780ms     593.443us       0.000us         0.00%     136.638us      45.546us             3  
-                                aten::native_layer_norm         2.82%      52.769us        94.62%       1.771ms     590.323us      88.543us       100.00%     136.638us      45.546us             3  
-                                       torch_layer_norm         0.00%       0.000us         0.00%       0.000us       0.000us     111.902us       126.38%     111.902us     111.902us             1  
-void at::native::(anonymous namespace)::vectorized_l...         0.00%       0.000us         0.00%       0.000us       0.000us      88.543us       100.00%      88.543us      29.514us             3  
-                                Activity Buffer Request        80.23%       1.502ms        80.23%       1.502ms       1.502ms      48.095us        54.32%      48.095us      48.095us             1  
-                                            aten::empty         1.54%      28.823us         1.54%      28.823us       3.203us       0.000us         0.00%       0.000us       0.000us             9  
-                                       cudaLaunchKernel         9.82%     183.715us         9.82%     183.715us      61.238us       0.000us         0.00%       0.000us       0.000us             3  
-                                             aten::view         0.21%       4.020us         0.21%       4.020us       0.670us       0.000us         0.00%       0.000us       0.000us             6  
-                                  cudaDeviceSynchronize         0.25%       4.770us         0.25%       4.770us       4.770us       0.000us         0.00%       0.000us       0.000us             1  
+                                       torch_layer_norm         0.00%       0.000us         0.00%       0.000us       0.000us      96.383us       353.93%      96.383us      96.383us             1  
+                                       torch_layer_norm         4.14%      80.990us        99.72%       1.949ms       1.949ms       0.000us         0.00%      36.288us      36.288us             1  
+                                       aten::layer_norm         0.49%       9.631us        95.58%       1.868ms     622.648us       0.000us         0.00%      36.288us      12.096us             3  
+                                aten::native_layer_norm         2.77%      54.113us        95.09%       1.858ms     619.438us      27.232us       100.00%      36.288us      12.096us             3  
+void at::native::(anonymous namespace)::vectorized_l...         0.00%       0.000us         0.00%       0.000us       0.000us      27.232us       100.00%      27.232us       9.077us             3  
+                                Activity Buffer Request        75.84%       1.482ms        75.84%       1.482ms       1.482ms       9.056us        33.25%       9.056us       9.056us             1  
+                                            aten::empty         1.50%      29.320us         1.50%      29.320us       3.258us       0.000us         0.00%       0.000us       0.000us             9  
+                                       cudaLaunchKernel        14.76%     288.535us        14.76%     288.535us      96.178us       0.000us         0.00%       0.000us       0.000us             3  
+                                             aten::view         0.22%       4.249us         0.22%       4.249us       0.708us       0.000us         0.00%       0.000us       0.000us             6  
+                                  cudaDeviceSynchronize         0.28%       5.411us         0.28%       5.411us       5.411us       0.000us         0.00%       0.000us       0.000us             1  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-Self CPU time total: 1.872ms
-Self CUDA time total: 88.543us
+Self CPU time total: 1.954ms
+Self CUDA time total: 27.232us
 
 
 
@@ -4463,19 +4451,19 @@ PROFILE TRACE: torch_layer_norm | LN_B4_S512_D4096
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
                                                    Name    Self CPU %      Self CPU   CPU total %     CPU total  CPU time avg     Self CUDA   Self CUDA %    CUDA total  CUDA time avg    # of Calls  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-                                       torch_layer_norm        11.20%      64.902us        91.70%     531.442us     531.442us       0.000us         0.00%     274.259us     274.259us             1  
-                                       aten::layer_norm         1.46%       8.459us        80.50%     466.540us     155.513us       0.000us         0.00%     274.259us      91.420us             3  
-                                aten::native_layer_norm         8.11%      47.030us        79.04%     458.081us     152.694us     170.744us       100.00%     274.259us      91.420us             3  
-                                       torch_layer_norm         0.00%       0.000us         0.00%       0.000us       0.000us     172.183us       100.84%     172.183us     172.183us             1  
-void at::native::(anonymous namespace)::vectorized_l...         0.00%       0.000us         0.00%       0.000us       0.000us     170.744us       100.00%     170.744us      56.915us             3  
-                                Activity Buffer Request        33.75%     195.605us        33.75%     195.605us     195.605us     103.515us        60.63%     103.515us     103.515us             1  
-                                            aten::empty         4.92%      28.491us         4.92%      28.491us       3.166us       0.000us         0.00%       0.000us       0.000us             9  
-                                       cudaLaunchKernel        31.58%     183.015us        31.58%     183.015us      61.005us       0.000us         0.00%       0.000us       0.000us             3  
-                                             aten::view         0.68%       3.940us         0.68%       3.940us       0.657us       0.000us         0.00%       0.000us       0.000us             6  
-                                  cudaDeviceSynchronize         8.30%      48.121us         8.30%      48.121us      48.121us       0.000us         0.00%       0.000us       0.000us             1  
+                                       torch_layer_norm         3.80%      69.480us        99.73%       1.822ms       1.822ms       0.000us         0.00%     112.641us     112.641us             1  
+                                       aten::layer_norm         0.50%       9.151us        95.93%       1.752ms     584.111us       0.000us         0.00%     112.641us      37.547us             3  
+                                aten::native_layer_norm         2.81%      51.420us        95.43%       1.743ms     581.060us      72.033us       100.00%     112.641us      37.547us             3  
+                                       torch_layer_norm         0.00%       0.000us         0.00%       0.000us       0.000us     101.696us       141.18%     101.696us     101.696us             1  
+void at::native::(anonymous namespace)::vectorized_l...         0.00%       0.000us         0.00%       0.000us       0.000us      72.033us       100.00%      72.033us      24.011us             3  
+                                Activity Buffer Request        80.53%       1.471ms        80.53%       1.471ms       1.471ms      40.608us        56.37%      40.608us      40.608us             1  
+                                            aten::empty         1.60%      29.163us         1.60%      29.163us       3.240us       0.000us         0.00%       0.000us       0.000us             9  
+                                       cudaLaunchKernel        10.27%     187.683us        10.27%     187.683us      62.561us       0.000us         0.00%       0.000us       0.000us             3  
+                                             aten::view         0.22%       3.950us         0.22%       3.950us       0.658us       0.000us         0.00%       0.000us       0.000us             6  
+                                  cudaDeviceSynchronize         0.27%       4.880us         0.27%       4.880us       4.880us       0.000us         0.00%       0.000us       0.000us             1  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-Self CPU time total: 579.563us
-Self CUDA time total: 170.744us
+Self CPU time total: 1.827ms
+Self CUDA time total: 72.033us
 
 
 
@@ -4485,19 +4473,19 @@ PROFILE TRACE: torch_layer_norm | LN_B4_S512_D8192
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
                                                    Name    Self CPU %      Self CPU   CPU total %     CPU total  CPU time avg     Self CUDA   Self CUDA %    CUDA total  CUDA time avg    # of Calls  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-                                       torch_layer_norm         2.78%      68.521us        74.25%       1.833ms       1.833ms       0.000us         0.00%       1.015ms       1.015ms             1  
-                                       aten::layer_norm         0.37%       9.021us        71.48%       1.765ms     588.209us       0.000us         0.00%       1.015ms     338.437us             3  
-                                aten::native_layer_norm         1.97%      48.600us        71.11%       1.756ms     585.202us     765.011us       100.00%       1.015ms     338.437us             3  
-                                       torch_layer_norm         0.00%       0.000us         0.00%       0.000us       0.000us     766.355us       100.18%     766.355us     766.355us             1  
-void at::native::(anonymous namespace)::vectorized_l...         0.00%       0.000us         0.00%       0.000us       0.000us     765.011us       100.00%     765.011us     255.004us             3  
-                                Activity Buffer Request        60.48%       1.493ms        60.48%       1.493ms       1.493ms     250.300us        32.72%     250.300us     250.300us             1  
-                                            aten::empty         1.12%      27.530us         1.12%      27.530us       3.059us       0.000us         0.00%       0.000us       0.000us             9  
-                                       cudaLaunchKernel         7.39%     182.375us         7.39%     182.375us      60.792us       0.000us         0.00%       0.000us       0.000us             3  
-                                             aten::view         0.16%       4.040us         0.16%       4.040us       0.673us       0.000us         0.00%       0.000us       0.000us             6  
-                                  cudaDeviceSynchronize        25.75%     635.633us        25.75%     635.633us     635.633us       0.000us         0.00%       0.000us       0.000us             1  
+                                       torch_layer_norm         3.85%      68.680us        99.71%       1.780ms       1.780ms       0.000us         0.00%     229.955us     229.955us             1  
+                                       aten::layer_norm         0.61%      10.850us        95.86%       1.711ms     570.370us       0.000us         0.00%     229.955us      76.652us             3  
+                                aten::native_layer_norm         3.11%      55.560us        95.26%       1.700ms     566.754us     144.066us       100.00%     229.955us      76.652us             3  
+                                       torch_layer_norm         0.00%       0.000us         0.00%       0.000us       0.000us     145.569us       101.04%     145.569us     145.569us             1  
+void at::native::(anonymous namespace)::vectorized_l...         0.00%       0.000us         0.00%       0.000us       0.000us     144.066us       100.00%     144.066us      48.022us             3  
+                                Activity Buffer Request        79.52%       1.419ms        79.52%       1.419ms       1.419ms      85.889us        59.62%      85.889us      85.889us             1  
+                                            aten::empty         1.71%      30.551us         1.71%      30.551us       3.395us       0.000us         0.00%       0.000us       0.000us             9  
+                                       cudaLaunchKernel        10.67%     190.375us        10.67%     190.375us      63.458us       0.000us         0.00%       0.000us       0.000us             3  
+                                             aten::view         0.24%       4.330us         0.24%       4.330us       0.722us       0.000us         0.00%       0.000us       0.000us             6  
+                                  cudaDeviceSynchronize         0.29%       5.130us         0.29%       5.130us       5.130us       0.000us         0.00%       0.000us       0.000us             1  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-Self CPU time total: 2.469ms
-Self CUDA time total: 765.011us
+Self CPU time total: 1.785ms
+Self CUDA time total: 144.066us
 
 
 
@@ -4507,19 +4495,19 @@ PROFILE TRACE: torch_layer_norm | LN_B4_S1024_D1024
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
                                                    Name    Self CPU %      Self CPU   CPU total %     CPU total  CPU time avg     Self CUDA   Self CUDA %    CUDA total  CUDA time avg    # of Calls  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-                                       torch_layer_norm         3.81%      69.942us        99.74%       1.831ms       1.831ms       0.000us         0.00%     147.579us     147.579us             1  
-                                       aten::layer_norm         0.48%       8.750us        95.93%       1.761ms     586.892us       0.000us         0.00%     147.579us      49.193us             3  
-                                aten::native_layer_norm         2.73%      50.120us        95.45%       1.752ms     583.976us      98.333us       100.00%     147.579us      49.193us             3  
-                                       torch_layer_norm         0.00%       0.000us         0.00%       0.000us       0.000us     109.597us       111.45%     109.597us     109.597us             1  
-void at::native::(anonymous namespace)::vectorized_l...         0.00%       0.000us         0.00%       0.000us       0.000us      98.333us       100.00%      98.333us      32.778us             3  
-                                Activity Buffer Request        81.29%       1.492ms        81.29%       1.492ms       1.492ms      49.246us        50.08%      49.246us      49.246us             1  
-                                            aten::empty         1.50%      27.580us         1.50%      27.580us       3.064us       0.000us         0.00%       0.000us       0.000us             9  
-                                       cudaLaunchKernel         9.72%     178.483us         9.72%     178.483us      59.494us       0.000us         0.00%       0.000us       0.000us             3  
-                                             aten::view         0.21%       3.842us         0.21%       3.842us       0.640us       0.000us         0.00%       0.000us       0.000us             6  
-                                  cudaDeviceSynchronize         0.26%       4.750us         0.26%       4.750us       4.750us       0.000us         0.00%       0.000us       0.000us             1  
+                                       torch_layer_norm         0.00%       0.000us         0.00%       0.000us       0.000us     115.904us       398.90%     115.904us     115.904us             1  
+                                       torch_layer_norm         4.36%      77.971us        99.69%       1.781ms       1.781ms       0.000us         0.00%      38.656us      38.656us             1  
+                                       aten::layer_norm         0.59%      10.570us        95.33%       1.703ms     567.730us       0.000us         0.00%      38.656us      12.885us             3  
+                                aten::native_layer_norm         3.31%      59.081us        94.74%       1.693ms     564.207us      29.056us       100.00%      38.656us      12.885us             3  
+void at::native::(anonymous namespace)::vectorized_l...         0.00%       0.000us         0.00%       0.000us       0.000us      29.056us       100.00%      29.056us       9.685us             3  
+                                Activity Buffer Request        80.03%       1.430ms        80.03%       1.430ms       1.430ms       9.600us        33.04%       9.600us       9.600us             1  
+                                            aten::empty         1.84%      32.962us         1.84%      32.962us       3.662us       0.000us         0.00%       0.000us       0.000us             9  
+                                       cudaLaunchKernel         9.29%     165.972us         9.29%     165.972us      55.324us       0.000us         0.00%       0.000us       0.000us             3  
+                                             aten::view         0.27%       4.790us         0.27%       4.790us       0.798us       0.000us         0.00%       0.000us       0.000us             6  
+                                  cudaDeviceSynchronize         0.31%       5.470us         0.31%       5.470us       5.470us       0.000us         0.00%       0.000us       0.000us             1  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-Self CPU time total: 1.835ms
-Self CUDA time total: 98.333us
+Self CPU time total: 1.787ms
+Self CUDA time total: 29.056us
 
 
 
@@ -4529,19 +4517,19 @@ PROFILE TRACE: torch_layer_norm | LN_B4_S1024_D2048
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
                                                    Name    Self CPU %      Self CPU   CPU total %     CPU total  CPU time avg     Self CUDA   Self CUDA %    CUDA total  CUDA time avg    # of Calls  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-                                       torch_layer_norm        13.41%      85.581us        92.66%     591.443us     591.443us       0.000us         0.00%     270.843us     270.843us             1  
-                                       aten::layer_norm         1.36%       8.710us        79.25%     505.862us     168.621us       0.000us         0.00%     270.843us      90.281us             3  
-                                aten::native_layer_norm         7.27%      46.392us        77.88%     497.152us     165.717us     172.445us       100.00%     270.843us      90.281us             3  
-                                       torch_layer_norm         0.00%       0.000us         0.00%       0.000us       0.000us     173.885us       100.84%     173.885us     173.885us             1  
-void at::native::(anonymous namespace)::vectorized_l...         0.00%       0.000us         0.00%       0.000us       0.000us     172.445us       100.00%     172.445us      57.482us             3  
-                                Activity Buffer Request        37.57%     239.825us        37.57%     239.825us     239.825us      98.398us        57.06%      98.398us      98.398us             1  
-                                            aten::empty         4.27%      27.241us         4.27%      27.241us       3.027us       0.000us         0.00%       0.000us       0.000us             9  
-                                       cudaLaunchKernel        28.17%     179.824us        28.17%     179.824us      59.941us       0.000us         0.00%       0.000us       0.000us             3  
-                                             aten::view         0.61%       3.870us         0.61%       3.870us       0.645us       0.000us         0.00%       0.000us       0.000us             6  
-                                  cudaDeviceSynchronize         7.34%      46.881us         7.34%      46.881us      46.881us       0.000us         0.00%       0.000us       0.000us             1  
+                                       torch_layer_norm        14.07%      64.760us        98.95%     455.588us     455.588us       0.000us         0.00%     101.120us     101.120us             1  
+                                       aten::layer_norm         1.91%       8.791us        84.88%     390.828us     130.276us       0.000us         0.00%     101.120us      33.707us             3  
+                                aten::native_layer_norm        11.79%      54.281us        82.97%     382.037us     127.346us      65.344us       100.00%     101.120us      33.707us             3  
+                                       torch_layer_norm         0.00%       0.000us         0.00%       0.000us       0.000us      96.510us       147.70%      96.510us      96.510us             1  
+void at::native::(anonymous namespace)::vectorized_l...         0.00%       0.000us         0.00%       0.000us       0.000us      65.344us       100.00%      65.344us      21.781us             3  
+                                Activity Buffer Request        29.77%     137.072us        29.77%     137.072us     137.072us      35.776us        54.75%      35.776us      35.776us             1  
+                                            aten::empty         6.60%      30.402us         6.60%      30.402us       3.378us       0.000us         0.00%       0.000us       0.000us             9  
+                                       cudaLaunchKernel        33.93%     156.232us        33.93%     156.232us      52.077us       0.000us         0.00%       0.000us       0.000us             3  
+                                             aten::view         0.88%       4.050us         0.88%       4.050us       0.675us       0.000us         0.00%       0.000us       0.000us             6  
+                                  cudaDeviceSynchronize         1.05%       4.840us         1.05%       4.840us       4.840us       0.000us         0.00%       0.000us       0.000us             1  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-Self CPU time total: 638.324us
-Self CUDA time total: 172.445us
+Self CPU time total: 460.428us
+Self CUDA time total: 65.344us
 
 
 
@@ -4551,19 +4539,19 @@ PROFILE TRACE: torch_layer_norm | LN_B4_S1024_D4096
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
                                                    Name    Self CPU %      Self CPU   CPU total %     CPU total  CPU time avg     Self CUDA   Self CUDA %    CUDA total  CUDA time avg    # of Calls  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-                                       torch_layer_norm         2.80%      69.051us        74.17%       1.829ms       1.829ms       0.000us         0.00%       1.017ms       1.017ms             1  
-                                       aten::layer_norm         0.38%       9.271us        71.37%       1.760ms     586.799us       0.000us         0.00%       1.017ms     338.980us             3  
-                                aten::native_layer_norm         1.96%      48.329us        70.99%       1.751ms     583.709us     774.001us       100.00%       1.017ms     338.980us             3  
-                                       torch_layer_norm         0.00%       0.000us         0.00%       0.000us       0.000us     775.313us       100.17%     775.313us     775.313us             1  
-void at::native::(anonymous namespace)::vectorized_l...         0.00%       0.000us         0.00%       0.000us       0.000us     774.001us       100.00%     774.001us     258.000us             3  
-                                Activity Buffer Request        60.50%       1.492ms        60.50%       1.492ms       1.492ms     242.939us        31.39%     242.939us     242.939us             1  
-                                            aten::empty         1.12%      27.712us         1.12%      27.712us       3.079us       0.000us         0.00%       0.000us       0.000us             9  
-                                       cudaLaunchKernel         7.26%     179.014us         7.26%     179.014us      59.671us       0.000us         0.00%       0.000us       0.000us             3  
-                                             aten::view         0.16%       3.900us         0.16%       3.900us       0.650us       0.000us         0.00%       0.000us       0.000us             6  
-                                  cudaDeviceSynchronize        25.83%     637.134us        25.83%     637.134us     637.134us       0.000us         0.00%       0.000us       0.000us             1  
+                                       torch_layer_norm         3.83%      67.811us        99.72%       1.767ms       1.767ms       0.000us         0.00%     207.840us     207.840us             1  
+                                       aten::layer_norm         0.55%       9.819us        95.89%       1.699ms     566.320us       0.000us         0.00%     207.840us      69.280us             3  
+                                aten::native_layer_norm         3.03%      53.603us        95.34%       1.689ms     563.047us     129.312us       100.00%     207.840us      69.280us             3  
+                                       torch_layer_norm         0.00%       0.000us         0.00%       0.000us       0.000us     130.911us       101.24%     130.911us     130.911us             1  
+void at::native::(anonymous namespace)::vectorized_l...         0.00%       0.000us         0.00%       0.000us       0.000us     129.312us       100.00%     129.312us      43.104us             3  
+                                Activity Buffer Request        81.49%       1.444ms        81.49%       1.444ms       1.444ms      78.528us        60.73%      78.528us      78.528us             1  
+                                            aten::empty         1.74%      30.830us         1.74%      30.830us       3.426us       0.000us         0.00%       0.000us       0.000us             9  
+                                       cudaLaunchKernel         8.86%     156.973us         8.86%     156.973us      52.324us       0.000us         0.00%       0.000us       0.000us             3  
+                                             aten::view         0.23%       4.020us         0.23%       4.020us       0.670us       0.000us         0.00%       0.000us       0.000us             6  
+                                  cudaDeviceSynchronize         0.28%       4.980us         0.28%       4.980us       4.980us       0.000us         0.00%       0.000us       0.000us             1  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-Self CPU time total: 2.467ms
-Self CUDA time total: 774.001us
+Self CPU time total: 1.772ms
+Self CUDA time total: 129.312us
 
 
 
@@ -4573,19 +4561,19 @@ PROFILE TRACE: torch_layer_norm | LN_B4_S1024_D8192
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
                                                    Name    Self CPU %      Self CPU   CPU total %     CPU total  CPU time avg     Self CUDA   Self CUDA %    CUDA total  CUDA time avg    # of Calls  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-                                       torch_layer_norm         3.34%      67.143us        27.08%     544.452us     544.452us       0.000us         0.00%       2.061ms       2.061ms             1  
-                                       aten::layer_norm         0.43%       8.689us        23.74%     477.309us     159.103us       0.000us         0.00%       2.061ms     687.112us             3  
-                                aten::native_layer_norm         2.32%      46.570us        23.31%     468.620us     156.207us       1.591ms       100.00%       2.061ms     687.112us             3  
-                                       torch_layer_norm         0.00%       0.000us         0.00%       0.000us       0.000us       1.593ms       100.09%       1.593ms       1.593ms             1  
-void at::native::(anonymous namespace)::vectorized_l...         0.00%       0.000us         0.00%       0.000us       0.000us       1.591ms       100.00%       1.591ms     530.454us             3  
-                                Activity Buffer Request        10.53%     211.705us        10.53%     211.705us     211.705us     469.975us        29.53%     469.975us     469.975us             1  
-                                            aten::empty         1.38%      27.780us         1.38%      27.780us       3.087us       0.000us         0.00%       0.000us       0.000us             9  
-                                       cudaLaunchKernel         8.88%     178.623us         8.88%     178.623us      59.541us       0.000us         0.00%       0.000us       0.000us             3  
-                                             aten::view         0.20%       3.942us         0.20%       3.942us       0.657us       0.000us         0.00%       0.000us       0.000us             6  
-                                  cudaDeviceSynchronize        72.92%       1.466ms        72.92%       1.466ms       1.466ms       0.000us         0.00%       0.000us       0.000us             1  
+                                       torch_layer_norm         3.13%      68.611us        81.17%       1.779ms       1.779ms       0.000us         0.00%     737.526us     737.526us             1  
+                                       aten::layer_norm         0.41%       9.061us        78.04%       1.711ms     570.260us       0.000us         0.00%     737.526us     245.842us             3  
+                                aten::native_layer_norm         2.43%      53.328us        77.62%       1.702ms     567.240us     547.705us       100.00%     737.526us     245.842us             3  
+                                       torch_layer_norm         0.00%       0.000us         0.00%       0.000us       0.000us     549.241us       100.28%     549.241us     549.241us             1  
+void at::native::(anonymous namespace)::vectorized_l...         0.00%       0.000us         0.00%       0.000us       0.000us     547.705us       100.00%     547.705us     182.568us             3  
+                                Activity Buffer Request        66.39%       1.455ms        66.39%       1.455ms       1.455ms     189.821us        34.66%     189.821us     189.821us             1  
+                                            aten::empty         1.36%      29.741us         1.36%      29.741us       3.305us       0.000us         0.00%       0.000us       0.000us             9  
+                                       cudaLaunchKernel         7.27%     159.364us         7.27%     159.364us      53.121us       0.000us         0.00%       0.000us       0.000us             3  
+                                             aten::view         0.18%       3.911us         0.18%       3.911us       0.652us       0.000us         0.00%       0.000us       0.000us             6  
+                                  cudaDeviceSynchronize        18.83%     412.857us        18.83%     412.857us     412.857us       0.000us         0.00%       0.000us       0.000us             1  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-Self CPU time total: 2.011ms
-Self CUDA time total: 1.591ms
+Self CPU time total: 2.192ms
+Self CUDA time total: 547.705us
 
 
 
@@ -4595,19 +4583,19 @@ PROFILE TRACE: torch_layer_norm | LN_B4_S2048_D1024
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
                                                    Name    Self CPU %      Self CPU   CPU total %     CPU total  CPU time avg     Self CUDA   Self CUDA %    CUDA total  CUDA time avg    # of Calls  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-                                       torch_layer_norm         3.62%      68.292us        96.50%       1.823ms       1.823ms       0.000us         0.00%     293.305us     293.305us             1  
-                                       aten::layer_norm         0.46%       8.692us        92.88%       1.754ms     584.823us       0.000us         0.00%     293.305us      97.768us             3  
-                                aten::native_layer_norm         2.51%      47.441us        92.42%       1.746ms     581.925us     194.459us       100.00%     293.305us      97.768us             3  
-                                       torch_layer_norm         0.00%       0.000us         0.00%       0.000us       0.000us     195.932us       100.76%     195.932us     195.932us             1  
-void at::native::(anonymous namespace)::vectorized_l...         0.00%       0.000us         0.00%       0.000us       0.000us     194.459us       100.00%     194.459us      64.820us             3  
-                                Activity Buffer Request        78.85%       1.489ms        78.85%       1.489ms       1.489ms      98.846us        50.83%      98.846us      98.846us             1  
-                                            aten::empty         1.54%      29.039us         1.54%      29.039us       3.227us       0.000us         0.00%       0.000us       0.000us             9  
-                                       cudaLaunchKernel         9.30%     175.764us         9.30%     175.764us      58.588us       0.000us         0.00%       0.000us       0.000us             3  
-                                             aten::view         0.21%       4.050us         0.21%       4.050us       0.675us       0.000us         0.00%       0.000us       0.000us             6  
-                                  cudaDeviceSynchronize         3.50%      66.181us         3.50%      66.181us      66.181us       0.000us         0.00%       0.000us       0.000us             1  
+                                       torch_layer_norm        13.81%      64.951us        98.91%     465.198us     465.198us       0.000us         0.00%     102.813us     102.813us             1  
+                                       aten::layer_norm         2.00%       9.429us        85.10%     400.247us     133.416us       0.000us         0.00%     102.813us      34.271us             3  
+                                aten::native_layer_norm        10.88%      51.150us        83.10%     390.818us     130.273us      68.606us       100.00%     102.813us      34.271us             3  
+                                       torch_layer_norm         0.00%       0.000us         0.00%       0.000us       0.000us     100.893us       147.06%     100.893us     100.893us             1  
+void at::native::(anonymous namespace)::vectorized_l...         0.00%       0.000us         0.00%       0.000us       0.000us      68.606us       100.00%      68.606us      22.869us             3  
+                                Activity Buffer Request        31.07%     146.142us        31.07%     146.142us     146.142us      34.207us        49.86%      34.207us      34.207us             1  
+                                            aten::empty         6.17%      29.002us         6.17%      29.002us       3.222us       0.000us         0.00%       0.000us       0.000us             9  
+                                       cudaLaunchKernel        34.16%     160.644us        34.16%     160.644us      53.548us       0.000us         0.00%       0.000us       0.000us             3  
+                                             aten::view         0.82%       3.880us         0.82%       3.880us       0.647us       0.000us         0.00%       0.000us       0.000us             6  
+                                  cudaDeviceSynchronize         1.09%       5.121us         1.09%       5.121us       5.121us       0.000us         0.00%       0.000us       0.000us             1  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-Self CPU time total: 1.889ms
-Self CUDA time total: 194.459us
+Self CPU time total: 470.319us
+Self CUDA time total: 68.606us
 
 
 
@@ -4617,19 +4605,19 @@ PROFILE TRACE: torch_layer_norm | LN_B4_S2048_D2048
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
                                                    Name    Self CPU %      Self CPU   CPU total %     CPU total  CPU time avg     Self CUDA   Self CUDA %    CUDA total  CUDA time avg    # of Calls  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-                                       torch_layer_norm         2.78%      69.740us        74.10%       1.857ms       1.857ms       0.000us         0.00%       1.019ms       1.019ms             1  
-                                       aten::layer_norm         0.37%       9.390us        71.32%       1.787ms     595.616us       0.000us         0.00%       1.019ms     339.749us             3  
-                                aten::native_layer_norm         1.97%      49.270us        70.94%       1.777ms     592.486us     782.484us       100.00%       1.019ms     339.749us             3  
-                                       torch_layer_norm         0.00%       0.000us         0.00%       0.000us       0.000us     783.796us       100.17%     783.796us     783.796us             1  
-void at::native::(anonymous namespace)::vectorized_l...         0.00%       0.000us         0.00%       0.000us       0.000us     782.484us       100.00%     782.484us     260.828us             3  
-                                Activity Buffer Request        60.61%       1.519ms        60.61%       1.519ms       1.519ms     236.764us        30.26%     236.764us     236.764us             1  
-                                            aten::empty         1.20%      30.103us         1.20%      30.103us       3.345us       0.000us         0.00%       0.000us       0.000us             9  
-                                       cudaLaunchKernel         7.01%     175.614us         7.01%     175.614us      58.538us       0.000us         0.00%       0.000us       0.000us             3  
-                                             aten::view         0.16%       3.909us         0.16%       3.909us       0.651us       0.000us         0.00%       0.000us       0.000us             6  
-                                  cudaDeviceSynchronize        25.90%     648.943us        25.90%     648.943us     648.943us       0.000us         0.00%       0.000us       0.000us             1  
+                                       torch_layer_norm         3.85%      67.820us        99.72%       1.755ms       1.755ms       0.000us         0.00%     204.288us     204.288us             1  
+                                       aten::layer_norm         0.52%       9.151us        95.86%       1.687ms     562.280us       0.000us         0.00%     204.288us      68.096us             3  
+                                aten::native_layer_norm         2.95%      51.910us        95.34%       1.678ms     559.230us     129.120us       100.00%     204.288us      68.096us             3  
+                                       torch_layer_norm         0.00%       0.000us         0.00%       0.000us       0.000us     130.560us       101.12%     130.560us     130.560us             1  
+void at::native::(anonymous namespace)::vectorized_l...         0.00%       0.000us         0.00%       0.000us       0.000us     129.120us       100.00%     129.120us      43.040us             3  
+                                Activity Buffer Request        81.69%       1.437ms        81.69%       1.437ms       1.437ms      75.168us        58.22%      75.168us      75.168us             1  
+                                            aten::empty         1.73%      30.362us         1.73%      30.362us       3.374us       0.000us         0.00%       0.000us       0.000us             9  
+                                       cudaLaunchKernel         8.76%     154.112us         8.76%     154.112us      51.371us       0.000us         0.00%       0.000us       0.000us             3  
+                                             aten::view         0.22%       3.910us         0.22%       3.910us       0.652us       0.000us         0.00%       0.000us       0.000us             6  
+                                  cudaDeviceSynchronize         0.28%       4.960us         0.28%       4.960us       4.960us       0.000us         0.00%       0.000us       0.000us             1  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-Self CPU time total: 2.506ms
-Self CUDA time total: 782.484us
+Self CPU time total: 1.760ms
+Self CUDA time total: 129.120us
 
 
 
@@ -4639,19 +4627,19 @@ PROFILE TRACE: torch_layer_norm | LN_B4_S2048_D4096
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
                                                    Name    Self CPU %      Self CPU   CPU total %     CPU total  CPU time avg     Self CUDA   Self CUDA %    CUDA total  CUDA time avg    # of Calls  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-                                       torch_layer_norm         3.31%      67.692us        28.17%     576.572us     576.572us       0.000us         0.00%       2.073ms       2.073ms             1  
-                                       aten::layer_norm         0.43%       8.840us        24.86%     508.880us     169.627us       0.000us         0.00%       2.073ms     691.102us             3  
-                                aten::native_layer_norm         2.41%      49.301us        24.43%     500.040us     166.680us       1.601ms       100.00%       2.073ms     691.102us             3  
-                                       torch_layer_norm         0.00%       0.000us         0.00%       0.000us       0.000us       1.602ms       100.08%       1.602ms       1.602ms             1  
-void at::native::(anonymous namespace)::vectorized_l...         0.00%       0.000us         0.00%       0.000us       0.000us       1.601ms       100.00%       1.601ms     533.655us             3  
-                                Activity Buffer Request        10.93%     223.675us        10.93%     223.675us     223.675us     472.343us        29.50%     472.343us     472.343us             1  
-                                            aten::empty         1.43%      29.180us         1.43%      29.180us       3.242us       0.000us         0.00%       0.000us       0.000us             9  
-                                       cudaLaunchKernel         9.47%     193.884us         9.47%     193.884us      64.628us       0.000us         0.00%       0.000us       0.000us             3  
-                                             aten::view         0.20%       4.000us         0.20%       4.000us       0.667us       0.000us         0.00%       0.000us       0.000us             6  
-                                  cudaDeviceSynchronize        71.83%       1.470ms        71.83%       1.470ms       1.470ms       0.000us         0.00%       0.000us       0.000us             1  
+                                       torch_layer_norm         3.24%      70.231us        80.97%       1.754ms       1.754ms       0.000us         0.00%     714.792us     714.792us             1  
+                                       aten::layer_norm         0.42%       9.200us        77.73%       1.684ms     561.233us       0.000us         0.00%     714.792us     238.264us             3  
+                                aten::native_layer_norm         2.38%      51.610us        77.31%       1.674ms     558.166us     542.598us       100.00%     714.792us     238.264us             3  
+                                       torch_layer_norm         0.00%       0.000us         0.00%       0.000us       0.000us     544.071us       100.27%     544.071us     544.071us             1  
+void at::native::(anonymous namespace)::vectorized_l...         0.00%       0.000us         0.00%       0.000us       0.000us     542.598us       100.00%     542.598us     180.866us             3  
+                                Activity Buffer Request        66.26%       1.435ms        66.26%       1.435ms       1.435ms     172.194us        31.74%     172.194us     172.194us             1  
+                                            aten::empty         1.34%      28.942us         1.34%      28.942us       3.216us       0.000us         0.00%       0.000us       0.000us             9  
+                                       cudaLaunchKernel         7.14%     154.623us         7.14%     154.623us      51.541us       0.000us         0.00%       0.000us       0.000us             3  
+                                             aten::view         0.19%       4.030us         0.19%       4.030us       0.672us       0.000us         0.00%       0.000us       0.000us             6  
+                                  cudaDeviceSynchronize        19.03%     412.116us        19.03%     412.116us     412.116us       0.000us         0.00%       0.000us       0.000us             1  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-Self CPU time total: 2.047ms
-Self CUDA time total: 1.601ms
+Self CPU time total: 2.166ms
+Self CUDA time total: 542.598us
 
 
 
@@ -4661,19 +4649,19 @@ PROFILE TRACE: torch_layer_norm | LN_B4_S2048_D8192
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
                                                    Name    Self CPU %      Self CPU   CPU total %     CPU total  CPU time avg     Self CUDA   Self CUDA %    CUDA total  CUDA time avg    # of Calls  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-                                       torch_layer_norm         1.41%      71.321us        36.81%       1.859ms       1.859ms       0.000us         0.00%       4.346ms       4.346ms             1  
-                                       aten::layer_norm         0.18%       9.191us        35.39%       1.788ms     595.990us       0.000us         0.00%       4.346ms       1.449ms             3  
-                                aten::native_layer_norm         0.98%      49.420us        35.21%       1.779ms     592.926us       3.326ms       100.00%       4.346ms       1.449ms             3  
-                                       torch_layer_norm         0.00%       0.000us         0.00%       0.000us       0.000us       3.327ms       100.04%       3.327ms       3.327ms             1  
-void at::native::(anonymous namespace)::vectorized_l...         0.00%       0.000us         0.00%       0.000us       0.000us       3.326ms       100.00%       3.326ms       1.109ms             3  
-                                Activity Buffer Request        30.03%       1.517ms        30.03%       1.517ms       1.517ms       1.021ms        30.69%       1.021ms       1.021ms             1  
-                                            aten::empty         0.58%      29.141us         0.58%      29.141us       3.238us       0.000us         0.00%       0.000us       0.000us             9  
-                                       cudaLaunchKernel         3.55%     179.254us         3.55%     179.254us      59.751us       0.000us         0.00%       0.000us       0.000us             3  
-                                             aten::view         0.08%       3.870us         0.08%       3.870us       0.645us       0.000us         0.00%       0.000us       0.000us             6  
-                                  cudaDeviceSynchronize        63.19%       3.192ms        63.19%       3.192ms       3.192ms       0.000us         0.00%       0.000us       0.000us             1  
+                                       torch_layer_norm         2.50%      69.210us        63.28%       1.753ms       1.753ms       0.000us         0.00%       1.482ms       1.482ms             1  
+                                       aten::layer_norm         0.34%       9.550us        60.78%       1.684ms     561.333us       0.000us         0.00%       1.482ms     494.135us             3  
+                                aten::native_layer_norm         1.89%      52.442us        60.43%       1.674ms     558.150us       1.150ms       100.00%       1.482ms     494.135us             3  
+                                       torch_layer_norm         0.00%       0.000us         0.00%       0.000us       0.000us       1.151ms       100.12%       1.151ms       1.151ms             1  
+void at::native::(anonymous namespace)::vectorized_l...         0.00%       0.000us         0.00%       0.000us       0.000us       1.150ms       100.00%       1.150ms     383.212us             3  
+                                Activity Buffer Request        51.68%       1.432ms        51.68%       1.432ms       1.432ms     332.769us        28.95%     332.769us     332.769us             1  
+                                            aten::empty         1.10%      30.460us         1.10%      30.460us       3.384us       0.000us         0.00%       0.000us       0.000us             9  
+                                       cudaLaunchKernel         5.62%     155.772us         5.62%     155.772us      51.924us       0.000us         0.00%       0.000us       0.000us             3  
+                                             aten::view         0.14%       3.891us         0.14%       3.891us       0.649us       0.000us         0.00%       0.000us       0.000us             6  
+                                  cudaDeviceSynchronize        36.72%       1.018ms        36.72%       1.018ms       1.018ms       0.000us         0.00%       0.000us       0.000us             1  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-Self CPU time total: 5.051ms
-Self CUDA time total: 3.326ms
+Self CPU time total: 2.771ms
+Self CUDA time total: 1.150ms
 
 
 
@@ -4683,19 +4671,19 @@ PROFILE TRACE: torch_layer_norm | LN_B16_S128_D1024
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
                                                    Name    Self CPU %      Self CPU   CPU total %     CPU total  CPU time avg     Self CUDA   Self CUDA %    CUDA total  CUDA time avg    # of Calls  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-                                       torch_layer_norm         0.00%       0.000us         0.00%       0.000us       0.000us      94.845us       222.19%      94.845us      94.845us             1  
-                                       torch_layer_norm        12.19%      64.781us        99.08%     526.632us     526.632us       0.000us         0.00%      56.095us      56.095us             1  
-                                       aten::layer_norm         1.55%       8.240us        86.89%     461.851us     153.950us       0.000us         0.00%      56.095us      18.698us             3  
-                                aten::native_layer_norm         8.82%      46.862us        85.34%     453.611us     151.204us      42.687us       100.00%      56.095us      18.698us             3  
-void at::native::(anonymous namespace)::vectorized_l...         0.00%       0.000us         0.00%       0.000us       0.000us      42.687us       100.00%      42.687us      14.229us             3  
-                                Activity Buffer Request        37.58%     199.725us        37.58%     199.725us     199.725us      13.408us        31.41%      13.408us      13.408us             1  
-                                            aten::empty         5.07%      26.941us         5.07%      26.941us       2.993us       0.000us         0.00%       0.000us       0.000us             9  
-                                       cudaLaunchKernel        33.00%     175.413us        33.00%     175.413us      58.471us       0.000us         0.00%       0.000us       0.000us             3  
-                                             aten::view         0.88%       4.670us         0.88%       4.670us       0.778us       0.000us         0.00%       0.000us       0.000us             6  
-                                  cudaDeviceSynchronize         0.92%       4.890us         0.92%       4.890us       4.890us       0.000us         0.00%       0.000us       0.000us             1  
+                                       torch_layer_norm         0.00%       0.000us         0.00%       0.000us       0.000us      86.813us       481.04%      86.813us      86.813us             1  
+                                       torch_layer_norm        13.94%      63.610us        98.78%     450.788us     450.788us       0.000us         0.00%      23.966us      23.966us             1  
+                                       aten::layer_norm         1.92%       8.751us        84.84%     387.178us     129.059us       0.000us         0.00%      23.966us       7.989us             3  
+                                aten::native_layer_norm        11.33%      51.701us        82.93%     378.427us     126.142us      18.047us       100.00%      23.966us       7.989us             3  
+void at::native::(anonymous namespace)::vectorized_l...         0.00%       0.000us         0.00%       0.000us       0.000us      18.047us       100.00%      18.047us       6.016us             3  
+                                Activity Buffer Request        30.87%     140.892us        30.87%     140.892us     140.892us       5.919us        32.80%       5.919us       5.919us             1  
+                                            aten::empty         6.07%      27.691us         6.07%      27.691us       3.077us       0.000us         0.00%       0.000us       0.000us             9  
+                                       cudaLaunchKernel        33.75%     154.013us        33.75%     154.013us      51.338us       0.000us         0.00%       0.000us       0.000us             3  
+                                             aten::view         0.91%       4.130us         0.91%       4.130us       0.688us       0.000us         0.00%       0.000us       0.000us             6  
+                                  cudaDeviceSynchronize         1.22%       5.560us         1.22%       5.560us       5.560us       0.000us         0.00%       0.000us       0.000us             1  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-Self CPU time total: 531.522us
-Self CUDA time total: 42.687us
+Self CPU time total: 456.348us
+Self CUDA time total: 18.047us
 
 
 
@@ -4705,19 +4693,19 @@ PROFILE TRACE: torch_layer_norm | LN_B16_S128_D2048
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
                                                    Name    Self CPU %      Self CPU   CPU total %     CPU total  CPU time avg     Self CUDA   Self CUDA %    CUDA total  CUDA time avg    # of Calls  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-                                       torch_layer_norm        10.93%      66.051us        99.19%     599.583us     599.583us       0.000us         0.00%     137.212us     137.212us             1  
-                                       aten::layer_norm         1.47%       8.912us        88.27%     533.532us     177.844us       0.000us         0.00%     137.212us      45.737us             3  
-                                aten::native_layer_norm         8.28%      50.060us        86.79%     524.620us     174.873us      88.510us       100.00%     137.212us      45.737us             3  
-                                       torch_layer_norm         0.00%       0.000us         0.00%       0.000us       0.000us     106.109us       119.88%     106.109us     106.109us             1  
-void at::native::(anonymous namespace)::vectorized_l...         0.00%       0.000us         0.00%       0.000us       0.000us      88.510us       100.00%      88.510us      29.503us             3  
-                                Activity Buffer Request        42.84%     258.935us        42.84%     258.935us     258.935us      48.702us        55.02%      48.702us      48.702us             1  
-                                            aten::empty         4.66%      28.180us         4.66%      28.180us       3.131us       0.000us         0.00%       0.000us       0.000us             9  
-                                       cudaLaunchKernel        30.27%     182.954us        30.27%     182.954us      60.985us       0.000us         0.00%       0.000us       0.000us             3  
-                                             aten::view         0.74%       4.491us         0.74%       4.491us       0.748us       0.000us         0.00%       0.000us       0.000us             6  
-                                  cudaDeviceSynchronize         0.81%       4.880us         0.81%       4.880us       4.880us       0.000us         0.00%       0.000us       0.000us             1  
+                                       torch_layer_norm         0.00%       0.000us         0.00%       0.000us       0.000us      94.272us       347.01%      94.272us      94.272us             1  
+                                       torch_layer_norm         3.87%      67.581us        99.70%       1.743ms       1.743ms       0.000us         0.00%      36.063us      36.063us             1  
+                                       aten::layer_norm         0.54%       9.410us        95.84%       1.675ms     558.423us       0.000us         0.00%      36.063us      12.021us             3  
+                                aten::native_layer_norm         3.00%      52.431us        95.30%       1.666ms     555.286us      27.167us       100.00%      36.063us      12.021us             3  
+void at::native::(anonymous namespace)::vectorized_l...         0.00%       0.000us         0.00%       0.000us       0.000us      27.167us       100.00%      27.167us       9.056us             3  
+                                Activity Buffer Request        81.64%       1.427ms        81.64%       1.427ms       1.427ms       8.896us        32.75%       8.896us       8.896us             1  
+                                            aten::empty         1.64%      28.640us         1.64%      28.640us       3.182us       0.000us         0.00%       0.000us       0.000us             9  
+                                       cudaLaunchKernel         8.79%     153.563us         8.79%     153.563us      51.188us       0.000us         0.00%       0.000us       0.000us             3  
+                                             aten::view         0.23%       4.090us         0.23%       4.090us       0.682us       0.000us         0.00%       0.000us       0.000us             6  
+                                  cudaDeviceSynchronize         0.30%       5.160us         0.30%       5.160us       5.160us       0.000us         0.00%       0.000us       0.000us             1  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-Self CPU time total: 604.463us
-Self CUDA time total: 88.510us
+Self CPU time total: 1.748ms
+Self CUDA time total: 27.167us
 
 
 
@@ -4727,19 +4715,19 @@ PROFILE TRACE: torch_layer_norm | LN_B16_S128_D4096
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
                                                    Name    Self CPU %      Self CPU   CPU total %     CPU total  CPU time avg     Self CUDA   Self CUDA %    CUDA total  CUDA time avg    # of Calls  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-                                       torch_layer_norm        10.02%      64.262us        91.98%     590.053us     590.053us       0.000us         0.00%     278.967us     278.967us             1  
-                                       aten::layer_norm         1.23%       7.910us        81.96%     525.791us     175.264us       0.000us         0.00%     278.967us      92.989us             3  
-                                aten::native_layer_norm         7.34%      47.060us        80.73%     517.881us     172.627us     173.722us       100.00%     278.967us      92.989us             3  
-                                       torch_layer_norm         0.00%       0.000us         0.00%       0.000us       0.000us     175.066us       100.77%     175.066us     175.066us             1  
-void at::native::(anonymous namespace)::vectorized_l...         0.00%       0.000us         0.00%       0.000us       0.000us     173.722us       100.00%     173.722us      57.907us             3  
-                                Activity Buffer Request        41.71%     267.606us        41.71%     267.606us     267.606us     105.245us        60.58%     105.245us     105.245us             1  
-                                            aten::empty         4.10%      26.330us         4.10%      26.330us       2.926us       0.000us         0.00%       0.000us       0.000us             9  
-                                       cudaLaunchKernel        26.95%     172.865us        26.95%     172.865us      57.622us       0.000us         0.00%       0.000us       0.000us             3  
-                                             aten::view         0.63%       4.020us         0.63%       4.020us       0.670us       0.000us         0.00%       0.000us       0.000us             6  
-                                  cudaDeviceSynchronize         8.02%      51.462us         8.02%      51.462us      51.462us       0.000us         0.00%       0.000us       0.000us             1  
+                                       torch_layer_norm        15.30%      64.290us        98.85%     415.327us     415.327us       0.000us         0.00%     113.182us     113.182us             1  
+                                       aten::layer_norm         1.89%       7.931us        83.55%     351.037us     117.012us       0.000us         0.00%     113.182us      37.727us             3  
+                                aten::native_layer_norm        12.15%      51.059us        81.66%     343.106us     114.369us      72.639us       100.00%     113.182us      37.727us             3  
+                                       torch_layer_norm         0.00%       0.000us         0.00%       0.000us       0.000us      97.758us       134.58%      97.758us      97.758us             1  
+void at::native::(anonymous namespace)::vectorized_l...         0.00%       0.000us         0.00%       0.000us       0.000us      72.639us       100.00%      72.639us      24.213us             3  
+                                Activity Buffer Request        25.15%     105.652us        25.15%     105.652us     105.652us      40.543us        55.81%      40.543us      40.543us             1  
+                                            aten::empty         7.08%      29.763us         7.08%      29.763us       3.307us       0.000us         0.00%       0.000us       0.000us             9  
+                                       cudaLaunchKernel        36.37%     152.792us        36.37%     152.792us      50.931us       0.000us         0.00%       0.000us       0.000us             3  
+                                             aten::view         0.91%       3.840us         0.91%       3.840us       0.640us       0.000us         0.00%       0.000us       0.000us             6  
+                                  cudaDeviceSynchronize         1.15%       4.831us         1.15%       4.831us       4.831us       0.000us         0.00%       0.000us       0.000us             1  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-Self CPU time total: 641.515us
-Self CUDA time total: 173.722us
+Self CPU time total: 420.158us
+Self CUDA time total: 72.639us
 
 
 
@@ -4749,19 +4737,19 @@ PROFILE TRACE: torch_layer_norm | LN_B16_S128_D8192
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
                                                    Name    Self CPU %      Self CPU   CPU total %     CPU total  CPU time avg     Self CUDA   Self CUDA %    CUDA total  CUDA time avg    # of Calls  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-                                       torch_layer_norm         5.42%      64.123us        46.63%     551.262us     551.262us       0.000us         0.00%     999.011us     999.011us             1  
-                                       aten::layer_norm         0.70%       8.291us        41.21%     487.139us     162.380us       0.000us         0.00%     999.011us     333.004us             3  
-                                aten::native_layer_norm         4.04%      47.749us        40.50%     478.848us     159.616us     754.698us       100.00%     999.011us     333.004us             3  
-                                       torch_layer_norm         0.00%       0.000us         0.00%       0.000us       0.000us     755.978us       100.17%     755.978us     755.978us             1  
-void at::native::(anonymous namespace)::vectorized_l...         0.00%       0.000us         0.00%       0.000us       0.000us     754.698us       100.00%     754.698us     251.566us             3  
-                                Activity Buffer Request        19.30%     228.145us        19.30%     228.145us     228.145us     244.313us        32.37%     244.313us     244.313us             1  
-                                            aten::empty         2.39%      28.290us         2.39%      28.290us       3.143us       0.000us         0.00%       0.000us       0.000us             9  
-                                       cudaLaunchKernel        14.40%     170.293us        14.40%     170.293us      56.764us       0.000us         0.00%       0.000us       0.000us             3  
-                                             aten::view         0.37%       4.371us         0.37%       4.371us       0.729us       0.000us         0.00%       0.000us       0.000us             6  
-                                  cudaDeviceSynchronize        53.37%     630.953us        53.37%     630.953us     630.953us       0.000us         0.00%       0.000us       0.000us             1  
+                                       torch_layer_norm         3.89%      68.361us        99.32%       1.748ms       1.748ms       0.000us         0.00%     226.432us     226.432us             1  
+                                       aten::layer_norm         0.51%       8.970us        95.44%       1.679ms     559.750us       0.000us         0.00%     226.432us      75.477us             3  
+                                aten::native_layer_norm         3.03%      53.343us        94.93%       1.670ms     556.760us     142.207us       100.00%     226.432us      75.477us             3  
+                                       torch_layer_norm         0.00%       0.000us         0.00%       0.000us       0.000us     143.552us       100.95%     143.552us     143.552us             1  
+void at::native::(anonymous namespace)::vectorized_l...         0.00%       0.000us         0.00%       0.000us       0.000us     142.207us       100.00%     142.207us      47.402us             3  
+                                Activity Buffer Request        81.27%       1.430ms        81.27%       1.430ms       1.430ms      84.225us        59.23%      84.225us      84.225us             1  
+                                            aten::empty         1.69%      29.760us         1.69%      29.760us       3.307us       0.000us         0.00%       0.000us       0.000us             9  
+                                       cudaLaunchKernel         8.71%     153.172us         8.71%     153.172us      51.057us       0.000us         0.00%       0.000us       0.000us             3  
+                                             aten::view         0.23%       4.080us         0.23%       4.080us       0.680us       0.000us         0.00%       0.000us       0.000us             6  
+                                  cudaDeviceSynchronize         0.68%      11.911us         0.68%      11.911us      11.911us       0.000us         0.00%       0.000us       0.000us             1  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-Self CPU time total: 1.182ms
-Self CUDA time total: 754.698us
+Self CPU time total: 1.760ms
+Self CUDA time total: 142.207us
 
 
 
@@ -4771,19 +4759,19 @@ PROFILE TRACE: torch_layer_norm | LN_B16_S512_D1024
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
                                                    Name    Self CPU %      Self CPU   CPU total %     CPU total  CPU time avg     Self CUDA   Self CUDA %    CUDA total  CUDA time avg    # of Calls  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-                                       torch_layer_norm        10.01%      65.931us        89.70%     590.752us     590.752us       0.000us         0.00%     288.918us     288.918us             1  
-                                       aten::layer_norm         1.34%       8.832us        79.69%     524.821us     174.940us       0.000us         0.00%     288.918us      96.306us             3  
-                                aten::native_layer_norm         7.23%      47.600us        78.35%     515.989us     171.996us     192.505us       100.00%     288.918us      96.306us             3  
-                                       torch_layer_norm         0.00%       0.000us         0.00%       0.000us       0.000us     193.977us       100.76%     193.977us     193.977us             1  
-void at::native::(anonymous namespace)::vectorized_l...         0.00%       0.000us         0.00%       0.000us       0.000us     192.505us       100.00%     192.505us      64.168us             3  
-                                Activity Buffer Request        41.13%     270.896us        41.13%     270.896us     270.896us      96.413us        50.08%      96.413us      96.413us             1  
-                                            aten::empty         4.24%      27.950us         4.24%      27.950us       3.106us       0.000us         0.00%       0.000us       0.000us             9  
-                                       cudaLaunchKernel        25.15%     165.623us        25.15%     165.623us      55.208us       0.000us         0.00%       0.000us       0.000us             3  
-                                             aten::view         0.60%       3.920us         0.60%       3.920us       0.653us       0.000us         0.00%       0.000us       0.000us             6  
-                                  cudaDeviceSynchronize        10.30%      67.841us        10.30%      67.841us      67.841us       0.000us         0.00%       0.000us       0.000us             1  
+                                       torch_layer_norm         3.86%      67.581us        99.71%       1.745ms       1.745ms       0.000us         0.00%     103.967us     103.967us             1  
+                                       aten::layer_norm         0.51%       8.910us        95.84%       1.677ms     559.073us       0.000us         0.00%     103.967us      34.656us             3  
+                                aten::native_layer_norm         3.07%      53.660us        95.33%       1.668ms     556.103us      69.343us       100.00%     103.967us      34.656us             3  
+                                       torch_layer_norm         0.00%       0.000us         0.00%       0.000us       0.000us     103.487us       149.24%     103.487us     103.487us             1  
+void at::native::(anonymous namespace)::vectorized_l...         0.00%       0.000us         0.00%       0.000us       0.000us      69.343us       100.00%      69.343us      23.114us             3  
+                                Activity Buffer Request        81.52%       1.427ms        81.52%       1.427ms       1.427ms      34.624us        49.93%      34.624us      34.624us             1  
+                                            aten::empty         1.61%      28.261us         1.61%      28.261us       3.140us       0.000us         0.00%       0.000us       0.000us             9  
+                                       cudaLaunchKernel         8.90%     155.753us         8.90%     155.753us      51.918us       0.000us         0.00%       0.000us       0.000us             3  
+                                             aten::view         0.24%       4.120us         0.24%       4.120us       0.687us       0.000us         0.00%       0.000us       0.000us             6  
+                                  cudaDeviceSynchronize         0.29%       5.151us         0.29%       5.151us       5.151us       0.000us         0.00%       0.000us       0.000us             1  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-Self CPU time total: 658.593us
-Self CUDA time total: 192.505us
+Self CPU time total: 1.750ms
+Self CUDA time total: 69.343us
 
 
 
@@ -4793,19 +4781,19 @@ PROFILE TRACE: torch_layer_norm | LN_B16_S512_D2048
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
                                                    Name    Self CPU %      Self CPU   CPU total %     CPU total  CPU time avg     Self CUDA   Self CUDA %    CUDA total  CUDA time avg    # of Calls  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-                                       torch_layer_norm         5.51%      65.042us        44.22%     521.671us     521.671us       0.000us         0.00%       1.021ms       1.021ms             1  
-                                       aten::layer_norm         0.70%       8.259us        38.70%     456.629us     152.210us       0.000us         0.00%       1.021ms     340.419us             3  
-                                aten::native_layer_norm         4.08%      48.143us        38.00%     448.370us     149.457us     782.094us       100.00%       1.021ms     340.419us             3  
-                                       torch_layer_norm         0.00%       0.000us         0.00%       0.000us       0.000us     783.471us       100.18%     783.471us     783.471us             1  
-void at::native::(anonymous namespace)::vectorized_l...         0.00%       0.000us         0.00%       0.000us       0.000us     782.094us       100.00%     782.094us     260.698us             3  
-                                Activity Buffer Request        17.26%     203.644us        17.26%     203.644us     203.644us     239.163us        30.58%     239.163us     239.163us             1  
-                                            aten::empty         2.33%      27.440us         2.33%      27.440us       3.049us       0.000us         0.00%       0.000us       0.000us             9  
-                                       cudaLaunchKernel        14.01%     165.323us        14.01%     165.323us      55.108us       0.000us         0.00%       0.000us       0.000us             3  
-                                             aten::view         0.32%       3.820us         0.32%       3.820us       0.637us       0.000us         0.00%       0.000us       0.000us             6  
-                                  cudaDeviceSynchronize        55.78%     658.143us        55.78%     658.143us     658.143us       0.000us         0.00%       0.000us       0.000us             1  
+                                       torch_layer_norm        11.35%      67.490us        99.15%     589.690us     589.690us       0.000us         0.00%     202.330us     202.330us             1  
+                                       aten::layer_norm         1.44%       8.590us        87.80%     522.200us     174.067us       0.000us         0.00%     202.330us      67.443us             3  
+                                aten::native_layer_norm         8.41%      50.041us        86.35%     513.610us     171.203us     128.124us       100.00%     202.330us      67.443us             3  
+                                       torch_layer_norm         0.00%       0.000us         0.00%       0.000us       0.000us     129.692us       101.22%     129.692us     129.692us             1  
+void at::native::(anonymous namespace)::vectorized_l...         0.00%       0.000us         0.00%       0.000us       0.000us     128.124us       100.00%     128.124us      42.708us             3  
+                                Activity Buffer Request        46.63%     277.315us        46.63%     277.315us     277.315us      74.206us        57.92%      74.206us      74.206us             1  
+                                            aten::empty         4.68%      27.831us         4.68%      27.831us       3.092us       0.000us         0.00%       0.000us       0.000us             9  
+                                       cudaLaunchKernel        25.89%     153.973us        25.89%     153.973us      51.324us       0.000us         0.00%       0.000us       0.000us             3  
+                                             aten::view         0.75%       4.450us         0.75%       4.450us       0.742us       0.000us         0.00%       0.000us       0.000us             6  
+                                  cudaDeviceSynchronize         0.85%       5.080us         0.85%       5.080us       5.080us       0.000us         0.00%       0.000us       0.000us             1  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-Self CPU time total: 1.180ms
-Self CUDA time total: 782.094us
+Self CPU time total: 594.770us
+Self CUDA time total: 128.124us
 
 
 
@@ -4815,19 +4803,19 @@ PROFILE TRACE: torch_layer_norm | LN_B16_S512_D4096
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
                                                    Name    Self CPU %      Self CPU   CPU total %     CPU total  CPU time avg     Self CUDA   Self CUDA %    CUDA total  CUDA time avg    # of Calls  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-                                       torch_layer_norm         3.28%      65.613us        26.55%     531.932us     531.932us       0.000us         0.00%       2.062ms       2.062ms             1  
-                                       aten::layer_norm         0.44%       8.751us        23.28%     466.319us     155.440us       0.000us         0.00%       2.062ms     687.358us             3  
-                                aten::native_layer_norm         2.38%      47.740us        22.84%     457.568us     152.523us       1.599ms       100.00%       2.062ms     687.358us             3  
-                                       torch_layer_norm         0.00%       0.000us         0.00%       0.000us       0.000us       1.600ms       100.08%       1.600ms       1.600ms             1  
-void at::native::(anonymous namespace)::vectorized_l...         0.00%       0.000us         0.00%       0.000us       0.000us       1.599ms       100.00%       1.599ms     532.854us             3  
-                                Activity Buffer Request        10.57%     211.745us        10.57%     211.745us     211.745us     463.511us        29.00%     463.511us     463.511us             1  
-                                            aten::empty         1.42%      28.490us         1.42%      28.490us       3.166us       0.000us         0.00%       0.000us       0.000us             9  
-                                       cudaLaunchKernel         8.28%     165.833us         8.28%     165.833us      55.278us       0.000us         0.00%       0.000us       0.000us             3  
-                                             aten::view         0.19%       3.760us         0.19%       3.760us       0.627us       0.000us         0.00%       0.000us       0.000us             6  
-                                  cudaDeviceSynchronize        73.45%       1.471ms        73.45%       1.471ms       1.471ms       0.000us         0.00%       0.000us       0.000us             1  
+                                       torch_layer_norm         6.87%      68.511us        58.17%     579.770us     579.770us       0.000us         0.00%     720.407us     720.407us             1  
+                                       aten::layer_norm         0.88%       8.821us        51.29%     511.259us     170.420us       0.000us         0.00%     720.407us     240.136us             3  
+                                aten::native_layer_norm         5.17%      51.521us        50.41%     502.438us     167.479us     546.073us       100.00%     720.407us     240.136us             3  
+                                       torch_layer_norm         0.00%       0.000us         0.00%       0.000us       0.000us     547.577us       100.28%     547.577us     547.577us             1  
+void at::native::(anonymous namespace)::vectorized_l...         0.00%       0.000us         0.00%       0.000us       0.000us     546.073us       100.00%     546.073us     182.024us             3  
+                                Activity Buffer Request        26.52%     264.294us        26.52%     264.294us     264.294us     174.334us        31.93%     174.334us     174.334us             1  
+                                            aten::empty         2.91%      29.030us         2.91%      29.030us       3.226us       0.000us         0.00%       0.000us       0.000us             9  
+                                       cudaLaunchKernel        15.39%     153.384us        15.39%     153.384us      51.128us       0.000us         0.00%       0.000us       0.000us             3  
+                                             aten::view         0.42%       4.209us         0.42%       4.209us       0.702us       0.000us         0.00%       0.000us       0.000us             6  
+                                  cudaDeviceSynchronize        41.83%     416.987us        41.83%     416.987us     416.987us       0.000us         0.00%       0.000us       0.000us             1  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-Self CPU time total: 2.003ms
-Self CUDA time total: 1.599ms
+Self CPU time total: 996.757us
+Self CUDA time total: 546.073us
 
 
 
@@ -4837,19 +4825,19 @@ PROFILE TRACE: torch_layer_norm | LN_B16_S512_D8192
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
                                                    Name    Self CPU %      Self CPU   CPU total %     CPU total  CPU time avg     Self CUDA   Self CUDA %    CUDA total  CUDA time avg    # of Calls  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-                                       torch_layer_norm         1.73%      65.201us        15.56%     587.973us     587.973us       0.000us         0.00%       4.337ms       4.337ms             1  
-                                       aten::layer_norm         0.23%       8.512us        13.84%     522.772us     174.257us       0.000us         0.00%       4.337ms       1.446ms             3  
-                                aten::native_layer_norm         1.23%      46.631us        13.61%     514.260us     171.420us       3.314ms       100.00%       4.337ms       1.446ms             3  
-                                       torch_layer_norm         0.00%       0.000us         0.00%       0.000us       0.000us       3.315ms       100.04%       3.315ms       3.315ms             1  
-void at::native::(anonymous namespace)::vectorized_l...         0.00%       0.000us         0.00%       0.000us       0.000us       3.314ms       100.00%       3.314ms       1.105ms             3  
-                                Activity Buffer Request         7.12%     269.056us         7.12%     269.056us     269.056us       1.023ms        30.87%       1.023ms       1.023ms             1  
-                                            aten::empty         0.74%      27.840us         0.74%      27.840us       3.093us       0.000us         0.00%       0.000us       0.000us             9  
-                                       cudaLaunchKernel         4.41%     166.733us         4.41%     166.733us      55.578us       0.000us         0.00%       0.000us       0.000us             3  
-                                             aten::view         0.11%       4.000us         0.11%       4.000us       0.667us       0.000us         0.00%       0.000us       0.000us             6  
-                                  cudaDeviceSynchronize        84.44%       3.190ms        84.44%       3.190ms       3.190ms       0.000us         0.00%       0.000us       0.000us             1  
+                                       torch_layer_norm         4.10%      64.241us        34.57%     541.829us     541.829us       0.000us         0.00%       1.480ms       1.480ms             1  
+                                       aten::layer_norm         0.55%       8.560us        30.47%     477.588us     159.196us       0.000us         0.00%       1.480ms     493.436us             3  
+                                aten::native_layer_norm         3.24%      50.830us        29.93%     469.028us     156.343us       1.149ms       100.00%       1.480ms     493.436us             3  
+                                       torch_layer_norm         0.00%       0.000us         0.00%       0.000us       0.000us       1.151ms       100.12%       1.151ms       1.151ms             1  
+void at::native::(anonymous namespace)::vectorized_l...         0.00%       0.000us         0.00%       0.000us       0.000us       1.149ms       100.00%       1.149ms     383.133us             3  
+                                Activity Buffer Request        14.86%     232.814us        14.86%     232.814us     232.814us     330.909us        28.79%     330.909us     330.909us             1  
+                                            aten::empty         1.86%      29.081us         1.86%      29.081us       3.231us       0.000us         0.00%       0.000us       0.000us             9  
+                                       cudaLaunchKernel         9.70%     152.022us         9.70%     152.022us      50.674us       0.000us         0.00%       0.000us       0.000us             3  
+                                             aten::view         0.27%       4.281us         0.27%       4.281us       0.713us       0.000us         0.00%       0.000us       0.000us             6  
+                                  cudaDeviceSynchronize        65.43%       1.025ms        65.43%       1.025ms       1.025ms       0.000us         0.00%       0.000us       0.000us             1  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-Self CPU time total: 3.778ms
-Self CUDA time total: 3.314ms
+Self CPU time total: 1.567ms
+Self CUDA time total: 1.149ms
 
 
 
@@ -4859,19 +4847,19 @@ PROFILE TRACE: torch_layer_norm | LN_B16_S1024_D1024
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
                                                    Name    Self CPU %      Self CPU   CPU total %     CPU total  CPU time avg     Self CUDA   Self CUDA %    CUDA total  CUDA time avg    # of Calls  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-                                       torch_layer_norm         5.47%      63.581us        43.72%     508.311us     508.311us       0.000us         0.00%       1.020ms       1.020ms             1  
-                                       aten::layer_norm         0.74%       8.570us        38.25%     444.730us     148.243us       0.000us         0.00%       1.020ms     340.056us             3  
-                                aten::native_layer_norm         4.13%      48.012us        37.51%     436.160us     145.387us     778.734us       100.00%       1.020ms     340.056us             3  
-                                       torch_layer_norm         0.00%       0.000us         0.00%       0.000us       0.000us     780.173us       100.18%     780.173us     780.173us             1  
-void at::native::(anonymous namespace)::vectorized_l...         0.00%       0.000us         0.00%       0.000us       0.000us     778.734us       100.00%     778.734us     259.578us             3  
-                                Activity Buffer Request        16.59%     192.874us        16.59%     192.874us     192.874us     241.434us        31.00%     241.434us     241.434us             1  
-                                            aten::empty         2.34%      27.210us         2.34%      27.210us       3.023us       0.000us         0.00%       0.000us       0.000us             9  
-                                       cudaLaunchKernel        14.14%     164.374us        14.14%     164.374us      54.791us       0.000us         0.00%       0.000us       0.000us             3  
-                                             aten::view         0.32%       3.690us         0.32%       3.690us       0.615us       0.000us         0.00%       0.000us       0.000us             6  
-                                  cudaDeviceSynchronize        56.28%     654.424us        56.28%     654.424us     654.424us       0.000us         0.00%       0.000us       0.000us             1  
+                                       torch_layer_norm        10.87%      65.290us        97.50%     585.660us     585.660us       0.000us         0.00%     211.160us     211.160us             1  
+                                       aten::layer_norm         1.49%       8.961us        86.63%     520.370us     173.457us       0.000us         0.00%     211.160us      70.387us             3  
+                                aten::native_layer_norm         8.59%      51.600us        85.14%     511.409us     170.470us     139.579us       100.00%     211.160us      70.387us             3  
+                                       torch_layer_norm         0.00%       0.000us         0.00%       0.000us       0.000us     140.987us       101.01%     140.987us     140.987us             1  
+void at::native::(anonymous namespace)::vectorized_l...         0.00%       0.000us         0.00%       0.000us       0.000us     139.579us       100.00%     139.579us      46.526us             3  
+                                Activity Buffer Request        45.81%     275.144us        45.81%     275.144us     275.144us      71.581us        51.28%      71.581us      71.581us             1  
+                                            aten::empty         4.65%      27.942us         4.65%      27.942us       3.105us       0.000us         0.00%       0.000us       0.000us             9  
+                                       cudaLaunchKernel        25.42%     152.693us        25.42%     152.693us      50.898us       0.000us         0.00%       0.000us       0.000us             3  
+                                             aten::view         0.67%       4.030us         0.67%       4.030us       0.672us       0.000us         0.00%       0.000us       0.000us             6  
+                                  cudaDeviceSynchronize         2.50%      14.990us         2.50%      14.990us      14.990us       0.000us         0.00%       0.000us       0.000us             1  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-Self CPU time total: 1.163ms
-Self CUDA time total: 778.734us
+Self CPU time total: 600.650us
+Self CUDA time total: 139.579us
 
 
 
@@ -4881,19 +4869,19 @@ PROFILE TRACE: torch_layer_norm | LN_B16_S1024_D2048
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
                                                    Name    Self CPU %      Self CPU   CPU total %     CPU total  CPU time avg     Self CUDA   Self CUDA %    CUDA total  CUDA time avg    # of Calls  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-                                       torch_layer_norm         3.23%      65.631us        26.92%     546.792us     546.792us       0.000us         0.00%       2.082ms       2.082ms             1  
-                                       aten::layer_norm         0.43%       8.791us        23.69%     481.161us     160.387us       0.000us         0.00%       2.082ms     694.073us             3  
-                                aten::native_layer_norm         2.34%      47.531us        23.26%     472.370us     157.457us       1.610ms       100.00%       2.082ms     694.073us             3  
-                                       torch_layer_norm         0.00%       0.000us         0.00%       0.000us       0.000us       1.611ms       100.08%       1.611ms       1.611ms             1  
-void at::native::(anonymous namespace)::vectorized_l...         0.00%       0.000us         0.00%       0.000us       0.000us       1.610ms       100.00%       1.610ms     536.701us             3  
-                                Activity Buffer Request        11.25%     228.475us        11.25%     228.475us     228.475us     472.116us        29.32%     472.116us     472.116us             1  
-                                            aten::empty         1.35%      27.360us         1.35%      27.360us       3.040us       0.000us         0.00%       0.000us       0.000us             9  
-                                       cudaLaunchKernel         8.13%     165.123us         8.13%     165.123us      55.041us       0.000us         0.00%       0.000us       0.000us             3  
-                                             aten::view         0.19%       3.881us         0.19%       3.881us       0.647us       0.000us         0.00%       0.000us       0.000us             6  
-                                  cudaDeviceSynchronize        73.08%       1.484ms        73.08%       1.484ms       1.484ms       0.000us         0.00%       0.000us       0.000us             1  
+                                       torch_layer_norm         6.53%      63.420us        56.04%     544.209us     544.209us       0.000us         0.00%     725.021us     725.021us             1  
+                                       aten::layer_norm         0.90%       8.770us        49.51%     480.789us     160.263us       0.000us         0.00%     725.021us     241.674us             3  
+                                aten::native_layer_norm         5.25%      50.982us        48.61%     472.019us     157.340us     551.902us       100.00%     725.021us     241.674us             3  
+                                       torch_layer_norm         0.00%       0.000us         0.00%       0.000us       0.000us     553.342us       100.26%     553.342us     553.342us             1  
+void at::native::(anonymous namespace)::vectorized_l...         0.00%       0.000us         0.00%       0.000us       0.000us     551.902us       100.00%     551.902us     183.967us             3  
+                                Activity Buffer Request        24.17%     234.744us        24.17%     234.744us     234.744us     173.119us        31.37%     173.119us     173.119us             1  
+                                            aten::empty         3.03%      29.450us         3.03%      29.450us       3.272us       0.000us         0.00%       0.000us       0.000us             9  
+                                       cudaLaunchKernel        15.70%     152.482us        15.70%     152.482us      50.827us       0.000us         0.00%       0.000us       0.000us             3  
+                                             aten::view         0.45%       4.361us         0.45%       4.361us       0.727us       0.000us         0.00%       0.000us       0.000us             6  
+                                  cudaDeviceSynchronize        43.96%     426.887us        43.96%     426.887us     426.887us       0.000us         0.00%       0.000us       0.000us             1  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-Self CPU time total: 2.031ms
-Self CUDA time total: 1.610ms
+Self CPU time total: 971.096us
+Self CUDA time total: 551.902us
 
 
 
@@ -4903,19 +4891,19 @@ PROFILE TRACE: torch_layer_norm | LN_B16_S1024_D4096
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
                                                    Name    Self CPU %      Self CPU   CPU total %     CPU total  CPU time avg     Self CUDA   Self CUDA %    CUDA total  CUDA time avg    # of Calls  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-                                       torch_layer_norm         1.80%      65.951us        13.62%     497.891us     497.891us       0.000us         0.00%       4.269ms       4.269ms             1  
-                                       aten::layer_norm         0.24%       8.923us        11.82%     431.940us     143.980us       0.000us         0.00%       4.269ms       1.423ms             3  
-                                aten::native_layer_norm         1.26%      46.081us        11.57%     423.017us     141.006us       3.268ms       100.00%       4.269ms       1.423ms             3  
-                                       torch_layer_norm         0.00%       0.000us         0.00%       0.000us       0.000us       3.270ms       100.04%       3.270ms       3.270ms             1  
-void at::native::(anonymous namespace)::vectorized_l...         0.00%       0.000us         0.00%       0.000us       0.000us       3.268ms       100.00%       3.268ms       1.089ms             3  
-                                Activity Buffer Request         4.93%     180.334us         4.93%     180.334us     180.334us       1.001ms        30.64%       1.001ms       1.001ms             1  
-                                            aten::empty         0.79%      28.830us         0.79%      28.830us       3.203us       0.000us         0.00%       0.000us       0.000us             9  
-                                       cudaLaunchKernel         4.49%     164.033us         4.49%     164.033us      54.678us       0.000us         0.00%       0.000us       0.000us             3  
-                                             aten::view         0.10%       3.739us         0.10%       3.739us       0.623us       0.000us         0.00%       0.000us       0.000us             6  
-                                  cudaDeviceSynchronize        86.38%       3.157ms        86.38%       3.157ms       3.157ms       0.000us         0.00%       0.000us       0.000us             1  
+                                       torch_layer_norm         4.07%      66.881us        38.72%     635.751us     635.751us       0.000us         0.00%       1.469ms       1.469ms             1  
+                                       aten::layer_norm         0.55%       9.009us        34.64%     568.870us     189.623us       0.000us         0.00%       1.469ms     489.666us             3  
+                                aten::native_layer_norm         3.27%      53.630us        34.10%     559.861us     186.620us       1.138ms       100.00%       1.469ms     489.666us             3  
+                                       torch_layer_norm         0.00%       0.000us         0.00%       0.000us       0.000us       1.139ms       100.13%       1.139ms       1.139ms             1  
+void at::native::(anonymous namespace)::vectorized_l...         0.00%       0.000us         0.00%       0.000us       0.000us       1.138ms       100.00%       1.138ms     379.279us             3  
+                                Activity Buffer Request        19.12%     313.985us        19.12%     313.985us     313.985us     331.162us        29.10%     331.162us     331.162us             1  
+                                            aten::empty         1.88%      30.903us         1.88%      30.903us       3.434us       0.000us         0.00%       0.000us       0.000us             9  
+                                       cudaLaunchKernel         9.57%     157.133us         9.57%     157.133us      52.378us       0.000us         0.00%       0.000us       0.000us             3  
+                                             aten::view         0.26%       4.210us         0.26%       4.210us       0.702us       0.000us         0.00%       0.000us       0.000us             6  
+                                  cudaDeviceSynchronize        61.28%       1.006ms        61.28%       1.006ms       1.006ms       0.000us         0.00%       0.000us       0.000us             1  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-Self CPU time total: 3.655ms
-Self CUDA time total: 3.268ms
+Self CPU time total: 1.642ms
+Self CUDA time total: 1.138ms
 
 
 
@@ -4925,19 +4913,19 @@ PROFILE TRACE: torch_layer_norm | LN_B16_S1024_D8192
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
                                                    Name    Self CPU %      Self CPU   CPU total %     CPU total  CPU time avg     Self CUDA   Self CUDA %    CUDA total  CUDA time avg    # of Calls  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-                                       torch_layer_norm         0.96%      68.974us         8.04%     575.593us     575.593us       0.000us         0.00%       8.896ms       8.896ms             1  
-                                       aten::layer_norm         0.12%       8.430us         7.07%     506.619us     168.873us       0.000us         0.00%       8.896ms       2.965ms             3  
-                                aten::native_layer_norm         0.65%      46.671us         6.96%     498.189us     166.063us       6.715ms       100.00%       8.896ms       2.965ms             3  
-                                       torch_layer_norm         0.00%       0.000us         0.00%       0.000us       0.000us       6.716ms       100.02%       6.716ms       6.716ms             1  
-void at::native::(anonymous namespace)::vectorized_l...         0.00%       0.000us         0.00%       0.000us       0.000us       6.715ms       100.00%       6.715ms       2.238ms             3  
-                                Activity Buffer Request         3.47%     248.836us         3.47%     248.836us     248.836us       2.181ms        32.49%       2.181ms       2.181ms             1  
-                                            aten::empty         0.38%      27.020us         0.38%      27.020us       3.002us       0.000us         0.00%       0.000us       0.000us             9  
-                                       cudaLaunchKernel         2.40%     171.633us         2.40%     171.633us      57.211us       0.000us         0.00%       0.000us       0.000us             3  
-                                             aten::view         0.06%       4.029us         0.06%       4.029us       0.672us       0.000us         0.00%       0.000us       0.000us             6  
-                                  cudaDeviceSynchronize        91.96%       6.587ms        91.96%       6.587ms       6.587ms       0.000us         0.00%       0.000us       0.000us             1  
+                                       torch_layer_norm         2.42%      65.690us        15.85%     430.707us     430.707us       0.000us         0.00%       3.155ms       3.155ms             1  
+                                       aten::layer_norm         0.35%       9.490us        13.44%     365.017us     121.672us       0.000us         0.00%       3.155ms       1.052ms             3  
+                                aten::native_layer_norm         1.79%      48.727us        13.09%     355.527us     118.509us       2.409ms       100.00%       3.155ms       1.052ms             3  
+                                       torch_layer_norm         0.00%       0.000us         0.00%       0.000us       0.000us       2.410ms       100.06%       2.410ms       2.410ms             1  
+void at::native::(anonymous namespace)::vectorized_l...         0.00%       0.000us         0.00%       0.000us       0.000us       2.409ms       100.00%       2.409ms     802.859us             3  
+                                Activity Buffer Request         4.38%     118.922us         4.38%     118.922us     118.922us     746.656us        31.00%     746.656us     746.656us             1  
+                                            aten::empty         1.13%      30.624us         1.13%      30.624us       3.403us       0.000us         0.00%       0.000us       0.000us             9  
+                                       cudaLaunchKernel         5.65%     153.412us         5.65%     153.412us      51.137us       0.000us         0.00%       0.000us       0.000us             3  
+                                             aten::view         0.14%       3.842us         0.14%       3.842us       0.640us       0.000us         0.00%       0.000us       0.000us             6  
+                                  cudaDeviceSynchronize        84.15%       2.286ms        84.15%       2.286ms       2.286ms       0.000us         0.00%       0.000us       0.000us             1  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-Self CPU time total: 7.163ms
-Self CUDA time total: 6.715ms
+Self CPU time total: 2.717ms
+Self CUDA time total: 2.409ms
 
 
 
@@ -4947,19 +4935,19 @@ PROFILE TRACE: torch_layer_norm | LN_B16_S2048_D1024
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
                                                    Name    Self CPU %      Self CPU   CPU total %     CPU total  CPU time avg     Self CUDA   Self CUDA %    CUDA total  CUDA time avg    # of Calls  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-                                       torch_layer_norm         3.21%      65.101us        27.13%     550.572us     550.572us       0.000us         0.00%       2.070ms       2.070ms             1  
-                                       aten::layer_norm         0.42%       8.470us        23.93%     485.471us     161.824us       0.000us         0.00%       2.070ms     689.859us             3  
-                                aten::native_layer_norm         2.33%      47.334us        23.51%     477.001us     159.000us       1.603ms       100.00%       2.070ms     689.859us             3  
-                                       torch_layer_norm         0.00%       0.000us         0.00%       0.000us       0.000us       1.604ms       100.08%       1.604ms       1.604ms             1  
-void at::native::(anonymous namespace)::vectorized_l...         0.00%       0.000us         0.00%       0.000us       0.000us       1.603ms       100.00%       1.603ms     534.289us             3  
-                                Activity Buffer Request        11.26%     228.575us        11.26%     228.575us     228.575us     466.708us        29.12%     466.708us     466.708us             1  
-                                            aten::empty         1.44%      29.320us         1.44%      29.320us       3.258us       0.000us         0.00%       0.000us       0.000us             9  
-                                       cudaLaunchKernel         8.29%     168.154us         8.29%     168.154us      56.051us       0.000us         0.00%       0.000us       0.000us             3  
-                                             aten::view         0.18%       3.618us         0.18%       3.618us       0.603us       0.000us         0.00%       0.000us       0.000us             6  
-                                  cudaDeviceSynchronize        72.87%       1.479ms        72.87%       1.479ms       1.479ms       0.000us         0.00%       0.000us       0.000us             1  
+                                       torch_layer_norm         6.72%      66.011us        55.62%     546.350us     546.350us       0.000us         0.00%     735.937us     735.937us             1  
+                                       aten::layer_norm         0.92%       8.990us        48.90%     480.339us     160.113us       0.000us         0.00%     735.937us     245.312us             3  
+                                aten::native_layer_norm         5.16%      50.724us        47.98%     471.349us     157.116us     560.097us       100.00%     735.937us     245.312us             3  
+                                       torch_layer_norm         0.00%       0.000us         0.00%       0.000us       0.000us     561.633us       100.27%     561.633us     561.633us             1  
+void at::native::(anonymous namespace)::vectorized_l...         0.00%       0.000us         0.00%       0.000us       0.000us     560.097us       100.00%     560.097us     186.699us             3  
+                                Activity Buffer Request        23.82%     234.014us        23.82%     234.014us     234.014us     175.840us        31.39%     175.840us     175.840us             1  
+                                            aten::empty         2.88%      28.270us         2.88%      28.270us       3.141us       0.000us         0.00%       0.000us       0.000us             9  
+                                       cudaLaunchKernel        15.72%     154.402us        15.72%     154.402us      51.467us       0.000us         0.00%       0.000us       0.000us             3  
+                                             aten::view         0.40%       3.939us         0.40%       3.939us       0.656us       0.000us         0.00%       0.000us       0.000us             6  
+                                  cudaDeviceSynchronize        44.38%     435.997us        44.38%     435.997us     435.997us       0.000us         0.00%       0.000us       0.000us             1  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-Self CPU time total: 2.029ms
-Self CUDA time total: 1.603ms
+Self CPU time total: 982.347us
+Self CUDA time total: 560.097us
 
 
 
@@ -4969,19 +4957,19 @@ PROFILE TRACE: torch_layer_norm | LN_B16_S2048_D2048
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
                                                    Name    Self CPU %      Self CPU   CPU total %     CPU total  CPU time avg     Self CUDA   Self CUDA %    CUDA total  CUDA time avg    # of Calls  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-                                       aten::layer_norm         0.24%       8.881us        12.85%     473.140us     157.713us       0.000us         0.00%       4.264ms       1.421ms             3  
-                                aten::native_layer_norm         1.29%      47.472us        12.61%     464.259us     154.753us       3.266ms       100.00%       4.264ms       1.421ms             3  
-                                       torch_layer_norm         1.85%      67.922us        14.70%     541.062us     541.062us       0.000us         0.00%       4.264ms       4.264ms             1  
-                                       torch_layer_norm         0.00%       0.000us         0.00%       0.000us       0.000us       3.268ms       100.05%       3.268ms       3.268ms             1  
-void at::native::(anonymous namespace)::vectorized_l...         0.00%       0.000us         0.00%       0.000us       0.000us       3.266ms       100.00%       3.266ms       1.089ms             3  
-                                Activity Buffer Request         5.85%     215.475us         5.85%     215.475us     215.475us     997.400us        30.54%     997.400us     997.400us             1  
-                                            aten::empty         0.76%      27.950us         0.76%      27.950us       3.106us       0.000us         0.00%       0.000us       0.000us             9  
-                                       cudaLaunchKernel         4.61%     169.513us         4.61%     169.513us      56.504us       0.000us         0.00%       0.000us       0.000us             3  
-                                             aten::view         0.10%       3.849us         0.10%       3.849us       0.642us       0.000us         0.00%       0.000us       0.000us             6  
-                                  cudaDeviceSynchronize        85.30%       3.140ms        85.30%       3.140ms       3.140ms       0.000us         0.00%       0.000us       0.000us             1  
+                                       torch_layer_norm         4.56%      64.832us        29.06%     412.897us     412.897us       0.000us         0.00%       1.469ms       1.469ms             1  
+                                       aten::layer_norm         0.65%       9.228us        24.50%     348.065us     116.022us       0.000us         0.00%       1.469ms     489.663us             3  
+                                aten::native_layer_norm         3.69%      52.410us        23.85%     338.837us     112.946us       1.133ms       100.00%       1.469ms     489.663us             3  
+                                       torch_layer_norm         0.00%       0.000us         0.00%       0.000us       0.000us       1.135ms       100.12%       1.135ms       1.135ms             1  
+void at::native::(anonymous namespace)::vectorized_l...         0.00%       0.000us         0.00%       0.000us       0.000us       1.133ms       100.00%       1.133ms     377.716us             3  
+                                Activity Buffer Request         7.07%     100.442us         7.07%     100.442us     100.442us     335.839us        29.64%     335.839us     335.839us             1  
+                                            aten::empty         2.06%      29.311us         2.06%      29.311us       3.257us       0.000us         0.00%       0.000us       0.000us             9  
+                                       cudaLaunchKernel        10.76%     152.823us        10.76%     152.823us      50.941us       0.000us         0.00%       0.000us       0.000us             3  
+                                             aten::view         0.27%       3.851us         0.27%       3.851us       0.642us       0.000us         0.00%       0.000us       0.000us             6  
+                                  cudaDeviceSynchronize        70.94%       1.008ms        70.94%       1.008ms       1.008ms       0.000us         0.00%       0.000us       0.000us             1  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-Self CPU time total: 3.681ms
-Self CUDA time total: 3.266ms
+Self CPU time total: 1.421ms
+Self CUDA time total: 1.133ms
 
 
 
@@ -4991,19 +4979,19 @@ PROFILE TRACE: torch_layer_norm | LN_B16_S2048_D4096
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
                                                    Name    Self CPU %      Self CPU   CPU total %     CPU total  CPU time avg     Self CUDA   Self CUDA %    CUDA total  CUDA time avg    # of Calls  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-                                       torch_layer_norm         0.99%      71.201us         8.44%     605.933us     605.933us       0.000us         0.00%       8.838ms       8.838ms             1  
-                                       aten::layer_norm         0.13%       9.280us         7.45%     534.732us     178.244us       0.000us         0.00%       8.838ms       2.946ms             3  
-                                aten::native_layer_norm         0.69%      49.421us         7.32%     525.452us     175.151us       6.702ms       100.00%       8.838ms       2.946ms             3  
-                                       torch_layer_norm         0.00%       0.000us         0.00%       0.000us       0.000us       6.703ms       100.02%       6.703ms       6.703ms             1  
-void at::native::(anonymous namespace)::vectorized_l...         0.00%       0.000us         0.00%       0.000us       0.000us       6.702ms       100.00%       6.702ms       2.234ms             3  
-                                Activity Buffer Request         3.78%     271.526us         3.78%     271.526us     271.526us       2.136ms        31.88%       2.136ms       2.136ms             1  
-                                            aten::empty         0.38%      27.460us         0.38%      27.460us       3.051us       0.000us         0.00%       0.000us       0.000us             9  
-                                       cudaLaunchKernel         2.41%     173.045us         2.41%     173.045us      57.682us       0.000us         0.00%       0.000us       0.000us             3  
-                                             aten::view         0.06%       4.000us         0.06%       4.000us       0.667us       0.000us         0.00%       0.000us       0.000us             6  
-                                  cudaDeviceSynchronize        91.56%       6.572ms        91.56%       6.572ms       6.572ms       0.000us         0.00%       0.000us       0.000us             1  
+                                       torch_layer_norm         2.43%      67.770us        21.38%     597.070us     597.070us       0.000us         0.00%       3.032ms       3.032ms             1  
+                                       aten::layer_norm         0.34%       9.401us        18.95%     529.300us     176.433us       0.000us         0.00%       3.032ms       1.011ms             3  
+                                aten::native_layer_norm         1.84%      51.400us        18.61%     519.899us     173.300us       2.325ms       100.00%       3.032ms       1.011ms             3  
+                                       torch_layer_norm         0.00%       0.000us         0.00%       0.000us       0.000us       2.327ms       100.06%       2.327ms       2.327ms             1  
+void at::native::(anonymous namespace)::vectorized_l...         0.00%       0.000us         0.00%       0.000us       0.000us       2.325ms       100.00%       2.325ms     775.112us             3  
+                                Activity Buffer Request         9.90%     276.585us         9.90%     276.585us     276.585us     706.558us        30.39%     706.558us     706.558us             1  
+                                            aten::empty         1.09%      30.392us         1.09%      30.392us       3.377us       0.000us         0.00%       0.000us       0.000us             9  
+                                       cudaLaunchKernel         5.64%     157.652us         5.64%     157.652us      52.551us       0.000us         0.00%       0.000us       0.000us             3  
+                                             aten::view         0.14%       3.870us         0.14%       3.870us       0.645us       0.000us         0.00%       0.000us       0.000us             6  
+                                  cudaDeviceSynchronize        78.62%       2.196ms        78.62%       2.196ms       2.196ms       0.000us         0.00%       0.000us       0.000us             1  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-Self CPU time total: 7.178ms
-Self CUDA time total: 6.702ms
+Self CPU time total: 2.793ms
+Self CUDA time total: 2.325ms
 
 
 
@@ -5013,121 +5001,75 @@ PROFILE TRACE: torch_layer_norm | LN_B16_S2048_D8192
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
                                                    Name    Self CPU %      Self CPU   CPU total %     CPU total  CPU time avg     Self CUDA   Self CUDA %    CUDA total  CUDA time avg    # of Calls  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-                                       torch_layer_norm         0.51%      71.382us         4.14%     576.813us     576.813us       0.000us         0.00%      17.998ms      17.998ms             1  
-                                       aten::layer_norm         0.06%       9.001us         3.62%     505.431us     168.477us       0.000us         0.00%      17.998ms       5.999ms             3  
-                                aten::native_layer_norm         0.35%      49.501us         3.56%     496.430us     165.477us      13.500ms       100.00%      17.998ms       5.999ms             3  
-                                       torch_layer_norm         0.00%       0.000us         0.00%       0.000us       0.000us      13.502ms       100.01%      13.502ms      13.502ms             1  
-void at::native::(anonymous namespace)::vectorized_l...         0.00%       0.000us         0.00%       0.000us       0.000us      13.500ms       100.00%      13.500ms       4.500ms             3  
-                                Activity Buffer Request         1.51%     210.264us         1.51%     210.264us     210.264us       4.498ms        33.31%       4.498ms       4.498ms             1  
-                                            aten::empty         0.21%      29.200us         0.21%      29.200us       3.244us       0.000us         0.00%       0.000us       0.000us             9  
-                                       cudaLaunchKernel         1.46%     203.594us         1.46%     203.594us      67.865us       0.000us         0.00%       0.000us       0.000us             3  
-                                             aten::view         0.03%       3.871us         0.03%       3.871us       0.645us       0.000us         0.00%       0.000us       0.000us             6  
-                                  cudaDeviceSynchronize        95.86%      13.371ms        95.86%      13.371ms      13.371ms       0.000us         0.00%       0.000us       0.000us             1  
+                                       torch_layer_norm         1.28%      68.262us        10.71%     572.390us     572.390us       0.000us         0.00%       6.493ms       6.493ms             1  
+                                       aten::layer_norm         0.16%       8.770us         9.43%     504.128us     168.043us       0.000us         0.00%       6.493ms       2.164ms             3  
+                                aten::native_layer_norm         0.96%      51.508us         9.27%     495.358us     165.119us       4.900ms       100.00%       6.493ms       2.164ms             3  
+                                       torch_layer_norm         0.00%       0.000us         0.00%       0.000us       0.000us       4.901ms       100.03%       4.901ms       4.901ms             1  
+void at::native::(anonymous namespace)::vectorized_l...         0.00%       0.000us         0.00%       0.000us       0.000us       4.900ms       100.00%       4.900ms       1.633ms             3  
+                                Activity Buffer Request         4.74%     253.634us         4.74%     253.634us     253.634us       1.594ms        32.53%       1.594ms       1.594ms             1  
+                                            aten::empty         0.56%      29.682us         0.56%      29.682us       3.298us       0.000us         0.00%       0.000us       0.000us             9  
+                                       cudaLaunchKernel         2.93%     156.523us         2.93%     156.523us      52.174us       0.000us         0.00%       0.000us       0.000us             3  
+                                             aten::view         0.08%       4.011us         0.08%       4.011us       0.669us       0.000us         0.00%       0.000us       0.000us             6  
+                                  cudaDeviceSynchronize        89.29%       4.774ms        89.29%       4.774ms       4.774ms       0.000us         0.00%       0.000us       0.000us             1  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-Self CPU time total: 13.947ms
-Self CUDA time total: 13.500ms
+Self CPU time total: 5.346ms
+Self CUDA time total: 4.900ms
 
 
 impl                     wl                  p50(ms)  ok
-torch_layer_norm         LN_B16_S1024_D1024     0.29  False
-torch_layer_norm         LN_B16_S1024_D2048     0.59  False
-torch_layer_norm         LN_B16_S1024_D4096     1.15  False
-torch_layer_norm         LN_B16_S1024_D8192     2.27  False
+torch_layer_norm         LN_B16_S1024_D1024     0.05  False
+torch_layer_norm         LN_B16_S1024_D2048     0.21  False
+torch_layer_norm         LN_B16_S1024_D4096     0.42  False
+torch_layer_norm         LN_B16_S1024_D8192     0.85  False
 torch_layer_norm         LN_B16_S128_D1024      0.03  False
-torch_layer_norm         LN_B16_S128_D2048      0.04  False
-torch_layer_norm         LN_B16_S128_D4096      0.05  False
-torch_layer_norm         LN_B16_S128_D8192      0.27  False
-torch_layer_norm         LN_B16_S2048_D1024     0.59  False
-torch_layer_norm         LN_B16_S2048_D2048     1.16  False
-torch_layer_norm         LN_B16_S2048_D4096     2.30  False
-torch_layer_norm         LN_B16_S2048_D8192     4.51  False
-torch_layer_norm         LN_B16_S512_D1024      0.07  False
-torch_layer_norm         LN_B16_S512_D2048      0.29  False
-torch_layer_norm         LN_B16_S512_D4096      0.59  False
-torch_layer_norm         LN_B16_S512_D8192      1.15  False
+torch_layer_norm         LN_B16_S128_D2048      0.03  False
+torch_layer_norm         LN_B16_S128_D4096      0.04  False
+torch_layer_norm         LN_B16_S128_D8192      0.05  False
+torch_layer_norm         LN_B16_S2048_D1024     0.21  False
+torch_layer_norm         LN_B16_S2048_D2048     0.42  False
+torch_layer_norm         LN_B16_S2048_D4096     0.82  False
+torch_layer_norm         LN_B16_S2048_D8192     1.68  False
+torch_layer_norm         LN_B16_S512_D1024      0.04  False
+torch_layer_norm         LN_B16_S512_D2048      0.05  False
+torch_layer_norm         LN_B16_S512_D4096      0.21  False
+torch_layer_norm         LN_B16_S512_D8192      0.43  False
 torch_layer_norm         LN_B1_S1024_D1024      0.03  False
 torch_layer_norm         LN_B1_S1024_D2048      0.03  False
-torch_layer_norm         LN_B1_S1024_D4096      0.04  False
-torch_layer_norm         LN_B1_S1024_D8192      0.05  False
-torch_layer_norm         LN_B1_S128_D1024       0.03  False
+torch_layer_norm         LN_B1_S1024_D4096      0.03  False
+torch_layer_norm         LN_B1_S1024_D8192      0.04  False
+torch_layer_norm         LN_B1_S128_D1024       0.02  False
 torch_layer_norm         LN_B1_S128_D2048       0.03  False
 torch_layer_norm         LN_B1_S128_D4096       0.03  False
 torch_layer_norm         LN_B1_S128_D8192       0.03  False
-torch_layer_norm         LN_B1_S2048_D1024      0.04  False
-torch_layer_norm         LN_B1_S2048_D2048      0.04  False
-torch_layer_norm         LN_B1_S2048_D4096      0.05  False
-torch_layer_norm         LN_B1_S2048_D8192      0.27  False
+torch_layer_norm         LN_B1_S2048_D1024      0.03  False
+torch_layer_norm         LN_B1_S2048_D2048      0.03  False
+torch_layer_norm         LN_B1_S2048_D4096      0.04  False
+torch_layer_norm         LN_B1_S2048_D8192      0.05  False
 torch_layer_norm         LN_B1_S512_D1024       0.03  False
 torch_layer_norm         LN_B1_S512_D2048       0.03  False
 torch_layer_norm         LN_B1_S512_D4096       0.03  False
-torch_layer_norm         LN_B1_S512_D8192       0.04  False
-torch_layer_norm         LN_B4_S1024_D1024      0.05  False
-torch_layer_norm         LN_B4_S1024_D2048      0.06  False
-torch_layer_norm         LN_B4_S1024_D4096      0.28  False
-torch_layer_norm         LN_B4_S1024_D8192      0.59  False
+torch_layer_norm         LN_B1_S512_D8192       0.03  False
+torch_layer_norm         LN_B4_S1024_D1024      0.03  False
+torch_layer_norm         LN_B4_S1024_D2048      0.04  False
+torch_layer_norm         LN_B4_S1024_D4096      0.05  False
+torch_layer_norm         LN_B4_S1024_D8192      0.20  False
 torch_layer_norm         LN_B4_S128_D1024       0.03  False
 torch_layer_norm         LN_B4_S128_D2048       0.03  False
 torch_layer_norm         LN_B4_S128_D4096       0.03  False
-torch_layer_norm         LN_B4_S128_D8192       0.04  False
-torch_layer_norm         LN_B4_S2048_D1024      0.07  False
-torch_layer_norm         LN_B4_S2048_D2048      0.28  False
-torch_layer_norm         LN_B4_S2048_D4096      0.58  False
-torch_layer_norm         LN_B4_S2048_D8192      1.15  False
+torch_layer_norm         LN_B4_S128_D8192       0.03  False
+torch_layer_norm         LN_B4_S2048_D1024      0.04  False
+torch_layer_norm         LN_B4_S2048_D2048      0.05  False
+torch_layer_norm         LN_B4_S2048_D4096      0.21  False
+torch_layer_norm         LN_B4_S2048_D8192      0.44  False
 torch_layer_norm         LN_B4_S512_D1024       0.03  False
-torch_layer_norm         LN_B4_S512_D2048       0.04  False
-torch_layer_norm         LN_B4_S512_D4096       0.05  False
-torch_layer_norm         LN_B4_S512_D8192       0.27  False
+torch_layer_norm         LN_B4_S512_D2048       0.03  False
+torch_layer_norm         LN_B4_S512_D4096       0.04  False
+torch_layer_norm         LN_B4_S512_D8192       0.05  False
 </pre></div>
 <div class="uv-install-logs" id="uv-logs-benchmark">
 <div class="uv-logs-header" onclick="toggleUvLogs(this)">▶ UV Install Logs</div>
 <div class="uv-logs-content" style="display: none;">
-   Building kernels-benchmark-tools @ file:///home/ubuntu/Projects/kernels-benchmarks-consolidated/tools
-Downloading networkx (1.9MiB)
-Downloading setuptools (1.1MiB)
-Downloading nvidia-nvjitlink-cu12 (37.4MiB)
-Downloading nvidia-cufile-cu12 (1.1MiB)
-Downloading sympy (6.0MiB)
-Downloading pillow (6.7MiB)
-Downloading nvidia-cufft-cu12 (184.2MiB)
-Downloading nvidia-cudnn-cu12 (674.0MiB)
-Downloading fonttools (4.7MiB)
-Downloading nvidia-cuda-nvrtc-cu12 (84.0MiB)
-Downloading matplotlib (8.3MiB)
-Downloading numpy (15.9MiB)
-Downloading nvidia-cuda-cupti-cu12 (9.8MiB)
-Downloading nvidia-nccl-cu12 (307.4MiB)
-Downloading nvidia-cublas-cu12 (566.8MiB)
-Downloading nvidia-cusolver-cu12 (255.1MiB)
-Downloading nvidia-curand-cu12 (60.7MiB)
-Downloading nvidia-cusparselt-cu12 (273.9MiB)
-Downloading kiwisolver (1.4MiB)
-Downloading triton (148.4MiB)
-Downloading nvidia-cusparse-cu12 (274.9MiB)
-Downloading torch (846.8MiB)
- Downloading nvidia-cufile-cu12
- Downloading kiwisolver
- Downloading setuptools
- Downloading fonttools
- Downloading networkx
- Downloading pillow
-      Built kernels-benchmark-tools @ file:///home/ubuntu/Projects/kernels-benchmarks-consolidated/tools
- Downloading matplotlib
- Downloading nvidia-cuda-cupti-cu12
- Downloading numpy
- Downloading sympy
- Downloading nvidia-nvjitlink-cu12
- Downloading nvidia-curand-cu12
- Downloading nvidia-cuda-nvrtc-cu12
- Downloading triton
- Downloading nvidia-cufft-cu12
- Downloading nvidia-cusolver-cu12
- Downloading nvidia-cusparselt-cu12
- Downloading nvidia-cusparse-cu12
- Downloading nvidia-nccl-cu12
- Downloading nvidia-cublas-cu12
- Downloading nvidia-cudnn-cu12
- Downloading torch
-Installed 37 packages in 220ms
+Installed 37 packages in 246ms
 </div>
 </div>
 <div class="cell-artifacts">
diff --git a/layer_norm/results/artifacts/combine/latency.svg b/layer_norm/results/artifacts/combine/latency.svg
index 2e30821930d26924b75424f8758b9026fe74c8c4..effb56da9741bfd7e06b460f98d00a10f6c0dd0b 100644
--- a/layer_norm/results/artifacts/combine/latency.svg
+++ b/layer_norm/results/artifacts/combine/latency.svg
@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:9666e51a7b23e41e320cf61de04ef7044c3870632454dcae02bf6d9c87decec7
+oid sha256:e7883bd5f88a9163cc9fdaeec2076ca6319f97d413c6bea136db33612dc2b864
 size 947
diff --git a/layer_norm/results/combined_results.html b/layer_norm/results/combined_results.html
index ea4e6fadbc68b996d1f222c9696292e116acb189..8b7ee1ce45eb4a38c542c0685544f74a7f2b87bd 100644
--- a/layer_norm/results/combined_results.html
+++ b/layer_norm/results/combined_results.html
@@ -3857,7 +3857,7 @@ span.linenos.special { color: #000000; background-color: #ffffc0; padding-left:
     <div class="system-info">
         <div class="system-info-header">Generated on:</div>
         <div class="system-info-content">
-            Linux x86_64 | Linux-5.15.0-1084-aws-x86_64-with-glibc2.31
+            Linux x86_64 | Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35
         </div>
     </div>
     
@@ -3872,7 +3872,7 @@ span.linenos.special { color: #000000; background-color: #ffffc0; padding-left:
   <rdf:RDF>
    <ns2:Work>
     <dc:type rdf:resource="http://purl.org/dc/dcmitype/StillImage" />
-    <dc:date>2025-10-24T19:26:16.447564</dc:date>
+    <dc:date>2025-10-27T14:46:34.455868</dc:date>
     <dc:format>image/svg+xml</dc:format>
     <dc:creator>
      <ns2:Agent>
@@ -3900,7 +3900,7 @@ span.linenos.special { color: #000000; background-color: #ffffc0; padding-left:
 <span onclick="toggleOutput('combine')" style="cursor: pointer;">▼ output</span>
  <span id="uv-indicator-combine" onclick="toggleUvLogsFromHeader('combine')" style="cursor: pointer;">▶ uv-logs</span>
 </span> | 
-Cell: combine | 38.84s
+Cell: combine | 4.28s
  | <button class="run-btn" onclick="runCell('combine')">▶ run</button>
 <button class="copy-btn" onclick="copyCell('combine')">Copy</button>
 <a href="cells/combine.py" target="_blank" class="raw-btn">Raw</a>
@@ -3972,13 +3972,13 @@ Cell: combine | 38.84s
 <div class="cell-stdout"><pre class="stdout-text">======================================================================
 LOADING BENCHMARK DATA
 ======================================================================
-✓ PyTorch LayerNorm             : /home/ubuntu/Projects/kernels-benchmarks-consolidated/benches/layer_norm/impls/.uvnote/cache/679c54caaf848e698d978e76e5f2839b8565918d30fac991242aebea8229f1c9
-✓ HF Kernels LayerNorm          : /home/ubuntu/Projects/kernels-benchmarks-consolidated/benches/layer_norm/impls/.uvnote/cache/f7de4b4d3171468ce97015124a3af1a23ef8d4ff4f319bd566a88676d47f08db
+✓ PyTorch LayerNorm             : /__w/kernels-benchmarks/kernels-benchmarks/benches/layer_norm/impls/.uvnote/cache/4403c31e9bef6e648597b4fcc9cfdc402678aaa4f90636b74325f12d334214a3
+✓ HF Kernels LayerNorm          : /__w/kernels-benchmarks/kernels-benchmarks/benches/layer_norm/impls/.uvnote/cache/bd278151199f29b397d85857b87922edaa39a62623fb28e0465de47d6a3bac74
 
   ✓ Found PyTorch LayerNorm
-     Path: /home/ubuntu/Projects/kernels-benchmarks-consolidated/benches/layer_norm/impls/.uvnote/cache/679c54caaf848e698d978e76e5f2839b8565918d30fac991242aebea8229f1c9/layer_norm.jsonl
+     Path: /__w/kernels-benchmarks/kernels-benchmarks/benches/layer_norm/impls/.uvnote/cache/4403c31e9bef6e648597b4fcc9cfdc402678aaa4f90636b74325f12d334214a3/layer_norm.jsonl
   ✓ Found HF Kernels LayerNorm
-     Path: /home/ubuntu/Projects/kernels-benchmarks-consolidated/benches/layer_norm/impls/.uvnote/cache/f7de4b4d3171468ce97015124a3af1a23ef8d4ff4f319bd566a88676d47f08db/layer_norm.jsonl
+     Path: /__w/kernels-benchmarks/kernels-benchmarks/benches/layer_norm/impls/.uvnote/cache/bd278151199f29b397d85857b87922edaa39a62623fb28e0465de47d6a3bac74/layer_norm.jsonl
 
 ======================================================================
 Summary: 2 found, 0 skipped, 0 missing
@@ -3987,102 +3987,102 @@ Summary: 2 found, 0 skipped, 0 missing
 COMBINED BENCHMARK SUMMARY
 
 impl                     wl                  p50(ms)  ok
-hf_kernels_layer_norm    LN_B16_S1024_D1024     0.29  False
-hf_kernels_layer_norm    LN_B16_S1024_D2048     0.61  False
-hf_kernels_layer_norm    LN_B16_S1024_D4096     1.15  False
-hf_kernels_layer_norm    LN_B16_S1024_D8192     2.27  False
+hf_kernels_layer_norm    LN_B16_S1024_D1024     0.05  False
+hf_kernels_layer_norm    LN_B16_S1024_D2048     0.22  False
+hf_kernels_layer_norm    LN_B16_S1024_D4096     0.44  False
+hf_kernels_layer_norm    LN_B16_S1024_D8192     0.84  False
 hf_kernels_layer_norm    LN_B16_S128_D1024      0.05  False
 hf_kernels_layer_norm    LN_B16_S128_D2048      0.05  False
-hf_kernels_layer_norm    LN_B16_S128_D4096      0.06  False
-hf_kernels_layer_norm    LN_B16_S128_D8192      0.30  False
-hf_kernels_layer_norm    LN_B16_S2048_D1024     0.61  False
-hf_kernels_layer_norm    LN_B16_S2048_D2048     1.20  False
-hf_kernels_layer_norm    LN_B16_S2048_D4096     2.27  False
-hf_kernels_layer_norm    LN_B16_S2048_D8192     4.51  False
-hf_kernels_layer_norm    LN_B16_S512_D1024      0.06  False
-hf_kernels_layer_norm    LN_B16_S512_D2048      0.30  False
-hf_kernels_layer_norm    LN_B16_S512_D4096      0.59  False
-hf_kernels_layer_norm    LN_B16_S512_D8192      1.16  False
+hf_kernels_layer_norm    LN_B16_S128_D4096      0.05  False
+hf_kernels_layer_norm    LN_B16_S128_D8192      0.05  False
+hf_kernels_layer_norm    LN_B16_S2048_D1024     0.21  False
+hf_kernels_layer_norm    LN_B16_S2048_D2048     0.46  False
+hf_kernels_layer_norm    LN_B16_S2048_D4096     0.84  False
+hf_kernels_layer_norm    LN_B16_S2048_D8192     1.65  False
+hf_kernels_layer_norm    LN_B16_S512_D1024      0.05  False
+hf_kernels_layer_norm    LN_B16_S512_D2048      0.05  False
+hf_kernels_layer_norm    LN_B16_S512_D4096      0.21  False
+hf_kernels_layer_norm    LN_B16_S512_D8192      0.43  False
 hf_kernels_layer_norm    LN_B1_S1024_D1024      0.05  False
 hf_kernels_layer_norm    LN_B1_S1024_D2048      0.05  False
 hf_kernels_layer_norm    LN_B1_S1024_D4096      0.05  False
-hf_kernels_layer_norm    LN_B1_S1024_D8192      0.06  False
-hf_kernels_layer_norm    LN_B1_S128_D1024       0.05  False
+hf_kernels_layer_norm    LN_B1_S1024_D8192      0.05  False
+hf_kernels_layer_norm    LN_B1_S128_D1024       0.04  False
 hf_kernels_layer_norm    LN_B1_S128_D2048       0.05  False
 hf_kernels_layer_norm    LN_B1_S128_D4096       0.05  False
 hf_kernels_layer_norm    LN_B1_S128_D8192       0.05  False
 hf_kernels_layer_norm    LN_B1_S2048_D1024      0.05  False
 hf_kernels_layer_norm    LN_B1_S2048_D2048      0.05  False
-hf_kernels_layer_norm    LN_B1_S2048_D4096      0.06  False
-hf_kernels_layer_norm    LN_B1_S2048_D8192      0.29  False
+hf_kernels_layer_norm    LN_B1_S2048_D4096      0.05  False
+hf_kernels_layer_norm    LN_B1_S2048_D8192      0.05  False
 hf_kernels_layer_norm    LN_B1_S512_D1024       0.05  False
 hf_kernels_layer_norm    LN_B1_S512_D2048       0.05  False
 hf_kernels_layer_norm    LN_B1_S512_D4096       0.05  False
 hf_kernels_layer_norm    LN_B1_S512_D8192       0.05  False
 hf_kernels_layer_norm    LN_B4_S1024_D1024      0.05  False
-hf_kernels_layer_norm    LN_B4_S1024_D2048      0.07  False
-hf_kernels_layer_norm    LN_B4_S1024_D4096      0.29  False
-hf_kernels_layer_norm    LN_B4_S1024_D8192      0.59  False
+hf_kernels_layer_norm    LN_B4_S1024_D2048      0.05  False
+hf_kernels_layer_norm    LN_B4_S1024_D4096      0.05  False
+hf_kernels_layer_norm    LN_B4_S1024_D8192      0.21  False
 hf_kernels_layer_norm    LN_B4_S128_D1024       0.05  False
 hf_kernels_layer_norm    LN_B4_S128_D2048       0.05  False
 hf_kernels_layer_norm    LN_B4_S128_D4096       0.05  False
 hf_kernels_layer_norm    LN_B4_S128_D8192       0.05  False
-hf_kernels_layer_norm    LN_B4_S2048_D1024      0.06  False
-hf_kernels_layer_norm    LN_B4_S2048_D2048      0.30  False
-hf_kernels_layer_norm    LN_B4_S2048_D4096      0.60  False
-hf_kernels_layer_norm    LN_B4_S2048_D8192      1.15  False
+hf_kernels_layer_norm    LN_B4_S2048_D1024      0.05  False
+hf_kernels_layer_norm    LN_B4_S2048_D2048      0.06  False
+hf_kernels_layer_norm    LN_B4_S2048_D4096      0.21  False
+hf_kernels_layer_norm    LN_B4_S2048_D8192      0.44  False
 hf_kernels_layer_norm    LN_B4_S512_D1024       0.05  False
 hf_kernels_layer_norm    LN_B4_S512_D2048       0.05  False
-hf_kernels_layer_norm    LN_B4_S512_D4096       0.06  False
-hf_kernels_layer_norm    LN_B4_S512_D8192       0.29  False
-torch_layer_norm         LN_B16_S1024_D1024     0.29  False
-torch_layer_norm         LN_B16_S1024_D2048     0.59  False
-torch_layer_norm         LN_B16_S1024_D4096     1.15  False
-torch_layer_norm         LN_B16_S1024_D8192     2.27  False
+hf_kernels_layer_norm    LN_B4_S512_D4096       0.05  False
+hf_kernels_layer_norm    LN_B4_S512_D8192       0.05  False
+torch_layer_norm         LN_B16_S1024_D1024     0.05  False
+torch_layer_norm         LN_B16_S1024_D2048     0.21  False
+torch_layer_norm         LN_B16_S1024_D4096     0.42  False
+torch_layer_norm         LN_B16_S1024_D8192     0.85  False
 torch_layer_norm         LN_B16_S128_D1024      0.03  False
-torch_layer_norm         LN_B16_S128_D2048      0.04  False
-torch_layer_norm         LN_B16_S128_D4096      0.05  False
-torch_layer_norm         LN_B16_S128_D8192      0.27  False
-torch_layer_norm         LN_B16_S2048_D1024     0.59  False
-torch_layer_norm         LN_B16_S2048_D2048     1.16  False
-torch_layer_norm         LN_B16_S2048_D4096     2.30  False
-torch_layer_norm         LN_B16_S2048_D8192     4.51  False
-torch_layer_norm         LN_B16_S512_D1024      0.07  False
-torch_layer_norm         LN_B16_S512_D2048      0.29  False
-torch_layer_norm         LN_B16_S512_D4096      0.59  False
-torch_layer_norm         LN_B16_S512_D8192      1.15  False
+torch_layer_norm         LN_B16_S128_D2048      0.03  False
+torch_layer_norm         LN_B16_S128_D4096      0.04  False
+torch_layer_norm         LN_B16_S128_D8192      0.05  False
+torch_layer_norm         LN_B16_S2048_D1024     0.21  False
+torch_layer_norm         LN_B16_S2048_D2048     0.42  False
+torch_layer_norm         LN_B16_S2048_D4096     0.82  False
+torch_layer_norm         LN_B16_S2048_D8192     1.68  False
+torch_layer_norm         LN_B16_S512_D1024      0.04  False
+torch_layer_norm         LN_B16_S512_D2048      0.05  False
+torch_layer_norm         LN_B16_S512_D4096      0.21  False
+torch_layer_norm         LN_B16_S512_D8192      0.43  False
 torch_layer_norm         LN_B1_S1024_D1024      0.03  False
 torch_layer_norm         LN_B1_S1024_D2048      0.03  False
-torch_layer_norm         LN_B1_S1024_D4096      0.04  False
-torch_layer_norm         LN_B1_S1024_D8192      0.05  False
-torch_layer_norm         LN_B1_S128_D1024       0.03  False
+torch_layer_norm         LN_B1_S1024_D4096      0.03  False
+torch_layer_norm         LN_B1_S1024_D8192      0.04  False
+torch_layer_norm         LN_B1_S128_D1024       0.02  False
 torch_layer_norm         LN_B1_S128_D2048       0.03  False
 torch_layer_norm         LN_B1_S128_D4096       0.03  False
 torch_layer_norm         LN_B1_S128_D8192       0.03  False
-torch_layer_norm         LN_B1_S2048_D1024      0.04  False
-torch_layer_norm         LN_B1_S2048_D2048      0.04  False
-torch_layer_norm         LN_B1_S2048_D4096      0.05  False
-torch_layer_norm         LN_B1_S2048_D8192      0.27  False
+torch_layer_norm         LN_B1_S2048_D1024      0.03  False
+torch_layer_norm         LN_B1_S2048_D2048      0.03  False
+torch_layer_norm         LN_B1_S2048_D4096      0.04  False
+torch_layer_norm         LN_B1_S2048_D8192      0.05  False
 torch_layer_norm         LN_B1_S512_D1024       0.03  False
 torch_layer_norm         LN_B1_S512_D2048       0.03  False
 torch_layer_norm         LN_B1_S512_D4096       0.03  False
-torch_layer_norm         LN_B1_S512_D8192       0.04  False
-torch_layer_norm         LN_B4_S1024_D1024      0.05  False
-torch_layer_norm         LN_B4_S1024_D2048      0.06  False
-torch_layer_norm         LN_B4_S1024_D4096      0.28  False
-torch_layer_norm         LN_B4_S1024_D8192      0.59  False
+torch_layer_norm         LN_B1_S512_D8192       0.03  False
+torch_layer_norm         LN_B4_S1024_D1024      0.03  False
+torch_layer_norm         LN_B4_S1024_D2048      0.04  False
+torch_layer_norm         LN_B4_S1024_D4096      0.05  False
+torch_layer_norm         LN_B4_S1024_D8192      0.20  False
 torch_layer_norm         LN_B4_S128_D1024       0.03  False
 torch_layer_norm         LN_B4_S128_D2048       0.03  False
 torch_layer_norm         LN_B4_S128_D4096       0.03  False
-torch_layer_norm         LN_B4_S128_D8192       0.04  False
-torch_layer_norm         LN_B4_S2048_D1024      0.07  False
-torch_layer_norm         LN_B4_S2048_D2048      0.28  False
-torch_layer_norm         LN_B4_S2048_D4096      0.58  False
-torch_layer_norm         LN_B4_S2048_D8192      1.15  False
+torch_layer_norm         LN_B4_S128_D8192       0.03  False
+torch_layer_norm         LN_B4_S2048_D1024      0.04  False
+torch_layer_norm         LN_B4_S2048_D2048      0.05  False
+torch_layer_norm         LN_B4_S2048_D4096      0.21  False
+torch_layer_norm         LN_B4_S2048_D8192      0.44  False
 torch_layer_norm         LN_B4_S512_D1024       0.03  False
-torch_layer_norm         LN_B4_S512_D2048       0.04  False
-torch_layer_norm         LN_B4_S512_D4096       0.05  False
-torch_layer_norm         LN_B4_S512_D8192       0.27  False
+torch_layer_norm         LN_B4_S512_D2048       0.03  False
+torch_layer_norm         LN_B4_S512_D4096       0.04  False
+torch_layer_norm         LN_B4_S512_D8192       0.05  False
 
 GENERATING COMBINED VISUALIZATION
 
@@ -4101,53 +4101,7 @@ Implementations included:
 <div class="uv-install-logs" id="uv-logs-combine">
 <div class="uv-logs-header" onclick="toggleUvLogs(this)">▶ UV Install Logs</div>
 <div class="uv-logs-content" style="display: none;">
-   Building kernels-benchmark-tools @ file:///home/ubuntu/Projects/kernels-benchmarks-consolidated/tools
-Downloading sympy (6.0MiB)
-Downloading pillow (6.7MiB)
-Downloading setuptools (1.1MiB)
-Downloading fonttools (4.7MiB)
-Downloading nvidia-nvjitlink-cu12 (37.4MiB)
-Downloading kiwisolver (1.4MiB)
-Downloading nvidia-cudnn-cu12 (674.0MiB)
-Downloading networkx (1.9MiB)
-Downloading matplotlib (8.3MiB)
-Downloading nvidia-curand-cu12 (60.7MiB)
-Downloading numpy (15.9MiB)
-Downloading nvidia-cuda-nvrtc-cu12 (84.0MiB)
-Downloading nvidia-cufile-cu12 (1.1MiB)
-Downloading nvidia-cusparselt-cu12 (273.9MiB)
-Downloading nvidia-cufft-cu12 (184.2MiB)
-Downloading nvidia-cuda-cupti-cu12 (9.8MiB)
-Downloading nvidia-cublas-cu12 (566.8MiB)
-Downloading nvidia-nccl-cu12 (307.4MiB)
-Downloading nvidia-cusparse-cu12 (274.9MiB)
-Downloading nvidia-cusolver-cu12 (255.1MiB)
-Downloading torch (846.8MiB)
-Downloading triton (148.4MiB)
- Downloading nvidia-cufile-cu12
- Downloading kiwisolver
- Downloading setuptools
- Downloading networkx
- Downloading fonttools
- Downloading pillow
-      Built kernels-benchmark-tools @ file:///home/ubuntu/Projects/kernels-benchmarks-consolidated/tools
- Downloading matplotlib
- Downloading nvidia-cuda-cupti-cu12
- Downloading numpy
- Downloading nvidia-nvjitlink-cu12
- Downloading sympy
- Downloading nvidia-curand-cu12
- Downloading nvidia-cuda-nvrtc-cu12
- Downloading triton
- Downloading nvidia-cufft-cu12
- Downloading nvidia-cusolver-cu12
- Downloading nvidia-cusparse-cu12
- Downloading nvidia-cusparselt-cu12
- Downloading nvidia-nccl-cu12
- Downloading nvidia-cublas-cu12
- Downloading nvidia-cudnn-cu12
- Downloading torch
-Installed 37 packages in 205ms
+Installed 37 packages in 260ms
 </div>
 </div>
 <div class="cell-artifacts">
@@ -4160,7 +4114,7 @@ Installed 37 packages in 205ms
   <rdf:RDF>
    <ns2:Work>
     <dc:type rdf:resource="http://purl.org/dc/dcmitype/StillImage" />
-    <dc:date>2025-10-24T19:26:16.447564</dc:date>
+    <dc:date>2025-10-27T14:46:34.455868</dc:date>
     <dc:format>image/svg+xml</dc:format>
     <dc:creator>
      <ns2:Agent>