diff --git a/activation/impls/artifacts/benchmark/activation.jsonl b/activation/impls/artifacts/benchmark/activation.jsonl index 434051e7334ba93dc285ce3121b82c5d0482ab20..bd99204f3d8c69a9f9ed111fe61aacdcbe3a9171 100644 --- a/activation/impls/artifacts/benchmark/activation.jsonl +++ b/activation/impls/artifacts/benchmark/activation.jsonl @@ -1,9 +1,9 @@ -{"ts": "2025-10-24T19:24:48Z", "run": "f61cf615fbaa45ae9db9a8ea612d3936", "impl": "torch_eager", "tags": {"family": "hf-kernels", "backend": "eager"}, "wl": {"name": "cuda_T128_D768", "num_tokens": 128, "hidden_dim": 768, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L4", "sm": "8.9", "py": "3.12.7", "plat": "Linux-5.15.0-1084-aws-x86_64-with-glibc2.31"}, "lat_ms": {"p10": 0.04845100920647383, "p50": 0.04891102435067296, "p90": 0.05595100810751319, "mean": 0.051765027455985546, "iqr": 0.007269962225109339, "raw_times": [0.04868104588240385, 0.04891102435067296, 0.04845100920647383, 0.0568310497328639, 0.05595100810751319], "has_warnings": true, "reps": 5, "warmup": 2}, "compile_ms": 0.055451004300266504, "peak_bytes": 2164736, "ok": true, "absmax": 0.0, "corr": {"ok": true, "rtol": 0.01, "atol": 0.1, "absmax": 0.0, "mae": 0.0, "mse": 0.0, "relmax": 0.0, "ref": "swiglu_bfloat16"}, "err": null} -{"ts": "2025-10-24T19:24:48Z", "run": "f61cf615fbaa45ae9db9a8ea612d3936", "impl": "torch_eager", "tags": {"family": "hf-kernels", "backend": "eager"}, "wl": {"name": "cuda_T128_D1024", "num_tokens": 128, "hidden_dim": 1024, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L4", "sm": "8.9", "py": "3.12.7", "plat": "Linux-5.15.0-1084-aws-x86_64-with-glibc2.31"}, "lat_ms": {"p10": 0.053381023462861776, "p50": 0.05378195783123374, "p90": 0.055961019825190306, "mean": 0.05980720743536949, "iqr": 0.002529995981603861, "raw_times": [0.055961019825190306, 0.053381023462861776, 0.08248101221397519, 0.05378195783123374, 0.053431023843586445], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.061191036365926266, "peak_bytes": 2885632, "ok": true, "absmax": 0.0, "corr": {"ok": true, "rtol": 0.01, "atol": 0.1, "absmax": 0.0, "mae": 0.0, "mse": 0.0, "relmax": 0.0, "ref": "swiglu_bfloat16"}, "err": null} -{"ts": "2025-10-24T19:24:48Z", "run": "f61cf615fbaa45ae9db9a8ea612d3936", "impl": "torch_eager", "tags": {"family": "hf-kernels", "backend": "eager"}, "wl": {"name": "cuda_T128_D2048", "num_tokens": 128, "hidden_dim": 2048, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L4", "sm": "8.9", "py": "3.12.7", "plat": "Linux-5.15.0-1084-aws-x86_64-with-glibc2.31"}, "lat_ms": {"p10": 0.05066103767603636, "p50": 0.0530310207977891, "p90": 0.0544210197404027, "mean": 0.052935024723410606, "iqr": 0.002769986167550087, "raw_times": [0.05066103767603636, 0.0544210197404027, 0.0530310207977891, 0.05491101182997227, 0.05165103357285261], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.057801022194325924, "peak_bytes": 5769216, "ok": true, "absmax": 0.0, "corr": {"ok": true, "rtol": 0.01, "atol": 0.1, "absmax": 0.0, "mae": 0.0, "mse": 0.0, "relmax": 0.0, "ref": "swiglu_bfloat16"}, "err": null} -{"ts": "2025-10-24T19:24:48Z", "run": "f61cf615fbaa45ae9db9a8ea612d3936", "impl": "torch_eager", "tags": {"family": "hf-kernels", "backend": "eager"}, "wl": {"name": "cuda_T256_D768", "num_tokens": 256, "hidden_dim": 768, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L4", "sm": "8.9", "py": "3.12.7", "plat": "Linux-5.15.0-1084-aws-x86_64-with-glibc2.31"}, "lat_ms": {"p10": 0.051781011279672384, "p50": 0.053541967645287514, "p90": 0.05360104842111468, "mean": 0.05314521258696914, "iqr": 0.0010800431482493877, "raw_times": [0.052521005272865295, 0.05428103031590581, 0.05360104842111468, 0.053541967645287514, 0.051781011279672384], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.0558110186830163, "peak_bytes": 4327424, "ok": true, "absmax": 0.0, "corr": {"ok": true, "rtol": 0.01, "atol": 0.1, "absmax": 0.0, "mae": 0.0, "mse": 0.0, "relmax": 0.0, "ref": "swiglu_bfloat16"}, "err": null} -{"ts": "2025-10-24T19:24:48Z", "run": "f61cf615fbaa45ae9db9a8ea612d3936", "impl": "torch_eager", "tags": {"family": "hf-kernels", "backend": "eager"}, "wl": {"name": "cuda_T256_D1024", "num_tokens": 256, "hidden_dim": 1024, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L4", "sm": "8.9", "py": "3.12.7", "plat": "Linux-5.15.0-1084-aws-x86_64-with-glibc2.31"}, "lat_ms": {"p10": 0.05193101242184639, "p50": 0.05308096297085285, "p90": 0.05407101707533002, "mean": 0.05422099493443966, "iqr": 0.0010799849405884743, "raw_times": [0.052991032134741545, 0.05407101707533002, 0.05903095006942749, 0.05308096297085285, 0.05193101242184639], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.05453097401186824, "peak_bytes": 5769216, "ok": true, "absmax": 0.0, "corr": {"ok": true, "rtol": 0.01, "atol": 0.1, "absmax": 0.0, "mae": 0.0, "mse": 0.0, "relmax": 0.0, "ref": "swiglu_bfloat16"}, "err": null} -{"ts": "2025-10-24T19:24:48Z", "run": "f61cf615fbaa45ae9db9a8ea612d3936", "impl": "torch_eager", "tags": {"family": "hf-kernels", "backend": "eager"}, "wl": {"name": "cuda_T256_D2048", "num_tokens": 256, "hidden_dim": 2048, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L4", "sm": "8.9", "py": "3.12.7", "plat": "Linux-5.15.0-1084-aws-x86_64-with-glibc2.31"}, "lat_ms": {"p10": 0.05170103395357728, "p50": 0.05204096669331193, "p90": 0.0529709504917264, "mean": 0.0523771857842803, "iqr": 0.0009989598765969276, "raw_times": [0.05204096669331193, 0.05170103395357728, 0.05320098716765642, 0.05197199061512947, 0.0529709504917264], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.05481095286086202, "peak_bytes": 11536384, "ok": true, "absmax": 0.0, "corr": {"ok": true, "rtol": 0.01, "atol": 0.1, "absmax": 0.0, "mae": 0.0, "mse": 0.0, "relmax": 0.0, "ref": "swiglu_bfloat16"}, "err": null} -{"ts": "2025-10-24T19:24:49Z", "run": "f61cf615fbaa45ae9db9a8ea612d3936", "impl": "torch_eager", "tags": {"family": "hf-kernels", "backend": "eager"}, "wl": {"name": "cuda_T512_D768", "num_tokens": 512, "hidden_dim": 768, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L4", "sm": "8.9", "py": "3.12.7", "plat": "Linux-5.15.0-1084-aws-x86_64-with-glibc2.31"}, "lat_ms": {"p10": 0.05258101737126708, "p50": 0.05330098792910576, "p90": 0.053990981541574, "mean": 0.053516996558755636, "iqr": 0.0007699709385633469, "raw_times": [0.05258101737126708, 0.053221010603010654, 0.05330098792910576, 0.054490985348820686, 0.053990981541574], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.05454104393720627, "peak_bytes": 8652800, "ok": true, "absmax": 0.0, "corr": {"ok": true, "rtol": 0.01, "atol": 0.1, "absmax": 0.0, "mae": 0.0, "mse": 0.0, "relmax": 0.0, "ref": "swiglu_bfloat16"}, "err": null} -{"ts": "2025-10-24T19:24:49Z", "run": "f61cf615fbaa45ae9db9a8ea612d3936", "impl": "torch_eager", "tags": {"family": "hf-kernels", "backend": "eager"}, "wl": {"name": "cuda_T512_D1024", "num_tokens": 512, "hidden_dim": 1024, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L4", "sm": "8.9", "py": "3.12.7", "plat": "Linux-5.15.0-1084-aws-x86_64-with-glibc2.31"}, "lat_ms": {"p10": 0.050960981752723455, "p50": 0.05149102071300149, "p90": 0.05149102071300149, "mean": 0.051745015662163496, "iqr": 0.00012997770681977272, "raw_times": [0.050960981752723455, 0.05149102071300149, 0.05149102071300149, 0.05342101212590933, 0.05136104300618172], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.05551200592890382, "peak_bytes": 11536384, "ok": true, "absmax": 0.0, "corr": {"ok": true, "rtol": 0.01, "atol": 0.1, "absmax": 0.0, "mae": 0.0, "mse": 0.0, "relmax": 0.0, "ref": "swiglu_bfloat16"}, "err": null} -{"ts": "2025-10-24T19:24:49Z", "run": "f61cf615fbaa45ae9db9a8ea612d3936", "impl": "torch_eager", "tags": {"family": "hf-kernels", "backend": "eager"}, "wl": {"name": "cuda_T512_D2048", "num_tokens": 512, "hidden_dim": 2048, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L4", "sm": "8.9", "py": "3.12.7", "plat": "Linux-5.15.0-1084-aws-x86_64-with-glibc2.31"}, "lat_ms": {"p10": 0.049641006626188755, "p50": 0.05309097468852997, "p90": 0.05348102422431111, "mean": 0.052487198263406754, "iqr": 0.0008300412446260452, "raw_times": [0.049641006626188755, 0.05357200279831886, 0.05265098297968507, 0.05309097468852997, 0.05348102422431111], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.05537096876651049, "peak_bytes": 23070720, "ok": true, "absmax": 0.0, "corr": {"ok": true, "rtol": 0.01, "atol": 0.1, "absmax": 0.0, "mae": 0.0, "mse": 0.0, "relmax": 0.0, "ref": "swiglu_bfloat16"}, "err": null} +{"ts": "2025-10-27T14:46:29Z", "run": "cb61ba9d82ad40cba986f04f71dd51b6", "impl": "hf_kernels_swiglu", "tags": {"family": "hf-kernels", "backend": "cuda"}, "wl": {"name": "cuda_T128_D768", "num_tokens": 128, "hidden_dim": 768, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.024320999955307343, "p50": 0.025090999997701147, "p90": 0.02569000002949906, "mean": 0.026606800020090304, "iqr": 0.0010690000635804608, "raw_times": [0.03331100015202537, 0.025090999997701147, 0.024320999955307343, 0.02569000002949906, 0.0246209999659186], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.03336100007800269, "peak_bytes": 1966080, "ok": true, "absmax": 0.0, "corr": {"ok": true, "rtol": 0.02, "atol": 0.02, "absmax": 0.0, "mae": 0.0, "mse": 0.0, "ref": "swiglu_fp32"}, "err": null} +{"ts": "2025-10-27T14:46:29Z", "run": "cb61ba9d82ad40cba986f04f71dd51b6", "impl": "hf_kernels_swiglu", "tags": {"family": "hf-kernels", "backend": "cuda"}, "wl": {"name": "cuda_T128_D1024", "num_tokens": 128, "hidden_dim": 1024, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.028640999971685233, "p50": 0.02958999994007172, "p90": 0.030561000130546745, "mean": 0.02986059994327661, "iqr": 0.0012610003068402875, "raw_times": [0.029299999823706457, 0.028640999971685233, 0.02958999994007172, 0.0312109998503729, 0.030561000130546745], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.03354099999341997, "peak_bytes": 2621440, "ok": true, "absmax": 0.0, "corr": {"ok": true, "rtol": 0.02, "atol": 0.02, "absmax": 0.0, "mae": 0.0, "mse": 0.0, "ref": "swiglu_fp32"}, "err": null} +{"ts": "2025-10-27T14:46:29Z", "run": "cb61ba9d82ad40cba986f04f71dd51b6", "impl": "hf_kernels_swiglu", "tags": {"family": "hf-kernels", "backend": "cuda"}, "wl": {"name": "cuda_T128_D2048", "num_tokens": 128, "hidden_dim": 2048, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.02880000010918593, "p50": 0.030331000061778468, "p90": 0.030401000003621448, "mean": 0.030208600037440192, "iqr": 0.0004209998678561533, "raw_times": [0.02880000010918593, 0.03153099987684982, 0.029980000135765295, 0.030331000061778468, 0.030401000003621448], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.03317000005154114, "peak_bytes": 5242880, "ok": true, "absmax": 0.0, "corr": {"ok": true, "rtol": 0.02, "atol": 0.02, "absmax": 0.0, "mae": 0.0, "mse": 0.0, "ref": "swiglu_fp32"}, "err": null} +{"ts": "2025-10-27T14:46:29Z", "run": "cb61ba9d82ad40cba986f04f71dd51b6", "impl": "hf_kernels_swiglu", "tags": {"family": "hf-kernels", "backend": "cuda"}, "wl": {"name": "cuda_T256_D768", "num_tokens": 256, "hidden_dim": 768, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.02921000009337149, "p50": 0.0294310000299447, "p90": 0.029789999871354667, "mean": 0.029938399984530406, "iqr": 0.0004489997991186101, "raw_times": [0.0294310000299447, 0.02921000009337149, 0.03191999985574512, 0.029341000072236056, 0.029789999871354667], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.03343000003042107, "peak_bytes": 3932160, "ok": true, "absmax": 0.0, "corr": {"ok": true, "rtol": 0.02, "atol": 0.02, "absmax": 0.0, "mae": 0.0, "mse": 0.0, "ref": "swiglu_fp32"}, "err": null} +{"ts": "2025-10-27T14:46:29Z", "run": "cb61ba9d82ad40cba986f04f71dd51b6", "impl": "hf_kernels_swiglu", "tags": {"family": "hf-kernels", "backend": "cuda"}, "wl": {"name": "cuda_T256_D1024", "num_tokens": 256, "hidden_dim": 1024, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.029799999992974335, "p50": 0.031021000040709623, "p90": 0.031239999998433632, "mean": 0.03210639997632825, "iqr": 0.0009289999525208259, "raw_times": [0.038159999803610845, 0.031021000040709623, 0.029799999992974335, 0.030311000045912806, 0.031239999998433632], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.03207100007784902, "peak_bytes": 5242880, "ok": true, "absmax": 0.0, "corr": {"ok": true, "rtol": 0.02, "atol": 0.02, "absmax": 0.0, "mae": 0.0, "mse": 0.0, "ref": "swiglu_fp32"}, "err": null} +{"ts": "2025-10-27T14:46:29Z", "run": "cb61ba9d82ad40cba986f04f71dd51b6", "impl": "hf_kernels_swiglu", "tags": {"family": "hf-kernels", "backend": "cuda"}, "wl": {"name": "cuda_T256_D2048", "num_tokens": 256, "hidden_dim": 2048, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.0278800000614865, "p50": 0.028550999786602915, "p90": 0.029250000125102815, "mean": 0.02903839999817137, "iqr": 0.0010100000054080738, "raw_times": [0.0278800000614865, 0.02824000011969474, 0.028550999786602915, 0.029250000125102815, 0.03127099989796989], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.03262000018366962, "peak_bytes": 10485760, "ok": true, "absmax": 0.0, "corr": {"ok": true, "rtol": 0.02, "atol": 0.02, "absmax": 0.0, "mae": 0.0, "mse": 0.0, "ref": "swiglu_fp32"}, "err": null} +{"ts": "2025-10-27T14:46:29Z", "run": "cb61ba9d82ad40cba986f04f71dd51b6", "impl": "hf_kernels_swiglu", "tags": {"family": "hf-kernels", "backend": "cuda"}, "wl": {"name": "cuda_T512_D768", "num_tokens": 512, "hidden_dim": 768, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.02945000005638576, "p50": 0.029881000045861583, "p90": 0.03017099993485317, "mean": 0.03019639998456114, "iqr": 0.0005509998572961194, "raw_times": [0.029881000045861583, 0.03185999980814813, 0.03017099993485317, 0.02945000005638576, 0.02962000007755705], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.031610000178261544, "peak_bytes": 7864320, "ok": true, "absmax": 0.0, "corr": {"ok": true, "rtol": 0.02, "atol": 0.02, "absmax": 0.0, "mae": 0.0, "mse": 0.0, "ref": "swiglu_fp32"}, "err": null} +{"ts": "2025-10-27T14:46:29Z", "run": "cb61ba9d82ad40cba986f04f71dd51b6", "impl": "hf_kernels_swiglu", "tags": {"family": "hf-kernels", "backend": "cuda"}, "wl": {"name": "cuda_T512_D1024", "num_tokens": 512, "hidden_dim": 1024, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.02807000009852345, "p50": 0.028989999918849207, "p90": 0.02929000015683414, "mean": 0.028920200020365883, "iqr": 0.0003590002961573191, "raw_times": [0.029320000066945795, 0.02929000015683414, 0.02807000009852345, 0.028989999918849207, 0.02893099986067682], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.033219999977518455, "peak_bytes": 10485760, "ok": true, "absmax": 0.0, "corr": {"ok": true, "rtol": 0.02, "atol": 0.02, "absmax": 0.0, "mae": 0.0, "mse": 0.0, "ref": "swiglu_fp32"}, "err": null} +{"ts": "2025-10-27T14:46:29Z", "run": "cb61ba9d82ad40cba986f04f71dd51b6", "impl": "hf_kernels_swiglu", "tags": {"family": "hf-kernels", "backend": "cuda"}, "wl": {"name": "cuda_T512_D2048", "num_tokens": 512, "hidden_dim": 2048, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.029301000040504732, "p50": 0.03090099994551565, "p90": 0.03149000008306757, "mean": 0.03127060003862425, "iqr": 0.0014889999420120148, "raw_times": [0.029301000040504732, 0.030001000141055556, 0.03149000008306757, 0.03465999998297775, 0.03090099994551565], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.03197000000909611, "peak_bytes": 20971520, "ok": true, "absmax": 0.0, "corr": {"ok": true, "rtol": 0.02, "atol": 0.02, "absmax": 0.0, "mae": 0.0, "mse": 0.0, "ref": "swiglu_fp32"}, "err": null} diff --git a/activation/impls/cells/benchmark.py b/activation/impls/cells/benchmark.py index 711af9e01652ef5081b507affd0f7df9ac99e644..04f9df27c14acf429b58dba6cf0677c00cbbbced 100644 --- a/activation/impls/cells/benchmark.py +++ b/activation/impls/cells/benchmark.py @@ -4,6 +4,7 @@ # "numpy", # "torch==2.8.0", # "kernels-benchmark-tools", +# "kernels", # ] # # [tool.uv.sources] @@ -12,17 +13,22 @@ import torch import sys from kernels_benchmark_tools import KernelTypeEnum, run_benchmark -import torch, torch.nn.functional as F +from kernels import get_kernel +# Load the activation kernel +activation = get_kernel("kernels-community/activation") -def swiglu_eager(x): - d = x.shape[-1] // 2 - return F.silu(x[..., :d]) * x[..., d:] + +def hf_kernels_swiglu(input_tensor): + hidden_dim = input_tensor.shape[-1] // 2 + out_shape = input_tensor.shape[:-1] + (hidden_dim,) + out = torch.empty(out_shape, dtype=input_tensor.dtype, device=input_tensor.device) + return activation.silu_and_mul(out, input_tensor) run_benchmark( kernel_type=KernelTypeEnum.ACTIVATION, - impl_name="torch_eager", - impl_tags={"family":"hf-kernels", "backend":"eager"}, - impl_func=swiglu_eager, + impl_name="hf_kernels_swiglu", + impl_tags={"family": "hf-kernels", "backend": "cuda"}, + impl_func=hf_kernels_swiglu, ) \ No newline at end of file diff --git a/activation/impls/hf_kernels_swiglu.html b/activation/impls/hf_kernels_swiglu.html index 5e56fd70e1083aed94a3dfe7ff9853871555ec6f..4784a25a1ce49622400c06aa26dd061266b6418d 100644 --- a/activation/impls/hf_kernels_swiglu.html +++ b/activation/impls/hf_kernels_swiglu.html @@ -3857,7 +3857,7 @@ span.linenos.special { color: #000000; background-color: #ffffc0; padding-left:
Generated on:
- Linux x86_64 | Linux-5.15.0-1084-aws-x86_64-with-glibc2.31 + Linux x86_64 | Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35
@@ -3871,7 +3871,7 @@ span.linenos.special { color: #000000; background-color: #ffffc0; padding-left: ▼ output ▶ uv-logs | -Cell: nv | 4.02s +Cell: nv | 0.26s | Raw @@ -3887,34 +3887,22 @@ Cell: nv | 4.02s
-
Fri Oct 24 19:18:43 2025       
+
Mon Oct 27 14:46:00 2025       
 +-----------------------------------------------------------------------------------------+
-| NVIDIA-SMI 560.35.05              Driver Version: 560.35.05      CUDA Version: 12.6     |
+| NVIDIA-SMI 570.195.03             Driver Version: 570.195.03     CUDA Version: 12.8     |
 |-----------------------------------------+------------------------+----------------------+
 | GPU  Name                 Persistence-M | Bus-Id          Disp.A | Volatile Uncorr. ECC |
 | Fan  Temp   Perf          Pwr:Usage/Cap |           Memory-Usage | GPU-Util  Compute M. |
 |                                         |                        |               MIG M. |
 |=========================================+========================+======================|
-|   0  NVIDIA L4                      Off |   00000000:38:00.0 Off |                    0 |
-| N/A   35C    P0             27W /   72W |       1MiB /  23034MiB |      0%      Default |
-|                                         |                        |                  N/A |
-+-----------------------------------------+------------------------+----------------------+
-|   1  NVIDIA L4                      Off |   00000000:3A:00.0 Off |                    0 |
-| N/A   32C    P0             28W /   72W |       1MiB /  23034MiB |      2%      Default |
-|                                         |                        |                  N/A |
-+-----------------------------------------+------------------------+----------------------+
-|   2  NVIDIA L4                      Off |   00000000:3C:00.0 Off |                    0 |
-| N/A   34C    P0             27W /   72W |       1MiB /  23034MiB |      1%      Default |
-|                                         |                        |                  N/A |
-+-----------------------------------------+------------------------+----------------------+
-|   3  NVIDIA L4                      Off |   00000000:3E:00.0 Off |                    0 |
-| N/A   32C    P0             27W /   72W |       1MiB /  23034MiB |      2%      Default |
+|   0  NVIDIA L40S                    On  |   00000000:4D:00.0 Off |                    0 |
+| N/A   32C    P0            153W /  350W |       0MiB /  46068MiB |     75%      Default |
 |                                         |                        |                  N/A |
 +-----------------------------------------+------------------------+----------------------+
 
 +-----------------------------------------------------------------------------------------+
 | Processes:                                                                              |
-|  GPU   GI   CI        PID   Type   Process name                              GPU Memory |
+|  GPU   GI   CI              PID   Type   Process name                        GPU Memory |
 |        ID   ID                                                               Usage      |
 |=========================================================================================|
 |  No running processes found                                                             |
@@ -3932,7 +3920,7 @@ Cell: nv | 4.02s
 ▼ output
  ▶ uv-logs
  | 
-Cell: benchmark | 43.68s
+Cell: benchmark | 4.32s
  | 
 
 Raw
@@ -3988,17 +3976,17 @@ PROFILE TRACE: hf_kernels_swiglu | cuda_T128_D768
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
                                                    Name    Self CPU %      Self CPU   CPU total %     CPU total  CPU time avg     Self CUDA   Self CUDA %    CUDA total  CUDA time avg    # of Calls  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-                                      hf_kernels_swiglu         0.00%       0.000us         0.00%       0.000us       0.000us      72.447us      1172.85%      72.447us      72.447us             1  
-                                      hf_kernels_swiglu        10.70%     189.904us        99.62%       1.769ms       1.769ms       0.000us         0.00%       8.289us       8.289us             1  
-                      _activation_beeaae6::silu_and_mul         1.07%      18.931us        86.38%       1.534ms     511.168us       6.177us       100.00%       8.289us       2.763us             3  
-void vllm::act_and_mul_kernel<c10::BFloat16, &(c10::...         0.00%       0.000us         0.00%       0.000us       0.000us       6.177us       100.00%       6.177us       2.059us             3  
-                                Activity Buffer Request        82.95%       1.473ms        82.95%       1.473ms       1.473ms       2.112us        34.19%       2.112us       2.112us             1  
-                                            aten::empty         2.54%      45.151us         2.54%      45.151us      15.050us       0.000us         0.00%       0.000us       0.000us             3  
-                                       cudaLaunchKernel         2.36%      41.961us         2.36%      41.961us      13.987us       0.000us         0.00%       0.000us       0.000us             3  
-                                  cudaDeviceSynchronize         0.38%       6.701us         0.38%       6.701us       6.701us       0.000us         0.00%       0.000us       0.000us             1  
+                                      hf_kernels_swiglu         0.00%       0.000us         0.00%       0.000us       0.000us      80.128us      1940.62%      80.128us      80.128us             1  
+                                      hf_kernels_swiglu        11.19%     199.383us        99.56%       1.774ms       1.774ms       0.000us         0.00%       5.634us       5.634us             1  
+                      _activation_beeaae6::silu_and_mul         1.10%      19.601us        85.64%       1.526ms     508.618us       4.129us       100.00%       5.634us       1.878us             3  
+void vllm::act_and_mul_kernel<c10::BFloat16, &(c10::...         0.00%       0.000us         0.00%       0.000us       0.000us       4.129us       100.00%       4.129us       1.376us             3  
+                                Activity Buffer Request        82.30%       1.466ms        82.30%       1.466ms       1.466ms       1.505us        36.45%       1.505us       1.505us             1  
+                                            aten::empty         2.73%      48.641us         2.73%      48.641us      16.214us       0.000us         0.00%       0.000us       0.000us             3  
+                                       cudaLaunchKernel         2.24%      39.931us         2.24%      39.931us      13.310us       0.000us         0.00%       0.000us       0.000us             3  
+                                  cudaDeviceSynchronize         0.44%       7.891us         0.44%       7.891us       7.891us       0.000us         0.00%       0.000us       0.000us             1  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-Self CPU time total: 1.775ms
-Self CUDA time total: 6.177us
+Self CPU time total: 1.782ms
+Self CUDA time total: 4.129us
 
 
 
@@ -4008,17 +3996,17 @@ PROFILE TRACE: hf_kernels_swiglu | cuda_T128_D1024
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
                                                    Name    Self CPU %      Self CPU   CPU total %     CPU total  CPU time avg     Self CUDA   Self CUDA %    CUDA total  CUDA time avg    # of Calls  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-                                      hf_kernels_swiglu         0.00%       0.000us         0.00%       0.000us       0.000us      91.934us      1135.55%      91.934us      91.934us             1  
-                                      hf_kernels_swiglu         6.80%     114.004us        99.69%       1.672ms       1.672ms       0.000us         0.00%      10.816us      10.816us             1  
-                      _activation_beeaae6::silu_and_mul         1.26%      21.089us        91.64%       1.537ms     512.271us       8.096us       100.00%      10.816us       3.605us             3  
-void vllm::act_and_mul_kernel<c10::BFloat16, &(c10::...         0.00%       0.000us         0.00%       0.000us       0.000us       8.096us       100.00%       8.096us       2.699us             3  
-                                Activity Buffer Request        88.69%       1.487ms        88.69%       1.487ms       1.487ms       2.720us        33.60%       2.720us       2.720us             1  
-                                            aten::empty         1.24%      20.870us         1.24%      20.870us       6.957us       0.000us         0.00%       0.000us       0.000us             3  
-                                       cudaLaunchKernel         1.70%      28.501us         1.70%      28.501us       9.500us       0.000us         0.00%       0.000us       0.000us             3  
-                                  cudaDeviceSynchronize         0.31%       5.260us         0.31%       5.260us       5.260us       0.000us         0.00%       0.000us       0.000us             1  
+                                      hf_kernels_swiglu         0.00%       0.000us         0.00%       0.000us       0.000us      77.823us      1961.76%      77.823us      77.823us             1  
+                                      hf_kernels_swiglu         7.28%     119.722us        99.70%       1.640ms       1.640ms       0.000us         0.00%       5.311us       5.311us             1  
+                      _activation_beeaae6::silu_and_mul         1.57%      25.841us        91.18%       1.500ms     499.858us       3.967us       100.00%       5.311us       1.770us             3  
+void vllm::act_and_mul_kernel<c10::BFloat16, &(c10::...         0.00%       0.000us         0.00%       0.000us       0.000us       3.967us       100.00%       3.967us       1.322us             3  
+                                Activity Buffer Request        87.74%       1.443ms        87.74%       1.443ms       1.443ms       1.344us        33.88%       1.344us       1.344us             1  
+                                            aten::empty         1.24%      20.410us         1.24%      20.410us       6.803us       0.000us         0.00%       0.000us       0.000us             3  
+                                       cudaLaunchKernel         1.86%      30.650us         1.86%      30.650us      10.217us       0.000us         0.00%       0.000us       0.000us             3  
+                                  cudaDeviceSynchronize         0.30%       4.930us         0.30%       4.930us       4.930us       0.000us         0.00%       0.000us       0.000us             1  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-Self CPU time total: 1.677ms
-Self CUDA time total: 8.096us
+Self CPU time total: 1.645ms
+Self CUDA time total: 3.967us
 
 
 
@@ -4028,17 +4016,17 @@ PROFILE TRACE: hf_kernels_swiglu | cuda_T128_D2048
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
                                                    Name    Self CPU %      Self CPU   CPU total %     CPU total  CPU time avg     Self CUDA   Self CUDA %    CUDA total  CUDA time avg    # of Calls  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-                                      hf_kernels_swiglu         0.00%       0.000us         0.00%       0.000us       0.000us      67.039us       596.86%      67.039us      67.039us             1  
-                                      hf_kernels_swiglu         5.22%      85.373us        99.71%       1.630ms       1.630ms       0.000us         0.00%      15.008us      15.008us             1  
-                      _activation_beeaae6::silu_and_mul         1.19%      19.431us        93.38%       1.527ms     508.877us      11.232us       100.00%      15.008us       5.003us             3  
-void vllm::act_and_mul_kernel<c10::BFloat16, &(c10::...         0.00%       0.000us         0.00%       0.000us       0.000us      11.232us       100.00%      11.232us       3.744us             3  
-                                Activity Buffer Request        90.58%       1.481ms        90.58%       1.481ms       1.481ms       3.776us        33.62%       3.776us       3.776us             1  
-                                            aten::empty         1.11%      18.160us         1.11%      18.160us       6.053us       0.000us         0.00%       0.000us       0.000us             3  
-                                       cudaLaunchKernel         1.61%      26.370us         1.61%      26.370us       8.790us       0.000us         0.00%       0.000us       0.000us             3  
-                                  cudaDeviceSynchronize         0.29%       4.730us         0.29%       4.730us       4.730us       0.000us         0.00%       0.000us       0.000us             1  
+                                      hf_kernels_swiglu         0.00%       0.000us         0.00%       0.000us       0.000us      67.487us      1369.46%      67.487us      67.487us             1  
+                                      hf_kernels_swiglu         6.70%     107.400us        99.69%       1.598ms       1.598ms       0.000us         0.00%       6.592us       6.592us             1  
+                      _activation_beeaae6::silu_and_mul         1.32%      21.191us        91.79%       1.471ms     490.438us       4.928us       100.00%       6.592us       2.197us             3  
+void vllm::act_and_mul_kernel<c10::BFloat16, &(c10::...         0.00%       0.000us         0.00%       0.000us       0.000us       4.928us       100.00%       4.928us       1.643us             3  
+                                Activity Buffer Request        88.89%       1.425ms        88.89%       1.425ms       1.425ms       1.664us        33.77%       1.664us       1.664us             1  
+                                            aten::empty         1.20%      19.281us         1.20%      19.281us       6.427us       0.000us         0.00%       0.000us       0.000us             3  
+                                       cudaLaunchKernel         1.57%      25.210us         1.57%      25.210us       8.403us       0.000us         0.00%       0.000us       0.000us             3  
+                                  cudaDeviceSynchronize         0.31%       4.970us         0.31%       4.970us       4.970us       0.000us         0.00%       0.000us       0.000us             1  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-Self CPU time total: 1.635ms
-Self CUDA time total: 11.232us
+Self CPU time total: 1.603ms
+Self CUDA time total: 4.928us
 
 
 
@@ -4048,17 +4036,17 @@ PROFILE TRACE: hf_kernels_swiglu | cuda_T256_D768
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
                                                    Name    Self CPU %      Self CPU   CPU total %     CPU total  CPU time avg     Self CUDA   Self CUDA %    CUDA total  CUDA time avg    # of Calls  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-                                      hf_kernels_swiglu         0.00%       0.000us         0.00%       0.000us       0.000us      69.598us       870.08%      69.598us      69.598us             1  
-                                      hf_kernels_swiglu         4.94%      87.632us        99.74%       1.771ms       1.771ms       0.000us         0.00%      10.719us      10.719us             1  
-                      _activation_beeaae6::silu_and_mul         1.09%      19.352us        93.69%       1.663ms     554.452us       7.999us       100.00%      10.719us       3.573us             3  
-void vllm::act_and_mul_kernel<c10::BFloat16, &(c10::...         0.00%       0.000us         0.00%       0.000us       0.000us       7.999us       100.00%       7.999us       2.666us             3  
-                                Activity Buffer Request        83.17%       1.477ms        83.17%       1.477ms       1.477ms       2.720us        34.00%       2.720us       2.720us             1  
-                                            aten::empty         1.11%      19.710us         1.11%      19.710us       6.570us       0.000us         0.00%       0.000us       0.000us             3  
-                                       cudaLaunchKernel         9.43%     167.443us         9.43%     167.443us      55.814us       0.000us         0.00%       0.000us       0.000us             3  
-                                  cudaDeviceSynchronize         0.26%       4.610us         0.26%       4.610us       4.610us       0.000us         0.00%       0.000us       0.000us             1  
+                                      hf_kernels_swiglu         0.00%       0.000us         0.00%       0.000us       0.000us      75.265us      1768.03%      75.265us      75.265us             1  
+                                      hf_kernels_swiglu         6.51%     118.032us        99.70%       1.807ms       1.807ms       0.000us         0.00%       5.697us       5.697us             1  
+                      _activation_beeaae6::silu_and_mul         1.22%      22.071us        92.05%       1.668ms     556.119us       4.257us       100.00%       5.697us       1.899us             3  
+void vllm::act_and_mul_kernel<c10::BFloat16, &(c10::...         0.00%       0.000us         0.00%       0.000us       0.000us       4.257us       100.00%       4.257us       1.419us             3  
+                                Activity Buffer Request        79.39%       1.439ms        79.39%       1.439ms       1.439ms       1.440us        33.83%       1.440us       1.440us             1  
+                                            aten::empty         1.14%      20.640us         1.14%      20.640us       6.880us       0.000us         0.00%       0.000us       0.000us             3  
+                                       cudaLaunchKernel        11.45%     207.513us        11.45%     207.513us      69.171us       0.000us         0.00%       0.000us       0.000us             3  
+                                  cudaDeviceSynchronize         0.30%       5.350us         0.30%       5.350us       5.350us       0.000us         0.00%       0.000us       0.000us             1  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-Self CPU time total: 1.775ms
-Self CUDA time total: 7.999us
+Self CPU time total: 1.812ms
+Self CUDA time total: 4.257us
 
 
 
@@ -4068,17 +4056,17 @@ PROFILE TRACE: hf_kernels_swiglu | cuda_T256_D1024
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
                                                    Name    Self CPU %      Self CPU   CPU total %     CPU total  CPU time avg     Self CUDA   Self CUDA %    CUDA total  CUDA time avg    # of Calls  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-                                      hf_kernels_swiglu         0.00%       0.000us         0.00%       0.000us       0.000us      70.239us       570.12%      70.239us      70.239us             1  
-                                      hf_kernels_swiglu         5.14%      91.331us        99.75%       1.772ms       1.772ms       0.000us         0.00%      16.448us      16.448us             1  
-                      _activation_beeaae6::silu_and_mul         1.09%      19.360us        93.54%       1.662ms     553.872us      12.320us       100.00%      16.448us       5.483us             3  
-void vllm::act_and_mul_kernel<c10::BFloat16, &(c10::...         0.00%       0.000us         0.00%       0.000us       0.000us      12.320us       100.00%      12.320us       4.107us             3  
-                                Activity Buffer Request        83.14%       1.477ms        83.14%       1.477ms       1.477ms       4.128us        33.51%       4.128us       4.128us             1  
-                                            aten::empty         1.07%      19.032us         1.07%      19.032us       6.344us       0.000us         0.00%       0.000us       0.000us             3  
-                                       cudaLaunchKernel         9.31%     165.333us         9.31%     165.333us      55.111us       0.000us         0.00%       0.000us       0.000us             3  
-                                  cudaDeviceSynchronize         0.25%       4.400us         0.25%       4.400us       4.400us       0.000us         0.00%       0.000us       0.000us             1  
+                                      hf_kernels_swiglu         0.00%       0.000us         0.00%       0.000us       0.000us      65.471us      1111.94%      65.471us      65.471us             1  
+                                      hf_kernels_swiglu        19.52%      89.390us        98.84%     452.537us     452.537us       0.000us         0.00%       7.872us       7.872us             1  
+                      _activation_beeaae6::silu_and_mul         5.02%      23.003us        75.04%     343.547us     114.516us       5.888us       100.00%       7.872us       2.624us             3  
+void vllm::act_and_mul_kernel<c10::BFloat16, &(c10::...         0.00%       0.000us         0.00%       0.000us       0.000us       5.888us       100.00%       5.888us       1.963us             3  
+                                Activity Buffer Request        33.89%     155.152us        33.89%     155.152us     155.152us       1.984us        33.70%       1.984us       1.984us             1  
+                                            aten::empty         4.28%      19.600us         4.28%      19.600us       6.533us       0.000us         0.00%       0.000us       0.000us             3  
+                                       cudaLaunchKernel        36.13%     165.392us        36.13%     165.392us      55.131us       0.000us         0.00%       0.000us       0.000us             3  
+                                  cudaDeviceSynchronize         1.16%       5.290us         1.16%       5.290us       5.290us       0.000us         0.00%       0.000us       0.000us             1  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-Self CPU time total: 1.776ms
-Self CUDA time total: 12.320us
+Self CPU time total: 457.827us
+Self CUDA time total: 5.888us
 
 
 
@@ -4088,17 +4076,17 @@ PROFILE TRACE: hf_kernels_swiglu | cuda_T256_D2048
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
                                                    Name    Self CPU %      Self CPU   CPU total %     CPU total  CPU time avg     Self CUDA   Self CUDA %    CUDA total  CUDA time avg    # of Calls  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-                                      hf_kernels_swiglu         0.00%       0.000us         0.00%       0.000us       0.000us      68.766us       394.32%      68.766us      68.766us             1  
-                                      hf_kernels_swiglu        16.12%      86.942us        99.12%     534.642us     534.642us       0.000us         0.00%      23.263us      23.263us             1  
-                      _activation_beeaae6::silu_and_mul         3.56%      19.181us        79.14%     426.890us     142.297us      17.439us       100.00%      23.263us       7.754us             3  
-void vllm::act_and_mul_kernel<c10::BFloat16, &(c10::...         0.00%       0.000us         0.00%       0.000us       0.000us      17.439us       100.00%      17.439us       5.813us             3  
-                                Activity Buffer Request        44.72%     241.246us        44.72%     241.246us     241.246us       5.824us        33.40%       5.824us       5.824us             1  
-                                            aten::empty         3.86%      20.810us         3.86%      20.810us       6.937us       0.000us         0.00%       0.000us       0.000us             3  
-                                       cudaLaunchKernel        30.86%     166.463us        30.86%     166.463us      55.488us       0.000us         0.00%       0.000us       0.000us             3  
-                                  cudaDeviceSynchronize         0.88%       4.760us         0.88%       4.760us       4.760us       0.000us         0.00%       0.000us       0.000us             1  
+                                      hf_kernels_swiglu         0.00%       0.000us         0.00%       0.000us       0.000us      68.383us       879.52%      68.383us      68.383us             1  
+                                      hf_kernels_swiglu         6.83%     118.711us        99.72%       1.734ms       1.734ms       0.000us         0.00%      10.367us      10.367us             1  
+                      _activation_beeaae6::silu_and_mul         1.25%      21.741us        91.78%       1.596ms     531.855us       7.775us       100.00%      10.367us       3.456us             3  
+void vllm::act_and_mul_kernel<c10::BFloat16, &(c10::...         0.00%       0.000us         0.00%       0.000us       0.000us       7.775us       100.00%       7.775us       2.592us             3  
+                                Activity Buffer Request        81.74%       1.421ms        81.74%       1.421ms       1.421ms       2.592us        33.34%       2.592us       2.592us             1  
+                                            aten::empty         1.11%      19.311us         1.11%      19.311us       6.437us       0.000us         0.00%       0.000us       0.000us             3  
+                                       cudaLaunchKernel         8.79%     152.752us         8.79%     152.752us      50.917us       0.000us         0.00%       0.000us       0.000us             3  
+                                  cudaDeviceSynchronize         0.28%       4.930us         0.28%       4.930us       4.930us       0.000us         0.00%       0.000us       0.000us             1  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-Self CPU time total: 539.402us
-Self CUDA time total: 17.439us
+Self CPU time total: 1.739ms
+Self CUDA time total: 7.775us
 
 
 
@@ -4108,17 +4096,17 @@ PROFILE TRACE: hf_kernels_swiglu | cuda_T512_D768
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
                                                    Name    Self CPU %      Self CPU   CPU total %     CPU total  CPU time avg     Self CUDA   Self CUDA %    CUDA total  CUDA time avg    # of Calls  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-                                      hf_kernels_swiglu         0.00%       0.000us         0.00%       0.000us       0.000us      67.422us       541.63%      67.422us      67.422us             1  
-                                      hf_kernels_swiglu        15.67%      86.170us        99.13%     545.172us     545.172us       0.000us         0.00%      16.576us      16.576us             1  
-                      _activation_beeaae6::silu_and_mul         3.45%      18.981us        79.89%     439.370us     146.457us      12.448us       100.00%      16.576us       5.525us             3  
-void vllm::act_and_mul_kernel<c10::BFloat16, &(c10::...         0.00%       0.000us         0.00%       0.000us       0.000us      12.448us       100.00%      12.448us       4.149us             3  
-                                Activity Buffer Request        46.28%     254.506us        46.28%     254.506us     254.506us       4.128us        33.16%       4.128us       4.128us             1  
-                                            aten::empty         3.57%      19.632us         3.57%      19.632us       6.544us       0.000us         0.00%       0.000us       0.000us             3  
-                                       cudaLaunchKernel        30.16%     165.883us        30.16%     165.883us      55.294us       0.000us         0.00%       0.000us       0.000us             3  
-                                  cudaDeviceSynchronize         0.87%       4.770us         0.87%       4.770us       4.770us       0.000us         0.00%       0.000us       0.000us             1  
+                                      hf_kernels_swiglu         0.00%       0.000us         0.00%       0.000us       0.000us      70.527us      1069.89%      70.527us      70.527us             1  
+                                      hf_kernels_swiglu         6.20%     108.691us        99.73%       1.749ms       1.749ms       0.000us         0.00%       8.800us       8.800us             1  
+                      _activation_beeaae6::silu_and_mul         1.29%      22.622us        92.35%       1.619ms     539.785us       6.592us       100.00%       8.800us       2.933us             3  
+void vllm::act_and_mul_kernel<c10::BFloat16, &(c10::...         0.00%       0.000us         0.00%       0.000us       0.000us       6.592us       100.00%       6.592us       2.197us             3  
+                                Activity Buffer Request        82.48%       1.446ms        82.48%       1.446ms       1.446ms       2.208us        33.50%       2.208us       2.208us             1  
+                                            aten::empty         1.18%      20.650us         1.18%      20.650us       6.883us       0.000us         0.00%       0.000us       0.000us             3  
+                                       cudaLaunchKernel         8.58%     150.492us         8.58%     150.492us      50.164us       0.000us         0.00%       0.000us       0.000us             3  
+                                  cudaDeviceSynchronize         0.27%       4.780us         0.27%       4.780us       4.780us       0.000us         0.00%       0.000us       0.000us             1  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-Self CPU time total: 549.942us
-Self CUDA time total: 12.448us
+Self CPU time total: 1.753ms
+Self CUDA time total: 6.592us
 
 
 
@@ -4128,17 +4116,17 @@ PROFILE TRACE: hf_kernels_swiglu | cuda_T512_D1024
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
                                                    Name    Self CPU %      Self CPU   CPU total %     CPU total  CPU time avg     Self CUDA   Self CUDA %    CUDA total  CUDA time avg    # of Calls  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-                                      hf_kernels_swiglu         0.00%       0.000us         0.00%       0.000us       0.000us      70.941us       341.59%      70.941us      70.941us             1  
-                                      hf_kernels_swiglu        15.89%      87.442us        99.17%     545.692us     545.692us       0.000us         0.00%      27.744us      27.744us             1  
-                      _activation_beeaae6::silu_and_mul         3.49%      19.210us        79.79%     439.080us     146.360us      20.768us       100.00%      27.744us       9.248us             3  
-void vllm::act_and_mul_kernel<c10::BFloat16, &(c10::...         0.00%       0.000us         0.00%       0.000us       0.000us      20.768us       100.00%      20.768us       6.923us             3  
-                                Activity Buffer Request        46.16%     253.986us        46.16%     253.986us     253.986us       6.976us        33.59%       6.976us       6.976us             1  
-                                            aten::empty         3.48%      19.170us         3.48%      19.170us       6.390us       0.000us         0.00%       0.000us       0.000us             3  
-                                       cudaLaunchKernel        30.15%     165.884us        30.15%     165.884us      55.295us       0.000us         0.00%       0.000us       0.000us             3  
-                                  cudaDeviceSynchronize         0.83%       4.591us         0.83%       4.591us       4.591us       0.000us         0.00%       0.000us       0.000us             1  
+                                      hf_kernels_swiglu         0.00%       0.000us         0.00%       0.000us       0.000us      66.591us       703.03%      66.591us      66.591us             1  
+                                      hf_kernels_swiglu        22.91%      88.512us        98.75%     381.506us     381.506us       0.000us         0.00%      12.640us      12.640us             1  
+                      _activation_beeaae6::silu_and_mul         5.22%      20.151us        70.42%     272.064us      90.688us       9.472us       100.00%      12.640us       4.213us             3  
+void vllm::act_and_mul_kernel<c10::BFloat16, &(c10::...         0.00%       0.000us         0.00%       0.000us       0.000us       9.472us       100.00%       9.472us       3.157us             3  
+                                Activity Buffer Request        26.21%     101.241us        26.21%     101.241us     101.241us       3.168us        33.45%       3.168us       3.168us             1  
+                                            aten::empty         5.42%      20.930us         5.42%      20.930us       6.977us       0.000us         0.00%       0.000us       0.000us             3  
+                                       cudaLaunchKernel        39.00%     150.672us        39.00%     150.672us      50.224us       0.000us         0.00%       0.000us       0.000us             3  
+                                  cudaDeviceSynchronize         1.25%       4.820us         1.25%       4.820us       4.820us       0.000us         0.00%       0.000us       0.000us             1  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-Self CPU time total: 550.283us
-Self CUDA time total: 20.768us
+Self CPU time total: 386.326us
+Self CUDA time total: 9.472us
 
 
 
@@ -4148,17 +4136,17 @@ PROFILE TRACE: hf_kernels_swiglu | cuda_T512_D2048
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
                                                    Name    Self CPU %      Self CPU   CPU total %     CPU total  CPU time avg     Self CUDA   Self CUDA %    CUDA total  CUDA time avg    # of Calls  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-                                      hf_kernels_swiglu         0.00%       0.000us         0.00%       0.000us       0.000us      70.780us       228.74%      70.780us      70.780us             1  
-                                      hf_kernels_swiglu        16.83%      85.362us        99.15%     502.911us     502.911us       0.000us         0.00%      41.183us      41.183us             1  
-                      _activation_beeaae6::silu_and_mul         3.74%      18.980us        78.74%     399.388us     133.129us      30.943us       100.00%      41.183us      13.728us             3  
-void vllm::act_and_mul_kernel<c10::BFloat16, &(c10::...         0.00%       0.000us         0.00%       0.000us       0.000us      30.943us       100.00%      30.943us      10.314us             3  
-                                Activity Buffer Request        42.65%     216.335us        42.65%     216.335us     216.335us      10.240us        33.09%      10.240us      10.240us             1  
-                                            aten::empty         3.58%      18.161us         3.58%      18.161us       6.054us       0.000us         0.00%       0.000us       0.000us             3  
-                                       cudaLaunchKernel        32.35%     164.073us        32.35%     164.073us      54.691us       0.000us         0.00%       0.000us       0.000us             3  
-                                  cudaDeviceSynchronize         0.85%       4.320us         0.85%       4.320us       4.320us       0.000us         0.00%       0.000us       0.000us             1  
+                                      hf_kernels_swiglu         0.00%       0.000us         0.00%       0.000us       0.000us      67.295us       514.21%      67.295us      67.295us             1  
+                                      hf_kernels_swiglu        24.05%     101.492us        98.90%     417.266us     417.266us       0.000us         0.00%      17.503us      17.503us             1  
+                      _activation_beeaae6::silu_and_mul         5.33%      22.480us        70.08%     295.684us      98.561us      13.087us       100.00%      17.503us       5.834us             3  
+void vllm::act_and_mul_kernel<c10::BFloat16, &(c10::...         0.00%       0.000us         0.00%       0.000us       0.000us      13.087us       100.00%      13.087us       4.362us             3  
+                                Activity Buffer Request        28.92%     122.012us        28.92%     122.012us     122.012us       4.416us        33.74%       4.416us       4.416us             1  
+                                            aten::empty         4.76%      20.090us         4.76%      20.090us       6.697us       0.000us         0.00%       0.000us       0.000us             3  
+                                       cudaLaunchKernel        35.83%     151.192us        35.83%     151.192us      50.397us       0.000us         0.00%       0.000us       0.000us             3  
+                                  cudaDeviceSynchronize         1.10%       4.660us         1.10%       4.660us       4.660us       0.000us         0.00%       0.000us       0.000us             1  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-Self CPU time total: 507.231us
-Self CUDA time total: 30.943us
+Self CPU time total: 421.926us
+Self CUDA time total: 13.087us
 
 
 impl                     wl                  p50(ms)  ok
@@ -4175,61 +4163,12 @@ hf_kernels_swiglu        cuda_T512_D768         0.03  True
 
▶ UV Install Logs
Fetching 7 files: 0%| | 0/7 [00:00<?, ?it/s] -Fetching 7 files: 14%|█▍ | 1/7 [00:00<00:01, 4.08it/s] -Fetching 7 files: 71%|███████▏ | 5/7 [00:00<00:00, 9.40it/s] -Fetching 7 files: 100%|██████████| 7/7 [00:00<00:00, 12.20it/s]
+Fetching 7 files: 71%|███████▏ | 5/7 [00:00<00:00, 13.68it/s] +Fetching 7 files: 100%|██████████| 7/7 [00:00<00:00, 19.14it/s]

Artifacts:

activation.jsonl diff --git a/activation/impls/torch_swiglu.html b/activation/impls/torch_swiglu.html index fad608debc07afac9b5948fbe5ed773bd10d6a06..6347cf8477b3c77c2a153235fedda937b464164d 100644 --- a/activation/impls/torch_swiglu.html +++ b/activation/impls/torch_swiglu.html @@ -3857,7 +3857,7 @@ span.linenos.special { color: #000000; background-color: #ffffc0; padding-left:
Generated on:
- Linux x86_64 | Linux-5.15.0-1084-aws-x86_64-with-glibc2.31 + Linux x86_64 | Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35
@@ -3871,7 +3871,7 @@ span.linenos.special { color: #000000; background-color: #ffffc0; padding-left: ▼ output ▶ uv-logs | -Cell: nv | 4.02s +Cell: nv | 0.26s | Raw @@ -3887,34 +3887,22 @@ Cell: nv | 4.02s
-
Fri Oct 24 19:24:09 2025       
+
Mon Oct 27 14:46:00 2025       
 +-----------------------------------------------------------------------------------------+
-| NVIDIA-SMI 560.35.05              Driver Version: 560.35.05      CUDA Version: 12.6     |
+| NVIDIA-SMI 570.195.03             Driver Version: 570.195.03     CUDA Version: 12.8     |
 |-----------------------------------------+------------------------+----------------------+
 | GPU  Name                 Persistence-M | Bus-Id          Disp.A | Volatile Uncorr. ECC |
 | Fan  Temp   Perf          Pwr:Usage/Cap |           Memory-Usage | GPU-Util  Compute M. |
 |                                         |                        |               MIG M. |
 |=========================================+========================+======================|
-|   0  NVIDIA L4                      Off |   00000000:38:00.0 Off |                    0 |
-| N/A   36C    P0             27W /   72W |       1MiB /  23034MiB |      2%      Default |
-|                                         |                        |                  N/A |
-+-----------------------------------------+------------------------+----------------------+
-|   1  NVIDIA L4                      Off |   00000000:3A:00.0 Off |                    0 |
-| N/A   33C    P0             28W /   72W |       1MiB /  23034MiB |      0%      Default |
-|                                         |                        |                  N/A |
-+-----------------------------------------+------------------------+----------------------+
-|   2  NVIDIA L4                      Off |   00000000:3C:00.0 Off |                    0 |
-| N/A   34C    P0             27W /   72W |       1MiB /  23034MiB |      0%      Default |
-|                                         |                        |                  N/A |
-+-----------------------------------------+------------------------+----------------------+
-|   3  NVIDIA L4                      Off |   00000000:3E:00.0 Off |                    0 |
-| N/A   33C    P0             27W /   72W |       1MiB /  23034MiB |      2%      Default |
+|   0  NVIDIA L40S                    On  |   00000000:4D:00.0 Off |                    0 |
+| N/A   32C    P0            153W /  350W |       0MiB /  46068MiB |     75%      Default |
 |                                         |                        |                  N/A |
 +-----------------------------------------+------------------------+----------------------+
 
 +-----------------------------------------------------------------------------------------+
 | Processes:                                                                              |
-|  GPU   GI   CI        PID   Type   Process name                              GPU Memory |
+|  GPU   GI   CI              PID   Type   Process name                        GPU Memory |
 |        ID   ID                                                               Usage      |
 |=========================================================================================|
 |  No running processes found                                                             |
@@ -3932,7 +3920,7 @@ Cell: nv | 4.02s
 ▼ output
  ▶ uv-logs
  | 
-Cell: benchmark | 42.42s
+Cell: benchmark | 6.99s
  | 
 
 Raw
@@ -3982,20 +3970,20 @@ PROFILE TRACE: torch_eager | cuda_T128_D768
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
                                                    Name    Self CPU %      Self CPU   CPU total %     CPU total  CPU time avg     Self CUDA   Self CUDA %    CUDA total  CUDA time avg    # of Calls  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-                                            torch_eager         0.00%       0.000us         0.00%       0.000us       0.000us     237.726us      1604.74%     237.726us     237.726us             1  
-                                            torch_eager        11.30%     225.975us        99.63%       1.992ms       1.992ms       0.000us         0.00%      17.566us      17.566us             1  
-                                             aten::silu         3.42%      68.411us        81.12%       1.622ms     540.728us       7.646us        51.61%      10.398us       3.466us             3  
-void at::native::elementwise_kernel<128, 4, at::nati...         0.00%       0.000us         0.00%       0.000us       0.000us       7.646us        51.61%       7.646us       2.549us             3  
-                                              aten::mul         2.15%      42.970us         3.33%      66.621us      22.207us       7.168us        48.39%       7.168us       2.389us             3  
-void at::native::elementwise_kernel<128, 4, at::nati...         0.00%       0.000us         0.00%       0.000us       0.000us       7.168us        48.39%       7.168us       2.389us             3  
-                                Activity Buffer Request        74.74%       1.495ms        74.74%       1.495ms       1.495ms       2.752us        18.58%       2.752us       2.752us             1  
-                                            aten::slice         3.26%      65.261us         3.88%      77.582us      12.930us       0.000us         0.00%       0.000us       0.000us             6  
-                                       aten::as_strided         0.62%      12.321us         0.62%      12.321us       2.053us       0.000us         0.00%       0.000us       0.000us             6  
-                                       cudaLaunchKernel         4.14%      82.803us         4.14%      82.803us      13.800us       0.000us         0.00%       0.000us       0.000us             6  
-                                  cudaDeviceSynchronize         0.37%       7.380us         0.37%       7.380us       7.380us       0.000us         0.00%       0.000us       0.000us             1  
+                                            torch_eager         0.00%       0.000us         0.00%       0.000us       0.000us     183.359us      1436.08%     183.359us     183.359us             1  
+                                            torch_eager        11.24%     212.694us        99.53%       1.883ms       1.883ms       0.000us         0.00%      15.072us      15.072us             1  
+                                             aten::silu         3.31%      62.660us        82.30%       1.557ms     519.134us       6.527us        51.12%       8.831us       2.944us             3  
+void at::native::elementwise_kernel<128, 4, at::nati...         0.00%       0.000us         0.00%       0.000us       0.000us       6.527us        51.12%       6.527us       2.176us             3  
+                                              aten::mul         1.85%      35.100us         2.98%      56.340us      18.780us       6.241us        48.88%       6.241us       2.080us             3  
+void at::native::elementwise_kernel<128, 4, at::nati...         0.00%       0.000us         0.00%       0.000us       0.000us       6.241us        48.88%       6.241us       2.080us             3  
+                                Activity Buffer Request        76.74%       1.452ms        76.74%       1.452ms       1.452ms       2.304us        18.05%       2.304us       2.304us             1  
+                                            aten::slice         2.41%      45.561us         3.01%      56.902us       9.484us       0.000us         0.00%       0.000us       0.000us             6  
+                                       aten::as_strided         0.60%      11.341us         0.60%      11.341us       1.890us       0.000us         0.00%       0.000us       0.000us             6  
+                                       cudaLaunchKernel         3.37%      63.741us         3.37%      63.741us      10.623us       0.000us         0.00%       0.000us       0.000us             6  
+                                  cudaDeviceSynchronize         0.47%       8.969us         0.47%       8.969us       8.969us       0.000us         0.00%       0.000us       0.000us             1  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-Self CPU time total: 2.000ms
-Self CUDA time total: 14.814us
+Self CPU time total: 1.892ms
+Self CUDA time total: 12.768us
 
 
 
@@ -4005,20 +3993,20 @@ PROFILE TRACE: torch_eager | cuda_T128_D1024
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
                                                    Name    Self CPU %      Self CPU   CPU total %     CPU total  CPU time avg     Self CUDA   Self CUDA %    CUDA total  CUDA time avg    # of Calls  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-                                            torch_eager         0.00%       0.000us         0.00%       0.000us       0.000us     155.197us      1056.55%     155.197us     155.197us             1  
-                                            torch_eager         6.38%     113.914us        99.69%       1.779ms       1.779ms       0.000us         0.00%      17.249us      17.249us             1  
-                                             aten::silu         2.13%      37.960us        88.89%       1.587ms     528.841us       7.616us        51.85%      10.176us       3.392us             3  
-void at::native::elementwise_kernel<128, 4, at::nati...         0.00%       0.000us         0.00%       0.000us       0.000us       7.616us        51.85%       7.616us       2.539us             3  
-                                              aten::mul         1.58%      28.130us         2.63%      46.991us      15.664us       7.073us        48.15%       7.073us       2.358us             3  
-void at::native::elementwise_kernel<128, 4, at::nati...         0.00%       0.000us         0.00%       0.000us       0.000us       7.073us        48.15%       7.073us       2.358us             3  
-                                Activity Buffer Request        85.27%       1.522ms        85.27%       1.522ms       1.522ms       2.560us        17.43%       2.560us       2.560us             1  
-                                            aten::slice         1.43%      25.481us         1.78%      31.850us       5.308us       0.000us         0.00%       0.000us       0.000us             6  
-                                       aten::as_strided         0.36%       6.369us         0.36%       6.369us       1.061us       0.000us         0.00%       0.000us       0.000us             6  
-                                       cudaLaunchKernel         2.55%      45.552us         2.55%      45.552us       7.592us       0.000us         0.00%       0.000us       0.000us             6  
-                                  cudaDeviceSynchronize         0.31%       5.590us         0.31%       5.590us       5.590us       0.000us         0.00%       0.000us       0.000us             1  
+                                            torch_eager         0.00%       0.000us         0.00%       0.000us       0.000us     158.431us      1279.63%     158.431us     158.431us             1  
+                                            torch_eager         6.85%     117.301us        99.69%       1.707ms       1.707ms       0.000us         0.00%      14.557us      14.557us             1  
+                                             aten::silu         2.45%      41.990us        88.25%       1.511ms     503.680us       6.398us        51.68%       8.574us       2.858us             3  
+void at::native::elementwise_kernel<128, 4, at::nati...         0.00%       0.000us         0.00%       0.000us       0.000us       6.398us        51.68%       6.398us       2.133us             3  
+                                              aten::mul         1.63%      27.830us         2.78%      47.630us      15.877us       5.983us        48.32%       5.983us       1.994us             3  
+void at::native::elementwise_kernel<128, 4, at::nati...         0.00%       0.000us         0.00%       0.000us       0.000us       5.983us        48.32%       5.983us       1.994us             3  
+                                Activity Buffer Request        84.28%       1.443ms        84.28%       1.443ms       1.443ms       2.176us        17.58%       2.176us       2.176us             1  
+                                            aten::slice         1.45%      24.820us         1.81%      30.931us       5.155us       0.000us         0.00%       0.000us       0.000us             6  
+                                       aten::as_strided         0.36%       6.111us         0.36%       6.111us       1.019us       0.000us         0.00%       0.000us       0.000us             6  
+                                       cudaLaunchKernel         2.67%      45.711us         2.67%      45.711us       7.618us       0.000us         0.00%       0.000us       0.000us             6  
+                                  cudaDeviceSynchronize         0.31%       5.320us         0.31%       5.320us       5.320us       0.000us         0.00%       0.000us       0.000us             1  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-Self CPU time total: 1.785ms
-Self CUDA time total: 14.689us
+Self CPU time total: 1.712ms
+Self CUDA time total: 12.381us
 
 
 
@@ -4028,20 +4016,20 @@ PROFILE TRACE: torch_eager | cuda_T128_D2048
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
                                                    Name    Self CPU %      Self CPU   CPU total %     CPU total  CPU time avg     Self CUDA   Self CUDA %    CUDA total  CUDA time avg    # of Calls  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-                                            torch_eager         0.00%       0.000us         0.00%       0.000us       0.000us     157.724us       928.23%     157.724us     157.724us             1  
-                                            torch_eager         6.06%     107.501us        99.72%       1.769ms       1.769ms       0.000us         0.00%      19.872us      19.872us             1  
-                                             aten::silu         2.60%      46.162us        89.17%       1.581ms     527.145us       8.576us        50.47%      11.456us       3.819us             3  
-void at::native::elementwise_kernel<128, 4, at::nati...         0.00%       0.000us         0.00%       0.000us       0.000us       8.576us        50.47%       8.576us       2.859us             3  
-                                              aten::mul         1.54%      27.281us         2.61%      46.211us      15.404us       8.416us        49.53%       8.416us       2.805us             3  
-void at::native::elementwise_kernel<128, 4, at::nati...         0.00%       0.000us         0.00%       0.000us       0.000us       8.416us        49.53%       8.416us       2.805us             3  
-                                Activity Buffer Request        85.05%       1.508ms        85.05%       1.508ms       1.508ms       2.880us        16.95%       2.880us       2.880us             1  
-                                            aten::slice         1.51%      26.721us         1.88%      33.391us       5.565us       0.000us         0.00%       0.000us       0.000us             6  
-                                       aten::as_strided         0.38%       6.670us         0.38%       6.670us       1.112us       0.000us         0.00%       0.000us       0.000us             6  
-                                       cudaLaunchKernel         2.58%      45.781us         2.58%      45.781us       7.630us       0.000us         0.00%       0.000us       0.000us             6  
-                                  cudaDeviceSynchronize         0.28%       4.940us         0.28%       4.940us       4.940us       0.000us         0.00%       0.000us       0.000us             1  
+                                            torch_eager         0.00%       0.000us         0.00%       0.000us       0.000us     145.182us      1095.88%     145.182us     145.182us             1  
+                                            torch_eager         6.28%     105.841us        99.65%       1.680ms       1.680ms       0.000us         0.00%      15.552us      15.552us             1  
+                                             aten::silu         2.40%      40.400us        89.03%       1.501ms     500.258us       6.816us        51.45%       9.120us       3.040us             3  
+void at::native::elementwise_kernel<128, 4, at::nati...         0.00%       0.000us         0.00%       0.000us       0.000us       6.816us        51.45%       6.816us       2.272us             3  
+                                              aten::mul         1.52%      25.690us         2.64%      44.480us      14.827us       6.432us        48.55%       6.432us       2.144us             3  
+void at::native::elementwise_kernel<128, 4, at::nati...         0.00%       0.000us         0.00%       0.000us       0.000us       6.432us        48.55%       6.432us       2.144us             3  
+                                Activity Buffer Request        85.10%       1.434ms        85.10%       1.434ms       1.434ms       2.304us        17.39%       2.304us       2.304us             1  
+                                            aten::slice         1.37%      23.030us         1.70%      28.690us       4.782us       0.000us         0.00%       0.000us       0.000us             6  
+                                       aten::as_strided         0.34%       5.660us         0.34%       5.660us       0.943us       0.000us         0.00%       0.000us       0.000us             6  
+                                       cudaLaunchKernel         2.66%      44.762us         2.66%      44.762us       7.460us       0.000us         0.00%       0.000us       0.000us             6  
+                                  cudaDeviceSynchronize         0.35%       5.820us         0.35%       5.820us       5.820us       0.000us         0.00%       0.000us       0.000us             1  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-Self CPU time total: 1.773ms
-Self CUDA time total: 16.992us
+Self CPU time total: 1.686ms
+Self CUDA time total: 13.248us
 
 
 
@@ -4051,20 +4039,20 @@ PROFILE TRACE: torch_eager | cuda_T256_D768
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
                                                    Name    Self CPU %      Self CPU   CPU total %     CPU total  CPU time avg     Self CUDA   Self CUDA %    CUDA total  CUDA time avg    # of Calls  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-                                            torch_eager         0.00%       0.000us         0.00%       0.000us       0.000us     154.972us       984.26%     154.972us     154.972us             1  
-                                            torch_eager         7.81%     106.363us        99.66%       1.357ms       1.357ms       0.000us         0.00%      18.497us      18.497us             1  
-                                             aten::silu         3.01%      41.020us        86.15%       1.173ms     391.021us       8.096us        51.42%      10.848us       3.616us             3  
-void at::native::elementwise_kernel<128, 4, at::nati...         0.00%       0.000us         0.00%       0.000us       0.000us       8.096us        51.42%       8.096us       2.699us             3  
-                                              aten::mul         1.89%      25.761us         3.27%      44.581us      14.860us       7.649us        48.58%       7.649us       2.550us             3  
-void at::native::elementwise_kernel<128, 4, at::nati...         0.00%       0.000us         0.00%       0.000us       0.000us       7.649us        48.58%       7.649us       2.550us             3  
-                                Activity Buffer Request        68.76%     936.210us        68.76%     936.210us     936.210us       2.752us        17.48%       2.752us       2.752us             1  
-                                            aten::slice         1.90%      25.829us         2.43%      33.031us       5.505us       0.000us         0.00%       0.000us       0.000us             6  
-                                       aten::as_strided         0.53%       7.202us         0.53%       7.202us       1.200us       0.000us         0.00%       0.000us       0.000us             6  
-                                       cudaLaunchKernel        15.76%     214.654us        15.76%     214.654us      35.776us       0.000us         0.00%       0.000us       0.000us             6  
-                                  cudaDeviceSynchronize         0.34%       4.590us         0.34%       4.590us       4.590us       0.000us         0.00%       0.000us       0.000us             1  
+                                            torch_eager         0.00%       0.000us         0.00%       0.000us       0.000us     145.025us      1135.85%     145.025us     145.025us             1  
+                                            torch_eager         7.55%     116.292us        99.65%       1.535ms       1.535ms       0.000us         0.00%      14.976us      14.976us             1  
+                                             aten::silu         2.67%      41.061us        87.34%       1.345ms     448.460us       6.592us        51.63%       8.800us       2.933us             3  
+void at::native::elementwise_kernel<128, 4, at::nati...         0.00%       0.000us         0.00%       0.000us       0.000us       6.592us        51.63%       6.592us       2.197us             3  
+                                              aten::mul         1.71%      26.359us         2.88%      44.330us      14.777us       6.176us        48.37%       6.176us       2.059us             3  
+void at::native::elementwise_kernel<128, 4, at::nati...         0.00%       0.000us         0.00%       0.000us       0.000us       6.176us        48.37%       6.176us       2.059us             3  
+                                Activity Buffer Request        69.61%       1.072ms        69.61%       1.072ms       1.072ms       2.208us        17.29%       2.208us       2.208us             1  
+                                            aten::slice         1.52%      23.350us         1.89%      29.050us       4.842us       0.000us         0.00%       0.000us       0.000us             6  
+                                       aten::as_strided         0.37%       5.700us         0.37%       5.700us       0.950us       0.000us         0.00%       0.000us       0.000us             6  
+                                       cudaLaunchKernel        16.23%     250.045us        16.23%     250.045us      41.674us       0.000us         0.00%       0.000us       0.000us             6  
+                                  cudaDeviceSynchronize         0.35%       5.360us         0.35%       5.360us       5.360us       0.000us         0.00%       0.000us       0.000us             1  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-Self CPU time total: 1.362ms
-Self CUDA time total: 15.745us
+Self CPU time total: 1.540ms
+Self CUDA time total: 12.768us
 
 
 
@@ -4074,20 +4062,20 @@ PROFILE TRACE: torch_eager | cuda_T256_D1024
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
                                                    Name    Self CPU %      Self CPU   CPU total %     CPU total  CPU time avg     Self CUDA   Self CUDA %    CUDA total  CUDA time avg    # of Calls  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-                                            torch_eager         0.00%       0.000us         0.00%       0.000us       0.000us     155.837us       910.37%     155.837us     155.837us             1  
-                                            torch_eager         5.68%     106.351us        99.75%       1.869ms       1.869ms       0.000us         0.00%      20.126us      20.126us             1  
-                                             aten::silu         2.11%      39.481us        89.91%       1.685ms     561.559us       8.671us        50.65%      11.679us       3.893us             3  
-void at::native::elementwise_kernel<128, 4, at::nati...         0.00%       0.000us         0.00%       0.000us       0.000us       8.671us        50.65%       8.671us       2.890us             3  
-                                              aten::mul         1.44%      26.891us         2.49%      46.661us      15.554us       8.447us        49.35%       8.447us       2.816us             3  
-void at::native::elementwise_kernel<128, 4, at::nati...         0.00%       0.000us         0.00%       0.000us       0.000us       8.447us        49.35%       8.447us       2.816us             3  
-                                Activity Buffer Request        78.61%       1.473ms        78.61%       1.473ms       1.473ms       3.008us        17.57%       3.008us       3.008us             1  
-                                            aten::slice         1.33%      24.861us         1.68%      31.451us       5.242us       0.000us         0.00%       0.000us       0.000us             6  
-                                       aten::as_strided         0.35%       6.590us         0.35%       6.590us       1.098us       0.000us         0.00%       0.000us       0.000us             6  
-                                       cudaLaunchKernel        10.25%     192.054us        10.25%     192.054us      32.009us       0.000us         0.00%       0.000us       0.000us             6  
-                                  cudaDeviceSynchronize         0.25%       4.670us         0.25%       4.670us       4.670us       0.000us         0.00%       0.000us       0.000us             1  
+                                            torch_eager         0.00%       0.000us         0.00%       0.000us       0.000us     144.030us      1089.82%     144.030us     144.030us             1  
+                                            torch_eager         5.82%     104.551us        99.68%       1.792ms       1.792ms       0.000us         0.00%      15.488us      15.488us             1  
+                                             aten::silu         2.32%      41.682us        89.81%       1.614ms     538.151us       6.752us        51.09%       9.024us       3.008us             3  
+void at::native::elementwise_kernel<128, 4, at::nati...         0.00%       0.000us         0.00%       0.000us       0.000us       6.752us        51.09%       6.752us       2.251us             3  
+                                              aten::mul         1.41%      25.409us         2.48%      44.550us      14.850us       6.464us        48.91%       6.464us       2.155us             3  
+void at::native::elementwise_kernel<128, 4, at::nati...         0.00%       0.000us         0.00%       0.000us       0.000us       6.464us        48.91%       6.464us       2.155us             3  
+                                Activity Buffer Request        78.50%       1.411ms        78.50%       1.411ms       1.411ms       2.272us        17.19%       2.272us       2.272us             1  
+                                            aten::slice         1.27%      22.830us         1.58%      28.320us       4.720us       0.000us         0.00%       0.000us       0.000us             6  
+                                       aten::as_strided         0.31%       5.490us         0.31%       5.490us       0.915us       0.000us         0.00%       0.000us       0.000us             6  
+                                       cudaLaunchKernel        10.06%     180.853us        10.06%     180.853us      30.142us       0.000us         0.00%       0.000us       0.000us             6  
+                                  cudaDeviceSynchronize         0.32%       5.710us         0.32%       5.710us       5.710us       0.000us         0.00%       0.000us       0.000us             1  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-Self CPU time total: 1.874ms
-Self CUDA time total: 17.118us
+Self CPU time total: 1.798ms
+Self CUDA time total: 13.216us
 
 
 
@@ -4097,20 +4085,20 @@ PROFILE TRACE: torch_eager | cuda_T256_D2048
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
                                                    Name    Self CPU %      Self CPU   CPU total %     CPU total  CPU time avg     Self CUDA   Self CUDA %    CUDA total  CUDA time avg    # of Calls  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-                                            torch_eager         0.00%       0.000us         0.00%       0.000us       0.000us     155.486us       596.92%     155.486us     155.486us             1  
-                                            torch_eager        20.98%     108.302us        99.11%     511.621us     511.621us       0.000us         0.00%      30.592us      30.592us             1  
-                                             aten::silu         7.61%      39.290us        63.32%     326.866us     108.955us      13.504us        51.84%      18.048us       6.016us             3  
-void at::native::elementwise_kernel<128, 4, at::nati...         0.00%       0.000us         0.00%       0.000us       0.000us      13.504us        51.84%      13.504us       4.501us             3  
-                                              aten::mul         5.03%      25.960us         8.46%      43.671us      14.557us      12.544us        48.16%      12.544us       4.181us             3  
-void at::native::elementwise_kernel<128, 4, at::nati...         0.00%       0.000us         0.00%       0.000us       0.000us      12.544us        48.16%      12.544us       4.181us             3  
-                                Activity Buffer Request        25.15%     129.813us        25.15%     129.813us     129.813us       4.544us        17.44%       4.544us       4.544us             1  
-                                            aten::slice         5.13%      26.471us         6.35%      32.782us       5.464us       0.000us         0.00%       0.000us       0.000us             6  
-                                       aten::as_strided         1.22%       6.311us         1.22%       6.311us       1.052us       0.000us         0.00%       0.000us       0.000us             6  
-                                       cudaLaunchKernel        33.99%     175.474us        33.99%     175.474us      29.246us       0.000us         0.00%       0.000us       0.000us             6  
-                                  cudaDeviceSynchronize         0.89%       4.611us         0.89%       4.611us       4.611us       0.000us         0.00%       0.000us       0.000us             1  
+                                            torch_eager         0.00%       0.000us         0.00%       0.000us       0.000us     140.382us       902.66%     140.382us     140.382us             1  
+                                            torch_eager        21.39%     103.633us        98.99%     479.697us     479.697us       0.000us         0.00%      18.240us      18.240us             1  
+                                             aten::silu         8.56%      41.460us        63.18%     306.154us     102.051us       7.936us        51.03%      10.624us       3.541us             3  
+void at::native::elementwise_kernel<128, 4, at::nati...         0.00%       0.000us         0.00%       0.000us       0.000us       7.936us        51.03%       7.936us       2.645us             3  
+                                              aten::mul         4.90%      23.759us         8.63%      41.840us      13.947us       7.616us        48.97%       7.616us       2.539us             3  
+void at::native::elementwise_kernel<128, 4, at::nati...         0.00%       0.000us         0.00%       0.000us       0.000us       7.616us        48.97%       7.616us       2.539us             3  
+                                Activity Buffer Request        23.12%     112.032us        23.12%     112.032us     112.032us       2.688us        17.28%       2.688us       2.688us             1  
+                                            aten::slice         4.68%      22.671us         5.79%      28.070us       4.678us       0.000us         0.00%       0.000us       0.000us             6  
+                                       aten::as_strided         1.11%       5.399us         1.11%       5.399us       0.900us       0.000us         0.00%       0.000us       0.000us             6  
+                                       cudaLaunchKernel        35.23%     170.743us        35.23%     170.743us      28.457us       0.000us         0.00%       0.000us       0.000us             6  
+                                  cudaDeviceSynchronize         1.01%       4.900us         1.01%       4.900us       4.900us       0.000us         0.00%       0.000us       0.000us             1  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-Self CPU time total: 516.232us
-Self CUDA time total: 26.048us
+Self CPU time total: 484.597us
+Self CUDA time total: 15.552us
 
 
 
@@ -4120,20 +4108,20 @@ PROFILE TRACE: torch_eager | cuda_T512_D768
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
                                                    Name    Self CPU %      Self CPU   CPU total %     CPU total  CPU time avg     Self CUDA   Self CUDA %    CUDA total  CUDA time avg    # of Calls  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-                                            torch_eager         0.00%       0.000us         0.00%       0.000us       0.000us     163.293us       685.87%     163.293us     163.293us             1  
-                                            torch_eager         5.58%     106.954us        99.75%       1.910ms       1.910ms       0.000us         0.00%      27.872us      27.872us             1  
-                                             aten::silu         2.13%      40.799us        89.92%       1.722ms     574.075us      12.032us        50.54%      16.096us       5.365us             3  
-void at::native::elementwise_kernel<128, 4, at::nati...         0.00%       0.000us         0.00%       0.000us       0.000us      12.032us        50.54%      12.032us       4.011us             3  
-                                              aten::mul         1.39%      26.590us         2.40%      46.050us      15.350us      11.776us        49.46%      11.776us       3.925us             3  
-void at::native::elementwise_kernel<128, 4, at::nati...         0.00%       0.000us         0.00%       0.000us       0.000us      11.776us        49.46%      11.776us       3.925us             3  
-                                Activity Buffer Request        79.43%       1.521ms        79.43%       1.521ms       1.521ms       4.064us        17.07%       4.064us       4.064us             1  
-                                            aten::slice         1.44%      27.592us         1.83%      35.091us       5.849us       0.000us         0.00%       0.000us       0.000us             6  
-                                       aten::as_strided         0.39%       7.499us         0.39%       7.499us       1.250us       0.000us         0.00%       0.000us       0.000us             6  
-                                       cudaLaunchKernel         9.38%     179.564us         9.38%     179.564us      29.927us       0.000us         0.00%       0.000us       0.000us             6  
-                                  cudaDeviceSynchronize         0.25%       4.880us         0.25%       4.880us       4.880us       0.000us         0.00%       0.000us       0.000us             1  
+                                            torch_eager         0.00%       0.000us         0.00%       0.000us       0.000us     145.662us      1011.54%     145.662us     145.662us             1  
+                                            torch_eager         5.99%     108.381us        99.73%       1.804ms       1.804ms       0.000us         0.00%      16.896us      16.896us             1  
+                                             aten::silu         2.28%      41.342us        89.69%       1.623ms     540.945us       7.392us        51.33%       9.888us       3.296us             3  
+void at::native::elementwise_kernel<128, 4, at::nati...         0.00%       0.000us         0.00%       0.000us       0.000us       7.392us        51.33%       7.392us       2.464us             3  
+                                              aten::mul         1.44%      26.049us         2.45%      44.420us      14.807us       7.008us        48.67%       7.008us       2.336us             3  
+void at::native::elementwise_kernel<128, 4, at::nati...         0.00%       0.000us         0.00%       0.000us       0.000us       7.008us        48.67%       7.008us       2.336us             3  
+                                Activity Buffer Request        78.99%       1.429ms        78.99%       1.429ms       1.429ms       2.496us        17.33%       2.496us       2.496us             1  
+                                            aten::slice         1.28%      23.160us         1.59%      28.810us       4.802us       0.000us         0.00%       0.000us       0.000us             6  
+                                       aten::as_strided         0.31%       5.650us         0.31%       5.650us       0.942us       0.000us         0.00%       0.000us       0.000us             6  
+                                       cudaLaunchKernel         9.43%     170.603us         9.43%     170.603us      28.434us       0.000us         0.00%       0.000us       0.000us             6  
+                                  cudaDeviceSynchronize         0.27%       4.930us         0.27%       4.930us       4.930us       0.000us         0.00%       0.000us       0.000us             1  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-Self CPU time total: 1.915ms
-Self CUDA time total: 23.808us
+Self CPU time total: 1.809ms
+Self CUDA time total: 14.400us
 
 
 
@@ -4143,20 +4131,20 @@ PROFILE TRACE: torch_eager | cuda_T512_D1024
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
                                                    Name    Self CPU %      Self CPU   CPU total %     CPU total  CPU time avg     Self CUDA   Self CUDA %    CUDA total  CUDA time avg    # of Calls  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-                                            torch_eager         0.00%       0.000us         0.00%       0.000us       0.000us     157.181us       605.66%     157.181us     157.181us             1  
-                                            torch_eager         5.64%     105.982us        99.73%       1.874ms       1.874ms       0.000us         0.00%      30.528us      30.528us             1  
-                                             aten::silu         2.16%      40.612us        89.86%       1.688ms     562.829us      13.440us        51.79%      18.016us       6.005us             3  
-void at::native::elementwise_kernel<128, 4, at::nati...         0.00%       0.000us         0.00%       0.000us       0.000us      13.440us        51.79%      13.440us       4.480us             3  
-                                              aten::mul         1.34%      25.270us         2.38%      44.720us      14.907us      12.512us        48.21%      12.512us       4.171us             3  
-void at::native::elementwise_kernel<128, 4, at::nati...         0.00%       0.000us         0.00%       0.000us       0.000us      12.512us        48.21%      12.512us       4.171us             3  
-                                Activity Buffer Request        79.27%       1.489ms        79.27%       1.489ms       1.489ms       4.576us        17.63%       4.576us       4.576us             1  
-                                            aten::slice         1.48%      27.801us         1.85%      34.741us       5.790us       0.000us         0.00%       0.000us       0.000us             6  
-                                       aten::as_strided         0.37%       6.940us         0.37%       6.940us       1.157us       0.000us         0.00%       0.000us       0.000us             6  
-                                       cudaLaunchKernel         9.47%     177.873us         9.47%     177.873us      29.645us       0.000us         0.00%       0.000us       0.000us             6  
-                                  cudaDeviceSynchronize         0.27%       5.010us         0.27%       5.010us       5.010us       0.000us         0.00%       0.000us       0.000us             1  
+                                            torch_eager         0.00%       0.000us         0.00%       0.000us       0.000us     142.206us       914.45%     142.206us     142.206us             1  
+                                            torch_eager        21.70%     105.494us        98.87%     480.727us     480.727us       0.000us         0.00%      18.239us      18.239us             1  
+                                             aten::silu         8.21%      39.900us        62.39%     303.354us     101.118us       7.966us        51.23%      10.654us       3.551us             3  
+void at::native::elementwise_kernel<128, 4, at::nati...         0.00%       0.000us         0.00%       0.000us       0.000us       7.966us        51.23%       7.966us       2.655us             3  
+                                              aten::mul         5.16%      25.070us         8.84%      42.990us      14.330us       7.585us        48.77%       7.585us       2.528us             3  
+void at::native::elementwise_kernel<128, 4, at::nati...         0.00%       0.000us         0.00%       0.000us       0.000us       7.585us        48.77%       7.585us       2.528us             3  
+                                Activity Buffer Request        23.29%     113.242us        23.29%     113.242us     113.242us       2.688us        17.29%       2.688us       2.688us             1  
+                                            aten::slice         4.75%      23.080us         5.94%      28.889us       4.815us       0.000us         0.00%       0.000us       0.000us             6  
+                                       aten::as_strided         1.19%       5.809us         1.19%       5.809us       0.968us       0.000us         0.00%       0.000us       0.000us             6  
+                                       cudaLaunchKernel        34.58%     168.132us        34.58%     168.132us      28.022us       0.000us         0.00%       0.000us       0.000us             6  
+                                  cudaDeviceSynchronize         1.13%       5.500us         1.13%       5.500us       5.500us       0.000us         0.00%       0.000us       0.000us             1  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-Self CPU time total: 1.879ms
-Self CUDA time total: 25.952us
+Self CPU time total: 486.227us
+Self CUDA time total: 15.551us
 
 
 
@@ -4166,26 +4154,26 @@ PROFILE TRACE: torch_eager | cuda_T512_D2048
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
                                                    Name    Self CPU %      Self CPU   CPU total %     CPU total  CPU time avg     Self CUDA   Self CUDA %    CUDA total  CUDA time avg    # of Calls  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-                                            torch_eager         0.00%       0.000us         0.00%       0.000us       0.000us     158.077us       375.10%     158.077us     158.077us             1  
-                                            torch_eager         5.61%     105.585us        99.74%       1.877ms       1.877ms       0.000us         0.00%      49.375us      49.375us             1  
-                                             aten::silu         2.18%      41.121us        90.06%       1.695ms     564.996us      21.856us        51.86%      29.088us       9.696us             3  
-void at::native::elementwise_kernel<128, 4, at::nati...         0.00%       0.000us         0.00%       0.000us       0.000us      21.856us        51.86%      21.856us       7.285us             3  
-                                              aten::mul         1.38%      26.000us         2.45%      46.100us      15.367us      20.287us        48.14%      20.287us       6.762us             3  
-void at::native::elementwise_kernel<128, 4, at::nati...         0.00%       0.000us         0.00%       0.000us       0.000us      20.287us        48.14%      20.287us       6.762us             3  
-                                Activity Buffer Request        79.53%       1.497ms        79.53%       1.497ms       1.497ms       7.232us        17.16%       7.232us       7.232us             1  
-                                            aten::slice         1.26%      23.718us         1.62%      30.479us       5.080us       0.000us         0.00%       0.000us       0.000us             6  
-                                       aten::as_strided         0.36%       6.761us         0.36%       6.761us       1.127us       0.000us         0.00%       0.000us       0.000us             6  
-                                       cudaLaunchKernel         9.41%     177.183us         9.41%     177.183us      29.531us       0.000us         0.00%       0.000us       0.000us             6  
-                                  cudaDeviceSynchronize         0.26%       4.970us         0.26%       4.970us       4.970us       0.000us         0.00%       0.000us       0.000us             1  
+                                            torch_eager         0.00%       0.000us         0.00%       0.000us       0.000us     149.022us       661.50%     149.022us     149.022us             1  
+                                            torch_eager         5.72%     105.900us        99.72%       1.847ms       1.847ms       0.000us         0.00%      26.431us      26.431us             1  
+                                             aten::silu         2.24%      41.461us        90.05%       1.668ms     555.875us      11.552us        51.28%      15.455us       5.152us             3  
+void at::native::elementwise_kernel<128, 4, at::nati...         0.00%       0.000us         0.00%       0.000us       0.000us      11.552us        51.28%      11.552us       3.851us             3  
+                                              aten::mul         1.41%      26.021us         2.40%      44.421us      14.807us      10.976us        48.72%      10.976us       3.659us             3  
+void at::native::elementwise_kernel<128, 4, at::nati...         0.00%       0.000us         0.00%       0.000us       0.000us      10.976us        48.72%      10.976us       3.659us             3  
+                                Activity Buffer Request        79.50%       1.472ms        79.50%       1.472ms       1.472ms       3.903us        17.33%       3.903us       3.903us             1  
+                                            aten::slice         1.25%      23.131us         1.56%      28.831us       4.805us       0.000us         0.00%       0.000us       0.000us             6  
+                                       aten::as_strided         0.31%       5.700us         0.31%       5.700us       0.950us       0.000us         0.00%       0.000us       0.000us             6  
+                                       cudaLaunchKernel         9.31%     172.382us         9.31%     172.382us      28.730us       0.000us         0.00%       0.000us       0.000us             6  
+                                  cudaDeviceSynchronize         0.28%       5.130us         0.28%       5.130us       5.130us       0.000us         0.00%       0.000us       0.000us             1  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-Self CPU time total: 1.882ms
-Self CUDA time total: 42.143us
+Self CPU time total: 1.852ms
+Self CUDA time total: 22.528us
 
 
 impl                     wl                  p50(ms)  ok
 torch_eager              cuda_T128_D1024        0.05  True
 torch_eager              cuda_T128_D2048        0.05  True
-torch_eager              cuda_T128_D768         0.05  True
+torch_eager              cuda_T128_D768         0.04  True
 torch_eager              cuda_T256_D1024        0.05  True
 torch_eager              cuda_T256_D2048        0.05  True
 torch_eager              cuda_T256_D768         0.05  True
@@ -4196,53 +4184,7 @@ torch_eager              cuda_T512_D768         0.05  True
 
▶ UV Install Logs
diff --git a/activation/results/artifacts/combine/latency.svg b/activation/results/artifacts/combine/latency.svg index bc4b8900664bea348ce4e4cdc17535a6ff8d8951..02e24e06df11cd1929543b7b6eb05b29ace9034e 100644 --- a/activation/results/artifacts/combine/latency.svg +++ b/activation/results/artifacts/combine/latency.svg @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:f286130086ddc73e4e87d0a2a68de7b2f17cff9f893d7fad0e1eb7210cf7e246 -size 20694 +oid sha256:9254fad09b1905d500f91c98ba5debdf4f6497c196acc2cdc499c0572bc73647 +size 20632 diff --git a/activation/results/combined_results.html b/activation/results/combined_results.html index 424cb36877c2c2c6d1afa9c948ce04e702e8766f..ebf73560e992accff356031bd9555e356bb61b32 100644 --- a/activation/results/combined_results.html +++ b/activation/results/combined_results.html @@ -3857,7 +3857,7 @@ span.linenos.special { color: #000000; background-color: #ffffc0; padding-left:
Generated on:
- Linux x86_64 | Linux-5.15.0-1084-aws-x86_64-with-glibc2.31 + Linux x86_64 | Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35
@@ -3872,7 +3872,7 @@ span.linenos.special { color: #000000; background-color: #ffffc0; padding-left: - 2025-10-24T19:26:55.354611 + 2025-10-27T14:46:43.482898 image/svg+xml @@ -4021,83 +4021,83 @@ span.linenos.special { color: #000000; background-color: #ffffc0; padding-left: - + - + - 0.030 + 0.025 - + - + - 0.035 + 0.030 - + - + - 0.040 + 0.035 - + - + - 0.045 + 0.040 - + - + - 0.050 + 0.045 - + - + - 0.055 + 0.050 @@ -4105,37 +4105,37 @@ span.linenos.special { color: #000000; background-color: #ffffc0; padding-left: - + - - - - - - - - + + + + + + + + - + - - - - - - - - - + + + + + + + + + @@ -4155,25 +4155,25 @@ span.linenos.special { color: #000000; background-color: #ffffc0; padding-left: - + - + - + - hf_kernels_swiglu + hf_kernels_swiglu - + - + - torch_eager + torch_eager @@ -4193,7 +4193,7 @@ span.linenos.special { color: #000000; background-color: #ffffc0; padding-left: ▼ output ▶ uv-logs | -Cell: combine | 38.46s +Cell: combine | 4.45s | Raw @@ -4267,13 +4267,13 @@ Cell: combine | 38.46s
======================================================================
 LOADING BENCHMARK DATA
 ======================================================================
-✓ HF Kernels SwiGLU             : /home/ubuntu/Projects/kernels-benchmarks-consolidated/benches/activation/impls/.uvnote/cache/d30443d7e6209ed0a7ffb0b020b1f31815cb2e95563283b7a25710e6420dbed8
-✓ PyTorch SwiGLU                : /home/ubuntu/Projects/kernels-benchmarks-consolidated/benches/activation/impls/.uvnote/cache/86a65ffc73cc3e7a7b1efe81bde7937d3f4e55d4f6b857c3fca0d9008687d8d6
+✓ HF Kernels SwiGLU             : /__w/kernels-benchmarks/kernels-benchmarks/benches/activation/impls/.uvnote/cache/2775e6386f1caf1fda935a997130c06dcaf7641efb0db21560c35301fdabfd9b
+✓ PyTorch SwiGLU                : /__w/kernels-benchmarks/kernels-benchmarks/benches/activation/impls/.uvnote/cache/661ca38adec8893d7c284140e922da661f0afcea4aaff6a3bf48a6494ce7c6eb
 
   ✓ Found HF Kernels SwiGLU
-     Path: /home/ubuntu/Projects/kernels-benchmarks-consolidated/benches/activation/impls/.uvnote/cache/d30443d7e6209ed0a7ffb0b020b1f31815cb2e95563283b7a25710e6420dbed8/activation.jsonl
+     Path: /__w/kernels-benchmarks/kernels-benchmarks/benches/activation/impls/.uvnote/cache/2775e6386f1caf1fda935a997130c06dcaf7641efb0db21560c35301fdabfd9b/activation.jsonl
   ✓ Found PyTorch SwiGLU
-     Path: /home/ubuntu/Projects/kernels-benchmarks-consolidated/benches/activation/impls/.uvnote/cache/86a65ffc73cc3e7a7b1efe81bde7937d3f4e55d4f6b857c3fca0d9008687d8d6/activation.jsonl
+     Path: /__w/kernels-benchmarks/kernels-benchmarks/benches/activation/impls/.uvnote/cache/661ca38adec8893d7c284140e922da661f0afcea4aaff6a3bf48a6494ce7c6eb/activation.jsonl
 
 ======================================================================
 Summary: 2 found, 0 skipped, 0 missing
@@ -4293,7 +4293,7 @@ hf_kernels_swiglu        cuda_T512_D2048        0.03  True
 hf_kernels_swiglu        cuda_T512_D768         0.03  True
 torch_eager              cuda_T128_D1024        0.05  True
 torch_eager              cuda_T128_D2048        0.05  True
-torch_eager              cuda_T128_D768         0.05  True
+torch_eager              cuda_T128_D768         0.04  True
 torch_eager              cuda_T256_D1024        0.05  True
 torch_eager              cuda_T256_D2048        0.05  True
 torch_eager              cuda_T256_D768         0.05  True
@@ -4319,53 +4319,7 @@ Implementations included:
 
▶ UV Install Logs
@@ -4378,7 +4332,7 @@ Installed 37 packages in 212ms - 2025-10-24T19:26:55.354611 + 2025-10-27T14:46:43.482898 image/svg+xml @@ -4527,83 +4481,83 @@ Installed 37 packages in 212ms - + - + - 0.030 + 0.025 - + - + - 0.035 + 0.030 - + - + - 0.040 + 0.035 - + - + - 0.045 + 0.040 - + - + - 0.050 + 0.045 - + - + - 0.055 + 0.050 @@ -4611,37 +4565,37 @@ Installed 37 packages in 212ms - + - - - - - - - - + + + + + + + + - + - - - - - - - - - + + + + + + + + + @@ -4661,25 +4615,25 @@ Installed 37 packages in 212ms - + - + - + - hf_kernels_swiglu + hf_kernels_swiglu - + - + - torch_eager + torch_eager diff --git a/flash_attn/impls/artifacts/benchmark/attention.jsonl b/flash_attn/impls/artifacts/benchmark/attention.jsonl index a71c816c6a4debc70577fca4dc7743032e2e6be5..f7b87bffff02cfb69a5abf9ea7fad8f878048292 100644 --- a/flash_attn/impls/artifacts/benchmark/attention.jsonl +++ b/flash_attn/impls/artifacts/benchmark/attention.jsonl @@ -1,6 +1,6 @@ -{"ts": "2025-10-24T19:25:35Z", "run": "352bdfcdb8354c52bb9bcfb05fb3e88b", "impl": "hf_kernels_flash_attn", "tags": {"family": "hf-kernels", "backend": "flash-attn", "compile": "none"}, "wl": {"name": "cuda_attn_L128_bfloat16", "batch": 1, "seq_len": 4224, "heads": 24, "head_dim": 128, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L4", "sm": "8.9", "py": "3.12.7", "plat": "Linux-5.15.0-1084-aws-x86_64-with-glibc2.31"}, "lat_ms": {"p10": 2.817559987306595, "p50": 2.819840970914811, "p90": 2.8203310212120414, "mean": 2.8193464037030935, "iqr": 0.002661021426320076, "raw_times": [2.8176699997857213, 2.8203310212120414, 2.821330039296299, 2.819840970914811, 2.817559987306595], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 2.8170199948363006, "peak_bytes": 295567360, "ok": true, "absmax": 0.0625, "corr": {"ok": true, "rtol": 0.02, "atol": 0.02, "absmax": 0.0625, "mae": 0.0003528594970703125, "mse": 2.726912498474121e-06, "ref": "sdpa_math_fp32"}, "err": null} -{"ts": "2025-10-24T19:25:35Z", "run": "352bdfcdb8354c52bb9bcfb05fb3e88b", "impl": "hf_kernels_flash_attn", "tags": {"family": "hf-kernels", "backend": "flash-attn", "compile": "none"}, "wl": {"name": "cuda_attn_L256_bfloat16", "batch": 1, "seq_len": 4352, "heads": 24, "head_dim": 128, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L4", "sm": "8.9", "py": "3.12.7", "plat": "Linux-5.15.0-1084-aws-x86_64-with-glibc2.31"}, "lat_ms": {"p10": 3.9076139801181853, "p50": 3.9129150100052357, "p90": 3.91379400389269, "mean": 3.920128010213375, "iqr": 0.0021209707483649254, "raw_times": [3.9546440239064395, 3.9076139801181853, 3.9129150100052357, 3.911673033144325, 3.91379400389269], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 3.1108770053833723, "peak_bytes": 304742400, "ok": true, "absmax": 0.0625, "corr": {"ok": true, "rtol": 0.02, "atol": 0.02, "absmax": 0.0625, "mae": 0.000354766845703125, "mse": 2.7567148208618164e-06, "ref": "sdpa_math_fp32"}, "err": null} -{"ts": "2025-10-24T19:25:35Z", "run": "352bdfcdb8354c52bb9bcfb05fb3e88b", "impl": "hf_kernels_flash_attn", "tags": {"family": "hf-kernels", "backend": "flash-attn", "compile": "none"}, "wl": {"name": "cuda_attn_L320_bfloat16", "batch": 1, "seq_len": 4416, "heads": 24, "head_dim": 128, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L4", "sm": "8.9", "py": "3.12.7", "plat": "Linux-5.15.0-1084-aws-x86_64-with-glibc2.31"}, "lat_ms": {"p10": 4.073257034178823, "p50": 4.119218967389315, "p90": 4.122229001950473, "mean": 4.102474392857403, "iqr": 0.04891102435067296, "raw_times": [4.073257034178823, 4.122229001950473, 4.119218967389315, 4.124348983168602, 4.0733179775998], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 3.606237005442381, "peak_bytes": 307494912, "ok": true, "absmax": 0.0625, "corr": {"ok": true, "rtol": 0.02, "atol": 0.02, "absmax": 0.0625, "mae": 0.000354766845703125, "mse": 2.7418136596679688e-06, "ref": "sdpa_math_fp32"}, "err": null} -{"ts": "2025-10-24T19:25:36Z", "run": "352bdfcdb8354c52bb9bcfb05fb3e88b", "impl": "hf_kernels_flash_attn", "tags": {"family": "hf-kernels", "backend": "flash-attn", "compile": "none"}, "wl": {"name": "cuda_attn_L384_bfloat16", "batch": 1, "seq_len": 4480, "heads": 24, "head_dim": 128, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L4", "sm": "8.9", "py": "3.12.7", "plat": "Linux-5.15.0-1084-aws-x86_64-with-glibc2.31"}, "lat_ms": {"p10": 4.078477970324457, "p50": 4.127818974666297, "p90": 4.151278990320861, "mean": 4.122894583269954, "iqr": 0.06814103107899427, "raw_times": [4.173759021796286, 4.151278990320861, 4.127818974666297, 4.078477970324457, 4.083137959241867], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 3.617736976593733, "peak_bytes": 311296000, "ok": true, "absmax": 0.0625, "corr": {"ok": true, "rtol": 0.02, "atol": 0.02, "absmax": 0.0625, "mae": 0.000354766845703125, "mse": 2.7418136596679688e-06, "ref": "sdpa_math_fp32"}, "err": null} -{"ts": "2025-10-24T19:25:36Z", "run": "352bdfcdb8354c52bb9bcfb05fb3e88b", "impl": "hf_kernels_flash_attn", "tags": {"family": "hf-kernels", "backend": "flash-attn", "compile": "none"}, "wl": {"name": "cuda_attn_L448_bfloat16", "batch": 1, "seq_len": 4544, "heads": 24, "head_dim": 128, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L4", "sm": "8.9", "py": "3.12.7", "plat": "Linux-5.15.0-1084-aws-x86_64-with-glibc2.31"}, "lat_ms": {"p10": 4.104706982616335, "p50": 4.1118780500255525, "p90": 4.146788967773318, "mean": 4.123546194750816, "iqr": 0.0404709717258811, "raw_times": [4.106317996047437, 4.104706982616335, 4.1118780500255525, 4.146788967773318, 4.148038977291435], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 4.064576991368085, "peak_bytes": 315621376, "ok": true, "absmax": 0.0625, "corr": {"ok": true, "rtol": 0.02, "atol": 0.02, "absmax": 0.0625, "mae": 0.0003528594970703125, "mse": 2.726912498474121e-06, "ref": "sdpa_math_fp32"}, "err": null} -{"ts": "2025-10-24T19:25:36Z", "run": "352bdfcdb8354c52bb9bcfb05fb3e88b", "impl": "hf_kernels_flash_attn", "tags": {"family": "hf-kernels", "backend": "flash-attn", "compile": "none"}, "wl": {"name": "cuda_attn_L512_bfloat16", "batch": 1, "seq_len": 4608, "heads": 24, "head_dim": 128, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L4", "sm": "8.9", "py": "3.12.7", "plat": "Linux-5.15.0-1084-aws-x86_64-with-glibc2.31"}, "lat_ms": {"p10": 4.358973994385451, "p50": 4.570448014419526, "p90": 4.571158031467348, "mean": 4.518645000644028, "iqr": 0.052271061576902866, "raw_times": [4.358973994385451, 4.570448014419526, 4.57375799305737, 4.571158031467348, 4.5188869698904455], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 3.924274002201855, "peak_bytes": 319946752, "ok": true, "absmax": 0.0625, "corr": {"ok": true, "rtol": 0.02, "atol": 0.02, "absmax": 0.0625, "mae": 0.0003528594970703125, "mse": 2.726912498474121e-06, "ref": "sdpa_math_fp32"}, "err": null} +{"ts": "2025-10-27T14:46:25Z", "run": "a317de5ffd144e2d917a7f8d53507ad4", "impl": "sage_int8_fp16", "tags": {"family": "sageattention", "backend": "int8_fp16_cuda", "compile": "none"}, "wl": {"name": "cuda_attn_L128_bfloat16", "batch": 1, "seq_len": 4224, "heads": 24, "head_dim": 128, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35"}, "lat_ms": null, "compile_ms": null, "peak_bytes": null, "ok": false, "absmax": null, "corr": {}, "err": {"type": "AttributeError", "msg": "module 'sage_attention_12c766386675beb4' has no attribute 'fwd'"}} +{"ts": "2025-10-27T14:46:25Z", "run": "a317de5ffd144e2d917a7f8d53507ad4", "impl": "sage_int8_fp16", "tags": {"family": "sageattention", "backend": "int8_fp16_cuda", "compile": "none"}, "wl": {"name": "cuda_attn_L256_bfloat16", "batch": 1, "seq_len": 4352, "heads": 24, "head_dim": 128, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35"}, "lat_ms": null, "compile_ms": null, "peak_bytes": null, "ok": false, "absmax": null, "corr": {}, "err": {"type": "AttributeError", "msg": "module 'sage_attention_12c766386675beb4' has no attribute 'fwd'"}} +{"ts": "2025-10-27T14:46:25Z", "run": "a317de5ffd144e2d917a7f8d53507ad4", "impl": "sage_int8_fp16", "tags": {"family": "sageattention", "backend": "int8_fp16_cuda", "compile": "none"}, "wl": {"name": "cuda_attn_L320_bfloat16", "batch": 1, "seq_len": 4416, "heads": 24, "head_dim": 128, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35"}, "lat_ms": null, "compile_ms": null, "peak_bytes": null, "ok": false, "absmax": null, "corr": {}, "err": {"type": "AttributeError", "msg": "module 'sage_attention_12c766386675beb4' has no attribute 'fwd'"}} +{"ts": "2025-10-27T14:46:25Z", "run": "a317de5ffd144e2d917a7f8d53507ad4", "impl": "sage_int8_fp16", "tags": {"family": "sageattention", "backend": "int8_fp16_cuda", "compile": "none"}, "wl": {"name": "cuda_attn_L384_bfloat16", "batch": 1, "seq_len": 4480, "heads": 24, "head_dim": 128, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35"}, "lat_ms": null, "compile_ms": null, "peak_bytes": null, "ok": false, "absmax": null, "corr": {}, "err": {"type": "AttributeError", "msg": "module 'sage_attention_12c766386675beb4' has no attribute 'fwd'"}} +{"ts": "2025-10-27T14:46:25Z", "run": "a317de5ffd144e2d917a7f8d53507ad4", "impl": "sage_int8_fp16", "tags": {"family": "sageattention", "backend": "int8_fp16_cuda", "compile": "none"}, "wl": {"name": "cuda_attn_L448_bfloat16", "batch": 1, "seq_len": 4544, "heads": 24, "head_dim": 128, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35"}, "lat_ms": null, "compile_ms": null, "peak_bytes": null, "ok": false, "absmax": null, "corr": {}, "err": {"type": "AttributeError", "msg": "module 'sage_attention_12c766386675beb4' has no attribute 'fwd'"}} +{"ts": "2025-10-27T14:46:25Z", "run": "a317de5ffd144e2d917a7f8d53507ad4", "impl": "sage_int8_fp16", "tags": {"family": "sageattention", "backend": "int8_fp16_cuda", "compile": "none"}, "wl": {"name": "cuda_attn_L512_bfloat16", "batch": 1, "seq_len": 4608, "heads": 24, "head_dim": 128, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35"}, "lat_ms": null, "compile_ms": null, "peak_bytes": null, "ok": false, "absmax": null, "corr": {}, "err": {"type": "AttributeError", "msg": "module 'sage_attention_12c766386675beb4' has no attribute 'fwd'"}} diff --git a/flash_attn/impls/cells/benchmark.py b/flash_attn/impls/cells/benchmark.py index 3322ea52931b529ccb87a69b788c5390d7ca6dd7..64fe6a4eeb97a838a63f7152c1133db1ed3229d9 100644 --- a/flash_attn/impls/cells/benchmark.py +++ b/flash_attn/impls/cells/benchmark.py @@ -3,8 +3,9 @@ # dependencies = [ # "numpy", # "torch==2.8.0", -# "kernels-benchmark-tools", # "kernels", +# "kernels-benchmark-tools", +# "sageattention", # ] # # [tool.uv.sources] @@ -15,18 +16,18 @@ import sys from kernels_benchmark_tools import KernelTypeEnum, run_benchmark from kernels import get_kernel -# Load the flash attention kernel -hf_kernels_flash_attn = get_kernel("kernels-community/flash-attn") +# Load the sage attention kernel +hf_kernels_sage_attn = get_kernel("kernels-community/sage_attention") -def hf_flash_attention(query, key, value): - """HuggingFace Kernels Flash Attention""" - return hf_kernels_flash_attn.fwd(query, key, value, is_causal=False)[0] +def sage_attention(query, key, value): + """SageAttention with INT8 Q/K quantization and FP16 P/V""" + return hf_kernels_sage_attn.fwd(query, key, value, is_causal=False)[0] run_benchmark( kernel_type=KernelTypeEnum.ATTENTION, - impl_name="hf_kernels_flash_attn", - impl_tags={"family": "hf-kernels", "backend": "flash-attn", "compile": "none"}, - impl_func=hf_flash_attention, + impl_name="sage_int8_fp16", + impl_tags={"family": "sageattention", "backend": "int8_fp16_cuda", "compile": "none"}, + impl_func=sage_attention, ) \ No newline at end of file diff --git a/flash_attn/impls/flash_attention.html b/flash_attn/impls/flash_attention.html index 1a8157b79111cd82be83b7682897d7f7f715a588..865b225e49d1ec1a10bb57c96dc824f8c850085f 100644 --- a/flash_attn/impls/flash_attention.html +++ b/flash_attn/impls/flash_attention.html @@ -3857,7 +3857,7 @@ span.linenos.special { color: #000000; background-color: #ffffc0; padding-left:
Generated on:
- Linux x86_64 | Linux-5.15.0-1084-aws-x86_64-with-glibc2.31 + Linux x86_64 | Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35
@@ -3871,7 +3871,7 @@ span.linenos.special { color: #000000; background-color: #ffffc0; padding-left: ▼ output ▶ uv-logs | -Cell: nv | 4.05s +Cell: nv | 0.26s | Raw @@ -3888,34 +3888,22 @@ Cell: nv | 4.05s
-
Fri Oct 24 19:21:04 2025       
+
Mon Oct 27 14:45:45 2025       
 +-----------------------------------------------------------------------------------------+
-| NVIDIA-SMI 560.35.05              Driver Version: 560.35.05      CUDA Version: 12.6     |
+| NVIDIA-SMI 570.195.03             Driver Version: 570.195.03     CUDA Version: 12.8     |
 |-----------------------------------------+------------------------+----------------------+
 | GPU  Name                 Persistence-M | Bus-Id          Disp.A | Volatile Uncorr. ECC |
 | Fan  Temp   Perf          Pwr:Usage/Cap |           Memory-Usage | GPU-Util  Compute M. |
 |                                         |                        |               MIG M. |
 |=========================================+========================+======================|
-|   0  NVIDIA L4                      Off |   00000000:38:00.0 Off |                    0 |
-| N/A   36C    P0             27W /   72W |       1MiB /  23034MiB |      0%      Default |
-|                                         |                        |                  N/A |
-+-----------------------------------------+------------------------+----------------------+
-|   1  NVIDIA L4                      Off |   00000000:3A:00.0 Off |                    0 |
-| N/A   33C    P0             28W /   72W |       1MiB /  23034MiB |      2%      Default |
-|                                         |                        |                  N/A |
-+-----------------------------------------+------------------------+----------------------+
-|   2  NVIDIA L4                      Off |   00000000:3C:00.0 Off |                    0 |
-| N/A   34C    P0             27W /   72W |       1MiB /  23034MiB |      1%      Default |
-|                                         |                        |                  N/A |
-+-----------------------------------------+------------------------+----------------------+
-|   3  NVIDIA L4                      Off |   00000000:3E:00.0 Off |                    0 |
-| N/A   33C    P0             27W /   72W |       1MiB /  23034MiB |      2%      Default |
+|   0  NVIDIA L40S                    On  |   00000000:4D:00.0 Off |                    0 |
+| N/A   31C    P0            135W /  350W |       0MiB /  46068MiB |    100%      Default |
 |                                         |                        |                  N/A |
 +-----------------------------------------+------------------------+----------------------+
 
 +-----------------------------------------------------------------------------------------+
 | Processes:                                                                              |
-|  GPU   GI   CI        PID   Type   Process name                              GPU Memory |
+|  GPU   GI   CI              PID   Type   Process name                        GPU Memory |
 |        ID   ID                                                               Usage      |
 |=========================================================================================|
 |  No running processes found                                                             |
@@ -3931,9 +3919,9 @@ Cell: nv | 4.05s
 
 ▼ code 
 ▼ output
- ▶ uv-logs
+ ▶ uv-logs
  | 
-Cell: benchmark | 44.13s
+Cell: benchmark | 3.87s
  | 
 
 Raw
@@ -3984,29 +3972,29 @@ PROFILE TRACE: torch_flash_ma | cuda_attn_L128_bfloat16
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
                                                    Name    Self CPU %      Self CPU   CPU total %     CPU total  CPU time avg     Self CUDA   Self CUDA %    CUDA total  CUDA time avg    # of Calls  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-                                         torch_flash_ma         2.87%     353.236us        20.60%       2.536ms       2.536ms       0.000us         0.00%      10.773ms      10.773ms             1  
-                                         torch_flash_ma         0.00%       0.000us         0.00%       0.000us       0.000us      10.620ms       100.09%      10.620ms      10.620ms             1  
-                     aten::scaled_dot_product_attention         0.36%      44.342us         1.92%     236.065us      78.688us       0.000us         0.00%       8.386ms       2.795ms             3  
-              aten::_scaled_dot_product_flash_attention         0.24%      29.551us         1.56%     191.723us      63.908us       0.000us         0.00%       8.386ms       2.795ms             3  
-                         aten::_flash_attention_forward         0.31%      38.342us         1.10%     135.583us      45.194us       8.386ms        79.03%       8.386ms       2.795ms             3  
-void pytorch_flash::flash_fwd_kernel<Flash_fwd_kerne...         0.00%       0.000us         0.00%       0.000us       0.000us       8.386ms        79.03%       8.386ms       2.795ms             3  
-                                       aten::contiguous         0.12%      15.199us        15.18%       1.869ms     155.744us       0.000us         0.00%       2.387ms     198.924us            12  
-                                            aten::clone         0.36%      44.321us        15.06%       1.854ms     154.478us       0.000us         0.00%       2.387ms     198.924us            12  
-                                            aten::copy_         0.78%      95.990us        13.98%       1.720ms     143.361us       2.225ms        20.97%       2.387ms     198.924us            12  
-void at::native::elementwise_kernel<128, 4, at::nati...         0.00%       0.000us         0.00%       0.000us       0.000us       2.225ms        20.97%       2.225ms     185.396us            12  
-                                Activity Buffer Request        12.35%       1.520ms        12.35%       1.520ms       1.520ms     162.335us         1.53%     162.335us     162.335us             1  
-                                        aten::transpose         0.62%      76.778us         0.84%     103.972us       4.332us       0.000us         0.00%       0.000us       0.000us            24  
-                                       aten::as_strided         0.22%      27.194us         0.22%      27.194us       1.133us       0.000us         0.00%       0.000us       0.000us            24  
-                                       aten::empty_like         0.24%      30.024us         0.91%     112.425us       7.495us       0.000us         0.00%       0.000us       0.000us            15  
-                                            aten::empty         0.80%      98.881us         0.80%      98.881us       4.120us       0.000us         0.00%       0.000us       0.000us            24  
-                                       cudaLaunchKernel         1.06%     129.984us         1.06%     129.984us       8.666us       0.000us         0.00%       0.000us       0.000us            15  
-                                    aten::empty_strided         0.14%      17.180us         0.14%      17.180us       5.727us       0.000us         0.00%       0.000us       0.000us             3  
-                                 cudaDeviceGetAttribute         0.02%       2.899us         0.02%       2.899us       0.483us       0.000us         0.00%       0.000us       0.000us             6  
-                                   cudaFuncSetAttribute         0.10%      11.980us         0.10%      11.980us       3.993us       0.000us         0.00%       0.000us       0.000us             3  
-                                  cudaDeviceSynchronize        79.40%       9.774ms        79.40%       9.774ms       9.774ms       0.000us         0.00%       0.000us       0.000us             1  
+                                         torch_flash_ma         0.00%       0.000us         0.00%       0.000us       0.000us       3.610ms       101.76%       3.610ms       3.610ms             1  
+                                         torch_flash_ma         6.54%     340.396us        46.01%       2.394ms       2.394ms       0.000us         0.00%       3.588ms       3.588ms             1  
+                     aten::scaled_dot_product_attention         0.84%      43.810us         4.24%     220.593us      73.531us       0.000us         0.00%       2.829ms     943.091us             3  
+              aten::_scaled_dot_product_flash_attention         0.51%      26.609us         3.40%     176.783us      58.928us       0.000us         0.00%       2.829ms     943.091us             3  
+                         aten::_flash_attention_forward         0.74%      38.381us         2.45%     127.692us      42.564us       2.829ms        79.74%       2.829ms     943.091us             3  
+void pytorch_flash::flash_fwd_kernel<Flash_fwd_kerne...         0.00%       0.000us         0.00%       0.000us       0.000us       2.829ms        79.74%       2.829ms     943.091us             3  
+                                       aten::contiguous         0.29%      15.001us        33.86%       1.762ms     146.802us       0.000us         0.00%     759.072us      63.256us            12  
+                                            aten::clone         0.76%      39.432us        33.57%       1.747ms     145.552us       0.000us         0.00%     759.072us      63.256us            12  
+                                            aten::copy_         1.71%      88.801us        31.26%       1.626ms     135.534us     718.688us        20.26%     759.072us      63.256us            12  
+void at::native::elementwise_kernel<128, 4, at::nati...         0.00%       0.000us         0.00%       0.000us       0.000us     718.688us        20.26%     718.688us      59.891us            12  
+                                Activity Buffer Request        27.68%       1.440ms        27.68%       1.440ms       1.440ms      40.384us         1.14%      40.384us      40.384us             1  
+                                        aten::transpose         1.34%      69.973us         1.80%      93.503us       3.896us       0.000us         0.00%       0.000us       0.000us            24  
+                                       aten::as_strided         0.45%      23.530us         0.45%      23.530us       0.980us       0.000us         0.00%       0.000us       0.000us            24  
+                                       aten::empty_like         0.50%      25.908us         1.97%     102.319us       6.821us       0.000us         0.00%       0.000us       0.000us            15  
+                                            aten::empty         1.75%      91.041us         1.75%      91.041us       3.793us       0.000us         0.00%       0.000us       0.000us            24  
+                                       cudaLaunchKernel         2.36%     123.031us         2.36%     123.031us       8.202us       0.000us         0.00%       0.000us       0.000us            15  
+                                    aten::empty_strided         0.31%      16.010us         0.31%      16.010us       5.337us       0.000us         0.00%       0.000us       0.000us             3  
+                                 cudaDeviceGetAttribute         0.05%       2.700us         0.05%       2.700us       0.450us       0.000us         0.00%       0.000us       0.000us             6  
+                                   cudaFuncSetAttribute         0.17%       8.980us         0.17%       8.980us       2.993us       0.000us         0.00%       0.000us       0.000us             3  
+                                  cudaDeviceSynchronize        53.99%       2.809ms        53.99%       2.809ms       2.809ms       0.000us         0.00%       0.000us       0.000us             1  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-Self CPU time total: 12.309ms
-Self CUDA time total: 10.610ms
+Self CPU time total: 5.203ms
+Self CUDA time total: 3.548ms
 
 
 
@@ -4016,29 +4004,29 @@ PROFILE TRACE: torch_flash_ma | cuda_attn_L256_bfloat16
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
                                                    Name    Self CPU %      Self CPU   CPU total %     CPU total  CPU time avg     Self CUDA   Self CUDA %    CUDA total  CUDA time avg    # of Calls  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-                                         torch_flash_ma         1.72%     263.576us        14.84%       2.279ms       2.279ms       0.000us         0.00%      13.971ms      13.971ms             1  
-                                         torch_flash_ma         0.00%       0.000us         0.00%       0.000us       0.000us      13.784ms       100.09%      13.784ms      13.784ms             1  
-                     aten::scaled_dot_product_attention         0.17%      25.751us         1.16%     178.074us      59.358us       0.000us         0.00%      11.389ms       3.796ms             3  
-              aten::_scaled_dot_product_flash_attention         0.12%      18.370us         0.99%     152.323us      50.774us       0.000us         0.00%      11.389ms       3.796ms             3  
-                         aten::_flash_attention_forward         0.21%      32.869us         0.72%     109.873us      36.624us      11.389ms        82.70%      11.389ms       3.796ms             3  
-void pytorch_flash::flash_fwd_kernel<Flash_fwd_kerne...         0.00%       0.000us         0.00%       0.000us       0.000us      11.389ms        82.70%      11.389ms       3.796ms             3  
-                                       aten::contiguous         0.06%       9.710us        11.64%       1.787ms     148.932us       0.000us         0.00%       2.582ms     215.169us            12  
-                                            aten::clone         0.19%      29.062us        11.57%       1.777ms     148.123us       0.000us         0.00%       2.582ms     215.169us            12  
-                                            aten::copy_         0.55%      83.901us        10.97%       1.685ms     140.395us       2.382ms        17.30%       2.582ms     215.169us            12  
-void at::native::elementwise_kernel<128, 4, at::nati...         0.00%       0.000us         0.00%       0.000us       0.000us       2.382ms        17.30%       2.382ms     198.534us            12  
-                                Activity Buffer Request         9.88%       1.517ms         9.88%       1.517ms       1.517ms     199.614us         1.45%     199.614us     199.614us             1  
-                                        aten::transpose         0.36%      54.739us         0.48%      74.091us       3.087us       0.000us         0.00%       0.000us       0.000us            24  
-                                       aten::as_strided         0.13%      19.352us         0.13%      19.352us       0.806us       0.000us         0.00%       0.000us       0.000us            24  
-                                       aten::empty_like         0.13%      19.810us         0.54%      82.371us       5.491us       0.000us         0.00%       0.000us       0.000us            15  
-                                            aten::empty         0.51%      77.821us         0.51%      77.821us       3.243us       0.000us         0.00%       0.000us       0.000us            24  
-                                       cudaLaunchKernel         0.70%     107.293us         0.70%     107.293us       7.153us       0.000us         0.00%       0.000us       0.000us            15  
-                                    aten::empty_strided         0.09%      13.681us         0.09%      13.681us       4.560us       0.000us         0.00%       0.000us       0.000us             3  
-                                 cudaDeviceGetAttribute         0.01%       1.961us         0.01%       1.961us       0.327us       0.000us         0.00%       0.000us       0.000us             6  
-                                   cudaFuncSetAttribute         0.03%       4.001us         0.03%       4.001us       1.334us       0.000us         0.00%       0.000us       0.000us             3  
-                                  cudaDeviceSynchronize        85.16%      13.081ms        85.16%      13.081ms      13.081ms       0.000us         0.00%       0.000us       0.000us             1  
+                                         torch_flash_ma         5.17%     272.917us        42.06%       2.218ms       2.218ms       0.000us         0.00%       3.821ms       3.821ms             1  
+                                         torch_flash_ma         0.00%       0.000us         0.00%       0.000us       0.000us       3.777ms       100.28%       3.777ms       3.777ms             1  
+                     aten::scaled_dot_product_attention         0.53%      27.761us         3.55%     187.333us      62.444us       0.000us         0.00%       3.004ms       1.001ms             3  
+              aten::_scaled_dot_product_flash_attention         0.37%      19.492us         3.03%     159.572us      53.191us       0.000us         0.00%       3.004ms       1.001ms             3  
+                         aten::_flash_attention_forward         0.75%      39.549us         2.23%     117.371us      39.124us       3.004ms        79.75%       3.004ms       1.001ms             3  
+void pytorch_flash::flash_fwd_kernel<Flash_fwd_kerne...         0.00%       0.000us         0.00%       0.000us       0.000us       3.004ms        79.75%       3.004ms       1.001ms             3  
+                                       aten::contiguous         0.20%      10.320us        32.06%       1.691ms     140.876us       0.000us         0.00%     817.314us      68.110us            12  
+                                            aten::clone         0.55%      29.048us        31.86%       1.680ms     140.016us       0.000us         0.00%     817.314us      68.110us            12  
+                                            aten::copy_         1.64%      86.662us        30.11%       1.588ms     132.347us     762.658us        20.25%     817.314us      68.110us            12  
+void at::native::elementwise_kernel<128, 4, at::nati...         0.00%       0.000us         0.00%       0.000us       0.000us     762.658us        20.25%     762.658us      63.555us            12  
+                                Activity Buffer Request        26.84%       1.415ms        26.84%       1.415ms       1.415ms      54.656us         1.45%      54.656us      54.656us             1  
+                                        aten::transpose         1.36%      71.528us         1.71%      90.179us       3.757us       0.000us         0.00%       0.000us       0.000us            24  
+                                       aten::as_strided         0.35%      18.651us         0.35%      18.651us       0.777us       0.000us         0.00%       0.000us       0.000us            24  
+                                       aten::empty_like         0.38%      19.801us         1.55%      81.840us       5.456us       0.000us         0.00%       0.000us       0.000us            15  
+                                            aten::empty         1.46%      77.040us         1.46%      77.040us       3.210us       0.000us         0.00%       0.000us       0.000us            24  
+                                       cudaLaunchKernel         2.07%     108.973us         2.07%     108.973us       7.265us       0.000us         0.00%       0.000us       0.000us            15  
+                                    aten::empty_strided         0.26%      13.940us         0.26%      13.940us       4.647us       0.000us         0.00%       0.000us       0.000us             3  
+                                 cudaDeviceGetAttribute         0.06%       2.910us         0.06%       2.910us       0.485us       0.000us         0.00%       0.000us       0.000us             6  
+                                   cudaFuncSetAttribute         0.08%       4.240us         0.08%       4.240us       1.413us       0.000us         0.00%       0.000us       0.000us             3  
+                                  cudaDeviceSynchronize        57.94%       3.056ms        57.94%       3.056ms       3.056ms       0.000us         0.00%       0.000us       0.000us             1  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-Self CPU time total: 15.360ms
-Self CUDA time total: 13.772ms
+Self CPU time total: 5.274ms
+Self CUDA time total: 3.767ms
 
 
 
@@ -4048,29 +4036,29 @@ PROFILE TRACE: torch_flash_ma | cuda_attn_L320_bfloat16
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
                                                    Name    Self CPU %      Self CPU   CPU total %     CPU total  CPU time avg     Self CUDA   Self CUDA %    CUDA total  CUDA time avg    # of Calls  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-                                         torch_flash_ma         1.59%     253.009us        16.33%       2.606ms       2.606ms       0.000us         0.00%      14.231ms      14.231ms             1  
-                                         torch_flash_ma         0.00%       0.000us         0.00%       0.000us       0.000us      14.040ms       100.09%      14.040ms      14.040ms             1  
-                     aten::scaled_dot_product_attention         0.16%      26.200us         1.12%     178.593us      59.531us       0.000us         0.00%      11.609ms       3.870ms             3  
-              aten::_scaled_dot_product_flash_attention         0.12%      19.071us         0.96%     152.393us      50.798us       0.000us         0.00%      11.609ms       3.870ms             3  
-                         aten::_flash_attention_forward         0.21%      33.032us         0.69%     110.322us      36.774us      11.609ms        82.76%      11.609ms       3.870ms             3  
-void pytorch_flash::flash_fwd_kernel<Flash_fwd_kerne...         0.00%       0.000us         0.00%       0.000us       0.000us      11.609ms        82.76%      11.609ms       3.870ms             3  
-                                       aten::contiguous         0.06%      10.030us        13.32%       2.125ms     177.070us       0.000us         0.00%       2.623ms     218.547us            12  
-                                            aten::clone         0.18%      28.858us        13.25%       2.115ms     176.235us       0.000us         0.00%       2.623ms     218.547us            12  
-                                            aten::copy_         0.51%      81.604us        12.67%       2.022ms     168.500us       2.418ms        17.24%       2.623ms     218.547us            12  
-void at::native::elementwise_kernel<128, 4, at::nati...         0.00%       0.000us         0.00%       0.000us       0.000us       2.418ms        17.24%       2.418ms     201.529us            12  
-                                Activity Buffer Request        11.62%       1.854ms        11.62%       1.854ms       1.854ms     204.222us         1.46%     204.222us     204.222us             1  
-                                        aten::transpose         0.33%      52.790us         0.45%      72.350us       3.015us       0.000us         0.00%       0.000us       0.000us            24  
-                                       aten::as_strided         0.12%      19.560us         0.12%      19.560us       0.815us       0.000us         0.00%       0.000us       0.000us            24  
-                                       aten::empty_like         0.12%      19.891us         0.52%      83.030us       5.535us       0.000us         0.00%       0.000us       0.000us            15  
-                                            aten::empty         0.49%      77.888us         0.49%      77.888us       3.245us       0.000us         0.00%       0.000us       0.000us            24  
-                                       cudaLaunchKernel         0.69%     109.402us         0.69%     109.402us       7.293us       0.000us         0.00%       0.000us       0.000us            15  
-                                    aten::empty_strided         0.09%      14.430us         0.09%      14.430us       4.810us       0.000us         0.00%       0.000us       0.000us             3  
-                                 cudaDeviceGetAttribute         0.01%       1.730us         0.01%       1.730us       0.288us       0.000us         0.00%       0.000us       0.000us             6  
-                                   cudaFuncSetAttribute         0.02%       3.831us         0.02%       3.831us       1.277us       0.000us         0.00%       0.000us       0.000us             3  
-                                  cudaDeviceSynchronize        83.67%      13.349ms        83.67%      13.349ms      13.349ms       0.000us         0.00%       0.000us       0.000us             1  
+                                         torch_flash_ma         4.99%     269.576us        41.89%       2.262ms       2.262ms       0.000us         0.00%       3.875ms       3.875ms             1  
+                                         torch_flash_ma         0.00%       0.000us         0.00%       0.000us       0.000us       3.827ms       100.29%       3.827ms       3.827ms             1  
+                     aten::scaled_dot_product_attention         0.50%      27.011us         3.47%     187.262us      62.421us       0.000us         0.00%       3.037ms       1.012ms             3  
+              aten::_scaled_dot_product_flash_attention         0.35%      18.851us         2.97%     160.251us      53.417us       0.000us         0.00%       3.037ms       1.012ms             3  
+                         aten::_flash_attention_forward         0.72%      39.000us         2.20%     118.550us      39.517us       3.037ms        79.57%       3.037ms       1.012ms             3  
+void pytorch_flash::flash_fwd_kernel<Flash_fwd_kerne...         0.00%       0.000us         0.00%       0.000us       0.000us       3.037ms        79.57%       3.037ms       1.012ms             3  
+                                       aten::contiguous         0.18%       9.780us        32.51%       1.755ms     146.253us       0.000us         0.00%     838.461us      69.872us            12  
+                                            aten::clone         0.54%      29.119us        32.32%       1.745ms     145.438us       0.000us         0.00%     838.461us      69.872us            12  
+                                            aten::copy_         1.56%      84.200us        30.52%       1.648ms     137.328us     779.741us        20.43%     838.461us      69.872us            12  
+void at::native::elementwise_kernel<128, 4, at::nati...         0.00%       0.000us         0.00%       0.000us       0.000us     779.741us        20.43%     779.741us      64.978us            12  
+                                Activity Buffer Request        27.41%       1.480ms        27.41%       1.480ms       1.480ms      58.720us         1.54%      58.720us      58.720us             1  
+                                        aten::transpose         1.00%      54.180us         1.34%      72.500us       3.021us       0.000us         0.00%       0.000us       0.000us            24  
+                                       aten::as_strided         0.34%      18.320us         0.34%      18.320us       0.763us       0.000us         0.00%       0.000us       0.000us            24  
+                                       aten::empty_like         0.36%      19.560us         1.66%      89.381us       5.959us       0.000us         0.00%       0.000us       0.000us            15  
+                                            aten::empty         1.53%      82.821us         1.53%      82.821us       3.451us       0.000us         0.00%       0.000us       0.000us            24  
+                                       cudaLaunchKernel         1.99%     107.272us         1.99%     107.272us       7.151us       0.000us         0.00%       0.000us       0.000us            15  
+                                    aten::empty_strided         0.30%      16.380us         0.30%      16.380us       5.460us       0.000us         0.00%       0.000us       0.000us             3  
+                                 cudaDeviceGetAttribute         0.03%       1.850us         0.03%       1.850us       0.308us       0.000us         0.00%       0.000us       0.000us             6  
+                                   cudaFuncSetAttribute         0.07%       3.830us         0.07%       3.830us       1.277us       0.000us         0.00%       0.000us       0.000us             3  
+                                  cudaDeviceSynchronize        58.11%       3.138ms        58.11%       3.138ms       3.138ms       0.000us         0.00%       0.000us       0.000us             1  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-Self CPU time total: 15.955ms
-Self CUDA time total: 14.027ms
+Self CPU time total: 5.399ms
+Self CUDA time total: 3.817ms
 
 
 
@@ -4080,29 +4068,29 @@ PROFILE TRACE: torch_flash_ma | cuda_attn_L384_bfloat16
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
                                                    Name    Self CPU %      Self CPU   CPU total %     CPU total  CPU time avg     Self CUDA   Self CUDA %    CUDA total  CUDA time avg    # of Calls  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-                                         torch_flash_ma         1.54%     253.696us        15.59%       2.567ms       2.567ms       0.000us         0.00%      14.787ms      14.787ms             1  
-                                         torch_flash_ma         0.00%       0.000us         0.00%       0.000us       0.000us      14.594ms       100.09%      14.594ms      14.594ms             1  
-                     aten::scaled_dot_product_attention         0.16%      26.450us         1.08%     178.164us      59.388us       0.000us         0.00%      12.117ms       4.039ms             3  
-              aten::_scaled_dot_product_flash_attention         0.12%      18.962us         0.92%     151.714us      50.571us       0.000us         0.00%      12.117ms       4.039ms             3  
-                         aten::_flash_attention_forward         0.20%      32.440us         0.66%     109.033us      36.344us      12.117ms        83.10%      12.117ms       4.039ms             3  
-void pytorch_flash::flash_fwd_kernel<Flash_fwd_kerne...         0.00%       0.000us         0.00%       0.000us       0.000us      12.117ms        83.10%      12.117ms       4.039ms             3  
-                                       aten::contiguous         0.06%      10.538us        12.68%       2.087ms     173.951us       0.000us         0.00%       2.670ms     222.462us            12  
-                                            aten::clone         0.17%      28.412us        12.61%       2.077ms     173.073us       0.000us         0.00%       2.670ms     222.462us            12  
-                                            aten::copy_         0.50%      82.093us        12.05%       1.984ms     165.351us       2.464ms        16.90%       2.670ms     222.462us            12  
-void at::native::elementwise_kernel<128, 4, at::nati...         0.00%       0.000us         0.00%       0.000us       0.000us       2.464ms        16.90%       2.464ms     205.326us            12  
-                                Activity Buffer Request         9.45%       1.555ms         9.45%       1.555ms       1.555ms     205.630us         1.41%     205.630us     205.630us             1  
-                                        aten::transpose         0.32%      52.269us         0.44%      71.730us       2.989us       0.000us         0.00%       0.000us       0.000us            24  
-                                       aten::as_strided         0.12%      19.461us         0.12%      19.461us       0.811us       0.000us         0.00%       0.000us       0.000us            24  
-                                       aten::empty_like         0.12%      19.690us         0.51%      84.151us       5.610us       0.000us         0.00%       0.000us       0.000us            15  
-                                            aten::empty         0.47%      77.802us         0.47%      77.802us       3.242us       0.000us         0.00%       0.000us       0.000us            24  
-                                       cudaLaunchKernel         2.24%     369.337us         2.24%     369.337us      24.622us       0.000us         0.00%       0.000us       0.000us            15  
-                                    aten::empty_strided         0.09%      14.871us         0.09%      14.871us       4.957us       0.000us         0.00%       0.000us       0.000us             3  
-                                 cudaDeviceGetAttribute         0.01%       1.880us         0.01%       1.880us       0.313us       0.000us         0.00%       0.000us       0.000us             6  
-                                   cudaFuncSetAttribute         0.02%       4.010us         0.02%       4.010us       1.337us       0.000us         0.00%       0.000us       0.000us             3  
-                                  cudaDeviceSynchronize        84.41%      13.899ms        84.41%      13.899ms      13.899ms       0.000us         0.00%       0.000us       0.000us             1  
+                                         torch_flash_ma         4.76%     268.853us        43.13%       2.435ms       2.435ms       0.000us         0.00%       3.964ms       3.964ms             1  
+                                         torch_flash_ma         0.00%       0.000us         0.00%       0.000us       0.000us       3.917ms       100.30%       3.917ms       3.917ms             1  
+                     aten::scaled_dot_product_attention         0.49%      27.720us         3.46%     195.333us      65.111us       0.000us         0.00%       3.118ms       1.039ms             3  
+              aten::_scaled_dot_product_flash_attention         0.34%      19.471us         2.97%     167.613us      55.871us       0.000us         0.00%       3.118ms       1.039ms             3  
+                         aten::_flash_attention_forward         0.70%      39.530us         2.23%     125.742us      41.914us       3.118ms        79.84%       3.118ms       1.039ms             3  
+void pytorch_flash::flash_fwd_kernel<Flash_fwd_kerne...         0.00%       0.000us         0.00%       0.000us       0.000us       3.118ms        79.84%       3.118ms       1.039ms             3  
+                                       aten::contiguous         0.17%       9.719us        34.03%       1.921ms     160.116us       0.000us         0.00%     845.599us      70.467us            12  
+                                            aten::clone         0.52%      29.239us        33.85%       1.912ms     159.306us       0.000us         0.00%     845.599us      70.467us            12  
+                                            aten::copy_         1.54%      86.910us        32.19%       1.818ms     151.460us     787.167us        20.16%     845.599us      70.467us            12  
+void at::native::elementwise_kernel<128, 4, at::nati...         0.00%       0.000us         0.00%       0.000us       0.000us     787.167us        20.16%     787.167us      65.597us            12  
+                                Activity Buffer Request        25.41%       1.435ms        25.41%       1.435ms       1.435ms      58.432us         1.50%      58.432us      58.432us             1  
+                                        aten::transpose         0.96%      54.080us         1.28%      72.141us       3.006us       0.000us         0.00%       0.000us       0.000us            24  
+                                       aten::as_strided         0.32%      18.061us         0.32%      18.061us       0.753us       0.000us         0.00%       0.000us       0.000us            24  
+                                       aten::empty_like         0.35%      19.512us         1.49%      84.134us       5.609us       0.000us         0.00%       0.000us       0.000us            15  
+                                            aten::empty         1.53%      86.581us         1.53%      86.581us       3.608us       0.000us         0.00%       0.000us       0.000us            24  
+                                       cudaLaunchKernel         5.66%     319.547us         5.66%     319.547us      21.303us       0.000us         0.00%       0.000us       0.000us            15  
+                                    aten::empty_strided         0.26%      14.430us         0.26%      14.430us       4.810us       0.000us         0.00%       0.000us       0.000us             3  
+                                 cudaDeviceGetAttribute         0.05%       2.740us         0.05%       2.740us       0.457us       0.000us         0.00%       0.000us       0.000us             6  
+                                   cudaFuncSetAttribute         0.07%       4.201us         0.07%       4.201us       1.400us       0.000us         0.00%       0.000us       0.000us             3  
+                                  cudaDeviceSynchronize        56.87%       3.211ms        56.87%       3.211ms       3.211ms       0.000us         0.00%       0.000us       0.000us             1  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-Self CPU time total: 16.466ms
-Self CUDA time total: 14.581ms
+Self CPU time total: 5.647ms
+Self CUDA time total: 3.906ms
 
 
 
@@ -4112,29 +4100,29 @@ PROFILE TRACE: torch_flash_ma | cuda_attn_L448_bfloat16
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
                                                    Name    Self CPU %      Self CPU   CPU total %     CPU total  CPU time avg     Self CUDA   Self CUDA %    CUDA total  CUDA time avg    # of Calls  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-                                         torch_flash_ma         1.70%     278.864us        15.50%       2.543ms       2.543ms       0.000us         0.00%      14.797ms      14.797ms             1  
-                                         torch_flash_ma         0.00%       0.000us         0.00%       0.000us       0.000us      14.600ms       100.09%      14.600ms      14.600ms             1  
-                     aten::scaled_dot_product_attention         0.17%      27.381us         1.16%     189.724us      63.241us       0.000us         0.00%      12.088ms       4.029ms             3  
-              aten::_scaled_dot_product_flash_attention         0.12%      19.359us         0.99%     162.343us      54.114us       0.000us         0.00%      12.088ms       4.029ms             3  
-                         aten::_flash_attention_forward         0.21%      33.700us         0.72%     118.223us      39.408us      12.088ms        82.87%      12.088ms       4.029ms             3  
-void pytorch_flash::flash_fwd_kernel<Flash_fwd_kerne...         0.00%       0.000us         0.00%       0.000us       0.000us      12.088ms        82.87%      12.088ms       4.029ms             3  
-                                       aten::contiguous         0.06%      10.278us        12.35%       2.025ms     168.720us       0.000us         0.00%       2.709ms     225.729us            12  
-                                            aten::clone         0.18%      29.935us        12.28%       2.014ms     167.864us       0.000us         0.00%       2.709ms     225.729us            12  
-                                            aten::copy_         0.52%      84.857us        11.68%       1.915ms     159.605us       2.499ms        17.13%       2.709ms     225.729us            12  
-void at::native::elementwise_kernel<128, 4, at::nati...         0.00%       0.000us         0.00%       0.000us       0.000us       2.499ms        17.13%       2.499ms     208.262us            12  
-                                Activity Buffer Request         9.10%       1.493ms         9.10%       1.493ms       1.493ms     209.598us         1.44%     209.598us     209.598us             1  
-                                        aten::transpose         0.33%      54.376us         0.45%      74.216us       3.092us       0.000us         0.00%       0.000us       0.000us            24  
-                                       aten::as_strided         0.12%      19.840us         0.12%      19.840us       0.827us       0.000us         0.00%       0.000us       0.000us            24  
-                                       aten::empty_like         0.12%      20.251us         0.54%      88.821us       5.921us       0.000us         0.00%       0.000us       0.000us            15  
-                                            aten::empty         0.50%      82.172us         0.50%      82.172us       3.424us       0.000us         0.00%       0.000us       0.000us            24  
-                                       cudaLaunchKernel         2.25%     368.209us         2.25%     368.209us      24.547us       0.000us         0.00%       0.000us       0.000us            15  
-                                    aten::empty_strided         0.09%      14.850us         0.09%      14.850us       4.950us       0.000us         0.00%       0.000us       0.000us             3  
-                                 cudaDeviceGetAttribute         0.01%       2.110us         0.01%       2.110us       0.352us       0.000us         0.00%       0.000us       0.000us             6  
-                                   cudaFuncSetAttribute         0.02%       3.861us         0.02%       3.861us       1.287us       0.000us         0.00%       0.000us       0.000us             3  
-                                  cudaDeviceSynchronize        84.50%      13.857ms        84.50%      13.857ms      13.857ms       0.000us         0.00%       0.000us       0.000us             1  
+                                         torch_flash_ma         5.25%     320.614us        40.80%       2.490ms       2.490ms       0.000us         0.00%       4.428ms       4.428ms             1  
+                                         torch_flash_ma         0.00%       0.000us         0.00%       0.000us       0.000us       4.377ms       100.25%       4.377ms       4.377ms             1  
+                     aten::scaled_dot_product_attention         0.44%      26.800us         3.27%     199.713us      66.571us       0.000us         0.00%       3.558ms       1.186ms             3  
+              aten::_scaled_dot_product_flash_attention         0.32%      19.239us         2.83%     172.913us      57.638us       0.000us         0.00%       3.558ms       1.186ms             3  
+                         aten::_flash_attention_forward         0.64%      38.816us         2.13%     129.963us      43.321us       3.558ms        81.48%       3.558ms       1.186ms             3  
+void pytorch_flash::flash_fwd_kernel<Flash_fwd_kerne...         0.00%       0.000us         0.00%       0.000us       0.000us       3.558ms        81.48%       3.558ms       1.186ms             3  
+                                       aten::contiguous         0.17%      10.568us        31.48%       1.922ms     160.138us       0.000us         0.00%     870.015us      72.501us            12  
+                                            aten::clone         0.48%      29.552us        31.31%       1.911ms     159.257us       0.000us         0.00%     870.015us      72.501us            12  
+                                            aten::copy_         1.37%      83.622us        29.71%       1.813ms     151.123us     808.479us        18.52%     870.015us      72.501us            12  
+void at::native::elementwise_kernel<128, 4, at::nati...         0.00%       0.000us         0.00%       0.000us       0.000us     808.479us        18.52%     808.479us      67.373us            12  
+                                Activity Buffer Request        24.07%       1.469ms        24.07%       1.469ms       1.469ms      61.536us         1.41%      61.536us      61.536us             1  
+                                        aten::transpose         0.88%      53.494us         1.18%      71.893us       2.996us       0.000us         0.00%       0.000us       0.000us            24  
+                                       aten::as_strided         0.30%      18.399us         0.30%      18.399us       0.767us       0.000us         0.00%       0.000us       0.000us            24  
+                                       aten::empty_like         0.45%      27.388us         1.61%      98.450us       6.563us       0.000us         0.00%       0.000us       0.000us            15  
+                                            aten::empty         1.35%      82.243us         1.35%      82.243us       3.427us       0.000us         0.00%       0.000us       0.000us            24  
+                                       cudaLaunchKernel         4.68%     285.943us         4.68%     285.943us      19.063us       0.000us         0.00%       0.000us       0.000us            15  
+                                    aten::empty_strided         0.29%      17.820us         0.29%      17.820us       5.940us       0.000us         0.00%       0.000us       0.000us             3  
+                                 cudaDeviceGetAttribute         0.04%       2.328us         0.04%       2.328us       0.388us       0.000us         0.00%       0.000us       0.000us             6  
+                                   cudaFuncSetAttribute         0.07%       4.078us         0.07%       4.078us       1.359us       0.000us         0.00%       0.000us       0.000us             3  
+                                  cudaDeviceSynchronize        59.20%       3.614ms        59.20%       3.614ms       3.614ms       0.000us         0.00%       0.000us       0.000us             1  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-Self CPU time total: 16.399ms
-Self CUDA time total: 14.587ms
+Self CPU time total: 6.104ms
+Self CUDA time total: 4.366ms
 
 
 
@@ -4144,91 +4132,39 @@ PROFILE TRACE: torch_flash_ma | cuda_attn_L512_bfloat16
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
                                                    Name    Self CPU %      Self CPU   CPU total %     CPU total  CPU time avg     Self CUDA   Self CUDA %    CUDA total  CUDA time avg    # of Calls  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-                                         torch_flash_ma         1.34%     250.556us        18.55%       3.457ms       3.457ms       0.000us         0.00%      16.094ms      16.094ms             1  
-                                         torch_flash_ma         0.00%       0.000us         0.00%       0.000us       0.000us      15.878ms       100.09%      15.878ms      15.878ms             1  
-                     aten::scaled_dot_product_attention         0.14%      25.201us         0.97%     180.244us      60.081us       0.000us         0.00%      12.955ms       4.318ms             3  
-              aten::_scaled_dot_product_flash_attention         0.10%      18.431us         0.83%     155.043us      51.681us       0.000us         0.00%      12.955ms       4.318ms             3  
-                         aten::_flash_attention_forward         0.18%      33.193us         0.61%     113.432us      37.811us      12.955ms        81.66%      12.955ms       4.318ms             3  
-void pytorch_flash::flash_fwd_kernel<Flash_fwd_kerne...         0.00%       0.000us         0.00%       0.000us       0.000us      12.955ms        81.66%      12.955ms       4.318ms             3  
-                                       aten::contiguous         0.05%      10.100us        15.97%       2.976ms     248.003us       0.000us         0.00%       3.139ms     261.603us            12  
-                                            aten::clone         0.16%      29.450us        15.92%       2.966ms     247.161us       0.000us         0.00%       3.139ms     261.603us            12  
-                                            aten::copy_         0.46%      85.134us        15.41%       2.871ms     239.275us       2.909ms        18.34%       3.139ms     261.603us            12  
-void at::native::elementwise_kernel<128, 4, at::nati...         0.00%       0.000us         0.00%       0.000us       0.000us       2.909ms        18.34%       2.909ms     242.440us            12  
-                                Activity Buffer Request         8.03%       1.497ms         8.03%       1.497ms       1.497ms     229.949us         1.45%     229.949us     229.949us             1  
-                                        aten::transpose         0.29%      53.550us         0.39%      73.110us       3.046us       0.000us         0.00%       0.000us       0.000us            24  
-                                       aten::as_strided         0.10%      19.560us         0.10%      19.560us       0.815us       0.000us         0.00%       0.000us       0.000us            24  
-                                       aten::empty_like         0.11%      19.791us         0.47%      87.501us       5.833us       0.000us         0.00%       0.000us       0.000us            15  
-                                            aten::empty         0.42%      78.571us         0.42%      78.571us       3.274us       0.000us         0.00%       0.000us       0.000us            24  
-                                       cudaLaunchKernel         7.05%       1.313ms         7.05%       1.313ms      87.561us       0.000us         0.00%       0.000us       0.000us            15  
-                                    aten::empty_strided         0.09%      17.450us         0.09%      17.450us       5.817us       0.000us         0.00%       0.000us       0.000us             3  
-                                 cudaDeviceGetAttribute         0.01%       1.828us         0.01%       1.828us       0.305us       0.000us         0.00%       0.000us       0.000us             6  
-                                   cudaFuncSetAttribute         0.02%       3.779us         0.02%       3.779us       1.260us       0.000us         0.00%       0.000us       0.000us             3  
-                                  cudaDeviceSynchronize        81.45%      15.178ms        81.45%      15.178ms      15.178ms       0.000us         0.00%       0.000us       0.000us             1  
+                                         torch_flash_ma         4.45%     272.752us        38.96%       2.390ms       2.390ms       0.000us         0.00%       4.517ms       4.517ms             1  
+                                         torch_flash_ma         0.00%       0.000us         0.00%       0.000us       0.000us       4.467ms       100.24%       4.467ms       4.467ms             1  
+                     aten::scaled_dot_product_attention         0.45%      27.641us         3.22%     197.213us      65.738us       0.000us         0.00%       3.636ms       1.212ms             3  
+              aten::_scaled_dot_product_flash_attention         0.32%      19.841us         2.76%     169.572us      56.524us       0.000us         0.00%       3.636ms       1.212ms             3  
+                         aten::_flash_attention_forward         0.71%      43.282us         2.06%     126.092us      42.031us       3.636ms        81.58%       3.636ms       1.212ms             3  
+void pytorch_flash::flash_fwd_kernel<Flash_fwd_kerne...         0.00%       0.000us         0.00%       0.000us       0.000us       3.636ms        81.58%       3.636ms       1.212ms             3  
+                                       aten::contiguous         0.18%      11.069us        30.46%       1.869ms     155.711us       0.000us         0.00%     881.085us      73.424us            12  
+                                            aten::clone         0.50%      30.953us        30.28%       1.857ms     154.789us       0.000us         0.00%     881.085us      73.424us            12  
+                                            aten::copy_         1.39%      85.529us        28.66%       1.758ms     146.482us     820.670us        18.42%     881.085us      73.424us            12  
+void at::native::elementwise_kernel<128, 4, at::nati...         0.00%       0.000us         0.00%       0.000us       0.000us     820.670us        18.42%     820.670us      68.389us            12  
+                                Activity Buffer Request        23.40%       1.435ms        23.40%       1.435ms       1.435ms      60.415us         1.36%      60.415us      60.415us             1  
+                                        aten::transpose         0.92%      56.138us         1.22%      75.130us       3.130us       0.000us         0.00%       0.000us       0.000us            24  
+                                       aten::as_strided         0.31%      18.992us         0.31%      18.992us       0.791us       0.000us         0.00%       0.000us       0.000us            24  
+                                       aten::empty_like         0.33%      20.287us         1.48%      90.810us       6.054us       0.000us         0.00%       0.000us       0.000us            15  
+                                            aten::empty         1.36%      83.613us         1.36%      83.613us       3.484us       0.000us         0.00%       0.000us       0.000us            24  
+                                       cudaLaunchKernel         4.26%     261.175us         4.26%     261.175us      17.412us       0.000us         0.00%       0.000us       0.000us            15  
+                                    aten::empty_strided         0.28%      17.260us         0.28%      17.260us       5.753us       0.000us         0.00%       0.000us       0.000us             3  
+                                 cudaDeviceGetAttribute         0.03%       1.850us         0.03%       1.850us       0.308us       0.000us         0.00%       0.000us       0.000us             6  
+                                   cudaFuncSetAttribute         0.07%       4.250us         0.07%       4.250us       1.417us       0.000us         0.00%       0.000us       0.000us             3  
+                                  cudaDeviceSynchronize        61.04%       3.744ms        61.04%       3.744ms       3.744ms       0.000us         0.00%       0.000us       0.000us             1  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-Self CPU time total: 18.634ms
-Self CUDA time total: 15.864ms
+Self CPU time total: 6.134ms
+Self CUDA time total: 4.456ms
 
 
 impl                     wl                  p50(ms)  ok
-torch_flash_ma           cuda_attn_L128_bfloat16     4.09  True
-torch_flash_ma           cuda_attn_L256_bfloat16     4.79  True
-torch_flash_ma           cuda_attn_L320_bfloat16     4.90  True
-torch_flash_ma           cuda_attn_L384_bfloat16     4.98  True
-torch_flash_ma           cuda_attn_L448_bfloat16     5.05  True
-torch_flash_ma           cuda_attn_L512_bfloat16     5.47  True
+torch_flash_ma           cuda_attn_L128_bfloat16     1.22  True
+torch_flash_ma           cuda_attn_L256_bfloat16     1.27  True
+torch_flash_ma           cuda_attn_L320_bfloat16     1.31  True
+torch_flash_ma           cuda_attn_L384_bfloat16     1.34  True
+torch_flash_ma           cuda_attn_L448_bfloat16     1.48  True
+torch_flash_ma           cuda_attn_L512_bfloat16     1.52  True
 
-
-
▶ UV Install Logs
- -

Artifacts:

attention.jsonl diff --git a/flash_attn/impls/hf_kernels_flash_attn.html b/flash_attn/impls/hf_kernels_flash_attn.html index 89b7e537b19452298016ec6db0ede83224aeee2c..377e4f883c400300d7994f075a1a49399ece1b0c 100644 --- a/flash_attn/impls/hf_kernels_flash_attn.html +++ b/flash_attn/impls/hf_kernels_flash_attn.html @@ -3857,7 +3857,7 @@ span.linenos.special { color: #000000; background-color: #ffffc0; padding-left:
Generated on:
- Linux x86_64 | Linux-5.15.0-1084-aws-x86_64-with-glibc2.31 + Linux x86_64 | Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35
@@ -3871,7 +3871,7 @@ span.linenos.special { color: #000000; background-color: #ffffc0; padding-left: ▼ output ▶ uv-logs | -Cell: benchmark | 47.93s +Cell: benchmark | 35.44s | Raw @@ -3926,21 +3926,21 @@ PROFILE TRACE: hf_kernels_flash_attn | cuda_attn_L128_bfloat16 ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ - hf_kernels_flash_attn 1.75% 172.444us 18.87% 1.860ms 1.860ms 0.000us 0.00% 10.982ms 10.982ms 1 - _flash_attn_9e27194::fwd 0.72% 71.472us 17.12% 1.688ms 562.609us 8.236ms 100.00% 10.982ms 3.661ms 3 - hf_kernels_flash_attn 0.00% 0.000us 0.00% 0.000us 0.000us 8.238ms 100.02% 8.238ms 8.238ms 1 -void flash::flash_fwd_kernel<Flash_fwd_kernel_traits... 0.00% 0.000us 0.00% 0.000us 0.000us 8.236ms 100.00% 8.236ms 2.745ms 3 - Activity Buffer Request 14.98% 1.477ms 14.98% 1.477ms 1.477ms 2.746ms 33.34% 2.746ms 2.746ms 1 - cudaDeviceGetAttribute 0.11% 11.099us 0.11% 11.099us 0.740us 0.000us 0.00% 0.000us 0.000us 15 - aten::empty_like 0.19% 18.800us 0.53% 52.161us 17.387us 0.000us 0.00% 0.000us 0.000us 3 - aten::empty_strided 0.34% 33.361us 0.34% 33.361us 11.120us 0.000us 0.00% 0.000us 0.000us 3 - aten::empty 0.27% 26.650us 0.27% 26.650us 2.961us 0.000us 0.00% 0.000us 0.000us 9 - cudaFuncSetAttribute 0.09% 8.722us 0.09% 8.722us 2.907us 0.000us 0.00% 0.000us 0.000us 3 - cudaLaunchKernel 0.41% 40.651us 0.41% 40.651us 13.550us 0.000us 0.00% 0.000us 0.000us 3 - cudaDeviceSynchronize 81.13% 8.001ms 81.13% 8.001ms 8.001ms 0.000us 0.00% 0.000us 0.000us 1 + hf_kernels_flash_attn 3.89% 173.532us 41.54% 1.852ms 1.852ms 0.000us 0.00% 3.821ms 3.821ms 1 + _flash_attn_9e27194::fwd 1.71% 76.382us 37.65% 1.679ms 559.513us 2.851ms 100.00% 3.821ms 1.274ms 3 + hf_kernels_flash_attn 0.00% 0.000us 0.00% 0.000us 0.000us 2.852ms 100.05% 2.852ms 2.852ms 1 +void flash::flash_fwd_kernel<Flash_fwd_kernel_traits... 0.00% 0.000us 0.00% 0.000us 0.000us 2.851ms 100.00% 2.851ms 950.289us 3 + Activity Buffer Request 32.53% 1.450ms 32.53% 1.450ms 1.450ms 970.364us 34.04% 970.364us 970.364us 1 + cudaDeviceGetAttribute 0.10% 4.520us 0.10% 4.520us 0.301us 0.000us 0.00% 0.000us 0.000us 15 + aten::empty_like 0.46% 20.440us 1.29% 57.461us 19.154us 0.000us 0.00% 0.000us 0.000us 3 + aten::empty_strided 0.83% 37.021us 0.83% 37.021us 12.340us 0.000us 0.00% 0.000us 0.000us 3 + aten::empty 0.76% 33.730us 0.76% 33.730us 3.748us 0.000us 0.00% 0.000us 0.000us 9 + cudaFuncSetAttribute 0.29% 12.870us 0.29% 12.870us 4.290us 0.000us 0.00% 0.000us 0.000us 3 + cudaLaunchKernel 0.97% 43.280us 0.97% 43.280us 14.427us 0.000us 0.00% 0.000us 0.000us 3 + cudaDeviceSynchronize 58.46% 2.606ms 58.46% 2.606ms 2.606ms 0.000us 0.00% 0.000us 0.000us 1 ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ -Self CPU time total: 9.861ms -Self CUDA time total: 8.236ms +Self CPU time total: 4.458ms +Self CUDA time total: 2.851ms @@ -3950,21 +3950,21 @@ PROFILE TRACE: hf_kernels_flash_attn | cuda_attn_L256_bfloat16 ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ - hf_kernels_flash_attn 0.74% 96.063us 13.14% 1.699ms 1.699ms 0.000us 0.00% 15.210ms 15.210ms 1 - _flash_attn_9e27194::fwd 0.37% 48.372us 12.39% 1.603ms 534.225us 11.384ms 100.00% 15.210ms 5.070ms 3 - hf_kernels_flash_attn 0.00% 0.000us 0.00% 0.000us 0.000us 11.386ms 100.02% 11.386ms 11.386ms 1 -void flash::flash_fwd_kernel<Flash_fwd_kernel_traits... 0.00% 0.000us 0.00% 0.000us 0.000us 11.384ms 100.00% 11.384ms 3.795ms 3 - Activity Buffer Request 11.40% 1.474ms 11.40% 1.474ms 1.474ms 3.826ms 33.61% 3.826ms 3.826ms 1 - cudaDeviceGetAttribute 0.03% 4.448us 0.03% 4.448us 0.297us 0.000us 0.00% 0.000us 0.000us 15 - aten::empty_like 0.05% 6.910us 0.18% 23.882us 7.961us 0.000us 0.00% 0.000us 0.000us 3 - aten::empty_strided 0.13% 16.972us 0.13% 16.972us 5.657us 0.000us 0.00% 0.000us 0.000us 3 - aten::empty 0.17% 21.490us 0.17% 21.490us 2.388us 0.000us 0.00% 0.000us 0.000us 9 - cudaFuncSetAttribute 0.03% 3.650us 0.03% 3.650us 1.217us 0.000us 0.00% 0.000us 0.000us 3 - cudaLaunchKernel 0.21% 26.920us 0.21% 26.920us 8.973us 0.000us 0.00% 0.000us 0.000us 3 - cudaDeviceSynchronize 86.86% 11.232ms 86.86% 11.232ms 11.232ms 0.000us 0.00% 0.000us 0.000us 1 + hf_kernels_flash_attn 2.32% 104.162us 37.24% 1.676ms 1.676ms 0.000us 0.00% 4.000ms 4.000ms 1 + _flash_attn_9e27194::fwd 1.05% 47.052us 34.93% 1.571ms 523.812us 2.988ms 100.00% 4.000ms 1.333ms 3 + hf_kernels_flash_attn 0.00% 0.000us 0.00% 0.000us 0.000us 2.989ms 100.04% 2.989ms 2.989ms 1 +void flash::flash_fwd_kernel<Flash_fwd_kernel_traits... 0.00% 0.000us 0.00% 0.000us 0.000us 2.988ms 100.00% 2.988ms 995.942us 3 + Activity Buffer Request 32.02% 1.441ms 32.02% 1.441ms 1.441ms 1.012ms 33.87% 1.012ms 1.012ms 1 + cudaDeviceGetAttribute 0.10% 4.331us 0.10% 4.331us 0.289us 0.000us 0.00% 0.000us 0.000us 15 + aten::empty_like 0.16% 7.210us 0.52% 23.350us 7.783us 0.000us 0.00% 0.000us 0.000us 3 + aten::empty_strided 0.36% 16.140us 0.36% 16.140us 5.380us 0.000us 0.00% 0.000us 0.000us 3 + aten::empty 0.47% 21.320us 0.47% 21.320us 2.369us 0.000us 0.00% 0.000us 0.000us 9 + cudaFuncSetAttribute 0.10% 4.349us 0.10% 4.349us 1.450us 0.000us 0.00% 0.000us 0.000us 3 + cudaLaunchKernel 0.67% 30.329us 0.67% 30.329us 10.110us 0.000us 0.00% 0.000us 0.000us 3 + cudaDeviceSynchronize 62.76% 2.824ms 62.76% 2.824ms 2.824ms 0.000us 0.00% 0.000us 0.000us 1 ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ -Self CPU time total: 12.931ms -Self CUDA time total: 11.384ms +Self CPU time total: 4.499ms +Self CUDA time total: 2.988ms @@ -3974,21 +3974,21 @@ PROFILE TRACE: hf_kernels_flash_attn | cuda_attn_L320_bfloat16 ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ - hf_kernels_flash_attn 0.67% 91.024us 12.59% 1.703ms 1.703ms 0.000us 0.00% 15.954ms 15.954ms 1 - _flash_attn_9e27194::fwd 0.35% 47.311us 11.92% 1.612ms 537.434us 11.964ms 100.00% 15.954ms 5.318ms 3 - hf_kernels_flash_attn 0.00% 0.000us 0.00% 0.000us 0.000us 11.966ms 100.01% 11.966ms 11.966ms 1 -void flash::flash_fwd_kernel<Flash_fwd_kernel_traits... 0.00% 0.000us 0.00% 0.000us 0.000us 11.964ms 100.00% 11.964ms 3.988ms 3 - Activity Buffer Request 10.98% 1.485ms 10.98% 1.485ms 1.485ms 3.990ms 33.35% 3.990ms 3.990ms 1 - cudaDeviceGetAttribute 0.03% 4.340us 0.03% 4.340us 0.289us 0.000us 0.00% 0.000us 0.000us 15 - aten::empty_like 0.06% 8.720us 0.18% 24.830us 8.277us 0.000us 0.00% 0.000us 0.000us 3 - aten::empty_strided 0.12% 16.110us 0.12% 16.110us 5.370us 0.000us 0.00% 0.000us 0.000us 3 - aten::empty 0.15% 20.500us 0.15% 20.500us 2.278us 0.000us 0.00% 0.000us 0.000us 9 - cudaFuncSetAttribute 0.03% 3.660us 0.03% 3.660us 1.220us 0.000us 0.00% 0.000us 0.000us 3 - cudaLaunchKernel 0.20% 26.400us 0.20% 26.400us 8.800us 0.000us 0.00% 0.000us 0.000us 3 - cudaDeviceSynchronize 87.41% 11.823ms 87.41% 11.823ms 11.823ms 0.000us 0.00% 0.000us 0.000us 1 + hf_kernels_flash_attn 2.58% 116.241us 37.17% 1.677ms 1.677ms 0.000us 0.00% 4.040ms 4.040ms 1 + _flash_attn_9e27194::fwd 1.11% 49.909us 34.60% 1.561ms 520.326us 3.012ms 100.00% 4.040ms 1.347ms 3 + hf_kernels_flash_attn 0.00% 0.000us 0.00% 0.000us 0.000us 3.013ms 100.04% 3.013ms 3.013ms 1 +void flash::flash_fwd_kernel<Flash_fwd_kernel_traits... 0.00% 0.000us 0.00% 0.000us 0.000us 3.012ms 100.00% 3.012ms 1.004ms 3 + Activity Buffer Request 31.60% 1.426ms 31.60% 1.426ms 1.426ms 1.029ms 34.16% 1.029ms 1.029ms 1 + cudaDeviceGetAttribute 0.08% 3.801us 0.08% 3.801us 0.253us 0.000us 0.00% 0.000us 0.000us 15 + aten::empty_like 0.18% 8.151us 0.55% 24.960us 8.320us 0.000us 0.00% 0.000us 0.000us 3 + aten::empty_strided 0.37% 16.809us 0.37% 16.809us 5.603us 0.000us 0.00% 0.000us 0.000us 3 + aten::empty 0.47% 21.201us 0.47% 21.201us 2.356us 0.000us 0.00% 0.000us 0.000us 9 + cudaFuncSetAttribute 0.09% 3.950us 0.09% 3.950us 1.317us 0.000us 0.00% 0.000us 0.000us 3 + cudaLaunchKernel 0.69% 31.260us 0.69% 31.260us 10.420us 0.000us 0.00% 0.000us 0.000us 3 + cudaDeviceSynchronize 62.83% 2.835ms 62.83% 2.835ms 2.835ms 0.000us 0.00% 0.000us 0.000us 1 ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ -Self CPU time total: 13.526ms -Self CUDA time total: 11.964ms +Self CPU time total: 4.512ms +Self CUDA time total: 3.012ms @@ -3998,21 +3998,21 @@ PROFILE TRACE: hf_kernels_flash_attn | cuda_attn_L384_bfloat16 ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ - hf_kernels_flash_attn 0.67% 93.544us 14.10% 1.960ms 1.960ms 0.000us 0.00% 16.171ms 16.171ms 1 - _flash_attn_9e27194::fwd 0.34% 47.108us 13.43% 1.866ms 622.149us 12.086ms 100.00% 16.171ms 5.390ms 3 - hf_kernels_flash_attn 0.00% 0.000us 0.00% 0.000us 0.000us 12.088ms 100.02% 12.088ms 12.088ms 1 -void flash::flash_fwd_kernel<Flash_fwd_kernel_traits... 0.00% 0.000us 0.00% 0.000us 0.000us 12.086ms 100.00% 12.086ms 4.029ms 3 - Activity Buffer Request 10.87% 1.511ms 10.87% 1.511ms 1.511ms 4.085ms 33.80% 4.085ms 4.085ms 1 - cudaDeviceGetAttribute 0.03% 4.151us 0.03% 4.151us 0.277us 0.000us 0.00% 0.000us 0.000us 15 - aten::empty_like 0.05% 7.020us 0.18% 24.401us 8.134us 0.000us 0.00% 0.000us 0.000us 3 - aten::empty_strided 0.13% 17.381us 0.13% 17.381us 5.794us 0.000us 0.00% 0.000us 0.000us 3 - aten::empty 0.16% 21.650us 0.16% 21.650us 2.406us 0.000us 0.00% 0.000us 0.000us 9 - cudaFuncSetAttribute 0.03% 3.680us 0.03% 3.680us 1.227us 0.000us 0.00% 0.000us 0.000us 3 - cudaLaunchKernel 1.83% 254.116us 1.83% 254.116us 84.705us 0.000us 0.00% 0.000us 0.000us 3 - cudaDeviceSynchronize 85.90% 11.939ms 85.90% 11.939ms 11.939ms 0.000us 0.00% 0.000us 0.000us 1 + hf_kernels_flash_attn 2.01% 99.212us 38.53% 1.898ms 1.898ms 0.000us 0.00% 4.264ms 4.264ms 1 + _flash_attn_9e27194::fwd 1.06% 52.152us 36.51% 1.799ms 599.723us 3.190ms 100.00% 4.264ms 1.421ms 3 + hf_kernels_flash_attn 0.00% 0.000us 0.00% 0.000us 0.000us 3.191ms 100.05% 3.191ms 3.191ms 1 +void flash::flash_fwd_kernel<Flash_fwd_kernel_traits... 0.00% 0.000us 0.00% 0.000us 0.000us 3.190ms 100.00% 3.190ms 1.063ms 3 + Activity Buffer Request 28.82% 1.420ms 28.82% 1.420ms 1.420ms 1.074ms 33.68% 1.074ms 1.074ms 1 + cudaDeviceGetAttribute 0.09% 4.479us 0.09% 4.479us 0.299us 0.000us 0.00% 0.000us 0.000us 15 + aten::empty_like 0.16% 7.900us 0.54% 26.470us 8.823us 0.000us 0.00% 0.000us 0.000us 3 + aten::empty_strided 0.38% 18.570us 0.38% 18.570us 6.190us 0.000us 0.00% 0.000us 0.000us 3 + aten::empty 0.46% 22.430us 0.46% 22.430us 2.492us 0.000us 0.00% 0.000us 0.000us 9 + cudaFuncSetAttribute 0.08% 3.830us 0.08% 3.830us 1.277us 0.000us 0.00% 0.000us 0.000us 3 + cudaLaunchKernel 5.47% 269.763us 5.47% 269.763us 89.921us 0.000us 0.00% 0.000us 0.000us 3 + cudaDeviceSynchronize 61.47% 3.029ms 61.47% 3.029ms 3.029ms 0.000us 0.00% 0.000us 0.000us 1 ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ -Self CPU time total: 13.899ms -Self CUDA time total: 12.086ms +Self CPU time total: 4.928ms +Self CUDA time total: 3.190ms @@ -4022,21 +4022,21 @@ PROFILE TRACE: hf_kernels_flash_attn | cuda_attn_L448_bfloat16 ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ - hf_kernels_flash_attn 0.66% 93.812us 13.64% 1.945ms 1.945ms 0.000us 0.00% 16.623ms 16.623ms 1 - _flash_attn_9e27194::fwd 0.35% 50.392us 12.98% 1.852ms 617.193us 12.470ms 100.00% 16.623ms 5.541ms 3 - hf_kernels_flash_attn 0.00% 0.000us 0.00% 0.000us 0.000us 12.472ms 100.02% 12.472ms 12.472ms 1 -void flash::flash_fwd_kernel<Flash_fwd_kernel_traits... 0.00% 0.000us 0.00% 0.000us 0.000us 12.470ms 100.00% 12.470ms 4.157ms 3 - Activity Buffer Request 10.49% 1.496ms 10.49% 1.496ms 1.496ms 4.153ms 33.30% 4.153ms 4.153ms 1 - cudaDeviceGetAttribute 0.03% 4.180us 0.03% 4.180us 0.279us 0.000us 0.00% 0.000us 0.000us 15 - aten::empty_like 0.11% 15.512us 0.23% 32.181us 10.727us 0.000us 0.00% 0.000us 0.000us 3 - aten::empty_strided 0.12% 16.669us 0.12% 16.669us 5.556us 0.000us 0.00% 0.000us 0.000us 3 - aten::empty 0.15% 21.480us 0.15% 21.480us 2.387us 0.000us 0.00% 0.000us 0.000us 9 - cudaFuncSetAttribute 0.03% 4.150us 0.03% 4.150us 1.383us 0.000us 0.00% 0.000us 0.000us 3 - cudaLaunchKernel 1.70% 242.835us 1.70% 242.835us 80.945us 0.000us 0.00% 0.000us 0.000us 3 - cudaDeviceSynchronize 86.36% 12.315ms 86.36% 12.315ms 12.315ms 0.000us 0.00% 0.000us 0.000us 1 + hf_kernels_flash_attn 2.16% 88.971us 14.91% 614.057us 614.057us 0.000us 0.00% 4.875ms 4.875ms 1 + _flash_attn_9e27194::fwd 1.23% 50.539us 12.75% 525.086us 175.029us 3.652ms 100.00% 4.875ms 1.625ms 3 + hf_kernels_flash_attn 0.00% 0.000us 0.00% 0.000us 0.000us 3.653ms 100.04% 3.653ms 3.653ms 1 +void flash::flash_fwd_kernel<Flash_fwd_kernel_traits... 0.00% 0.000us 0.00% 0.000us 0.000us 3.652ms 100.00% 3.652ms 1.217ms 3 + Activity Buffer Request 5.08% 209.112us 5.08% 209.112us 209.112us 1.223ms 33.50% 1.223ms 1.223ms 1 + cudaDeviceGetAttribute 0.10% 3.960us 0.10% 3.960us 0.264us 0.000us 0.00% 0.000us 0.000us 15 + aten::empty_like 0.19% 7.749us 0.60% 24.700us 8.233us 0.000us 0.00% 0.000us 0.000us 3 + aten::empty_strided 0.41% 16.951us 0.41% 16.951us 5.650us 0.000us 0.00% 0.000us 0.000us 3 + aten::empty 0.54% 22.121us 0.54% 22.121us 2.458us 0.000us 0.00% 0.000us 0.000us 9 + cudaFuncSetAttribute 0.10% 4.190us 0.10% 4.190us 1.397us 0.000us 0.00% 0.000us 0.000us 3 + cudaLaunchKernel 5.11% 210.464us 5.11% 210.464us 70.155us 0.000us 0.00% 0.000us 0.000us 3 + cudaDeviceSynchronize 85.09% 3.504ms 85.09% 3.504ms 3.504ms 0.000us 0.00% 0.000us 0.000us 1 ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ -Self CPU time total: 14.261ms -Self CUDA time total: 12.470ms +Self CPU time total: 4.118ms +Self CUDA time total: 3.652ms @@ -4046,89 +4046,88 @@ PROFILE TRACE: hf_kernels_flash_attn | cuda_attn_L512_bfloat16 ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ - hf_kernels_flash_attn 0.61% 96.222us 15.74% 2.480ms 2.480ms 0.000us 0.00% 17.900ms 17.900ms 1 - _flash_attn_9e27194::fwd 0.31% 49.571us 15.13% 2.384ms 794.661us 13.426ms 100.00% 17.900ms 5.967ms 3 - hf_kernels_flash_attn 0.00% 0.000us 0.00% 0.000us 0.000us 13.428ms 100.02% 13.428ms 13.428ms 1 -void flash::flash_fwd_kernel<Flash_fwd_kernel_traits... 0.00% 0.000us 0.00% 0.000us 0.000us 13.426ms 100.00% 13.426ms 4.475ms 3 - Activity Buffer Request 9.64% 1.519ms 9.64% 1.519ms 1.519ms 4.474ms 33.33% 4.474ms 4.474ms 1 - cudaDeviceGetAttribute 0.03% 4.041us 0.03% 4.041us 0.269us 0.000us 0.00% 0.000us 0.000us 15 - aten::empty_like 0.05% 7.901us 0.16% 24.582us 8.194us 0.000us 0.00% 0.000us 0.000us 3 - aten::empty_strided 0.11% 16.681us 0.11% 16.681us 5.560us 0.000us 0.00% 0.000us 0.000us 3 - aten::empty 0.13% 20.818us 0.13% 20.818us 2.313us 0.000us 0.00% 0.000us 0.000us 9 - cudaFuncSetAttribute 0.02% 3.610us 0.02% 3.610us 1.203us 0.000us 0.00% 0.000us 0.000us 3 - cudaLaunchKernel 4.84% 761.957us 4.84% 761.957us 253.986us 0.000us 0.00% 0.000us 0.000us 3 - cudaDeviceSynchronize 84.26% 13.278ms 84.26% 13.278ms 13.278ms 0.000us 0.00% 0.000us 0.000us 1 + hf_kernels_flash_attn 2.23% 91.402us 14.65% 600.857us 600.857us 0.000us 0.00% 4.881ms 4.881ms 1 + _flash_attn_9e27194::fwd 1.15% 47.191us 12.42% 509.455us 169.818us 3.654ms 100.00% 4.881ms 1.627ms 3 + hf_kernels_flash_attn 0.00% 0.000us 0.00% 0.000us 0.000us 3.655ms 100.04% 3.655ms 3.655ms 1 +void flash::flash_fwd_kernel<Flash_fwd_kernel_traits... 0.00% 0.000us 0.00% 0.000us 0.000us 3.654ms 100.00% 3.654ms 1.218ms 3 + Activity Buffer Request 5.38% 220.623us 5.38% 220.623us 220.623us 1.227ms 33.59% 1.227ms 1.227ms 1 + cudaDeviceGetAttribute 0.09% 3.601us 0.09% 3.601us 0.240us 0.000us 0.00% 0.000us 0.000us 15 + aten::empty_like 0.18% 7.230us 0.58% 23.840us 7.947us 0.000us 0.00% 0.000us 0.000us 3 + aten::empty_strided 0.40% 16.610us 0.40% 16.610us 5.537us 0.000us 0.00% 0.000us 0.000us 3 + aten::empty 0.51% 20.851us 0.51% 20.851us 2.317us 0.000us 0.00% 0.000us 0.000us 9 + cudaFuncSetAttribute 0.09% 3.688us 0.09% 3.688us 1.229us 0.000us 0.00% 0.000us 0.000us 3 + cudaLaunchKernel 4.62% 189.661us 4.62% 189.661us 63.220us 0.000us 0.00% 0.000us 0.000us 3 + cudaDeviceSynchronize 85.35% 3.502ms 85.35% 3.502ms 3.502ms 0.000us 0.00% 0.000us 0.000us 1 ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ -Self CPU time total: 15.758ms -Self CUDA time total: 13.426ms +Self CPU time total: 4.103ms +Self CUDA time total: 3.654ms impl wl p50(ms) ok -hf_kernels_flash_attn cuda_attn_L128_bfloat16 2.82 True -hf_kernels_flash_attn cuda_attn_L256_bfloat16 3.91 True -hf_kernels_flash_attn cuda_attn_L320_bfloat16 4.12 True -hf_kernels_flash_attn cuda_attn_L384_bfloat16 4.13 True -hf_kernels_flash_attn cuda_attn_L448_bfloat16 4.11 True -hf_kernels_flash_attn cuda_attn_L512_bfloat16 4.57 True +hf_kernels_flash_attn cuda_attn_L128_bfloat16 0.98 True +hf_kernels_flash_attn cuda_attn_L256_bfloat16 1.02 True +hf_kernels_flash_attn cuda_attn_L320_bfloat16 1.05 True +hf_kernels_flash_attn cuda_attn_L384_bfloat16 1.07 True +hf_kernels_flash_attn cuda_attn_L448_bfloat16 1.23 True +hf_kernels_flash_attn cuda_attn_L512_bfloat16 1.23 True
▶ UV Install Logs
Fetching 20 files: 0%| | 0/20 [00:00<?, ?it/s] -Fetching 20 files: 5%|▌ | 1/20 [00:00<00:04, 4.15it/s] -Fetching 20 files: 10%|█ | 2/20 [00:03<00:35, 1.96s/it] -Fetching 20 files: 100%|██████████| 20/20 [00:03<00:00, 5.86it/s]
+Fetching 20 files: 10%|█ | 2/20 [00:01<00:12, 1.43it/s] +Fetching 20 files: 100%|██████████| 20/20 [00:01<00:00, 14.34it/s]

Artifacts:

attention.jsonl diff --git a/flash_attn/impls/hf_kernels_flash_attn3.html b/flash_attn/impls/hf_kernels_flash_attn3.html index 9931f2961261033f4de2f06ea452f344486787ca..a053bb95457c873b96f776bcf4309302293dd2b6 100644 --- a/flash_attn/impls/hf_kernels_flash_attn3.html +++ b/flash_attn/impls/hf_kernels_flash_attn3.html @@ -3857,7 +3857,7 @@ span.linenos.special { color: #000000; background-color: #ffffc0; padding-left:
Generated on:
- Linux x86_64 | Linux-5.15.0-1084-aws-x86_64-with-glibc2.31 + Linux x86_64 | Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35
@@ -3869,9 +3869,9 @@ span.linenos.special { color: #000000; background-color: #ffffc0; padding-left: ▼ code ▼ output - ▶ uv-logs + ▶ uv-logs | -Cell: benchmark | 45.91s +Cell: benchmark | 5.62s | Raw @@ -3925,19 +3925,19 @@ PROFILE TRACE: hf_kernels_flash_attn3 | cuda_attn_L128_bfloat16 ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ - hf_kernels_flash_attn3 1.92% 183.884us 20.78% 1.986ms 1.986ms 0.000us 0.00% 10.512ms 10.512ms 1 - FlashAttnFunc 1.41% 134.465us 18.86% 1.802ms 600.660us 0.000us 0.00% 10.512ms 3.504ms 3 - _flash_attn3_48fe103_dirty::fwd 0.80% 76.599us 17.45% 1.668ms 555.838us 7.883ms 100.00% 10.512ms 3.504ms 3 - hf_kernels_flash_attn3 0.00% 0.000us 0.00% 0.000us 0.000us 7.884ms 100.02% 7.884ms 7.884ms 1 -void cutlass::device_kernel<flash::enable_sm80_to_sm... 0.00% 0.000us 0.00% 0.000us 0.000us 7.883ms 100.00% 7.883ms 2.628ms 3 - Activity Buffer Request 15.56% 1.487ms 15.56% 1.487ms 1.487ms 2.629ms 33.36% 2.629ms 2.629ms 1 - aten::empty 0.46% 44.151us 0.46% 44.151us 7.358us 0.000us 0.00% 0.000us 0.000us 6 - cudaFuncSetAttribute 0.16% 15.420us 0.16% 15.420us 5.140us 0.000us 0.00% 0.000us 0.000us 3 - cudaLaunchKernel 0.46% 44.162us 0.46% 44.162us 14.721us 0.000us 0.00% 0.000us 0.000us 3 - cudaDeviceSynchronize 79.22% 7.570ms 79.22% 7.570ms 7.570ms 0.000us 0.00% 0.000us 0.000us 1 + hf_kernels_flash_attn3 3.90% 171.143us 44.22% 1.941ms 1.941ms 0.000us 0.00% 3.653ms 3.653ms 1 + FlashAttnFunc 2.92% 128.011us 40.32% 1.769ms 589.788us 0.000us 0.00% 3.653ms 1.218ms 3 + _flash_attn3_48fe103_dirty::fwd 1.90% 83.422us 37.41% 1.641ms 547.118us 2.755ms 100.00% 3.653ms 1.218ms 3 + hf_kernels_flash_attn3 0.00% 0.000us 0.00% 0.000us 0.000us 2.756ms 100.05% 2.756ms 2.756ms 1 +void cutlass::device_kernel<flash::enable_sm80_to_sm... 0.00% 0.000us 0.00% 0.000us 0.000us 2.755ms 100.00% 2.755ms 918.306us 3 + Activity Buffer Request 33.13% 1.454ms 33.13% 1.454ms 1.454ms 898.082us 32.60% 898.082us 898.082us 1 + aten::empty 1.02% 44.762us 1.02% 44.762us 7.460us 0.000us 0.00% 0.000us 0.000us 6 + cudaFuncSetAttribute 0.33% 14.660us 0.33% 14.660us 4.887us 0.000us 0.00% 0.000us 0.000us 3 + cudaLaunchKernel 1.02% 44.660us 1.02% 44.660us 14.887us 0.000us 0.00% 0.000us 0.000us 3 + cudaDeviceSynchronize 55.78% 2.447ms 55.78% 2.447ms 2.447ms 0.000us 0.00% 0.000us 0.000us 1 ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ -Self CPU time total: 9.555ms -Self CUDA time total: 7.883ms +Self CPU time total: 4.388ms +Self CUDA time total: 2.755ms @@ -3947,19 +3947,19 @@ PROFILE TRACE: hf_kernels_flash_attn3 | cuda_attn_L256_bfloat16 ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ - FlashAttnFunc 0.80% 101.601us 13.56% 1.712ms 570.799us 0.000us 0.00% 14.746ms 4.915ms 3 - _flash_attn3_48fe103_dirty::fwd 0.39% 49.531us 12.75% 1.611ms 536.932us 11.037ms 100.00% 14.746ms 4.915ms 3 - hf_kernels_flash_attn3 0.89% 111.943us 14.45% 1.824ms 1.824ms 0.000us 0.00% 14.746ms 14.746ms 1 - hf_kernels_flash_attn3 0.00% 0.000us 0.00% 0.000us 0.000us 11.039ms 100.02% 11.039ms 11.039ms 1 -void cutlass::device_kernel<flash::enable_sm80_to_sm... 0.00% 0.000us 0.00% 0.000us 0.000us 11.037ms 100.00% 11.037ms 3.679ms 3 - Activity Buffer Request 11.87% 1.500ms 11.87% 1.500ms 1.500ms 3.709ms 33.60% 3.709ms 3.709ms 1 - aten::empty 0.21% 26.220us 0.21% 26.220us 4.370us 0.000us 0.00% 0.000us 0.000us 6 - cudaFuncSetAttribute 0.04% 5.092us 0.04% 5.092us 1.697us 0.000us 0.00% 0.000us 0.000us 3 - cudaLaunchKernel 0.24% 30.290us 0.24% 30.290us 10.097us 0.000us 0.00% 0.000us 0.000us 3 - cudaDeviceSynchronize 85.55% 10.805ms 85.55% 10.805ms 10.805ms 0.000us 0.00% 0.000us 0.000us 1 + hf_kernels_flash_attn3 2.42% 105.470us 40.03% 1.743ms 1.743ms 0.000us 0.00% 3.784ms 3.784ms 1 + FlashAttnFunc 2.12% 92.121us 37.61% 1.638ms 546.005us 0.000us 0.00% 3.784ms 1.261ms 3 + _flash_attn3_48fe103_dirty::fwd 1.23% 53.460us 35.49% 1.546ms 515.298us 2.836ms 100.00% 3.784ms 1.261ms 3 + hf_kernels_flash_attn3 0.00% 0.000us 0.00% 0.000us 0.000us 2.838ms 100.05% 2.838ms 2.838ms 1 +void cutlass::device_kernel<flash::enable_sm80_to_sm... 0.00% 0.000us 0.00% 0.000us 0.000us 2.836ms 100.00% 2.836ms 945.359us 3 + Activity Buffer Request 32.85% 1.431ms 32.85% 1.431ms 1.431ms 947.652us 33.41% 947.652us 947.652us 1 + aten::empty 0.62% 27.052us 0.62% 27.052us 4.509us 0.000us 0.00% 0.000us 0.000us 6 + cudaFuncSetAttribute 0.11% 4.721us 0.11% 4.721us 1.574us 0.000us 0.00% 0.000us 0.000us 3 + cudaLaunchKernel 0.68% 29.730us 0.68% 29.730us 9.910us 0.000us 0.00% 0.000us 0.000us 3 + cudaDeviceSynchronize 59.97% 2.612ms 59.97% 2.612ms 2.612ms 0.000us 0.00% 0.000us 0.000us 1 ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ -Self CPU time total: 12.629ms -Self CUDA time total: 11.037ms +Self CPU time total: 4.355ms +Self CUDA time total: 2.836ms @@ -3969,19 +3969,19 @@ PROFILE TRACE: hf_kernels_flash_attn3 | cuda_attn_L320_bfloat16 ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ - hf_kernels_flash_attn3 0.84% 108.082us 14.36% 1.851ms 1.851ms 0.000us 0.00% 15.081ms 15.081ms 1 - FlashAttnFunc 0.79% 101.882us 13.52% 1.743ms 580.849us 0.000us 0.00% 15.081ms 5.027ms 3 - _flash_attn3_48fe103_dirty::fwd 0.38% 48.472us 12.73% 1.641ms 546.889us 11.268ms 100.00% 15.081ms 5.027ms 3 - hf_kernels_flash_attn3 0.00% 0.000us 0.00% 0.000us 0.000us 11.269ms 100.02% 11.269ms 11.269ms 1 -void cutlass::device_kernel<flash::enable_sm80_to_sm... 0.00% 0.000us 0.00% 0.000us 0.000us 11.268ms 100.00% 11.268ms 3.756ms 3 - Activity Buffer Request 11.87% 1.530ms 11.87% 1.530ms 1.530ms 3.813ms 33.84% 3.813ms 3.813ms 1 - aten::empty 0.21% 26.670us 0.21% 26.670us 4.445us 0.000us 0.00% 0.000us 0.000us 6 - cudaFuncSetAttribute 0.04% 5.170us 0.04% 5.170us 1.723us 0.000us 0.00% 0.000us 0.000us 3 - cudaLaunchKernel 0.24% 30.581us 0.24% 30.581us 10.194us 0.000us 0.00% 0.000us 0.000us 3 - cudaDeviceSynchronize 85.64% 11.041ms 85.64% 11.041ms 11.041ms 0.000us 0.00% 0.000us 0.000us 1 + hf_kernels_flash_attn3 2.34% 104.112us 39.68% 1.767ms 1.767ms 0.000us 0.00% 3.931ms 3.931ms 1 + FlashAttnFunc 2.59% 115.143us 37.35% 1.662ms 554.155us 0.000us 0.00% 3.931ms 1.310ms 3 + _flash_attn3_48fe103_dirty::fwd 1.23% 54.772us 34.76% 1.547ms 515.774us 2.932ms 100.00% 3.931ms 1.310ms 3 + hf_kernels_flash_attn3 0.00% 0.000us 0.00% 0.000us 0.000us 2.934ms 100.05% 2.934ms 2.934ms 1 +void cutlass::device_kernel<flash::enable_sm80_to_sm... 0.00% 0.000us 0.00% 0.000us 0.000us 2.932ms 100.00% 2.932ms 977.432us 3 + Activity Buffer Request 32.05% 1.427ms 32.05% 1.427ms 1.427ms 998.487us 34.05% 998.487us 998.487us 1 + aten::empty 0.66% 29.309us 0.66% 29.309us 4.885us 0.000us 0.00% 0.000us 0.000us 6 + cudaFuncSetAttribute 0.11% 4.840us 0.11% 4.840us 1.613us 0.000us 0.00% 0.000us 0.000us 3 + cudaLaunchKernel 0.71% 31.520us 0.71% 31.520us 10.507us 0.000us 0.00% 0.000us 0.000us 3 + cudaDeviceSynchronize 60.32% 2.685ms 60.32% 2.685ms 2.685ms 0.000us 0.00% 0.000us 0.000us 1 ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ -Self CPU time total: 12.891ms -Self CUDA time total: 11.268ms +Self CPU time total: 4.452ms +Self CUDA time total: 2.932ms @@ -3991,19 +3991,19 @@ PROFILE TRACE: hf_kernels_flash_attn3 | cuda_attn_L384_bfloat16 ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ - hf_kernels_flash_attn3 0.87% 107.542us 12.07% 1.493ms 1.493ms 0.000us 0.00% 14.923ms 14.923ms 1 - FlashAttnFunc 0.84% 104.222us 11.20% 1.385ms 461.687us 0.000us 0.00% 14.923ms 4.974ms 3 - _flash_attn3_48fe103_dirty::fwd 0.41% 51.032us 10.36% 1.281ms 426.946us 11.101ms 100.00% 14.923ms 4.974ms 3 - hf_kernels_flash_attn3 0.00% 0.000us 0.00% 0.000us 0.000us 11.102ms 100.02% 11.102ms 11.102ms 1 -void cutlass::device_kernel<flash::enable_sm80_to_sm... 0.00% 0.000us 0.00% 0.000us 0.000us 11.101ms 100.00% 11.101ms 3.700ms 3 - Activity Buffer Request 7.69% 950.601us 7.69% 950.601us 950.601us 3.822ms 34.43% 3.822ms 3.822ms 1 - aten::empty 0.22% 27.719us 0.22% 27.719us 4.620us 0.000us 0.00% 0.000us 0.000us 6 - cudaFuncSetAttribute 0.04% 5.160us 0.04% 5.160us 1.720us 0.000us 0.00% 0.000us 0.000us 3 - cudaLaunchKernel 1.99% 246.326us 1.99% 246.326us 82.109us 0.000us 0.00% 0.000us 0.000us 3 - cudaDeviceSynchronize 87.93% 10.869ms 87.93% 10.869ms 10.869ms 0.000us 0.00% 0.000us 0.000us 1 + hf_kernels_flash_attn3 2.48% 118.391us 41.58% 1.983ms 1.983ms 0.000us 0.00% 4.029ms 4.029ms 1 + FlashAttnFunc 2.00% 95.232us 39.09% 1.865ms 621.579us 0.000us 0.00% 4.029ms 1.343ms 3 + _flash_attn3_48fe103_dirty::fwd 1.18% 56.301us 37.10% 1.770ms 589.835us 3.014ms 100.00% 4.029ms 1.343ms 3 + hf_kernels_flash_attn3 0.00% 0.000us 0.00% 0.000us 0.000us 3.016ms 100.06% 3.016ms 3.016ms 1 +void cutlass::device_kernel<flash::enable_sm80_to_sm... 0.00% 0.000us 0.00% 0.000us 0.000us 3.014ms 100.00% 3.014ms 1.005ms 3 + Activity Buffer Request 30.19% 1.440ms 30.19% 1.440ms 1.440ms 1.015ms 33.67% 1.015ms 1.015ms 1 + aten::empty 0.58% 27.710us 0.58% 27.710us 4.618us 0.000us 0.00% 0.000us 0.000us 6 + cudaFuncSetAttribute 0.10% 4.771us 0.10% 4.771us 1.590us 0.000us 0.00% 0.000us 0.000us 3 + cudaLaunchKernel 5.05% 240.873us 5.05% 240.873us 80.291us 0.000us 0.00% 0.000us 0.000us 3 + cudaDeviceSynchronize 58.42% 2.787ms 58.42% 2.787ms 2.787ms 0.000us 0.00% 0.000us 0.000us 1 ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ -Self CPU time total: 12.361ms -Self CUDA time total: 11.101ms +Self CPU time total: 4.770ms +Self CUDA time total: 3.014ms @@ -4013,19 +4013,19 @@ PROFILE TRACE: hf_kernels_flash_attn3 | cuda_attn_L448_bfloat16 ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ - hf_kernels_flash_attn3 0.89% 122.681us 14.72% 2.032ms 2.032ms 0.000us 0.00% 16.019ms 16.019ms 1 - FlashAttnFunc 0.72% 100.054us 13.83% 1.909ms 636.464us 0.000us 0.00% 16.019ms 5.340ms 3 - _flash_attn3_48fe103_dirty::fwd 0.37% 50.743us 13.11% 1.809ms 603.113us 11.999ms 100.00% 16.019ms 5.340ms 3 - hf_kernels_flash_attn3 0.00% 0.000us 0.00% 0.000us 0.000us 12.001ms 100.02% 12.001ms 12.001ms 1 -void cutlass::device_kernel<flash::enable_sm80_to_sm... 0.00% 0.000us 0.00% 0.000us 0.000us 11.999ms 100.00% 11.999ms 4.000ms 3 - Activity Buffer Request 10.68% 1.474ms 10.68% 1.474ms 1.474ms 4.020ms 33.50% 4.020ms 4.020ms 1 - aten::empty 0.20% 27.509us 0.20% 27.509us 4.585us 0.000us 0.00% 0.000us 0.000us 6 - cudaFuncSetAttribute 0.04% 5.180us 0.04% 5.180us 1.727us 0.000us 0.00% 0.000us 0.000us 3 - cudaLaunchKernel 1.82% 251.475us 1.82% 251.475us 83.825us 0.000us 0.00% 0.000us 0.000us 3 - cudaDeviceSynchronize 85.28% 11.773ms 85.28% 11.773ms 11.773ms 0.000us 0.00% 0.000us 0.000us 1 + hf_kernels_flash_attn3 2.45% 127.821us 37.14% 1.937ms 1.937ms 0.000us 0.00% 4.669ms 4.669ms 1 + FlashAttnFunc 1.78% 92.961us 34.69% 1.809ms 603.079us 0.000us 0.00% 4.669ms 1.556ms 3 + _flash_attn3_48fe103_dirty::fwd 0.98% 50.990us 32.91% 1.716ms 572.092us 3.496ms 100.00% 4.669ms 1.556ms 3 + hf_kernels_flash_attn3 0.00% 0.000us 0.00% 0.000us 0.000us 3.498ms 100.05% 3.498ms 3.498ms 1 +void cutlass::device_kernel<flash::enable_sm80_to_sm... 0.00% 0.000us 0.00% 0.000us 0.000us 3.496ms 100.00% 3.496ms 1.165ms 3 + Activity Buffer Request 27.66% 1.443ms 27.66% 1.443ms 1.443ms 1.173ms 33.56% 1.173ms 1.173ms 1 + aten::empty 0.56% 28.951us 0.56% 28.951us 4.825us 0.000us 0.00% 0.000us 0.000us 6 + cudaFuncSetAttribute 0.09% 4.870us 0.09% 4.870us 1.623us 0.000us 0.00% 0.000us 0.000us 3 + cudaLaunchKernel 3.62% 188.673us 3.62% 188.673us 62.891us 0.000us 0.00% 0.000us 0.000us 3 + cudaDeviceSynchronize 62.86% 3.279ms 62.86% 3.279ms 3.279ms 0.000us 0.00% 0.000us 0.000us 1 ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ -Self CPU time total: 13.805ms -Self CUDA time total: 11.999ms +Self CPU time total: 5.216ms +Self CUDA time total: 3.496ms @@ -4035,87 +4035,34 @@ PROFILE TRACE: hf_kernels_flash_attn3 | cuda_attn_L512_bfloat16 ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ - hf_kernels_flash_attn3 0.65% 102.032us 20.79% 3.268ms 3.268ms 0.000us 0.00% 16.971ms 16.971ms 1 - FlashAttnFunc 0.66% 104.392us 20.14% 3.166ms 1.055ms 0.000us 0.00% 16.971ms 5.657ms 3 - _flash_attn3_48fe103_dirty::fwd 0.30% 47.113us 19.48% 3.062ms 1.021ms 12.681ms 100.00% 16.971ms 5.657ms 3 - hf_kernels_flash_attn3 0.00% 0.000us 0.00% 0.000us 0.000us 12.683ms 100.02% 12.683ms 12.683ms 1 -void cutlass::device_kernel<flash::enable_sm80_to_sm... 0.00% 0.000us 0.00% 0.000us 0.000us 12.681ms 100.00% 12.681ms 4.227ms 3 - Activity Buffer Request 10.87% 1.709ms 10.87% 1.709ms 1.709ms 4.290ms 33.83% 4.290ms 4.290ms 1 - aten::empty 0.17% 27.090us 0.17% 27.090us 4.515us 0.000us 0.00% 0.000us 0.000us 6 - cudaFuncSetAttribute 0.03% 5.219us 0.03% 5.219us 1.740us 0.000us 0.00% 0.000us 0.000us 3 - cudaLaunchKernel 8.10% 1.273ms 8.10% 1.273ms 424.362us 0.000us 0.00% 0.000us 0.000us 3 - cudaDeviceSynchronize 79.21% 12.453ms 79.21% 12.453ms 12.453ms 0.000us 0.00% 0.000us 0.000us 1 + hf_kernels_flash_attn3 2.26% 115.651us 36.11% 1.844ms 1.844ms 0.000us 0.00% 4.648ms 4.648ms 1 + FlashAttnFunc 1.78% 91.130us 33.84% 1.728ms 576.085us 0.000us 0.00% 4.648ms 1.549ms 3 + _flash_attn3_48fe103_dirty::fwd 1.06% 54.250us 32.06% 1.637ms 545.708us 3.480ms 100.00% 4.648ms 1.549ms 3 + hf_kernels_flash_attn3 0.00% 0.000us 0.00% 0.000us 0.000us 3.481ms 100.04% 3.481ms 3.481ms 1 +void cutlass::device_kernel<flash::enable_sm80_to_sm... 0.00% 0.000us 0.00% 0.000us 0.000us 3.480ms 100.00% 3.480ms 1.160ms 3 + Activity Buffer Request 27.00% 1.379ms 27.00% 1.379ms 1.379ms 1.168ms 33.58% 1.168ms 1.168ms 1 + aten::empty 0.55% 28.142us 0.55% 28.142us 4.690us 0.000us 0.00% 0.000us 0.000us 6 + cudaFuncSetAttribute 0.10% 5.261us 0.10% 5.261us 1.754us 0.000us 0.00% 0.000us 0.000us 3 + cudaLaunchKernel 3.35% 170.883us 3.35% 170.883us 56.961us 0.000us 0.00% 0.000us 0.000us 3 + cudaDeviceSynchronize 63.89% 3.263ms 63.89% 3.263ms 3.263ms 0.000us 0.00% 0.000us 0.000us 1 ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ -Self CPU time total: 15.722ms -Self CUDA time total: 12.681ms +Self CPU time total: 5.107ms +Self CUDA time total: 3.480ms impl wl p50(ms) ok -hf_kernels_flash_attn3 cuda_attn_L128_bfloat16 3.22 True -hf_kernels_flash_attn3 cuda_attn_L256_bfloat16 3.77 True -hf_kernels_flash_attn3 cuda_attn_L320_bfloat16 3.91 True -hf_kernels_flash_attn3 cuda_attn_L384_bfloat16 3.97 True -hf_kernels_flash_attn3 cuda_attn_L448_bfloat16 4.19 True -hf_kernels_flash_attn3 cuda_attn_L512_bfloat16 4.41 True +hf_kernels_flash_attn3 cuda_attn_L128_bfloat16 0.95 True +hf_kernels_flash_attn3 cuda_attn_L256_bfloat16 0.98 True +hf_kernels_flash_attn3 cuda_attn_L320_bfloat16 1.03 True +hf_kernels_flash_attn3 cuda_attn_L384_bfloat16 1.04 True +hf_kernels_flash_attn3 cuda_attn_L448_bfloat16 1.21 True +hf_kernels_flash_attn3 cuda_attn_L512_bfloat16 1.18 True
-
-
▶ UV Install Logs
- -
Fetching 4 files: 0%| | 0/4 [00:00<?, ?it/s] -Fetching 4 files: 25%|██▌ | 1/4 [00:00<00:00, 7.95it/s] -Fetching 4 files: 50%|█████ | 2/4 [00:01<00:01, 1.15it/s] -Fetching 4 files: 100%|██████████| 4/4 [00:01<00:00, 2.64it/s]

Artifacts:

attention.jsonl diff --git a/flash_attn/impls/mem_efficient_attention.html b/flash_attn/impls/mem_efficient_attention.html index fe7541e7ea06ed98fb77a03fb38c560aafd75082..2ef177e15d8ffcd4554ffa06fae5689015fee95f 100644 --- a/flash_attn/impls/mem_efficient_attention.html +++ b/flash_attn/impls/mem_efficient_attention.html @@ -3857,7 +3857,7 @@ span.linenos.special { color: #000000; background-color: #ffffc0; padding-left:
Generated on:
- Linux x86_64 | Linux-5.15.0-1084-aws-x86_64-with-glibc2.31 + Linux x86_64 | Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35
@@ -3869,9 +3869,9 @@ span.linenos.special { color: #000000; background-color: #ffffc0; padding-left: ▼ code ▼ output - ▶ uv-logs + ▶ uv-logs | -Cell: benchmark | 44.03s +Cell: benchmark | 4.02s | Raw @@ -3924,28 +3924,28 @@ PROFILE TRACE: torch_mem_eff | cuda_attn_L128_bfloat16 ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ - torch_mem_eff 2.05% 363.238us 13.65% 2.421ms 2.421ms 0.000us 0.00% 16.223ms 16.223ms 1 - torch_mem_eff 0.00% 0.000us 0.00% 0.000us 0.000us 16.048ms 100.05% 16.048ms 16.048ms 1 - aten::scaled_dot_product_attention 0.20% 35.830us 1.03% 182.144us 60.715us 0.000us 0.00% 14.265ms 4.755ms 3 - aten::_scaled_dot_product_efficient_attention 0.13% 22.700us 0.82% 146.314us 48.771us 0.000us 0.00% 14.265ms 4.755ms 3 - aten::_efficient_attention_forward 0.19% 33.351us 0.54% 96.203us 32.068us 14.265ms 88.94% 14.265ms 4.755ms 3 -fmha_cutlassF_bf16_aligned_64x128_rf_sm80(PyTorchMem... 0.00% 0.000us 0.00% 0.000us 0.000us 14.265ms 88.94% 14.265ms 4.755ms 3 - aten::contiguous 0.08% 13.451us 10.18% 1.806ms 200.629us 0.000us 0.00% 1.957ms 217.467us 9 - aten::clone 0.17% 30.701us 10.10% 1.792ms 199.134us 0.000us 0.00% 1.957ms 217.467us 9 - aten::copy_ 0.43% 76.213us 9.49% 1.684ms 187.121us 1.775ms 11.06% 1.957ms 217.467us 9 -void at::native::elementwise_kernel<128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 1.775ms 11.06% 1.775ms 197.189us 9 - Activity Buffer Request 8.62% 1.529ms 8.62% 1.529ms 1.529ms 182.494us 1.14% 182.494us 182.494us 1 - aten::transpose 0.41% 73.552us 0.55% 97.771us 4.074us 0.000us 0.00% 0.000us 0.000us 24 - aten::as_strided 0.14% 24.219us 0.14% 24.219us 1.009us 0.000us 0.00% 0.000us 0.000us 24 - aten::empty_like 0.13% 23.478us 0.44% 77.421us 8.602us 0.000us 0.00% 0.000us 0.000us 9 - aten::empty 0.48% 85.684us 0.48% 85.684us 4.080us 0.000us 0.00% 0.000us 0.000us 21 - cudaLaunchKernel 0.58% 102.581us 0.58% 102.581us 8.548us 0.000us 0.00% 0.000us 0.000us 12 - cudaStreamIsCapturing 0.02% 3.010us 0.02% 3.010us 1.003us 0.000us 0.00% 0.000us 0.000us 3 - cudaFuncSetAttribute 0.02% 4.301us 0.02% 4.301us 1.434us 0.000us 0.00% 0.000us 0.000us 3 - cudaDeviceSynchronize 86.35% 15.322ms 86.35% 15.322ms 15.322ms 0.000us 0.00% 0.000us 0.000us 1 + torch_mem_eff 4.61% 329.029us 32.49% 2.320ms 2.320ms 0.000us 0.00% 5.545ms 5.545ms 1 + torch_mem_eff 0.00% 0.000us 0.00% 0.000us 0.000us 5.524ms 100.54% 5.524ms 5.524ms 1 + aten::scaled_dot_product_attention 0.42% 29.860us 2.75% 196.242us 65.414us 0.000us 0.00% 4.878ms 1.626ms 3 + aten::_scaled_dot_product_efficient_attention 0.35% 25.230us 2.33% 166.382us 55.461us 0.000us 0.00% 4.878ms 1.626ms 3 + aten::_efficient_attention_forward 0.73% 52.049us 1.68% 119.861us 39.954us 4.878ms 88.79% 4.878ms 1.626ms 3 +fmha_cutlassF_bf16_aligned_64x128_rf_sm80(PyTorchMem... 0.00% 0.000us 0.00% 0.000us 0.000us 4.878ms 88.79% 4.878ms 1.626ms 3 + aten::contiguous 0.18% 13.143us 24.28% 1.734ms 192.643us 0.000us 0.00% 666.300us 74.033us 9 + aten::clone 0.50% 35.608us 24.09% 1.721ms 191.183us 0.000us 0.00% 666.300us 74.033us 9 + aten::copy_ 1.01% 71.952us 22.59% 1.613ms 179.214us 615.708us 11.21% 666.300us 74.033us 9 +void at::native::elementwise_kernel<128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 615.708us 11.21% 615.708us 68.412us 9 + Activity Buffer Request 20.33% 1.452ms 20.33% 1.452ms 1.452ms 50.592us 0.92% 50.592us 50.592us 1 + aten::transpose 0.87% 61.994us 1.16% 82.494us 3.437us 0.000us 0.00% 0.000us 0.000us 24 + aten::as_strided 0.29% 20.500us 0.29% 20.500us 0.854us 0.000us 0.00% 0.000us 0.000us 24 + aten::empty_like 0.25% 17.742us 1.01% 72.112us 8.012us 0.000us 0.00% 0.000us 0.000us 9 + aten::empty 1.17% 83.610us 1.17% 83.610us 3.981us 0.000us 0.00% 0.000us 0.000us 21 + cudaLaunchKernel 1.60% 114.582us 1.60% 114.582us 9.548us 0.000us 0.00% 0.000us 0.000us 12 + cudaStreamIsCapturing 0.04% 3.180us 0.04% 3.180us 1.060us 0.000us 0.00% 0.000us 0.000us 3 + cudaFuncSetAttribute 0.14% 10.280us 0.14% 10.280us 3.427us 0.000us 0.00% 0.000us 0.000us 3 + cudaDeviceSynchronize 67.51% 4.821ms 67.51% 4.821ms 4.821ms 0.000us 0.00% 0.000us 0.000us 1 ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ -Self CPU time total: 17.744ms -Self CUDA time total: 16.040ms +Self CPU time total: 7.141ms +Self CUDA time total: 5.494ms @@ -3955,28 +3955,28 @@ PROFILE TRACE: torch_mem_eff | cuda_attn_L256_bfloat16 ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ - torch_mem_eff 1.10% 253.536us 9.32% 2.141ms 2.141ms 0.000us 0.00% 21.587ms 21.587ms 1 - torch_mem_eff 0.00% 0.000us 0.00% 0.000us 0.000us 21.402ms 100.04% 21.402ms 21.402ms 1 - aten::scaled_dot_product_attention 0.08% 19.430us 0.63% 143.683us 47.894us 0.000us 0.00% 19.557ms 6.519ms 3 - aten::_scaled_dot_product_efficient_attention 0.08% 18.332us 0.54% 124.253us 41.418us 0.000us 0.00% 19.557ms 6.519ms 3 - aten::_efficient_attention_forward 0.12% 28.280us 0.35% 81.271us 27.090us 19.557ms 91.42% 19.557ms 6.519ms 3 -fmha_cutlassF_bf16_aligned_64x128_rf_sm80(PyTorchMem... 0.00% 0.000us 0.00% 0.000us 0.000us 19.557ms 91.42% 19.557ms 6.519ms 3 - aten::contiguous 0.03% 7.109us 7.41% 1.701ms 189.023us 0.000us 0.00% 2.030ms 225.605us 9 - aten::clone 0.09% 20.673us 7.38% 1.694ms 188.233us 0.000us 0.00% 2.030ms 225.605us 9 - aten::copy_ 0.27% 61.032us 7.08% 1.625ms 180.543us 1.836ms 8.58% 2.030ms 225.605us 9 -void at::native::elementwise_kernel<128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 1.836ms 8.58% 1.836ms 203.973us 9 - Activity Buffer Request 6.54% 1.501ms 6.54% 1.501ms 1.501ms 194.686us 0.91% 194.686us 194.686us 1 - aten::transpose 0.22% 49.892us 0.29% 67.250us 2.802us 0.000us 0.00% 0.000us 0.000us 24 - aten::as_strided 0.08% 17.358us 0.08% 17.358us 0.723us 0.000us 0.00% 0.000us 0.000us 24 - aten::empty_like 0.05% 11.620us 0.21% 48.540us 5.393us 0.000us 0.00% 0.000us 0.000us 9 - aten::empty 0.27% 63.131us 0.27% 63.131us 3.006us 0.000us 0.00% 0.000us 0.000us 21 - cudaLaunchKernel 0.37% 84.411us 0.37% 84.411us 7.034us 0.000us 0.00% 0.000us 0.000us 12 - cudaStreamIsCapturing 0.01% 2.460us 0.01% 2.460us 0.820us 0.000us 0.00% 0.000us 0.000us 3 - cudaFuncSetAttribute 0.01% 2.960us 0.01% 2.960us 0.987us 0.000us 0.00% 0.000us 0.000us 3 - cudaDeviceSynchronize 90.68% 20.821ms 90.68% 20.821ms 20.821ms 0.000us 0.00% 0.000us 0.000us 1 + torch_mem_eff 3.39% 253.102us 28.13% 2.097ms 2.097ms 0.000us 0.00% 5.972ms 5.972ms 1 + torch_mem_eff 0.00% 0.000us 0.00% 0.000us 0.000us 5.926ms 100.15% 5.926ms 5.926ms 1 + aten::scaled_dot_product_attention 0.26% 19.190us 1.92% 143.113us 47.704us 0.000us 0.00% 5.278ms 1.759ms 3 + aten::_scaled_dot_product_efficient_attention 0.26% 19.540us 1.66% 123.923us 41.308us 0.000us 0.00% 5.278ms 1.759ms 3 + aten::_efficient_attention_forward 0.37% 27.385us 1.10% 81.652us 27.217us 5.278ms 89.20% 5.278ms 1.759ms 3 +fmha_cutlassF_bf16_aligned_64x128_rf_sm80(PyTorchMem... 0.00% 0.000us 0.00% 0.000us 0.000us 5.278ms 89.20% 5.278ms 1.759ms 3 + aten::contiguous 0.09% 6.999us 22.26% 1.660ms 184.423us 0.000us 0.00% 693.503us 77.056us 9 + aten::clone 0.31% 23.031us 22.17% 1.653ms 183.645us 0.000us 0.00% 693.503us 77.056us 9 + aten::copy_ 0.83% 61.989us 21.18% 1.579ms 175.477us 638.911us 10.80% 693.503us 77.056us 9 +void at::native::elementwise_kernel<128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 638.911us 10.80% 638.911us 70.990us 9 + Activity Buffer Request 19.45% 1.450ms 19.45% 1.450ms 1.450ms 54.592us 0.92% 54.592us 54.592us 1 + aten::transpose 0.64% 47.641us 0.86% 64.101us 2.671us 0.000us 0.00% 0.000us 0.000us 24 + aten::as_strided 0.22% 16.460us 0.22% 16.460us 0.686us 0.000us 0.00% 0.000us 0.000us 24 + aten::empty_like 0.16% 11.730us 0.68% 50.483us 5.609us 0.000us 0.00% 0.000us 0.000us 9 + aten::empty 0.86% 64.470us 0.86% 64.470us 3.070us 0.000us 0.00% 0.000us 0.000us 21 + cudaLaunchKernel 1.21% 90.240us 1.21% 90.240us 7.520us 0.000us 0.00% 0.000us 0.000us 12 + cudaStreamIsCapturing 0.03% 2.290us 0.03% 2.290us 0.763us 0.000us 0.00% 0.000us 0.000us 3 + cudaFuncSetAttribute 0.04% 3.130us 0.04% 3.130us 1.043us 0.000us 0.00% 0.000us 0.000us 3 + cudaDeviceSynchronize 71.87% 5.359ms 71.87% 5.359ms 5.359ms 0.000us 0.00% 0.000us 0.000us 1 ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ -Self CPU time total: 22.962ms -Self CUDA time total: 21.392ms +Self CPU time total: 7.456ms +Self CUDA time total: 5.917ms @@ -3986,28 +3986,28 @@ PROFILE TRACE: torch_mem_eff | cuda_attn_L320_bfloat16 ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ - torch_mem_eff 1.02% 243.020us 8.92% 2.127ms 2.127ms 0.000us 0.00% 22.482ms 22.482ms 1 - torch_mem_eff 0.00% 0.000us 0.00% 0.000us 0.000us 22.293ms 100.04% 22.293ms 22.293ms 1 - aten::scaled_dot_product_attention 0.08% 18.442us 0.60% 142.065us 47.355us 0.000us 0.00% 20.413ms 6.804ms 3 - aten::_scaled_dot_product_efficient_attention 0.08% 17.984us 0.52% 123.623us 41.208us 0.000us 0.00% 20.413ms 6.804ms 3 - aten::_efficient_attention_forward 0.12% 28.538us 0.35% 82.550us 27.517us 20.413ms 91.61% 20.413ms 6.804ms 3 -fmha_cutlassF_bf16_aligned_64x128_rf_sm80(PyTorchMem... 0.00% 0.000us 0.00% 0.000us 0.000us 20.413ms 91.61% 20.413ms 6.804ms 3 - aten::contiguous 0.03% 7.301us 7.12% 1.699ms 188.733us 0.000us 0.00% 2.068ms 229.822us 9 - aten::clone 0.09% 20.431us 7.09% 1.691ms 187.922us 0.000us 0.00% 2.068ms 229.822us 9 - aten::copy_ 0.25% 59.709us 6.80% 1.622ms 180.233us 1.870ms 8.39% 2.068ms 229.822us 9 -void at::native::elementwise_kernel<128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 1.870ms 8.39% 1.870ms 207.771us 9 - Activity Buffer Request 6.28% 1.498ms 6.28% 1.498ms 1.498ms 198.462us 0.89% 198.462us 198.462us 1 - aten::transpose 0.21% 49.091us 0.28% 66.291us 2.762us 0.000us 0.00% 0.000us 0.000us 24 - aten::as_strided 0.07% 17.200us 0.07% 17.200us 0.717us 0.000us 0.00% 0.000us 0.000us 24 - aten::empty_like 0.05% 11.563us 0.20% 48.772us 5.419us 0.000us 0.00% 0.000us 0.000us 9 - aten::empty 0.27% 63.659us 0.27% 63.659us 3.031us 0.000us 0.00% 0.000us 0.000us 21 - cudaLaunchKernel 0.36% 86.324us 0.36% 86.324us 7.194us 0.000us 0.00% 0.000us 0.000us 12 - cudaStreamIsCapturing 0.01% 2.431us 0.01% 2.431us 0.810us 0.000us 0.00% 0.000us 0.000us 3 - cudaFuncSetAttribute 0.01% 2.970us 0.01% 2.970us 0.990us 0.000us 0.00% 0.000us 0.000us 3 - cudaDeviceSynchronize 91.08% 21.725ms 91.08% 21.725ms 21.725ms 0.000us 0.00% 0.000us 0.000us 1 + torch_mem_eff 3.16% 240.823us 26.89% 2.051ms 2.051ms 0.000us 0.00% 6.167ms 6.167ms 1 + torch_mem_eff 0.00% 0.000us 0.00% 0.000us 0.000us 6.117ms 100.14% 6.117ms 6.117ms 1 + aten::scaled_dot_product_attention 0.24% 18.220us 1.81% 137.732us 45.911us 0.000us 0.00% 5.453ms 1.818ms 3 + aten::_scaled_dot_product_efficient_attention 0.24% 18.402us 1.57% 119.512us 39.837us 0.000us 0.00% 5.453ms 1.818ms 3 + aten::_efficient_attention_forward 0.35% 26.389us 1.04% 79.670us 26.557us 5.453ms 89.28% 5.453ms 1.818ms 3 +fmha_cutlassF_bf16_aligned_64x128_rf_sm80(PyTorchMem... 0.00% 0.000us 0.00% 0.000us 0.000us 5.453ms 89.28% 5.453ms 1.818ms 3 + aten::contiguous 0.09% 6.950us 21.38% 1.630ms 181.132us 0.000us 0.00% 713.534us 79.282us 9 + aten::clone 0.28% 21.189us 21.28% 1.623ms 180.360us 0.000us 0.00% 713.534us 79.282us 9 + aten::copy_ 0.81% 62.032us 20.34% 1.551ms 172.330us 655.038us 10.72% 713.534us 79.282us 9 +void at::native::elementwise_kernel<128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 655.038us 10.72% 655.038us 72.782us 9 + Activity Buffer Request 18.63% 1.421ms 18.63% 1.421ms 1.421ms 58.496us 0.96% 58.496us 58.496us 1 + aten::transpose 0.62% 47.348us 0.84% 63.699us 2.654us 0.000us 0.00% 0.000us 0.000us 24 + aten::as_strided 0.21% 16.351us 0.21% 16.351us 0.681us 0.000us 0.00% 0.000us 0.000us 24 + aten::empty_like 0.15% 11.091us 0.67% 51.081us 5.676us 0.000us 0.00% 0.000us 0.000us 9 + aten::empty 0.86% 65.760us 0.86% 65.760us 3.131us 0.000us 0.00% 0.000us 0.000us 21 + cudaLaunchKernel 1.18% 89.982us 1.18% 89.982us 7.498us 0.000us 0.00% 0.000us 0.000us 12 + cudaStreamIsCapturing 0.03% 2.210us 0.03% 2.210us 0.737us 0.000us 0.00% 0.000us 0.000us 3 + cudaFuncSetAttribute 0.04% 3.100us 0.04% 3.100us 1.033us 0.000us 0.00% 0.000us 0.000us 3 + cudaDeviceSynchronize 73.11% 5.575ms 73.11% 5.575ms 5.575ms 0.000us 0.00% 0.000us 0.000us 1 ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ -Self CPU time total: 23.852ms -Self CUDA time total: 22.283ms +Self CPU time total: 7.626ms +Self CUDA time total: 6.108ms @@ -4017,28 +4017,28 @@ PROFILE TRACE: torch_mem_eff | cuda_attn_L384_bfloat16 ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ - torch_mem_eff 1.02% 244.258us 9.92% 2.384ms 2.384ms 0.000us 0.00% 22.468ms 22.468ms 1 - torch_mem_eff 0.00% 0.000us 0.00% 0.000us 0.000us 22.273ms 100.04% 22.273ms 22.273ms 1 - aten::scaled_dot_product_attention 0.08% 18.581us 0.64% 152.823us 50.941us 0.000us 0.00% 20.365ms 6.788ms 3 - aten::_scaled_dot_product_efficient_attention 0.08% 18.340us 0.56% 134.242us 44.747us 0.000us 0.00% 20.365ms 6.788ms 3 - aten::_efficient_attention_forward 0.12% 27.659us 0.39% 92.632us 30.877us 20.365ms 91.47% 20.365ms 6.788ms 3 -fmha_cutlassF_bf16_aligned_64x128_rf_sm80(PyTorchMem... 0.00% 0.000us 0.00% 0.000us 0.000us 20.365ms 91.47% 20.365ms 6.788ms 3 - aten::contiguous 0.03% 7.371us 8.08% 1.943ms 215.938us 0.000us 0.00% 2.103ms 233.655us 9 - aten::clone 0.09% 21.799us 8.05% 1.936ms 215.119us 0.000us 0.00% 2.103ms 233.655us 9 - aten::copy_ 0.27% 65.442us 7.66% 1.841ms 204.604us 1.898ms 8.53% 2.103ms 233.655us 9 -void at::native::elementwise_kernel<128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 1.898ms 8.53% 1.898ms 210.921us 9 - Activity Buffer Request 6.22% 1.495ms 6.22% 1.495ms 1.495ms 204.607us 0.92% 204.607us 204.607us 1 - aten::transpose 0.20% 48.657us 0.28% 66.799us 2.783us 0.000us 0.00% 0.000us 0.000us 24 - aten::as_strided 0.08% 18.142us 0.08% 18.142us 0.756us 0.000us 0.00% 0.000us 0.000us 24 - aten::empty_like 0.13% 32.371us 0.30% 72.832us 8.092us 0.000us 0.00% 0.000us 0.000us 9 - aten::empty 0.29% 69.063us 0.29% 69.063us 3.289us 0.000us 0.00% 0.000us 0.000us 21 - cudaLaunchKernel 1.30% 311.775us 1.30% 311.775us 25.981us 0.000us 0.00% 0.000us 0.000us 12 - cudaStreamIsCapturing 0.01% 2.430us 0.01% 2.430us 0.810us 0.000us 0.00% 0.000us 0.000us 3 - cudaFuncSetAttribute 0.01% 2.951us 0.01% 2.951us 0.984us 0.000us 0.00% 0.000us 0.000us 3 - cudaDeviceSynchronize 90.08% 21.659ms 90.08% 21.659ms 21.659ms 0.000us 0.00% 0.000us 0.000us 1 + torch_mem_eff 4.44% 356.182us 33.00% 2.648ms 2.648ms 0.000us 0.00% 6.210ms 6.210ms 1 + torch_mem_eff 0.00% 0.000us 0.00% 0.000us 0.000us 6.165ms 100.21% 6.165ms 6.165ms 1 + aten::scaled_dot_product_attention 0.29% 23.400us 2.31% 185.263us 61.754us 0.000us 0.00% 5.497ms 1.832ms 3 + aten::_scaled_dot_product_efficient_attention 0.29% 23.202us 2.02% 161.863us 53.954us 0.000us 0.00% 5.497ms 1.832ms 3 + aten::_efficient_attention_forward 0.44% 35.239us 1.36% 108.811us 36.270us 5.497ms 89.36% 5.497ms 1.832ms 3 +fmha_cutlassF_bf16_aligned_64x128_rf_sm80(PyTorchMem... 0.00% 0.000us 0.00% 0.000us 0.000us 5.497ms 89.36% 5.497ms 1.832ms 3 + aten::contiguous 0.11% 9.040us 25.54% 2.050ms 227.726us 0.000us 0.00% 712.735us 79.193us 9 + aten::clone 0.35% 28.461us 25.43% 2.040ms 226.722us 0.000us 0.00% 712.735us 79.193us 9 + aten::copy_ 1.02% 82.020us 24.22% 1.944ms 215.993us 654.527us 10.64% 712.735us 79.193us 9 +void at::native::elementwise_kernel<128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 654.527us 10.64% 654.527us 72.725us 9 + Activity Buffer Request 19.35% 1.553ms 19.35% 1.553ms 1.553ms 58.208us 0.95% 58.208us 58.208us 1 + aten::transpose 0.81% 64.960us 1.09% 87.330us 3.639us 0.000us 0.00% 0.000us 0.000us 24 + aten::as_strided 0.28% 22.370us 0.28% 22.370us 0.932us 0.000us 0.00% 0.000us 0.000us 24 + aten::empty_like 0.19% 15.081us 0.85% 68.092us 7.566us 0.000us 0.00% 0.000us 0.000us 9 + aten::empty 1.09% 87.522us 1.09% 87.522us 4.168us 0.000us 0.00% 0.000us 0.000us 21 + cudaLaunchKernel 4.25% 341.154us 4.25% 341.154us 28.429us 0.000us 0.00% 0.000us 0.000us 12 + cudaStreamIsCapturing 0.04% 2.841us 0.04% 2.841us 0.947us 0.000us 0.00% 0.000us 0.000us 3 + cudaFuncSetAttribute 0.05% 4.120us 0.05% 4.120us 1.373us 0.000us 0.00% 0.000us 0.000us 3 + cudaDeviceSynchronize 67.00% 5.376ms 67.00% 5.376ms 5.376ms 0.000us 0.00% 0.000us 0.000us 1 ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ -Self CPU time total: 24.043ms -Self CUDA time total: 22.264ms +Self CPU time total: 8.025ms +Self CUDA time total: 6.152ms @@ -4048,28 +4048,28 @@ PROFILE TRACE: torch_mem_eff | cuda_attn_L448_bfloat16 ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ - torch_mem_eff 0.99% 238.965us 8.38% 2.024ms 2.024ms 0.000us 0.00% 22.887ms 22.887ms 1 - torch_mem_eff 0.00% 0.000us 0.00% 0.000us 0.000us 22.691ms 100.04% 22.691ms 22.691ms 1 - aten::scaled_dot_product_attention 0.08% 19.540us 0.60% 145.283us 48.428us 0.000us 0.00% 20.756ms 6.919ms 3 - aten::_scaled_dot_product_efficient_attention 0.08% 18.450us 0.52% 125.743us 41.914us 0.000us 0.00% 20.756ms 6.919ms 3 - aten::_efficient_attention_forward 0.12% 28.200us 0.34% 82.042us 27.347us 20.756ms 91.51% 20.756ms 6.919ms 3 -fmha_cutlassF_bf16_aligned_64x128_rf_sm80(PyTorchMem... 0.00% 0.000us 0.00% 0.000us 0.000us 20.756ms 91.51% 20.756ms 6.919ms 3 - aten::contiguous 0.03% 7.310us 6.62% 1.597ms 177.483us 0.000us 0.00% 2.130ms 236.720us 9 - aten::clone 0.08% 20.502us 6.59% 1.590ms 176.671us 0.000us 0.00% 2.130ms 236.720us 9 - aten::copy_ 0.25% 60.710us 6.29% 1.519ms 168.815us 1.926ms 8.49% 2.130ms 236.720us 9 -void at::native::elementwise_kernel<128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 1.926ms 8.49% 1.926ms 213.965us 9 - Activity Buffer Request 4.97% 1.199ms 4.97% 1.199ms 1.199ms 204.798us 0.90% 204.798us 204.798us 1 - aten::transpose 0.21% 49.950us 0.28% 67.671us 2.820us 0.000us 0.00% 0.000us 0.000us 24 - aten::as_strided 0.07% 17.721us 0.07% 17.721us 0.738us 0.000us 0.00% 0.000us 0.000us 24 - aten::empty_like 0.05% 11.321us 0.21% 50.202us 5.578us 0.000us 0.00% 0.000us 0.000us 9 - aten::empty 0.27% 64.383us 0.27% 64.383us 3.066us 0.000us 0.00% 0.000us 0.000us 21 - cudaLaunchKernel 1.17% 282.217us 1.17% 282.217us 23.518us 0.000us 0.00% 0.000us 0.000us 12 - cudaStreamIsCapturing 0.01% 2.720us 0.01% 2.720us 0.907us 0.000us 0.00% 0.000us 0.000us 3 - cudaFuncSetAttribute 0.01% 3.029us 0.01% 3.029us 1.010us 0.000us 0.00% 0.000us 0.000us 3 - cudaDeviceSynchronize 91.62% 22.117ms 91.62% 22.117ms 22.117ms 0.000us 0.00% 0.000us 0.000us 1 + torch_mem_eff 3.33% 272.217us 28.45% 2.323ms 2.323ms 0.000us 0.00% 6.452ms 6.452ms 1 + torch_mem_eff 0.00% 0.000us 0.00% 0.000us 0.000us 6.401ms 100.14% 6.401ms 6.401ms 1 + aten::scaled_dot_product_attention 0.25% 20.040us 1.74% 141.700us 47.233us 0.000us 0.00% 5.729ms 1.910ms 3 + aten::_scaled_dot_product_efficient_attention 0.23% 18.560us 1.49% 121.660us 40.553us 0.000us 0.00% 5.729ms 1.910ms 3 + aten::_efficient_attention_forward 0.34% 27.420us 1.00% 81.440us 27.147us 5.729ms 89.62% 5.729ms 1.910ms 3 +fmha_cutlassF_bf16_aligned_64x128_rf_sm80(PyTorchMem... 0.00% 0.000us 0.00% 0.000us 0.000us 5.729ms 89.62% 5.729ms 1.910ms 3 + aten::contiguous 0.09% 7.310us 22.83% 1.865ms 207.177us 0.000us 0.00% 723.614us 80.402us 9 + aten::clone 0.27% 22.438us 22.75% 1.857ms 206.364us 0.000us 0.00% 723.614us 80.402us 9 + aten::copy_ 0.75% 61.292us 21.84% 1.783ms 198.108us 663.806us 10.38% 723.614us 80.402us 9 +void at::native::elementwise_kernel<128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 663.806us 10.38% 663.806us 73.756us 9 + Activity Buffer Request 18.13% 1.481ms 18.13% 1.481ms 1.481ms 59.808us 0.94% 59.808us 59.808us 1 + aten::transpose 0.61% 49.591us 0.81% 66.019us 2.751us 0.000us 0.00% 0.000us 0.000us 24 + aten::as_strided 0.20% 16.428us 0.20% 16.428us 0.684us 0.000us 0.00% 0.000us 0.000us 24 + aten::empty_like 0.14% 11.501us 0.64% 51.871us 5.763us 0.000us 0.00% 0.000us 0.000us 9 + aten::empty 0.80% 65.620us 0.80% 65.620us 3.125us 0.000us 0.00% 0.000us 0.000us 21 + cudaLaunchKernel 3.24% 264.473us 3.24% 264.473us 22.039us 0.000us 0.00% 0.000us 0.000us 12 + cudaStreamIsCapturing 0.03% 2.310us 0.03% 2.310us 0.770us 0.000us 0.00% 0.000us 0.000us 3 + cudaFuncSetAttribute 0.04% 3.060us 0.04% 3.060us 1.020us 0.000us 0.00% 0.000us 0.000us 3 + cudaDeviceSynchronize 71.55% 5.843ms 71.55% 5.843ms 5.843ms 0.000us 0.00% 0.000us 0.000us 1 ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ -Self CPU time total: 24.141ms -Self CUDA time total: 22.682ms +Self CPU time total: 8.166ms +Self CUDA time total: 6.392ms @@ -4079,90 +4079,38 @@ PROFILE TRACE: torch_mem_eff | cuda_attn_L512_bfloat16 ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ - torch_mem_eff 0.89% 241.438us 9.64% 2.630ms 2.630ms 0.000us 0.00% 25.454ms 25.454ms 1 - torch_mem_eff 0.00% 0.000us 0.00% 0.000us 0.000us 25.223ms 100.04% 25.223ms 25.223ms 1 - aten::scaled_dot_product_attention 0.07% 18.690us 0.53% 143.613us 47.871us 0.000us 0.00% 22.917ms 7.639ms 3 - aten::_scaled_dot_product_efficient_attention 0.07% 19.432us 0.46% 124.923us 41.641us 0.000us 0.00% 22.917ms 7.639ms 3 - aten::_efficient_attention_forward 0.10% 27.951us 0.30% 81.832us 27.277us 22.917ms 90.90% 22.917ms 7.639ms 3 -fmha_cutlassF_bf16_aligned_64x128_rf_sm80(PyTorchMem... 0.00% 0.000us 0.00% 0.000us 0.000us 22.917ms 90.90% 22.917ms 7.639ms 3 - aten::contiguous 0.03% 7.769us 8.07% 2.200ms 244.390us 0.000us 0.00% 2.537ms 281.850us 9 - aten::clone 0.08% 21.360us 8.04% 2.192ms 243.526us 0.000us 0.00% 2.537ms 281.850us 9 - aten::copy_ 0.23% 62.351us 7.77% 2.118ms 235.368us 2.295ms 9.10% 2.537ms 281.850us 9 -void at::native::elementwise_kernel<128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 2.295ms 9.10% 2.295ms 255.042us 9 - Activity Buffer Request 5.96% 1.625ms 5.96% 1.625ms 1.625ms 241.278us 0.96% 241.278us 241.278us 1 - aten::transpose 0.19% 51.326us 0.25% 68.688us 2.862us 0.000us 0.00% 0.000us 0.000us 24 - aten::as_strided 0.06% 17.362us 0.06% 17.362us 0.723us 0.000us 0.00% 0.000us 0.000us 24 - aten::empty_like 0.04% 11.861us 0.19% 52.062us 5.785us 0.000us 0.00% 0.000us 0.000us 9 - aten::empty 0.24% 65.461us 0.24% 65.461us 3.117us 0.000us 0.00% 0.000us 0.000us 21 - cudaLaunchKernel 1.67% 454.311us 1.67% 454.311us 37.859us 0.000us 0.00% 0.000us 0.000us 12 - cudaStreamIsCapturing 0.01% 2.710us 0.01% 2.710us 0.903us 0.000us 0.00% 0.000us 0.000us 3 - cudaFuncSetAttribute 0.01% 2.880us 0.01% 2.880us 0.960us 0.000us 0.00% 0.000us 0.000us 3 - cudaDeviceSynchronize 90.36% 24.642ms 90.36% 24.642ms 24.642ms 0.000us 0.00% 0.000us 0.000us 1 + torch_mem_eff 2.84% 238.921us 26.25% 2.206ms 2.206ms 0.000us 0.00% 6.803ms 6.803ms 1 + torch_mem_eff 0.00% 0.000us 0.00% 0.000us 0.000us 6.751ms 100.13% 6.751ms 6.751ms 1 + aten::scaled_dot_product_attention 0.23% 19.080us 1.67% 140.122us 46.707us 0.000us 0.00% 6.072ms 2.024ms 3 + aten::_scaled_dot_product_efficient_attention 0.22% 18.680us 1.44% 121.042us 40.347us 0.000us 0.00% 6.072ms 2.024ms 3 + aten::_efficient_attention_forward 0.32% 27.009us 0.95% 79.840us 26.613us 6.072ms 90.07% 6.072ms 2.024ms 3 +fmha_cutlassF_bf16_aligned_64x128_rf_sm80(PyTorchMem... 0.00% 0.000us 0.00% 0.000us 0.000us 6.072ms 90.07% 6.072ms 2.024ms 3 + aten::contiguous 0.09% 7.439us 21.24% 1.785ms 198.324us 0.000us 0.00% 731.099us 81.233us 9 + aten::clone 0.26% 21.852us 21.15% 1.777ms 197.498us 0.000us 0.00% 731.099us 81.233us 9 + aten::copy_ 0.77% 64.769us 20.27% 1.703ms 189.239us 669.820us 9.93% 731.099us 81.233us 9 +void at::native::elementwise_kernel<128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 669.820us 9.93% 669.820us 74.424us 9 + Activity Buffer Request 16.92% 1.422ms 16.92% 1.422ms 1.422ms 61.279us 0.91% 61.279us 61.279us 1 + aten::transpose 0.57% 48.271us 0.77% 64.334us 2.681us 0.000us 0.00% 0.000us 0.000us 24 + aten::as_strided 0.19% 16.063us 0.19% 16.063us 0.669us 0.000us 0.00% 0.000us 0.000us 24 + aten::empty_like 0.14% 11.440us 0.62% 52.480us 5.831us 0.000us 0.00% 0.000us 0.000us 9 + aten::empty 0.79% 66.661us 0.79% 66.661us 3.174us 0.000us 0.00% 0.000us 0.000us 21 + cudaLaunchKernel 2.84% 238.383us 2.84% 238.383us 19.865us 0.000us 0.00% 0.000us 0.000us 12 + cudaStreamIsCapturing 0.03% 2.270us 0.03% 2.270us 0.757us 0.000us 0.00% 0.000us 0.000us 3 + cudaFuncSetAttribute 0.04% 3.090us 0.04% 3.090us 1.030us 0.000us 0.00% 0.000us 0.000us 3 + cudaDeviceSynchronize 73.75% 6.196ms 73.75% 6.196ms 6.196ms 0.000us 0.00% 0.000us 0.000us 1 ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ -Self CPU time total: 27.271ms -Self CUDA time total: 25.213ms +Self CPU time total: 8.402ms +Self CUDA time total: 6.742ms impl wl p50(ms) ok -torch_mem_eff cuda_attn_L128_bfloat16 6.77 True -torch_mem_eff cuda_attn_L256_bfloat16 7.24 True -torch_mem_eff cuda_attn_L320_bfloat16 7.52 True -torch_mem_eff cuda_attn_L384_bfloat16 7.59 True -torch_mem_eff cuda_attn_L448_bfloat16 7.97 True -torch_mem_eff cuda_attn_L512_bfloat16 8.47 True +torch_mem_eff cuda_attn_L128_bfloat16 1.89 True +torch_mem_eff cuda_attn_L256_bfloat16 1.95 True +torch_mem_eff cuda_attn_L320_bfloat16 2.05 True +torch_mem_eff cuda_attn_L384_bfloat16 2.08 True +torch_mem_eff cuda_attn_L448_bfloat16 2.13 True +torch_mem_eff cuda_attn_L512_bfloat16 2.27 True
-
-
▶ UV Install Logs
- -

Artifacts:

attention.jsonl diff --git a/flash_attn/impls/sage_attention.html b/flash_attn/impls/sage_attention.html index 364f1f4634c22af85c73498d50323f326f86ac56..ab4f80472a285c7007aef3edb3b9473b4ac8170b 100644 --- a/flash_attn/impls/sage_attention.html +++ b/flash_attn/impls/sage_attention.html @@ -3857,7 +3857,7 @@ span.linenos.special { color: #000000; background-color: #ffffc0; padding-left:
Generated on:
- Linux x86_64 | Linux-5.15.0-1084-aws-x86_64-with-glibc2.31 + Linux x86_64 | Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35
@@ -3871,7 +3871,7 @@ span.linenos.special { color: #000000; background-color: #ffffc0; padding-left: ▼ output ▶ uv-logs | -Cell: benchmark | 44.02s +Cell: benchmark | 4.37s | Raw @@ -3921,76 +3921,28 @@ Cell: benchmark | 44.02s
Running attention benchmark on cuda with 6 workloads.
 impl                     wl                  p50(ms)  ok
 sage_int8_fp16           cuda_attn_L128_bfloat16    FAIL  False
-  Error: module 'sage_attention_c9b3a60c7e3c5091' has no attribute 'fwd'
+  Error: module 'sage_attention_12c766386675beb4' has no attribute 'fwd'
 sage_int8_fp16           cuda_attn_L256_bfloat16    FAIL  False
-  Error: module 'sage_attention_c9b3a60c7e3c5091' has no attribute 'fwd'
+  Error: module 'sage_attention_12c766386675beb4' has no attribute 'fwd'
 sage_int8_fp16           cuda_attn_L320_bfloat16    FAIL  False
-  Error: module 'sage_attention_c9b3a60c7e3c5091' has no attribute 'fwd'
+  Error: module 'sage_attention_12c766386675beb4' has no attribute 'fwd'
 sage_int8_fp16           cuda_attn_L384_bfloat16    FAIL  False
-  Error: module 'sage_attention_c9b3a60c7e3c5091' has no attribute 'fwd'
+  Error: module 'sage_attention_12c766386675beb4' has no attribute 'fwd'
 sage_int8_fp16           cuda_attn_L448_bfloat16    FAIL  False
-  Error: module 'sage_attention_c9b3a60c7e3c5091' has no attribute 'fwd'
+  Error: module 'sage_attention_12c766386675beb4' has no attribute 'fwd'
 sage_int8_fp16           cuda_attn_L512_bfloat16    FAIL  False
-  Error: module 'sage_attention_c9b3a60c7e3c5091' has no attribute 'fwd'
+  Error: module 'sage_attention_12c766386675beb4' has no attribute 'fwd'
 
▶ UV Install Logs
Fetching 11 files: 0%| | 0/11 [00:00<?, ?it/s] -Fetching 11 files: 9%|▉ | 1/11 [00:00<00:02, 3.52it/s] -Fetching 11 files: 73%|███████▎ | 8/11 [00:00<00:00, 9.29it/s] -Fetching 11 files: 100%|██████████| 11/11 [00:00<00:00, 12.03it/s]
+Fetching 11 files: 27%|██▋ | 3/11 [00:00<00:00, 14.92it/s] +Fetching 11 files: 73%|███████▎ | 8/11 [00:00<00:00, 14.19it/s] +Fetching 11 files: 100%|██████████| 11/11 [00:00<00:00, 19.60it/s]

Artifacts:

attention.jsonl diff --git a/flash_attn/impls/xformers.html b/flash_attn/impls/xformers.html index e7a61d66f041c6966cb9e2e4975416b77d7cb087..294f2ef1a8f4ed568426150c8120ee2f0c927541 100644 --- a/flash_attn/impls/xformers.html +++ b/flash_attn/impls/xformers.html @@ -3857,7 +3857,7 @@ span.linenos.special { color: #000000; background-color: #ffffc0; padding-left:
Generated on:
- Linux x86_64 | Linux-5.15.0-1084-aws-x86_64-with-glibc2.31 + Linux x86_64 | Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35
@@ -3871,7 +3871,7 @@ span.linenos.special { color: #000000; background-color: #ffffc0; padding-left: ▼ output ▶ uv-logs | -Cell: benchmark | 45.32s +Cell: benchmark | 5.09s | Raw @@ -3923,21 +3923,21 @@ PROFILE TRACE: xformers_meff | cuda_attn_L128_bfloat16 ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ - xformers_meff 5.28% 517.181us 24.85% 2.433ms 2.433ms 0.000us 0.00% 10.583ms 10.583ms 1 - xformers_flash3::flash_fwd 2.21% 216.725us 19.17% 1.877ms 625.707us 0.000us 0.00% 10.583ms 3.528ms 3 - flash_attn_3::fwd 0.75% 73.471us 16.96% 1.660ms 553.465us 7.934ms 100.00% 10.583ms 3.528ms 3 - xformers_meff 0.00% 0.000us 0.00% 0.000us 0.000us 7.935ms 100.02% 7.935ms 7.935ms 1 -void cutlass::device_kernel<flash::enable_sm80_to_sm... 0.00% 0.000us 0.00% 0.000us 0.000us 7.934ms 100.00% 7.934ms 2.645ms 3 - Activity Buffer Request 15.30% 1.498ms 15.30% 1.498ms 1.498ms 2.649ms 33.39% 2.649ms 2.649ms 1 - aten::empty 0.35% 34.410us 0.35% 34.410us 5.735us 0.000us 0.00% 0.000us 0.000us 6 - cudaFuncSetAttribute 0.13% 13.051us 0.13% 13.051us 4.350us 0.000us 0.00% 0.000us 0.000us 3 - cudaLaunchKernel 0.42% 41.351us 0.42% 41.351us 13.784us 0.000us 0.00% 0.000us 0.000us 3 - aten::reshape 0.14% 13.581us 0.40% 38.881us 6.480us 0.000us 0.00% 0.000us 0.000us 6 - aten::view 0.26% 25.300us 0.26% 25.300us 4.217us 0.000us 0.00% 0.000us 0.000us 6 - cudaDeviceSynchronize 75.15% 7.358ms 75.15% 7.358ms 7.358ms 0.000us 0.00% 0.000us 0.000us 1 + xformers_meff 10.73% 481.606us 51.24% 2.299ms 2.299ms 0.000us 0.00% 3.630ms 3.630ms 1 + xformers_flash3::flash_fwd 4.33% 194.084us 39.70% 1.781ms 593.782us 0.000us 0.00% 3.630ms 1.210ms 3 + flash_attn_3::fwd 1.76% 78.961us 35.37% 1.587ms 529.087us 2.729ms 100.00% 3.630ms 1.210ms 3 + xformers_meff 0.00% 0.000us 0.00% 0.000us 0.000us 2.730ms 100.05% 2.730ms 2.730ms 1 +void cutlass::device_kernel<flash::enable_sm80_to_sm... 0.00% 0.000us 0.00% 0.000us 0.000us 2.729ms 100.00% 2.729ms 909.588us 3 + Activity Buffer Request 31.70% 1.423ms 31.70% 1.423ms 1.423ms 901.535us 33.04% 901.535us 901.535us 1 + aten::empty 0.75% 33.761us 0.75% 33.761us 5.627us 0.000us 0.00% 0.000us 0.000us 6 + cudaFuncSetAttribute 0.28% 12.380us 0.28% 12.380us 4.127us 0.000us 0.00% 0.000us 0.000us 3 + cudaLaunchKernel 0.88% 39.570us 0.88% 39.570us 13.190us 0.000us 0.00% 0.000us 0.000us 3 + aten::reshape 0.30% 13.520us 0.80% 36.080us 6.013us 0.000us 0.00% 0.000us 0.000us 6 + aten::view 0.50% 22.560us 0.50% 22.560us 3.760us 0.000us 0.00% 0.000us 0.000us 6 + cudaDeviceSynchronize 48.76% 2.188ms 48.76% 2.188ms 2.188ms 0.000us 0.00% 0.000us 0.000us 1 ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ -Self CPU time total: 9.791ms -Self CUDA time total: 7.934ms +Self CPU time total: 4.487ms +Self CUDA time total: 2.729ms @@ -3947,21 +3947,21 @@ PROFILE TRACE: xformers_meff | cuda_attn_L256_bfloat16 ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ - xformers_meff 2.97% 376.750us 17.03% 2.160ms 2.160ms 0.000us 0.00% 14.695ms 14.695ms 1 - xformers_flash3::flash_fwd 1.31% 166.673us 13.88% 1.760ms 586.646us 0.000us 0.00% 14.695ms 4.898ms 3 - flash_attn_3::fwd 0.41% 52.370us 12.57% 1.593ms 531.088us 11.013ms 100.00% 14.695ms 4.898ms 3 - xformers_meff 0.00% 0.000us 0.00% 0.000us 0.000us 11.015ms 100.02% 11.015ms 11.015ms 1 -void cutlass::device_kernel<flash::enable_sm80_to_sm... 0.00% 0.000us 0.00% 0.000us 0.000us 11.013ms 100.00% 11.013ms 3.671ms 3 - Activity Buffer Request 11.62% 1.473ms 11.62% 1.473ms 1.473ms 3.682ms 33.43% 3.682ms 3.682ms 1 - aten::empty 0.22% 28.511us 0.22% 28.511us 4.752us 0.000us 0.00% 0.000us 0.000us 6 - cudaFuncSetAttribute 0.04% 5.391us 0.04% 5.391us 1.797us 0.000us 0.00% 0.000us 0.000us 3 - cudaLaunchKernel 0.27% 34.441us 0.27% 34.441us 11.480us 0.000us 0.00% 0.000us 0.000us 3 - aten::reshape 0.07% 8.699us 0.18% 22.949us 3.825us 0.000us 0.00% 0.000us 0.000us 6 - aten::view 0.11% 14.250us 0.11% 14.250us 2.375us 0.000us 0.00% 0.000us 0.000us 6 - cudaDeviceSynchronize 82.97% 10.518ms 82.97% 10.518ms 10.518ms 0.000us 0.00% 0.000us 0.000us 1 + xformers_meff 7.10% 312.113us 46.81% 2.059ms 2.059ms 0.000us 0.00% 3.744ms 3.744ms 1 + xformers_flash3::flash_fwd 3.88% 170.673us 39.17% 1.723ms 574.405us 0.000us 0.00% 3.744ms 1.248ms 3 + flash_attn_3::fwd 1.28% 56.171us 35.29% 1.553ms 517.514us 2.795ms 100.00% 3.744ms 1.248ms 3 + xformers_meff 0.00% 0.000us 0.00% 0.000us 0.000us 2.796ms 100.05% 2.796ms 2.796ms 1 +void cutlass::device_kernel<flash::enable_sm80_to_sm... 0.00% 0.000us 0.00% 0.000us 0.000us 2.795ms 100.00% 2.795ms 931.630us 3 + Activity Buffer Request 32.47% 1.428ms 32.47% 1.428ms 1.428ms 948.729us 33.95% 948.729us 948.729us 1 + aten::empty 0.66% 29.091us 0.66% 29.091us 4.848us 0.000us 0.00% 0.000us 0.000us 6 + cudaFuncSetAttribute 0.13% 5.590us 0.13% 5.590us 1.863us 0.000us 0.00% 0.000us 0.000us 3 + cudaLaunchKernel 0.76% 33.440us 0.76% 33.440us 11.147us 0.000us 0.00% 0.000us 0.000us 3 + aten::reshape 0.20% 8.951us 0.54% 23.831us 3.972us 0.000us 0.00% 0.000us 0.000us 6 + aten::view 0.34% 14.880us 0.34% 14.880us 2.480us 0.000us 0.00% 0.000us 0.000us 6 + cudaDeviceSynchronize 53.19% 2.340ms 53.19% 2.340ms 2.340ms 0.000us 0.00% 0.000us 0.000us 1 ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ -Self CPU time total: 12.678ms -Self CUDA time total: 11.013ms +Self CPU time total: 4.399ms +Self CUDA time total: 2.795ms @@ -3971,21 +3971,21 @@ PROFILE TRACE: xformers_meff | cuda_attn_L320_bfloat16 ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ - xformers_meff 2.76% 351.879us 17.06% 2.178ms 2.178ms 0.000us 0.00% 14.911ms 14.911ms 1 - xformers_flash3::flash_fwd 1.47% 187.843us 14.11% 1.803ms 600.839us 0.000us 0.00% 14.911ms 4.970ms 3 - flash_attn_3::fwd 0.41% 52.611us 12.64% 1.615ms 538.225us 11.083ms 100.00% 14.911ms 4.970ms 3 - xformers_meff 0.00% 0.000us 0.00% 0.000us 0.000us 11.085ms 100.02% 11.085ms 11.085ms 1 -void cutlass::device_kernel<flash::enable_sm80_to_sm... 0.00% 0.000us 0.00% 0.000us 0.000us 11.083ms 100.00% 11.083ms 3.694ms 3 - Activity Buffer Request 11.67% 1.491ms 11.67% 1.491ms 1.491ms 3.829ms 34.54% 3.829ms 3.829ms 1 - aten::empty 0.23% 29.661us 0.23% 29.661us 4.944us 0.000us 0.00% 0.000us 0.000us 6 - cudaFuncSetAttribute 0.04% 5.680us 0.04% 5.680us 1.893us 0.000us 0.00% 0.000us 0.000us 3 - cudaLaunchKernel 0.28% 35.941us 0.28% 35.941us 11.980us 0.000us 0.00% 0.000us 0.000us 3 - aten::reshape 0.07% 8.779us 0.19% 23.920us 3.987us 0.000us 0.00% 0.000us 0.000us 6 - aten::view 0.12% 15.141us 0.12% 15.141us 2.524us 0.000us 0.00% 0.000us 0.000us 6 - cudaDeviceSynchronize 82.94% 10.593ms 82.94% 10.593ms 10.593ms 0.000us 0.00% 0.000us 0.000us 1 + xformers_meff 6.52% 299.466us 45.41% 2.085ms 2.085ms 0.000us 0.00% 3.907ms 3.907ms 1 + xformers_flash3::flash_fwd 3.09% 142.061us 38.39% 1.763ms 587.558us 0.000us 0.00% 3.907ms 1.302ms 3 + flash_attn_3::fwd 1.15% 53.012us 35.30% 1.621ms 540.204us 2.913ms 100.00% 3.907ms 1.302ms 3 + xformers_meff 0.00% 0.000us 0.00% 0.000us 0.000us 2.915ms 100.06% 2.915ms 2.915ms 1 +void cutlass::device_kernel<flash::enable_sm80_to_sm... 0.00% 0.000us 0.00% 0.000us 0.000us 2.913ms 100.00% 2.913ms 971.158us 3 + Activity Buffer Request 32.68% 1.500ms 32.68% 1.500ms 1.500ms 993.281us 34.09% 993.281us 993.281us 1 + aten::empty 0.62% 28.380us 0.62% 28.380us 4.730us 0.000us 0.00% 0.000us 0.000us 6 + cudaFuncSetAttribute 0.11% 5.270us 0.11% 5.270us 1.757us 0.000us 0.00% 0.000us 0.000us 3 + cudaLaunchKernel 0.73% 33.640us 0.73% 33.640us 11.213us 0.000us 0.00% 0.000us 0.000us 3 + aten::reshape 0.18% 8.421us 0.49% 22.660us 3.777us 0.000us 0.00% 0.000us 0.000us 6 + aten::view 0.31% 14.239us 0.31% 14.239us 2.373us 0.000us 0.00% 0.000us 0.000us 6 + cudaDeviceSynchronize 54.59% 2.507ms 54.59% 2.507ms 2.507ms 0.000us 0.00% 0.000us 0.000us 1 ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ -Self CPU time total: 12.771ms -Self CUDA time total: 11.083ms +Self CPU time total: 4.591ms +Self CUDA time total: 2.913ms @@ -3995,21 +3995,21 @@ PROFILE TRACE: xformers_meff | cuda_attn_L384_bfloat16 ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ - xformers_meff 2.60% 343.688us 18.22% 2.412ms 2.412ms 0.000us 0.00% 15.065ms 15.065ms 1 - xformers_flash3::flash_fwd 1.25% 165.081us 15.45% 2.045ms 681.611us 0.000us 0.00% 15.065ms 5.022ms 3 - flash_attn_3::fwd 0.38% 50.950us 14.20% 1.880ms 626.584us 11.285ms 100.00% 15.065ms 5.022ms 3 - xformers_meff 0.00% 0.000us 0.00% 0.000us 0.000us 11.286ms 100.02% 11.286ms 11.286ms 1 -void cutlass::device_kernel<flash::enable_sm80_to_sm... 0.00% 0.000us 0.00% 0.000us 0.000us 11.285ms 100.00% 11.285ms 3.762ms 3 - Activity Buffer Request 11.56% 1.531ms 11.56% 1.531ms 1.531ms 3.781ms 33.50% 3.781ms 3.781ms 1 - aten::empty 0.22% 29.192us 0.22% 29.192us 4.865us 0.000us 0.00% 0.000us 0.000us 6 - cudaFuncSetAttribute 0.04% 5.370us 0.04% 5.370us 1.790us 0.000us 0.00% 0.000us 0.000us 3 - cudaLaunchKernel 1.99% 263.376us 1.99% 263.376us 87.792us 0.000us 0.00% 0.000us 0.000us 3 - aten::reshape 0.07% 9.160us 0.18% 23.762us 3.960us 0.000us 0.00% 0.000us 0.000us 6 - aten::view 0.11% 14.602us 0.11% 14.602us 2.434us 0.000us 0.00% 0.000us 0.000us 6 - cudaDeviceSynchronize 81.78% 10.825ms 81.78% 10.825ms 10.825ms 0.000us 0.00% 0.000us 0.000us 1 + xformers_meff 6.26% 300.335us 46.54% 2.234ms 2.234ms 0.000us 0.00% 3.980ms 3.980ms 1 + xformers_flash3::flash_fwd 3.08% 147.673us 39.81% 1.911ms 637.009us 0.000us 0.00% 3.980ms 1.327ms 3 + flash_attn_3::fwd 1.12% 53.571us 36.74% 1.763ms 587.785us 2.981ms 100.00% 3.980ms 1.327ms 3 + xformers_meff 0.00% 0.000us 0.00% 0.000us 0.000us 2.982ms 100.05% 2.982ms 2.982ms 1 +void cutlass::device_kernel<flash::enable_sm80_to_sm... 0.00% 0.000us 0.00% 0.000us 0.000us 2.981ms 100.00% 2.981ms 993.631us 3 + Activity Buffer Request 29.81% 1.431ms 29.81% 1.431ms 1.431ms 999.263us 33.52% 999.263us 999.263us 1 + aten::empty 0.60% 28.930us 0.60% 28.930us 4.822us 0.000us 0.00% 0.000us 0.000us 6 + cudaFuncSetAttribute 0.12% 5.610us 0.12% 5.610us 1.870us 0.000us 0.00% 0.000us 0.000us 3 + cudaLaunchKernel 5.09% 244.533us 5.09% 244.533us 81.511us 0.000us 0.00% 0.000us 0.000us 3 + aten::reshape 0.18% 8.489us 0.47% 22.530us 3.755us 0.000us 0.00% 0.000us 0.000us 6 + aten::view 0.29% 14.041us 0.29% 14.041us 2.340us 0.000us 0.00% 0.000us 0.000us 6 + cudaDeviceSynchronize 53.46% 2.566ms 53.46% 2.566ms 2.566ms 0.000us 0.00% 0.000us 0.000us 1 ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ -Self CPU time total: 13.238ms -Self CUDA time total: 11.285ms +Self CPU time total: 4.800ms +Self CUDA time total: 2.981ms @@ -4019,21 +4019,21 @@ PROFILE TRACE: xformers_meff | cuda_attn_L448_bfloat16 ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ - xformers_meff 2.46% 345.459us 17.01% 2.385ms 2.385ms 0.000us 0.00% 16.124ms 16.124ms 1 - xformers_flash3::flash_fwd 1.15% 161.632us 14.38% 2.017ms 672.171us 0.000us 0.00% 16.124ms 5.375ms 3 - flash_attn_3::fwd 0.37% 51.683us 13.23% 1.855ms 618.293us 12.092ms 100.00% 16.124ms 5.375ms 3 - xformers_meff 0.00% 0.000us 0.00% 0.000us 0.000us 12.094ms 100.02% 12.094ms 12.094ms 1 -void cutlass::device_kernel<flash::enable_sm80_to_sm... 0.00% 0.000us 0.00% 0.000us 0.000us 12.092ms 100.00% 12.092ms 4.031ms 3 - Activity Buffer Request 10.69% 1.499ms 10.69% 1.499ms 1.499ms 4.032ms 33.35% 4.032ms 4.032ms 1 - aten::empty 0.21% 29.140us 0.21% 29.140us 4.857us 0.000us 0.00% 0.000us 0.000us 6 - cudaFuncSetAttribute 0.04% 5.520us 0.04% 5.520us 1.840us 0.000us 0.00% 0.000us 0.000us 3 - cudaLaunchKernel 1.92% 269.435us 1.92% 269.435us 89.812us 0.000us 0.00% 0.000us 0.000us 3 - aten::reshape 0.06% 9.069us 0.16% 22.880us 3.813us 0.000us 0.00% 0.000us 0.000us 6 - aten::view 0.10% 13.811us 0.10% 13.811us 2.302us 0.000us 0.00% 0.000us 0.000us 6 - cudaDeviceSynchronize 82.99% 11.636ms 82.99% 11.636ms 11.636ms 0.000us 0.00% 0.000us 0.000us 1 + xformers_meff 5.98% 313.865us 42.05% 2.207ms 2.207ms 0.000us 0.00% 4.635ms 4.635ms 1 + xformers_flash3::flash_fwd 2.80% 146.723us 35.63% 1.870ms 623.176us 0.000us 0.00% 4.635ms 1.545ms 3 + flash_attn_3::fwd 0.99% 51.861us 32.83% 1.723ms 574.268us 3.467ms 100.00% 4.635ms 1.545ms 3 + xformers_meff 0.00% 0.000us 0.00% 0.000us 0.000us 3.469ms 100.05% 3.469ms 3.469ms 1 +void cutlass::device_kernel<flash::enable_sm80_to_sm... 0.00% 0.000us 0.00% 0.000us 0.000us 3.467ms 100.00% 3.467ms 1.156ms 3 + Activity Buffer Request 27.82% 1.460ms 27.82% 1.460ms 1.460ms 1.168ms 33.68% 1.168ms 1.168ms 1 + aten::empty 0.56% 29.260us 0.56% 29.260us 4.877us 0.000us 0.00% 0.000us 0.000us 6 + cudaFuncSetAttribute 0.12% 6.040us 0.12% 6.040us 2.013us 0.000us 0.00% 0.000us 0.000us 3 + cudaLaunchKernel 3.35% 175.903us 3.35% 175.903us 58.634us 0.000us 0.00% 0.000us 0.000us 3 + aten::reshape 0.16% 8.638us 0.44% 23.169us 3.862us 0.000us 0.00% 0.000us 0.000us 6 + aten::view 0.28% 14.531us 0.28% 14.531us 2.422us 0.000us 0.00% 0.000us 0.000us 6 + cudaDeviceSynchronize 57.95% 3.041ms 57.95% 3.041ms 3.041ms 0.000us 0.00% 0.000us 0.000us 1 ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ -Self CPU time total: 14.021ms -Self CUDA time total: 12.092ms +Self CPU time total: 5.247ms +Self CUDA time total: 3.467ms @@ -4043,83 +4043,37 @@ PROFILE TRACE: xformers_meff | cuda_attn_L512_bfloat16 ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ - xformers_meff 2.36% 347.389us 16.65% 2.455ms 2.455ms 0.000us 0.00% 16.980ms 16.980ms 1 - xformers_flash3::flash_fwd 1.09% 160.181us 14.14% 2.085ms 695.001us 0.000us 0.00% 16.980ms 5.660ms 3 - flash_attn_3::fwd 0.36% 52.921us 13.05% 1.925ms 641.607us 12.735ms 100.00% 16.980ms 5.660ms 3 - xformers_meff 0.00% 0.000us 0.00% 0.000us 0.000us 12.738ms 100.02% 12.738ms 12.738ms 1 -void cutlass::device_kernel<flash::enable_sm80_to_sm... 0.00% 0.000us 0.00% 0.000us 0.000us 12.735ms 100.00% 12.735ms 4.245ms 3 - Activity Buffer Request 10.11% 1.491ms 10.11% 1.491ms 1.491ms 4.245ms 33.33% 4.245ms 4.245ms 1 - aten::empty 0.20% 29.922us 0.20% 29.922us 4.987us 0.000us 0.00% 0.000us 0.000us 6 - cudaFuncSetAttribute 0.04% 5.530us 0.04% 5.530us 1.843us 0.000us 0.00% 0.000us 0.000us 3 - cudaLaunchKernel 2.34% 345.117us 2.34% 345.117us 115.039us 0.000us 0.00% 0.000us 0.000us 3 - aten::reshape 0.06% 8.379us 0.15% 22.620us 3.770us 0.000us 0.00% 0.000us 0.000us 6 - aten::view 0.10% 14.241us 0.10% 14.241us 2.373us 0.000us 0.00% 0.000us 0.000us 6 - cudaDeviceSynchronize 83.35% 12.290ms 83.35% 12.290ms 12.290ms 0.000us 0.00% 0.000us 0.000us 1 + xformers_meff 5.97% 309.094us 41.86% 2.166ms 2.166ms 0.000us 0.00% 4.567ms 4.567ms 1 + xformers_flash3::flash_fwd 2.75% 142.242us 35.45% 1.834ms 611.405us 0.000us 0.00% 4.567ms 1.522ms 3 + flash_attn_3::fwd 1.04% 53.951us 32.70% 1.692ms 563.991us 3.419ms 100.00% 4.567ms 1.522ms 3 + xformers_meff 0.00% 0.000us 0.00% 0.000us 0.000us 3.421ms 100.05% 3.421ms 3.421ms 1 +void cutlass::device_kernel<flash::enable_sm80_to_sm... 0.00% 0.000us 0.00% 0.000us 0.000us 3.419ms 100.00% 3.419ms 1.140ms 3 + Activity Buffer Request 27.74% 1.436ms 27.74% 1.436ms 1.436ms 1.148ms 33.59% 1.148ms 1.148ms 1 + aten::empty 0.58% 29.770us 0.58% 29.770us 4.962us 0.000us 0.00% 0.000us 0.000us 6 + cudaFuncSetAttribute 0.11% 5.591us 0.11% 5.591us 1.864us 0.000us 0.00% 0.000us 0.000us 3 + cudaLaunchKernel 3.23% 167.152us 3.23% 167.152us 55.717us 0.000us 0.00% 0.000us 0.000us 3 + aten::reshape 0.16% 8.371us 0.44% 22.751us 3.792us 0.000us 0.00% 0.000us 0.000us 6 + aten::view 0.28% 14.380us 0.28% 14.380us 2.397us 0.000us 0.00% 0.000us 0.000us 6 + cudaDeviceSynchronize 58.14% 3.008ms 58.14% 3.008ms 3.008ms 0.000us 0.00% 0.000us 0.000us 1 ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ -Self CPU time total: 14.745ms -Self CUDA time total: 12.735ms +Self CPU time total: 5.174ms +Self CUDA time total: 3.419ms impl wl p50(ms) ok -xformers_meff cuda_attn_L128_bfloat16 3.60 True -xformers_meff cuda_attn_L256_bfloat16 3.43 True -xformers_meff cuda_attn_L320_bfloat16 4.10 True -xformers_meff cuda_attn_L384_bfloat16 4.01 True -xformers_meff cuda_attn_L448_bfloat16 4.21 True -xformers_meff cuda_attn_L512_bfloat16 4.43 True +xformers_meff cuda_attn_L128_bfloat16 1.00 True +xformers_meff cuda_attn_L256_bfloat16 1.04 True +xformers_meff cuda_attn_L320_bfloat16 1.09 True +xformers_meff cuda_attn_L384_bfloat16 1.11 True +xformers_meff cuda_attn_L448_bfloat16 1.26 True +xformers_meff cuda_attn_L512_bfloat16 1.25 True
▶ UV Install Logs
diff --git a/flash_attn/results/artifacts/combine/latency.svg b/flash_attn/results/artifacts/combine/latency.svg index 3367735a9738724cca392eba39308eb7893657bd..671245b29a5bb3712886378686087bbd4b801023 100644 --- a/flash_attn/results/artifacts/combine/latency.svg +++ b/flash_attn/results/artifacts/combine/latency.svg @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:b3bfd4c5e82f8daf2fec939924eb6dc23b3e5d20e8327316e4f8b69db047e2a9 -size 24011 +oid sha256:a94beca550ea0b3ff8a0f0eef062da6a6179ae09e78edc24cbacb71d8bd623a2 +size 24784 diff --git a/flash_attn/results/combined_results.html b/flash_attn/results/combined_results.html index 83a9d73329f9582b6415102ca8b59cbdb9f81586..d8c030e7979e6d4a765ee5da1a3df4da10896636 100644 --- a/flash_attn/results/combined_results.html +++ b/flash_attn/results/combined_results.html @@ -3857,7 +3857,7 @@ span.linenos.special { color: #000000; background-color: #ffffc0; padding-left:
Generated on:
- Linux x86_64 | Linux-5.15.0-1084-aws-x86_64-with-glibc2.31 + Linux x86_64 | Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35
@@ -3872,7 +3872,7 @@ span.linenos.special { color: #000000; background-color: #ffffc0; padding-left: - 2025-10-24T19:27:34.267507 + 2025-10-27T14:46:38.946915 image/svg+xml @@ -3891,320 +3891,333 @@ span.linenos.special { color: #000000; background-color: #ffffc0; padding-left: - + - + - + - cuda_attn_L128_bfloat16 + cuda_attn_L128_bfloat16 - + - + - cuda_attn_L256_bfloat16 + cuda_attn_L256_bfloat16 - + - + - cuda_attn_L320_bfloat16 + cuda_attn_L320_bfloat16 - + - + - cuda_attn_L384_bfloat16 + cuda_attn_L384_bfloat16 - + - + - cuda_attn_L448_bfloat16 + cuda_attn_L448_bfloat16 - + - + - cuda_attn_L512_bfloat16 + cuda_attn_L512_bfloat16 - Workload + Workload - + - + - 3 + 1.0 - + - + - 4 + 1.2 - + - + - 5 + 1.4 - + - + - 6 + 1.6 - + - + - 7 + 1.8 - + - + - 8 + 2.0 + + + + + + + + + + + + + 2.2 - Latency P50 (ms) + Latency P50 (ms) - + - - - - - - - + + + + + + + - + - - - - - - - + + + + + + + - + - - - - - - - + + + + + + + - + - - - - - - - + + + + + + + - + - - - - - - - + + + + + + + - + - + - + - - Attention Implementation Latency + + Attention Implementation Latency - + - - + + - + - torch_flash_ma + torch_flash_ma - - + + - + - torch_mem_eff + torch_mem_eff - - + + - + - xformers_meff + xformers_meff - - + + - + - hf_kernels_flash_attn + hf_kernels_flash_attn - - + + - + - hf_kernels_flash_attn3 + hf_kernels_flash_attn3 - - + + @@ -4217,7 +4230,7 @@ span.linenos.special { color: #000000; background-color: #ffffc0; padding-left: ▼ output ▶ uv-logs | -Cell: combine | 39.40s +Cell: combine | 4.50s | Raw @@ -4297,25 +4310,25 @@ Cell: combine | 39.40s
======================================================================
 LOADING BENCHMARK DATA
 ======================================================================
-✓ Flash (PyTorch SDPA)          : /home/ubuntu/Projects/kernels-benchmarks-consolidated/benches/flash_attn/impls/.uvnote/cache/1229e2a918a2e0c395750645114ee4e0e721d5f703c5221972db88ca3fe9e8b9
-✓ MemEff (PyTorch SDPA)         : /home/ubuntu/Projects/kernels-benchmarks-consolidated/benches/flash_attn/impls/.uvnote/cache/2ad4910cf70b34b5a3a316e2d789b9763d6651ee3d6727249ff229320cd58d24
-✓ xFormers                      : /home/ubuntu/Projects/kernels-benchmarks-consolidated/benches/flash_attn/impls/.uvnote/cache/4883c01f586350408f08d21d8d78943b44e953dd559356f9392f803696daca1a
-✓ HF Kernels Flash Attn         : /home/ubuntu/Projects/kernels-benchmarks-consolidated/benches/flash_attn/impls/.uvnote/cache/0c83351d95394732eb53074c734760a0bef9733834298a6c04a08d9ec6a12660
-✓ HF Kernels Flash Attn3        : /home/ubuntu/Projects/kernels-benchmarks-consolidated/benches/flash_attn/impls/.uvnote/cache/f514c47cfa55db88f58672a033a040f67509b5051b8e3f332dd6d20ae85a88a8
-✓ SageAttention                 : /home/ubuntu/Projects/kernels-benchmarks-consolidated/benches/flash_attn/impls/.uvnote/cache/44c62c72b6cd63934c6e75ade8b74c9428734ea4f94c030632b382a8f0107a57
+✓ Flash (PyTorch SDPA)          : /__w/kernels-benchmarks/kernels-benchmarks/benches/flash_attn/impls/.uvnote/cache/4b81c2b991fc4a0f70c4117e933abc4007fd7f3f55394d7778a4074adf29df04
+✓ MemEff (PyTorch SDPA)         : /__w/kernels-benchmarks/kernels-benchmarks/benches/flash_attn/impls/.uvnote/cache/a23b7ad9cfb9e9968ec4a8f126174dc4a3ab5e6999c65a44570f93656598bd2f
+✓ xFormers                      : /__w/kernels-benchmarks/kernels-benchmarks/benches/flash_attn/impls/.uvnote/cache/269846603898e0ee1872d7a8b40fca43ba558b2f3400f8a7bedb1ee79df7da58
+✓ HF Kernels Flash Attn         : /__w/kernels-benchmarks/kernels-benchmarks/benches/flash_attn/impls/.uvnote/cache/c1c92a22d205ca145ffb0083188c0f8eef512cfd6aa091b1e49d6329fbd08849
+✓ HF Kernels Flash Attn3        : /__w/kernels-benchmarks/kernels-benchmarks/benches/flash_attn/impls/.uvnote/cache/8d741e4aa09c527ddf0f50ffa03a7e840559990c66178bfb9cf04bd97f3efd20
+✓ SageAttention                 : /__w/kernels-benchmarks/kernels-benchmarks/benches/flash_attn/impls/.uvnote/cache/1355120a3e88bcb74f4130be51dfe8b03e7dc2b7823f2a53b20da7899570a16f
 
   ✓ Found Flash (PyTorch SDPA)
-     Path: /home/ubuntu/Projects/kernels-benchmarks-consolidated/benches/flash_attn/impls/.uvnote/cache/1229e2a918a2e0c395750645114ee4e0e721d5f703c5221972db88ca3fe9e8b9/attention.jsonl
+     Path: /__w/kernels-benchmarks/kernels-benchmarks/benches/flash_attn/impls/.uvnote/cache/4b81c2b991fc4a0f70c4117e933abc4007fd7f3f55394d7778a4074adf29df04/attention.jsonl
   ✓ Found MemEff (PyTorch SDPA)
-     Path: /home/ubuntu/Projects/kernels-benchmarks-consolidated/benches/flash_attn/impls/.uvnote/cache/2ad4910cf70b34b5a3a316e2d789b9763d6651ee3d6727249ff229320cd58d24/attention.jsonl
+     Path: /__w/kernels-benchmarks/kernels-benchmarks/benches/flash_attn/impls/.uvnote/cache/a23b7ad9cfb9e9968ec4a8f126174dc4a3ab5e6999c65a44570f93656598bd2f/attention.jsonl
   ✓ Found xFormers
-     Path: /home/ubuntu/Projects/kernels-benchmarks-consolidated/benches/flash_attn/impls/.uvnote/cache/4883c01f586350408f08d21d8d78943b44e953dd559356f9392f803696daca1a/attention.jsonl
+     Path: /__w/kernels-benchmarks/kernels-benchmarks/benches/flash_attn/impls/.uvnote/cache/269846603898e0ee1872d7a8b40fca43ba558b2f3400f8a7bedb1ee79df7da58/attention.jsonl
   ✓ Found HF Kernels Flash Attn
-     Path: /home/ubuntu/Projects/kernels-benchmarks-consolidated/benches/flash_attn/impls/.uvnote/cache/0c83351d95394732eb53074c734760a0bef9733834298a6c04a08d9ec6a12660/attention.jsonl
+     Path: /__w/kernels-benchmarks/kernels-benchmarks/benches/flash_attn/impls/.uvnote/cache/c1c92a22d205ca145ffb0083188c0f8eef512cfd6aa091b1e49d6329fbd08849/attention.jsonl
   ✓ Found HF Kernels Flash Attn3
-     Path: /home/ubuntu/Projects/kernels-benchmarks-consolidated/benches/flash_attn/impls/.uvnote/cache/f514c47cfa55db88f58672a033a040f67509b5051b8e3f332dd6d20ae85a88a8/attention.jsonl
+     Path: /__w/kernels-benchmarks/kernels-benchmarks/benches/flash_attn/impls/.uvnote/cache/8d741e4aa09c527ddf0f50ffa03a7e840559990c66178bfb9cf04bd97f3efd20/attention.jsonl
   ✓ Found SageAttention
-     Path: /home/ubuntu/Projects/kernels-benchmarks-consolidated/benches/flash_attn/impls/.uvnote/cache/44c62c72b6cd63934c6e75ade8b74c9428734ea4f94c030632b382a8f0107a57/attention.jsonl
+     Path: /__w/kernels-benchmarks/kernels-benchmarks/benches/flash_attn/impls/.uvnote/cache/1355120a3e88bcb74f4130be51dfe8b03e7dc2b7823f2a53b20da7899570a16f/attention.jsonl
 
 ======================================================================
 Summary: 6 found, 0 skipped, 0 missing
@@ -4324,48 +4337,48 @@ Summary: 6 found, 0 skipped, 0 missing
 COMBINED BENCHMARK SUMMARY
 
 impl                     wl                  p50(ms)  ok
-hf_kernels_flash_attn    cuda_attn_L128_bfloat16     2.82  True
-hf_kernels_flash_attn    cuda_attn_L256_bfloat16     3.91  True
-hf_kernels_flash_attn    cuda_attn_L320_bfloat16     4.12  True
-hf_kernels_flash_attn    cuda_attn_L384_bfloat16     4.13  True
-hf_kernels_flash_attn    cuda_attn_L448_bfloat16     4.11  True
-hf_kernels_flash_attn    cuda_attn_L512_bfloat16     4.57  True
-hf_kernels_flash_attn3   cuda_attn_L128_bfloat16     3.22  True
-hf_kernels_flash_attn3   cuda_attn_L256_bfloat16     3.77  True
-hf_kernels_flash_attn3   cuda_attn_L320_bfloat16     3.91  True
-hf_kernels_flash_attn3   cuda_attn_L384_bfloat16     3.97  True
-hf_kernels_flash_attn3   cuda_attn_L448_bfloat16     4.19  True
-hf_kernels_flash_attn3   cuda_attn_L512_bfloat16     4.41  True
+hf_kernels_flash_attn    cuda_attn_L128_bfloat16     0.98  True
+hf_kernels_flash_attn    cuda_attn_L256_bfloat16     1.02  True
+hf_kernels_flash_attn    cuda_attn_L320_bfloat16     1.05  True
+hf_kernels_flash_attn    cuda_attn_L384_bfloat16     1.07  True
+hf_kernels_flash_attn    cuda_attn_L448_bfloat16     1.23  True
+hf_kernels_flash_attn    cuda_attn_L512_bfloat16     1.23  True
+hf_kernels_flash_attn3   cuda_attn_L128_bfloat16     0.95  True
+hf_kernels_flash_attn3   cuda_attn_L256_bfloat16     0.98  True
+hf_kernels_flash_attn3   cuda_attn_L320_bfloat16     1.03  True
+hf_kernels_flash_attn3   cuda_attn_L384_bfloat16     1.04  True
+hf_kernels_flash_attn3   cuda_attn_L448_bfloat16     1.21  True
+hf_kernels_flash_attn3   cuda_attn_L512_bfloat16     1.18  True
 sage_int8_fp16           cuda_attn_L128_bfloat16    FAIL  False
-  Error: module 'sage_attention_c9b3a60c7e3c5091' has no attribute 'fwd'
+  Error: module 'sage_attention_12c766386675beb4' has no attribute 'fwd'
 sage_int8_fp16           cuda_attn_L256_bfloat16    FAIL  False
-  Error: module 'sage_attention_c9b3a60c7e3c5091' has no attribute 'fwd'
+  Error: module 'sage_attention_12c766386675beb4' has no attribute 'fwd'
 sage_int8_fp16           cuda_attn_L320_bfloat16    FAIL  False
-  Error: module 'sage_attention_c9b3a60c7e3c5091' has no attribute 'fwd'
+  Error: module 'sage_attention_12c766386675beb4' has no attribute 'fwd'
 sage_int8_fp16           cuda_attn_L384_bfloat16    FAIL  False
-  Error: module 'sage_attention_c9b3a60c7e3c5091' has no attribute 'fwd'
+  Error: module 'sage_attention_12c766386675beb4' has no attribute 'fwd'
 sage_int8_fp16           cuda_attn_L448_bfloat16    FAIL  False
-  Error: module 'sage_attention_c9b3a60c7e3c5091' has no attribute 'fwd'
+  Error: module 'sage_attention_12c766386675beb4' has no attribute 'fwd'
 sage_int8_fp16           cuda_attn_L512_bfloat16    FAIL  False
-  Error: module 'sage_attention_c9b3a60c7e3c5091' has no attribute 'fwd'
-torch_flash_ma           cuda_attn_L128_bfloat16     4.09  True
-torch_flash_ma           cuda_attn_L256_bfloat16     4.79  True
-torch_flash_ma           cuda_attn_L320_bfloat16     4.90  True
-torch_flash_ma           cuda_attn_L384_bfloat16     4.98  True
-torch_flash_ma           cuda_attn_L448_bfloat16     5.05  True
-torch_flash_ma           cuda_attn_L512_bfloat16     5.47  True
-torch_mem_eff            cuda_attn_L128_bfloat16     6.77  True
-torch_mem_eff            cuda_attn_L256_bfloat16     7.24  True
-torch_mem_eff            cuda_attn_L320_bfloat16     7.52  True
-torch_mem_eff            cuda_attn_L384_bfloat16     7.59  True
-torch_mem_eff            cuda_attn_L448_bfloat16     7.97  True
-torch_mem_eff            cuda_attn_L512_bfloat16     8.47  True
-xformers_meff            cuda_attn_L128_bfloat16     3.60  True
-xformers_meff            cuda_attn_L256_bfloat16     3.43  True
-xformers_meff            cuda_attn_L320_bfloat16     4.10  True
-xformers_meff            cuda_attn_L384_bfloat16     4.01  True
-xformers_meff            cuda_attn_L448_bfloat16     4.21  True
-xformers_meff            cuda_attn_L512_bfloat16     4.43  True
+  Error: module 'sage_attention_12c766386675beb4' has no attribute 'fwd'
+torch_flash_ma           cuda_attn_L128_bfloat16     1.22  True
+torch_flash_ma           cuda_attn_L256_bfloat16     1.27  True
+torch_flash_ma           cuda_attn_L320_bfloat16     1.31  True
+torch_flash_ma           cuda_attn_L384_bfloat16     1.34  True
+torch_flash_ma           cuda_attn_L448_bfloat16     1.48  True
+torch_flash_ma           cuda_attn_L512_bfloat16     1.52  True
+torch_mem_eff            cuda_attn_L128_bfloat16     1.89  True
+torch_mem_eff            cuda_attn_L256_bfloat16     1.95  True
+torch_mem_eff            cuda_attn_L320_bfloat16     2.05  True
+torch_mem_eff            cuda_attn_L384_bfloat16     2.08  True
+torch_mem_eff            cuda_attn_L448_bfloat16     2.13  True
+torch_mem_eff            cuda_attn_L512_bfloat16     2.27  True
+xformers_meff            cuda_attn_L128_bfloat16     1.00  True
+xformers_meff            cuda_attn_L256_bfloat16     1.04  True
+xformers_meff            cuda_attn_L320_bfloat16     1.09  True
+xformers_meff            cuda_attn_L384_bfloat16     1.11  True
+xformers_meff            cuda_attn_L448_bfloat16     1.26  True
+xformers_meff            cuda_attn_L512_bfloat16     1.25  True
 
 GENERATING COMBINED VISUALIZATION
 
@@ -4389,53 +4402,7 @@ Implementations included:
 
▶ UV Install Logs
@@ -4448,7 +4415,7 @@ Installed 37 packages in 230ms - 2025-10-24T19:27:34.267507 + 2025-10-27T14:46:38.946915 image/svg+xml @@ -4467,320 +4434,333 @@ Installed 37 packages in 230ms - + - + - + - cuda_attn_L128_bfloat16 + cuda_attn_L128_bfloat16 - + - + - cuda_attn_L256_bfloat16 + cuda_attn_L256_bfloat16 - + - + - cuda_attn_L320_bfloat16 + cuda_attn_L320_bfloat16 - + - + - cuda_attn_L384_bfloat16 + cuda_attn_L384_bfloat16 - + - + - cuda_attn_L448_bfloat16 + cuda_attn_L448_bfloat16 - + - + - cuda_attn_L512_bfloat16 + cuda_attn_L512_bfloat16 - Workload + Workload - + - + - 3 + 1.0 - + - + - 4 + 1.2 - + - + - 5 + 1.4 - + - + - 6 + 1.6 - + - + - 7 + 1.8 - + - + - 8 + 2.0 + + + + + + + + + + + + + 2.2 - Latency P50 (ms) + Latency P50 (ms) - + - - - - - - - + + + + + + + - + - - - - - - - + + + + + + + - + - - - - - - - + + + + + + + - + - - - - - - - + + + + + + + - + - - - - - - - + + + + + + + - + - + - + - - Attention Implementation Latency + + Attention Implementation Latency - + - - + + - + - torch_flash_ma + torch_flash_ma - - + + - + - torch_mem_eff + torch_mem_eff - - + + - + - xformers_meff + xformers_meff - - + + - + - hf_kernels_flash_attn + hf_kernels_flash_attn - - + + - + - hf_kernels_flash_attn3 + hf_kernels_flash_attn3 - - + + diff --git a/layer_norm/impls/artifacts/benchmark/layer_norm.jsonl b/layer_norm/impls/artifacts/benchmark/layer_norm.jsonl index c6e99c7cf12fb0468248ecd58ebe47ba3c3385d9..a79e8f8bdbd98b4943b0ef6a74d24d280882e16b 100644 --- a/layer_norm/impls/artifacts/benchmark/layer_norm.jsonl +++ b/layer_norm/impls/artifacts/benchmark/layer_norm.jsonl @@ -1,48 +1,48 @@ -{"ts": "2025-10-24T19:24:01Z", "run": "fba0677bba464dcfbc87714fa9385ac6", "impl": "torch_layer_norm", "tags": {"family": "torch", "op": "layer_norm"}, "wl": {"name": "LN_B1_S128_D1024", "batch": 1, "seq_len": 128, "hidden_dim": 1024, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L4", "sm": "8.9", "py": "3.12.7", "plat": "Linux-5.15.0-1084-aws-x86_64-with-glibc2.31"}, "lat_ms": {"p10": 0.028959999326616526, "p50": 0.029670016374439, "p90": 0.02976099494844675, "mean": 0.030270603019744158, "iqr": 0.00046996865421533585, "raw_times": [0.028959999326616526, 0.02976099494844675, 0.029291026294231415, 0.029670016374439, 0.0336709781549871], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.03676099004223943, "peak_bytes": 2363392, "ok": false, "absmax": 0.03125, "corr": {"ok": false, "rtol": 0.001, "atol": 0.001, "absmax": 0.03125, "mae": 0.00151824951171875, "mse": 1.1086463928222656e-05, "ref": "layer_norm_fp32"}, "err": null} -{"ts": "2025-10-24T19:24:01Z", "run": "fba0677bba464dcfbc87714fa9385ac6", "impl": "torch_layer_norm", "tags": {"family": "torch", "op": "layer_norm"}, "wl": {"name": "LN_B1_S128_D2048", "batch": 1, "seq_len": 128, "hidden_dim": 2048, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L4", "sm": "8.9", "py": "3.12.7", "plat": "Linux-5.15.0-1084-aws-x86_64-with-glibc2.31"}, "lat_ms": {"p10": 0.0324210268445313, "p50": 0.03255100455135107, "p90": 0.03347999881953001, "mean": 0.0366490101441741, "iqr": 0.0010589719749987125, "raw_times": [0.0324210268445313, 0.05237199366092682, 0.03347999881953001, 0.03255100455135107, 0.0324210268445313], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.03719097003340721, "peak_bytes": 4726784, "ok": false, "absmax": 0.03125, "corr": {"ok": false, "rtol": 0.001, "atol": 0.001, "absmax": 0.03125, "mae": 0.0014801025390625, "mse": 1.0728836059570312e-05, "ref": "layer_norm_fp32"}, "err": null} -{"ts": "2025-10-24T19:24:01Z", "run": "fba0677bba464dcfbc87714fa9385ac6", "impl": "torch_layer_norm", "tags": {"family": "torch", "op": "layer_norm"}, "wl": {"name": "LN_B1_S128_D4096", "batch": 1, "seq_len": 128, "hidden_dim": 4096, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L4", "sm": "8.9", "py": "3.12.7", "plat": "Linux-5.15.0-1084-aws-x86_64-with-glibc2.31"}, "lat_ms": {"p10": 0.030800001695752144, "p50": 0.032610027119517326, "p90": 0.03269099397584796, "mean": 0.03422859590500593, "iqr": 0.0001300359144806862, "raw_times": [0.042480998672544956, 0.03256095806136727, 0.032610027119517326, 0.03269099397584796, 0.030800001695752144], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.038141035474836826, "peak_bytes": 9453568, "ok": false, "absmax": 0.03125, "corr": {"ok": false, "rtol": 0.001, "atol": 0.001, "absmax": 0.03125, "mae": 0.00153350830078125, "mse": 1.0967254638671875e-05, "ref": "layer_norm_fp32"}, "err": null} -{"ts": "2025-10-24T19:24:01Z", "run": "fba0677bba464dcfbc87714fa9385ac6", "impl": "torch_layer_norm", "tags": {"family": "torch", "op": "layer_norm"}, "wl": {"name": "LN_B1_S128_D8192", "batch": 1, "seq_len": 128, "hidden_dim": 8192, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L4", "sm": "8.9", "py": "3.12.7", "plat": "Linux-5.15.0-1084-aws-x86_64-with-glibc2.31"}, "lat_ms": {"p10": 0.031030969694256783, "p50": 0.03305997233837843, "p90": 0.03334099892526865, "mean": 0.034240796230733395, "iqr": 0.0011699739843606949, "raw_times": [0.031030969694256783, 0.03334099892526865, 0.03305997233837843, 0.032171024940907955, 0.041601015254855156], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.03404001472517848, "peak_bytes": 18907136, "ok": false, "absmax": 0.03125, "corr": {"ok": false, "rtol": 0.001, "atol": 0.001, "absmax": 0.03125, "mae": 0.00148773193359375, "mse": 1.1086463928222656e-05, "ref": "layer_norm_fp32"}, "err": null} -{"ts": "2025-10-24T19:24:01Z", "run": "fba0677bba464dcfbc87714fa9385ac6", "impl": "torch_layer_norm", "tags": {"family": "torch", "op": "layer_norm"}, "wl": {"name": "LN_B1_S512_D1024", "batch": 1, "seq_len": 512, "hidden_dim": 1024, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L4", "sm": "8.9", "py": "3.12.7", "plat": "Linux-5.15.0-1084-aws-x86_64-with-glibc2.31"}, "lat_ms": {"p10": 0.030020950362086296, "p50": 0.03277999348938465, "p90": 0.03290100721642375, "mean": 0.03224459942430258, "iqr": 0.0006309710443019867, "raw_times": [0.03227003617212176, 0.03290100721642375, 0.03277999348938465, 0.030020950362086296, 0.03325100988149643], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.034740951377898455, "peak_bytes": 9441280, "ok": false, "absmax": 0.03125, "corr": {"ok": false, "rtol": 0.001, "atol": 0.001, "absmax": 0.03125, "mae": 0.001556396484375, "mse": 1.1205673217773438e-05, "ref": "layer_norm_fp32"}, "err": null} -{"ts": "2025-10-24T19:24:01Z", "run": "fba0677bba464dcfbc87714fa9385ac6", "impl": "torch_layer_norm", "tags": {"family": "torch", "op": "layer_norm"}, "wl": {"name": "LN_B1_S512_D2048", "batch": 1, "seq_len": 512, "hidden_dim": 2048, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L4", "sm": "8.9", "py": "3.12.7", "plat": "Linux-5.15.0-1084-aws-x86_64-with-glibc2.31"}, "lat_ms": {"p10": 0.029260001610964537, "p50": 0.031291041523218155, "p90": 0.03236101474612951, "mean": 0.03568681422621012, "iqr": 0.0016400008462369442, "raw_times": [0.029260001610964537, 0.03236101474612951, 0.054800999350845814, 0.03072101389989257, 0.031291041523218155], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.03347097663208842, "peak_bytes": 18882560, "ok": false, "absmax": 0.03125, "corr": {"ok": false, "rtol": 0.001, "atol": 0.001, "absmax": 0.03125, "mae": 0.00154876708984375, "mse": 1.1265277862548828e-05, "ref": "layer_norm_fp32"}, "err": null} -{"ts": "2025-10-24T19:24:01Z", "run": "fba0677bba464dcfbc87714fa9385ac6", "impl": "torch_layer_norm", "tags": {"family": "torch", "op": "layer_norm"}, "wl": {"name": "LN_B1_S512_D4096", "batch": 1, "seq_len": 512, "hidden_dim": 4096, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L4", "sm": "8.9", "py": "3.12.7", "plat": "Linux-5.15.0-1084-aws-x86_64-with-glibc2.31"}, "lat_ms": {"p10": 0.029839982744306326, "p50": 0.031700998079031706, "p90": 0.03264000406488776, "mean": 0.031820591539144516, "iqr": 0.0009690411388874054, "raw_times": [0.029839982744306326, 0.03167096292600036, 0.03264000406488776, 0.031700998079031706, 0.03325100988149643], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.03392098005861044, "peak_bytes": 37765120, "ok": false, "absmax": 0.03125, "corr": {"ok": false, "rtol": 0.001, "atol": 0.001, "absmax": 0.03125, "mae": 0.0015869140625, "mse": 1.1563301086425781e-05, "ref": "layer_norm_fp32"}, "err": null} -{"ts": "2025-10-24T19:24:01Z", "run": "fba0677bba464dcfbc87714fa9385ac6", "impl": "torch_layer_norm", "tags": {"family": "torch", "op": "layer_norm"}, "wl": {"name": "LN_B1_S512_D8192", "batch": 1, "seq_len": 512, "hidden_dim": 8192, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L4", "sm": "8.9", "py": "3.12.7", "plat": "Linux-5.15.0-1084-aws-x86_64-with-glibc2.31"}, "lat_ms": {"p10": 0.03726099384948611, "p50": 0.03943097544834018, "p90": 0.040319981053471565, "mean": 0.039948790799826384, "iqr": 0.0009189825505018234, "raw_times": [0.03726099384948611, 0.03940099850296974, 0.04333100514486432, 0.040319981053471565, 0.03943097544834018], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.040440005250275135, "peak_bytes": 75530240, "ok": false, "absmax": 0.03125, "corr": {"ok": false, "rtol": 0.001, "atol": 0.001, "absmax": 0.03125, "mae": 0.0014801025390625, "mse": 1.0848045349121094e-05, "ref": "layer_norm_fp32"}, "err": null} -{"ts": "2025-10-24T19:24:01Z", "run": "fba0677bba464dcfbc87714fa9385ac6", "impl": "torch_layer_norm", "tags": {"family": "torch", "op": "layer_norm"}, "wl": {"name": "LN_B1_S1024_D1024", "batch": 1, "seq_len": 1024, "hidden_dim": 1024, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L4", "sm": "8.9", "py": "3.12.7", "plat": "Linux-5.15.0-1084-aws-x86_64-with-glibc2.31"}, "lat_ms": {"p10": 0.029730028472840786, "p50": 0.03197102341800928, "p90": 0.03233103780075908, "mean": 0.03148262621834874, "iqr": 0.001300009898841381, "raw_times": [0.029730028472840786, 0.03197102341800928, 0.03235001349821687, 0.031031027901917696, 0.03233103780075908], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.034419994335621595, "peak_bytes": 21008384, "ok": false, "absmax": 0.03125, "corr": {"ok": false, "rtol": 0.001, "atol": 0.001, "absmax": 0.03125, "mae": 0.0015716552734375, "mse": 1.1324882507324219e-05, "ref": "layer_norm_fp32"}, "err": null} -{"ts": "2025-10-24T19:24:01Z", "run": "fba0677bba464dcfbc87714fa9385ac6", "impl": "torch_layer_norm", "tags": {"family": "torch", "op": "layer_norm"}, "wl": {"name": "LN_B1_S1024_D2048", "batch": 1, "seq_len": 1024, "hidden_dim": 2048, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L4", "sm": "8.9", "py": "3.12.7", "plat": "Linux-5.15.0-1084-aws-x86_64-with-glibc2.31"}, "lat_ms": {"p10": 0.030681025236845016, "p50": 0.031400995794683695, "p90": 0.03258103970438242, "mean": 0.03199680941179395, "iqr": 0.0015910482034087181, "raw_times": [0.030681025236845016, 0.03258103970438242, 0.034330994822084904, 0.0309899915009737, 0.031400995794683695], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.03291096072643995, "peak_bytes": 37756928, "ok": false, "absmax": 0.03125, "corr": {"ok": false, "rtol": 0.001, "atol": 0.001, "absmax": 0.03125, "mae": 0.00156402587890625, "mse": 1.138448715209961e-05, "ref": "layer_norm_fp32"}, "err": null} -{"ts": "2025-10-24T19:24:01Z", "run": "fba0677bba464dcfbc87714fa9385ac6", "impl": "torch_layer_norm", "tags": {"family": "torch", "op": "layer_norm"}, "wl": {"name": "LN_B1_S1024_D4096", "batch": 1, "seq_len": 1024, "hidden_dim": 4096, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L4", "sm": "8.9", "py": "3.12.7", "plat": "Linux-5.15.0-1084-aws-x86_64-with-glibc2.31"}, "lat_ms": {"p10": 0.03773096250370145, "p50": 0.03855105023831129, "p90": 0.03870099317282438, "mean": 0.038839003536850214, "iqr": 0.00018998980522155762, "raw_times": [0.03773096250370145, 0.04070100840181112, 0.03855105023831129, 0.038511003367602825, 0.03870099317282438], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.04059000639244914, "peak_bytes": 75513856, "ok": false, "absmax": 0.03125, "corr": {"ok": false, "rtol": 0.001, "atol": 0.001, "absmax": 0.03125, "mae": 0.0015411376953125, "mse": 1.1205673217773438e-05, "ref": "layer_norm_fp32"}, "err": null} -{"ts": "2025-10-24T19:24:01Z", "run": "fba0677bba464dcfbc87714fa9385ac6", "impl": "torch_layer_norm", "tags": {"family": "torch", "op": "layer_norm"}, "wl": {"name": "LN_B1_S1024_D8192", "batch": 1, "seq_len": 1024, "hidden_dim": 8192, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L4", "sm": "8.9", "py": "3.12.7", "plat": "Linux-5.15.0-1084-aws-x86_64-with-glibc2.31"}, "lat_ms": {"p10": 0.05229096859693527, "p50": 0.05274196155369282, "p90": 0.052841962315142155, "mean": 0.05267937667667866, "iqr": 0.00024097971618175507, "raw_times": [0.05274196155369282, 0.052841962315142155, 0.05229096859693527, 0.05292100831866264, 0.0526009825989604], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.05453097401186824, "peak_bytes": 151027712, "ok": false, "absmax": 0.03125, "corr": {"ok": false, "rtol": 0.001, "atol": 0.001, "absmax": 0.03125, "mae": 0.001495361328125, "mse": 1.0967254638671875e-05, "ref": "layer_norm_fp32"}, "err": null} -{"ts": "2025-10-24T19:24:01Z", "run": "fba0677bba464dcfbc87714fa9385ac6", "impl": "torch_layer_norm", "tags": {"family": "torch", "op": "layer_norm"}, "wl": {"name": "LN_B1_S2048_D1024", "batch": 1, "seq_len": 2048, "hidden_dim": 1024, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L4", "sm": "8.9", "py": "3.12.7", "plat": "Linux-5.15.0-1084-aws-x86_64-with-glibc2.31"}, "lat_ms": {"p10": 0.033741001971066, "p50": 0.03515096614137292, "p90": 0.035751028917729855, "mean": 0.035592797212302685, "iqr": 0.0010509975254535675, "raw_times": [0.033741001971066, 0.03515096614137292, 0.038620957639068365, 0.035751028917729855, 0.03470003139227629], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.03478099824860692, "peak_bytes": 41979904, "ok": false, "absmax": 0.03125, "corr": {"ok": false, "rtol": 0.001, "atol": 0.001, "absmax": 0.03125, "mae": 0.00157928466796875, "mse": 1.1444091796875e-05, "ref": "layer_norm_fp32"}, "err": null} -{"ts": "2025-10-24T19:24:01Z", "run": "fba0677bba464dcfbc87714fa9385ac6", "impl": "torch_layer_norm", "tags": {"family": "torch", "op": "layer_norm"}, "wl": {"name": "LN_B1_S2048_D2048", "batch": 1, "seq_len": 2048, "hidden_dim": 2048, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L4", "sm": "8.9", "py": "3.12.7", "plat": "Linux-5.15.0-1084-aws-x86_64-with-glibc2.31"}, "lat_ms": {"p10": 0.040161015931516886, "p50": 0.04095997428521514, "p90": 0.04124100087210536, "mean": 0.04105480620637536, "iqr": 0.00031996751204133034, "raw_times": [0.040161015931516886, 0.04199100658297539, 0.04124100087210536, 0.04095997428521514, 0.04092103336006403], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.04168099258095026, "peak_bytes": 75505664, "ok": false, "absmax": 0.03125, "corr": {"ok": false, "rtol": 0.001, "atol": 0.001, "absmax": 0.03125, "mae": 0.0015716552734375, "mse": 1.1444091796875e-05, "ref": "layer_norm_fp32"}, "err": null} -{"ts": "2025-10-24T19:24:01Z", "run": "fba0677bba464dcfbc87714fa9385ac6", "impl": "torch_layer_norm", "tags": {"family": "torch", "op": "layer_norm"}, "wl": {"name": "LN_B1_S2048_D4096", "batch": 1, "seq_len": 2048, "hidden_dim": 4096, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L4", "sm": "8.9", "py": "3.12.7", "plat": "Linux-5.15.0-1084-aws-x86_64-with-glibc2.31"}, "lat_ms": {"p10": 0.05262100603431463, "p50": 0.05288200918585062, "p90": 0.0531109981238842, "mean": 0.05337720504030585, "iqr": 0.00027997884899377823, "raw_times": [0.05283101927489042, 0.05262100603431463, 0.05288200918585062, 0.0531109981238842, 0.05544099258258939], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.05422096000984311, "peak_bytes": 151011328, "ok": false, "absmax": 0.03125, "corr": {"ok": false, "rtol": 0.001, "atol": 0.001, "absmax": 0.03125, "mae": 0.0015411376953125, "mse": 1.1205673217773438e-05, "ref": "layer_norm_fp32"}, "err": null} -{"ts": "2025-10-24T19:24:01Z", "run": "fba0677bba464dcfbc87714fa9385ac6", "impl": "torch_layer_norm", "tags": {"family": "torch", "op": "layer_norm"}, "wl": {"name": "LN_B1_S2048_D8192", "batch": 1, "seq_len": 2048, "hidden_dim": 8192, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L4", "sm": "8.9", "py": "3.12.7", "plat": "Linux-5.15.0-1084-aws-x86_64-with-glibc2.31"}, "lat_ms": {"p10": 0.2716059680096805, "p50": 0.2742449869401753, "p90": 0.2774460008367896, "mean": 0.2755257999524474, "iqr": 0.0037999707274138927, "raw_times": [0.28068601386621594, 0.2736460301093757, 0.2742449869401753, 0.2716059680096805, 0.2774460008367896], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.27578597655519843, "peak_bytes": 302022656, "ok": false, "absmax": 0.03125, "corr": {"ok": false, "rtol": 0.001, "atol": 0.001, "absmax": 0.03125, "mae": 0.0015106201171875, "mse": 1.1086463928222656e-05, "ref": "layer_norm_fp32"}, "err": null} -{"ts": "2025-10-24T19:24:01Z", "run": "fba0677bba464dcfbc87714fa9385ac6", "impl": "torch_layer_norm", "tags": {"family": "torch", "op": "layer_norm"}, "wl": {"name": "LN_B4_S128_D1024", "batch": 4, "seq_len": 128, "hidden_dim": 1024, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L4", "sm": "8.9", "py": "3.12.7", "plat": "Linux-5.15.0-1084-aws-x86_64-with-glibc2.31"}, "lat_ms": {"p10": 0.030030030757188797, "p50": 0.031871022656559944, "p90": 0.03221101360395551, "mean": 0.03341861302033067, "iqr": 0.00040099257603287697, "raw_times": [0.030030030757188797, 0.03221101360395551, 0.04117097705602646, 0.031871022656559944, 0.03181002102792263], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.034871045500040054, "peak_bytes": 69242880, "ok": false, "absmax": 0.03125, "corr": {"ok": false, "rtol": 0.001, "atol": 0.001, "absmax": 0.03125, "mae": 0.001556396484375, "mse": 1.1205673217773438e-05, "ref": "layer_norm_fp32"}, "err": null} -{"ts": "2025-10-24T19:24:01Z", "run": "fba0677bba464dcfbc87714fa9385ac6", "impl": "torch_layer_norm", "tags": {"family": "torch", "op": "layer_norm"}, "wl": {"name": "LN_B4_S128_D2048", "batch": 4, "seq_len": 128, "hidden_dim": 2048, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L4", "sm": "8.9", "py": "3.12.7", "plat": "Linux-5.15.0-1084-aws-x86_64-with-glibc2.31"}, "lat_ms": {"p10": 0.02923101419582963, "p50": 0.030590977985411882, "p90": 0.030929979402571917, "mean": 0.030384794808924198, "iqr": 0.0010989606380462646, "raw_times": [0.02923101419582963, 0.03134098369628191, 0.030590977985411882, 0.029831018764525652, 0.030929979402571917], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.03408099291846156, "peak_bytes": 18882560, "ok": false, "absmax": 0.03125, "corr": {"ok": false, "rtol": 0.001, "atol": 0.001, "absmax": 0.03125, "mae": 0.00154876708984375, "mse": 1.1265277862548828e-05, "ref": "layer_norm_fp32"}, "err": null} -{"ts": "2025-10-24T19:24:01Z", "run": "fba0677bba464dcfbc87714fa9385ac6", "impl": "torch_layer_norm", "tags": {"family": "torch", "op": "layer_norm"}, "wl": {"name": "LN_B4_S128_D4096", "batch": 4, "seq_len": 128, "hidden_dim": 4096, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L4", "sm": "8.9", "py": "3.12.7", "plat": "Linux-5.15.0-1084-aws-x86_64-with-glibc2.31"}, "lat_ms": {"p10": 0.030279974453151226, "p50": 0.03076100256294012, "p90": 0.03130995901301503, "mean": 0.031078385654836893, "iqr": 0.0005599576979875565, "raw_times": [0.03130995901301503, 0.03076100256294012, 0.030279974453151226, 0.03229099093005061, 0.030750001315027475], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.03285001730546355, "peak_bytes": 37765120, "ok": false, "absmax": 0.03125, "corr": {"ok": false, "rtol": 0.001, "atol": 0.001, "absmax": 0.03125, "mae": 0.0015869140625, "mse": 1.1563301086425781e-05, "ref": "layer_norm_fp32"}, "err": null} -{"ts": "2025-10-24T19:24:01Z", "run": "fba0677bba464dcfbc87714fa9385ac6", "impl": "torch_layer_norm", "tags": {"family": "torch", "op": "layer_norm"}, "wl": {"name": "LN_B4_S128_D8192", "batch": 4, "seq_len": 128, "hidden_dim": 8192, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L4", "sm": "8.9", "py": "3.12.7", "plat": "Linux-5.15.0-1084-aws-x86_64-with-glibc2.31"}, "lat_ms": {"p10": 0.03846001345664263, "p50": 0.03899098373949528, "p90": 0.039101054426282644, "mean": 0.039028620813041925, "iqr": 0.00023102620616555214, "raw_times": [0.03899098373949528, 0.039101054426282644, 0.039721024222671986, 0.03887002822011709, 0.03846001345664263], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.040511018596589565, "peak_bytes": 75530240, "ok": false, "absmax": 0.03125, "corr": {"ok": false, "rtol": 0.001, "atol": 0.001, "absmax": 0.03125, "mae": 0.0014801025390625, "mse": 1.0848045349121094e-05, "ref": "layer_norm_fp32"}, "err": null} -{"ts": "2025-10-24T19:24:01Z", "run": "fba0677bba464dcfbc87714fa9385ac6", "impl": "torch_layer_norm", "tags": {"family": "torch", "op": "layer_norm"}, "wl": {"name": "LN_B4_S512_D1024", "batch": 4, "seq_len": 512, "hidden_dim": 1024, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L4", "sm": "8.9", "py": "3.12.7", "plat": "Linux-5.15.0-1084-aws-x86_64-with-glibc2.31"}, "lat_ms": {"p10": 0.0330110196955502, "p50": 0.034221040550619364, "p90": 0.03488000947982073, "mean": 0.03425482427701354, "iqr": 0.0009589712135493755, "raw_times": [0.0330110196955502, 0.034221040550619364, 0.03524101339280605, 0.03392103826627135, 0.03488000947982073], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.036880956031382084, "peak_bytes": 37752832, "ok": false, "absmax": 0.03125, "corr": {"ok": false, "rtol": 0.001, "atol": 0.001, "absmax": 0.03125, "mae": 0.00157928466796875, "mse": 1.1444091796875e-05, "ref": "layer_norm_fp32"}, "err": null} -{"ts": "2025-10-24T19:24:01Z", "run": "fba0677bba464dcfbc87714fa9385ac6", "impl": "torch_layer_norm", "tags": {"family": "torch", "op": "layer_norm"}, "wl": {"name": "LN_B4_S512_D2048", "batch": 4, "seq_len": 512, "hidden_dim": 2048, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L4", "sm": "8.9", "py": "3.12.7", "plat": "Linux-5.15.0-1084-aws-x86_64-with-glibc2.31"}, "lat_ms": {"p10": 0.039421021938323975, "p50": 0.039631035178899765, "p90": 0.041121034882962704, "mean": 0.04027721006423235, "iqr": 0.0015800469554960728, "raw_times": [0.039631035178899765, 0.04167197039350867, 0.041121034882962704, 0.03954098792746663, 0.039421021938323975], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.04271004581823945, "peak_bytes": 75505664, "ok": false, "absmax": 0.03125, "corr": {"ok": false, "rtol": 0.001, "atol": 0.001, "absmax": 0.03125, "mae": 0.0015716552734375, "mse": 1.1444091796875e-05, "ref": "layer_norm_fp32"}, "err": null} -{"ts": "2025-10-24T19:24:01Z", "run": "fba0677bba464dcfbc87714fa9385ac6", "impl": "torch_layer_norm", "tags": {"family": "torch", "op": "layer_norm"}, "wl": {"name": "LN_B4_S512_D4096", "batch": 4, "seq_len": 512, "hidden_dim": 4096, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L4", "sm": "8.9", "py": "3.12.7", "plat": "Linux-5.15.0-1084-aws-x86_64-with-glibc2.31"}, "lat_ms": {"p10": 0.052101968321949244, "p50": 0.05301198689267039, "p90": 0.053400988690555096, "mean": 0.052991590928286314, "iqr": 0.0005399924702942371, "raw_times": [0.052101968321949244, 0.05358201451599598, 0.05286099622026086, 0.05301198689267039, 0.053400988690555096], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.054280972108244896, "peak_bytes": 151011328, "ok": false, "absmax": 0.03125, "corr": {"ok": false, "rtol": 0.001, "atol": 0.001, "absmax": 0.03125, "mae": 0.0015411376953125, "mse": 1.1205673217773438e-05, "ref": "layer_norm_fp32"}, "err": null} -{"ts": "2025-10-24T19:24:01Z", "run": "fba0677bba464dcfbc87714fa9385ac6", "impl": "torch_layer_norm", "tags": {"family": "torch", "op": "layer_norm"}, "wl": {"name": "LN_B4_S512_D8192", "batch": 4, "seq_len": 512, "hidden_dim": 8192, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L4", "sm": "8.9", "py": "3.12.7", "plat": "Linux-5.15.0-1084-aws-x86_64-with-glibc2.31"}, "lat_ms": {"p10": 0.2699450124055147, "p50": 0.2703659702092409, "p90": 0.2711050328798592, "mean": 0.27191760018467903, "iqr": 0.0009190407581627369, "raw_times": [0.2703659702092409, 0.2711050328798592, 0.27798599330708385, 0.27018599212169647, 0.2699450124055147], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.27769600274041295, "peak_bytes": 302022656, "ok": false, "absmax": 0.03125, "corr": {"ok": false, "rtol": 0.001, "atol": 0.001, "absmax": 0.03125, "mae": 0.0015106201171875, "mse": 1.1086463928222656e-05, "ref": "layer_norm_fp32"}, "err": null} -{"ts": "2025-10-24T19:24:01Z", "run": "fba0677bba464dcfbc87714fa9385ac6", "impl": "torch_layer_norm", "tags": {"family": "torch", "op": "layer_norm"}, "wl": {"name": "LN_B4_S1024_D1024", "batch": 4, "seq_len": 1024, "hidden_dim": 1024, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L4", "sm": "8.9", "py": "3.12.7", "plat": "Linux-5.15.0-1084-aws-x86_64-with-glibc2.31"}, "lat_ms": {"p10": 0.04519004141911864, "p50": 0.045621010940521955, "p90": 0.04610104952007532, "mean": 0.046770821791142225, "iqr": 0.0005300389602780342, "raw_times": [0.04557101055979729, 0.04610104952007532, 0.045621010940521955, 0.04519004141911864, 0.05137099651619792], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.0482110190205276, "peak_bytes": 83922944, "ok": false, "absmax": 0.03125, "corr": {"ok": false, "rtol": 0.001, "atol": 0.001, "absmax": 0.03125, "mae": 0.0015869140625, "mse": 1.150369644165039e-05, "ref": "layer_norm_fp32"}, "err": null} -{"ts": "2025-10-24T19:24:01Z", "run": "fba0677bba464dcfbc87714fa9385ac6", "impl": "torch_layer_norm", "tags": {"family": "torch", "op": "layer_norm"}, "wl": {"name": "LN_B4_S1024_D2048", "batch": 4, "seq_len": 1024, "hidden_dim": 2048, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L4", "sm": "8.9", "py": "3.12.7", "plat": "Linux-5.15.0-1084-aws-x86_64-with-glibc2.31"}, "lat_ms": {"p10": 0.05651102401316166, "p50": 0.05743099609389901, "p90": 0.05767098627984524, "mean": 0.05834901239722967, "iqr": 0.00033993273973464966, "raw_times": [0.05651102401316166, 0.06280100205913186, 0.05743099609389901, 0.05767098627984524, 0.05733105354011059], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.05965103628113866, "peak_bytes": 151003136, "ok": false, "absmax": 0.03125, "corr": {"ok": false, "rtol": 0.001, "atol": 0.001, "absmax": 0.03125, "mae": 0.0015716552734375, "mse": 1.1444091796875e-05, "ref": "layer_norm_fp32"}, "err": null} -{"ts": "2025-10-24T19:24:01Z", "run": "fba0677bba464dcfbc87714fa9385ac6", "impl": "torch_layer_norm", "tags": {"family": "torch", "op": "layer_norm"}, "wl": {"name": "LN_B4_S1024_D4096", "batch": 4, "seq_len": 1024, "hidden_dim": 4096, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L4", "sm": "8.9", "py": "3.12.7", "plat": "Linux-5.15.0-1084-aws-x86_64-with-glibc2.31"}, "lat_ms": {"p10": 0.2797759952954948, "p50": 0.2818260109052062, "p90": 0.2828260185196996, "mean": 0.28369000647217035, "iqr": 0.0011200318112969398, "raw_times": [0.2797759952954948, 0.28170598670840263, 0.2828260185196996, 0.2818260109052062, 0.29231602093204856], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.278854975476861, "peak_bytes": 302006272, "ok": false, "absmax": 0.03125, "corr": {"ok": false, "rtol": 0.001, "atol": 0.001, "absmax": 0.03125, "mae": 0.00154876708984375, "mse": 1.1265277862548828e-05, "ref": "layer_norm_fp32"}, "err": null} -{"ts": "2025-10-24T19:24:01Z", "run": "fba0677bba464dcfbc87714fa9385ac6", "impl": "torch_layer_norm", "tags": {"family": "torch", "op": "layer_norm"}, "wl": {"name": "LN_B4_S1024_D8192", "batch": 4, "seq_len": 1024, "hidden_dim": 8192, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L4", "sm": "8.9", "py": "3.12.7", "plat": "Linux-5.15.0-1084-aws-x86_64-with-glibc2.31"}, "lat_ms": {"p10": 0.5820420337840915, "p50": 0.589212984777987, "p90": 0.5898119998164475, "mean": 0.5889245891012251, "iqr": 0.002659042365849018, "raw_times": [0.5898119998164475, 0.5871529574505985, 0.589212984777987, 0.5820420337840915, 0.5964029696770012], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.5985530442558229, "peak_bytes": 604012544, "ok": false, "absmax": 0.03125, "corr": {"ok": false, "rtol": 0.001, "atol": 0.001, "absmax": 0.03125, "mae": 0.0015106201171875, "mse": 1.1026859283447266e-05, "ref": "layer_norm_fp32"}, "err": null} -{"ts": "2025-10-24T19:24:01Z", "run": "fba0677bba464dcfbc87714fa9385ac6", "impl": "torch_layer_norm", "tags": {"family": "torch", "op": "layer_norm"}, "wl": {"name": "LN_B4_S2048_D1024", "batch": 4, "seq_len": 2048, "hidden_dim": 1024, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L4", "sm": "8.9", "py": "3.12.7", "plat": "Linux-5.15.0-1084-aws-x86_64-with-glibc2.31"}, "lat_ms": {"p10": 0.06898096762597561, "p50": 0.07057201582938433, "p90": 0.07107201963663101, "mean": 0.07093560416251421, "iqr": 0.0007100170478224754, "raw_times": [0.07369101513177156, 0.07036200258880854, 0.07107201963663101, 0.06898096762597561, 0.07057201582938433], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.06987096276134253, "peak_bytes": 167809024, "ok": false, "absmax": 0.03125, "corr": {"ok": false, "rtol": 0.001, "atol": 0.001, "absmax": 0.03125, "mae": 0.0015869140625, "mse": 1.1563301086425781e-05, "ref": "layer_norm_fp32"}, "err": null} -{"ts": "2025-10-24T19:24:01Z", "run": "fba0677bba464dcfbc87714fa9385ac6", "impl": "torch_layer_norm", "tags": {"family": "torch", "op": "layer_norm"}, "wl": {"name": "LN_B4_S2048_D2048", "batch": 4, "seq_len": 2048, "hidden_dim": 2048, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L4", "sm": "8.9", "py": "3.12.7", "plat": "Linux-5.15.0-1084-aws-x86_64-with-glibc2.31"}, "lat_ms": {"p10": 0.2780960057862103, "p50": 0.2848859876394272, "p90": 0.28691597981378436, "mean": 0.2848202013410628, "iqr": 0.003049965016543865, "raw_times": [0.2838660147972405, 0.28691597981378436, 0.2848859876394272, 0.2903370186686516, 0.2780960057862103], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.27952599339187145, "peak_bytes": 301998080, "ok": false, "absmax": 0.0625, "corr": {"ok": false, "rtol": 0.001, "atol": 0.001, "absmax": 0.0625, "mae": 0.0015716552734375, "mse": 1.150369644165039e-05, "ref": "layer_norm_fp32"}, "err": null} -{"ts": "2025-10-24T19:24:01Z", "run": "fba0677bba464dcfbc87714fa9385ac6", "impl": "torch_layer_norm", "tags": {"family": "torch", "op": "layer_norm"}, "wl": {"name": "LN_B4_S2048_D4096", "batch": 4, "seq_len": 2048, "hidden_dim": 4096, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L4", "sm": "8.9", "py": "3.12.7", "plat": "Linux-5.15.0-1084-aws-x86_64-with-glibc2.31"}, "lat_ms": {"p10": 0.5754730082117021, "p50": 0.5831019952893257, "p90": 0.5851630121469498, "mean": 0.582532596308738, "iqr": 0.004841014742851257, "raw_times": [0.5831019952893257, 0.5851630121469498, 0.5803219974040985, 0.5886029684916139, 0.5754730082117021], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.5842720274813473, "peak_bytes": 603996160, "ok": false, "absmax": 0.03125, "corr": {"ok": false, "rtol": 0.001, "atol": 0.001, "absmax": 0.03125, "mae": 0.0015411376953125, "mse": 1.1205673217773438e-05, "ref": "layer_norm_fp32"}, "err": null} -{"ts": "2025-10-24T19:24:01Z", "run": "fba0677bba464dcfbc87714fa9385ac6", "impl": "torch_layer_norm", "tags": {"family": "torch", "op": "layer_norm"}, "wl": {"name": "LN_B4_S2048_D8192", "batch": 4, "seq_len": 2048, "hidden_dim": 8192, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L4", "sm": "8.9", "py": "3.12.7", "plat": "Linux-5.15.0-1084-aws-x86_64-with-glibc2.31"}, "lat_ms": {"p10": 1.1429850128479302, "p50": 1.1474639759398997, "p90": 1.1484349961392581, "mean": 1.1493865866214037, "iqr": 0.001491047441959381, "raw_times": [1.1474639759398997, 1.1429850128479302, 1.1484349961392581, 1.1469439486972988, 1.1611049994826317], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 1.1680549941956997, "peak_bytes": 1207992320, "ok": false, "absmax": 0.03125, "corr": {"ok": false, "rtol": 0.001, "atol": 0.001, "absmax": 0.03125, "mae": 0.0015106201171875, "mse": 1.1026859283447266e-05, "ref": "layer_norm_fp32"}, "err": null} -{"ts": "2025-10-24T19:24:01Z", "run": "fba0677bba464dcfbc87714fa9385ac6", "impl": "torch_layer_norm", "tags": {"family": "torch", "op": "layer_norm"}, "wl": {"name": "LN_B16_S128_D1024", "batch": 16, "seq_len": 128, "hidden_dim": 1024, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L4", "sm": "8.9", "py": "3.12.7", "plat": "Linux-5.15.0-1084-aws-x86_64-with-glibc2.31"}, "lat_ms": {"p10": 0.03370095510035753, "p50": 0.03467098576948047, "p90": 0.03499101148918271, "mean": 0.034558994229882956, "iqr": 0.0006700283847749233, "raw_times": [0.03432098310440779, 0.03467098576948047, 0.03499101148918271, 0.03511103568598628, 0.03370095510035753], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.035281002055853605, "peak_bytes": 276860928, "ok": false, "absmax": 0.03125, "corr": {"ok": false, "rtol": 0.001, "atol": 0.001, "absmax": 0.03125, "mae": 0.00157928466796875, "mse": 1.1444091796875e-05, "ref": "layer_norm_fp32"}, "err": null} -{"ts": "2025-10-24T19:24:01Z", "run": "fba0677bba464dcfbc87714fa9385ac6", "impl": "torch_layer_norm", "tags": {"family": "torch", "op": "layer_norm"}, "wl": {"name": "LN_B16_S128_D2048", "batch": 16, "seq_len": 128, "hidden_dim": 2048, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L4", "sm": "8.9", "py": "3.12.7", "plat": "Linux-5.15.0-1084-aws-x86_64-with-glibc2.31"}, "lat_ms": {"p10": 0.04038098268210888, "p50": 0.04121096571907401, "p90": 0.041480991058051586, "mean": 0.04112699534744024, "iqr": 0.0006599584594368935, "raw_times": [0.04038098268210888, 0.04121096571907401, 0.04082103259861469, 0.041741004679352045, 0.041480991058051586], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.04577101208269596, "peak_bytes": 75505664, "ok": false, "absmax": 0.03125, "corr": {"ok": false, "rtol": 0.001, "atol": 0.001, "absmax": 0.03125, "mae": 0.0015716552734375, "mse": 1.1444091796875e-05, "ref": "layer_norm_fp32"}, "err": null} -{"ts": "2025-10-24T19:24:01Z", "run": "fba0677bba464dcfbc87714fa9385ac6", "impl": "torch_layer_norm", "tags": {"family": "torch", "op": "layer_norm"}, "wl": {"name": "LN_B16_S128_D4096", "batch": 16, "seq_len": 128, "hidden_dim": 4096, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L4", "sm": "8.9", "py": "3.12.7", "plat": "Linux-5.15.0-1084-aws-x86_64-with-glibc2.31"}, "lat_ms": {"p10": 0.0530310207977891, "p50": 0.05350098945200443, "p90": 0.05359097849577665, "mean": 0.053398997988551855, "iqr": 0.0005399924702942371, "raw_times": [0.05350098945200443, 0.053050986025482416, 0.0530310207977891, 0.05359097849577665, 0.053821015171706676], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.062031031120568514, "peak_bytes": 151011328, "ok": false, "absmax": 0.03125, "corr": {"ok": false, "rtol": 0.001, "atol": 0.001, "absmax": 0.03125, "mae": 0.0015411376953125, "mse": 1.1205673217773438e-05, "ref": "layer_norm_fp32"}, "err": null} -{"ts": "2025-10-24T19:24:01Z", "run": "fba0677bba464dcfbc87714fa9385ac6", "impl": "torch_layer_norm", "tags": {"family": "torch", "op": "layer_norm"}, "wl": {"name": "LN_B16_S128_D8192", "batch": 16, "seq_len": 128, "hidden_dim": 8192, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L4", "sm": "8.9", "py": "3.12.7", "plat": "Linux-5.15.0-1084-aws-x86_64-with-glibc2.31"}, "lat_ms": {"p10": 0.26927603175863624, "p50": 0.27200503973290324, "p90": 0.27223501820117235, "mean": 0.2713776077143848, "iqr": 0.002459040842950344, "raw_times": [0.27200503973290324, 0.27359597152099013, 0.26927603175863624, 0.27223501820117235, 0.269775977358222], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.2747260150499642, "peak_bytes": 302022656, "ok": false, "absmax": 0.03125, "corr": {"ok": false, "rtol": 0.001, "atol": 0.001, "absmax": 0.03125, "mae": 0.0015106201171875, "mse": 1.1086463928222656e-05, "ref": "layer_norm_fp32"}, "err": null} -{"ts": "2025-10-24T19:24:01Z", "run": "fba0677bba464dcfbc87714fa9385ac6", "impl": "torch_layer_norm", "tags": {"family": "torch", "op": "layer_norm"}, "wl": {"name": "LN_B16_S512_D1024", "batch": 16, "seq_len": 512, "hidden_dim": 1024, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L4", "sm": "8.9", "py": "3.12.7", "plat": "Linux-5.15.0-1084-aws-x86_64-with-glibc2.31"}, "lat_ms": {"p10": 0.06719096563756466, "p50": 0.07005198858678341, "p90": 0.07023202488198876, "mean": 0.06946560461074114, "iqr": 0.0011699739843606949, "raw_times": [0.06719096563756466, 0.07005198858678341, 0.06906205089762807, 0.07079099304974079, 0.07023202488198876], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.07167202420532703, "peak_bytes": 150999040, "ok": false, "absmax": 0.03125, "corr": {"ok": false, "rtol": 0.001, "atol": 0.001, "absmax": 0.03125, "mae": 0.0015869140625, "mse": 1.1563301086425781e-05, "ref": "layer_norm_fp32"}, "err": null} -{"ts": "2025-10-24T19:24:01Z", "run": "fba0677bba464dcfbc87714fa9385ac6", "impl": "torch_layer_norm", "tags": {"family": "torch", "op": "layer_norm"}, "wl": {"name": "LN_B16_S512_D2048", "batch": 16, "seq_len": 512, "hidden_dim": 2048, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L4", "sm": "8.9", "py": "3.12.7", "plat": "Linux-5.15.0-1084-aws-x86_64-with-glibc2.31"}, "lat_ms": {"p10": 0.2804459654726088, "p50": 0.28543599182739854, "p90": 0.28567598201334476, "mean": 0.2847379771992564, "iqr": 0.0013799872249364853, "raw_times": [0.28543599182739854, 0.2878359518945217, 0.2842959947884083, 0.28567598201334476, 0.2804459654726088], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.28216600185260177, "peak_bytes": 301998080, "ok": false, "absmax": 0.0625, "corr": {"ok": false, "rtol": 0.001, "atol": 0.001, "absmax": 0.0625, "mae": 0.0015716552734375, "mse": 1.150369644165039e-05, "ref": "layer_norm_fp32"}, "err": null} -{"ts": "2025-10-24T19:24:01Z", "run": "fba0677bba464dcfbc87714fa9385ac6", "impl": "torch_layer_norm", "tags": {"family": "torch", "op": "layer_norm"}, "wl": {"name": "LN_B16_S512_D4096", "batch": 16, "seq_len": 512, "hidden_dim": 4096, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L4", "sm": "8.9", "py": "3.12.7", "plat": "Linux-5.15.0-1084-aws-x86_64-with-glibc2.31"}, "lat_ms": {"p10": 0.5787130212411284, "p50": 0.5883330013602972, "p90": 0.5898720119148493, "mean": 0.5873608053661883, "iqr": 0.0030189985409379005, "raw_times": [0.5930329789407551, 0.5868530133739114, 0.5898720119148493, 0.5787130212411284, 0.5883330013602972], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.5958119872957468, "peak_bytes": 603996160, "ok": false, "absmax": 0.03125, "corr": {"ok": false, "rtol": 0.001, "atol": 0.001, "absmax": 0.03125, "mae": 0.0015411376953125, "mse": 1.1205673217773438e-05, "ref": "layer_norm_fp32"}, "err": null} -{"ts": "2025-10-24T19:24:01Z", "run": "fba0677bba464dcfbc87714fa9385ac6", "impl": "torch_layer_norm", "tags": {"family": "torch", "op": "layer_norm"}, "wl": {"name": "LN_B16_S512_D8192", "batch": 16, "seq_len": 512, "hidden_dim": 8192, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L4", "sm": "8.9", "py": "3.12.7", "plat": "Linux-5.15.0-1084-aws-x86_64-with-glibc2.31"}, "lat_ms": {"p10": 1.1475840001367033, "p50": 1.149774994701147, "p90": 1.149774994701147, "mean": 1.149676798377186, "iqr": 0.0017299898900091648, "raw_times": [1.148045004811138, 1.149774994701147, 1.149774994701147, 1.153204997535795, 1.1475840001367033], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 1.1586649925448, "peak_bytes": 1207992320, "ok": false, "absmax": 0.03125, "corr": {"ok": false, "rtol": 0.001, "atol": 0.001, "absmax": 0.03125, "mae": 0.0015106201171875, "mse": 1.1026859283447266e-05, "ref": "layer_norm_fp32"}, "err": null} -{"ts": "2025-10-24T19:24:01Z", "run": "fba0677bba464dcfbc87714fa9385ac6", "impl": "torch_layer_norm", "tags": {"family": "torch", "op": "layer_norm"}, "wl": {"name": "LN_B16_S1024_D1024", "batch": 16, "seq_len": 1024, "hidden_dim": 1024, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L4", "sm": "8.9", "py": "3.12.7", "plat": "Linux-5.15.0-1084-aws-x86_64-with-glibc2.31"}, "lat_ms": {"p10": 0.28807600028812885, "p50": 0.29028597055003047, "p90": 0.2923660213127732, "mean": 0.290472200140357, "iqr": 0.0033390242606401443, "raw_times": [0.2923660213127732, 0.29028597055003047, 0.2890269970521331, 0.28807600028812885, 0.29260601149871945], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.28624600963667035, "peak_bytes": 335581184, "ok": false, "absmax": 0.0625, "corr": {"ok": false, "rtol": 0.001, "atol": 0.001, "absmax": 0.0625, "mae": 0.0015869140625, "mse": 1.1563301086425781e-05, "ref": "layer_norm_fp32"}, "err": null} -{"ts": "2025-10-24T19:24:01Z", "run": "fba0677bba464dcfbc87714fa9385ac6", "impl": "torch_layer_norm", "tags": {"family": "torch", "op": "layer_norm"}, "wl": {"name": "LN_B16_S1024_D2048", "batch": 16, "seq_len": 1024, "hidden_dim": 2048, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L4", "sm": "8.9", "py": "3.12.7", "plat": "Linux-5.15.0-1084-aws-x86_64-with-glibc2.31"}, "lat_ms": {"p10": 0.5808720015920699, "p50": 0.5865029525011778, "p90": 0.5889830063097179, "mean": 0.5862265941686928, "iqr": 0.0050209928303956985, "raw_times": [0.5808720015920699, 0.5865029525011778, 0.5908129969611764, 0.5889830063097179, 0.5839620134793222], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.5870030145160854, "peak_bytes": 603987968, "ok": false, "absmax": 0.0625, "corr": {"ok": false, "rtol": 0.001, "atol": 0.001, "absmax": 0.0625, "mae": 0.0015716552734375, "mse": 1.1444091796875e-05, "ref": "layer_norm_fp32"}, "err": null} -{"ts": "2025-10-24T19:24:02Z", "run": "fba0677bba464dcfbc87714fa9385ac6", "impl": "torch_layer_norm", "tags": {"family": "torch", "op": "layer_norm"}, "wl": {"name": "LN_B16_S1024_D4096", "batch": 16, "seq_len": 1024, "hidden_dim": 4096, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L4", "sm": "8.9", "py": "3.12.7", "plat": "Linux-5.15.0-1084-aws-x86_64-with-glibc2.31"}, "lat_ms": {"p10": 1.1516850208863616, "p50": 1.1541039566509426, "p90": 1.159774954430759, "mean": 1.1568425805307925, "iqr": 0.006380956619977951, "raw_times": [1.1652549728751183, 1.153393997810781, 1.1541039566509426, 1.159774954430759, 1.1516850208863616], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 1.1652849498204887, "peak_bytes": 1207975936, "ok": false, "absmax": 0.03125, "corr": {"ok": false, "rtol": 0.001, "atol": 0.001, "absmax": 0.03125, "mae": 0.00154876708984375, "mse": 1.1265277862548828e-05, "ref": "layer_norm_fp32"}, "err": null} -{"ts": "2025-10-24T19:24:02Z", "run": "fba0677bba464dcfbc87714fa9385ac6", "impl": "torch_layer_norm", "tags": {"family": "torch", "op": "layer_norm"}, "wl": {"name": "LN_B16_S1024_D8192", "batch": 16, "seq_len": 1024, "hidden_dim": 8192, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L4", "sm": "8.9", "py": "3.12.7", "plat": "Linux-5.15.0-1084-aws-x86_64-with-glibc2.31"}, "lat_ms": {"p10": 2.264778013341129, "p50": 2.2672080085612833, "p90": 2.2703579743392766, "mean": 2.2687464021146297, "iqr": 0.0047089415602386, "raw_times": [2.2703579743392766, 2.264778013341129, 2.2672080085612833, 2.265649032779038, 2.275738981552422], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 2.299049054272473, "peak_bytes": 2415951872, "ok": false, "absmax": 0.03125, "corr": {"ok": false, "rtol": 0.001, "atol": 0.001, "absmax": 0.03125, "mae": 0.00150299072265625, "mse": 1.0967254638671875e-05, "ref": "layer_norm_fp32"}, "err": null} -{"ts": "2025-10-24T19:24:02Z", "run": "fba0677bba464dcfbc87714fa9385ac6", "impl": "torch_layer_norm", "tags": {"family": "torch", "op": "layer_norm"}, "wl": {"name": "LN_B16_S2048_D1024", "batch": 16, "seq_len": 2048, "hidden_dim": 1024, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L4", "sm": "8.9", "py": "3.12.7", "plat": "Linux-5.15.0-1084-aws-x86_64-with-glibc2.31"}, "lat_ms": {"p10": 0.5862729740329087, "p50": 0.5921830306760967, "p90": 0.5925330333411694, "mean": 0.591674807947129, "iqr": 0.00036100391298532486, "raw_times": [0.592172029428184, 0.5921830306760967, 0.5925330333411694, 0.5952129722572863, 0.5862729740329087], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.5872620386071503, "peak_bytes": 671125504, "ok": false, "absmax": 0.0625, "corr": {"ok": false, "rtol": 0.001, "atol": 0.001, "absmax": 0.0625, "mae": 0.0015869140625, "mse": 1.1563301086425781e-05, "ref": "layer_norm_fp32"}, "err": null} -{"ts": "2025-10-24T19:24:02Z", "run": "fba0677bba464dcfbc87714fa9385ac6", "impl": "torch_layer_norm", "tags": {"family": "torch", "op": "layer_norm"}, "wl": {"name": "LN_B16_S2048_D2048", "batch": 16, "seq_len": 2048, "hidden_dim": 2048, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L4", "sm": "8.9", "py": "3.12.7", "plat": "Linux-5.15.0-1084-aws-x86_64-with-glibc2.31"}, "lat_ms": {"p10": 1.150664989836514, "p50": 1.156534010078758, "p90": 1.1567150359041989, "mean": 1.15486680297181, "iqr": 0.004480069037526846, "raw_times": [1.1567150359041989, 1.150664989836514, 1.1581850121729076, 1.156534010078758, 1.152234966866672], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 1.150145020801574, "peak_bytes": 1207967744, "ok": false, "absmax": 0.0625, "corr": {"ok": false, "rtol": 0.001, "atol": 0.001, "absmax": 0.0625, "mae": 0.00156402587890625, "mse": 1.138448715209961e-05, "ref": "layer_norm_fp32"}, "err": null} -{"ts": "2025-10-24T19:24:02Z", "run": "fba0677bba464dcfbc87714fa9385ac6", "impl": "torch_layer_norm", "tags": {"family": "torch", "op": "layer_norm"}, "wl": {"name": "LN_B16_S2048_D4096", "batch": 16, "seq_len": 2048, "hidden_dim": 4096, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L4", "sm": "8.9", "py": "3.12.7", "plat": "Linux-5.15.0-1084-aws-x86_64-with-glibc2.31"}, "lat_ms": {"p10": 2.2770979558117688, "p50": 2.2999990032985806, "p90": 2.302108972799033, "mean": 2.2958547924645245, "iqr": 0.012759934179484844, "raw_times": [2.2770979558117688, 2.302108972799033, 2.310718991793692, 2.2999990032985806, 2.289349038619548], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 2.3001389927230775, "peak_bytes": 2415935488, "ok": false, "absmax": 0.03125, "corr": {"ok": false, "rtol": 0.001, "atol": 0.001, "absmax": 0.03125, "mae": 0.0015411376953125, "mse": 1.1265277862548828e-05, "ref": "layer_norm_fp32"}, "err": null} -{"ts": "2025-10-24T19:24:02Z", "run": "fba0677bba464dcfbc87714fa9385ac6", "impl": "torch_layer_norm", "tags": {"family": "torch", "op": "layer_norm"}, "wl": {"name": "LN_B16_S2048_D8192", "batch": 16, "seq_len": 2048, "hidden_dim": 8192, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L4", "sm": "8.9", "py": "3.12.7", "plat": "Linux-5.15.0-1084-aws-x86_64-with-glibc2.31"}, "lat_ms": {"p10": 4.500485956668854, "p50": 4.510977014433593, "p90": 4.513906955253333, "mean": 4.509088769555092, "iqr": 0.010930001735687256, "raw_times": [4.500485956668854, 4.510977014433593, 4.502976953517646, 4.513906955253333, 4.5170969679020345], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 4.5062569552101195, "peak_bytes": 4831870976, "ok": false, "absmax": 0.03125, "corr": {"ok": false, "rtol": 0.001, "atol": 0.001, "absmax": 0.03125, "mae": 0.0015106201171875, "mse": 1.1086463928222656e-05, "ref": "layer_norm_fp32"}, "err": null} +{"ts": "2025-10-27T14:46:20Z", "run": "9dc250db88854e28964e0fea9bac9f3c", "impl": "hf_kernels_layer_norm", "tags": {"family": "hf-kernels", "repo": "kernels-community/layer-norm", "op": "layer_norm"}, "wl": {"name": "LN_B1_S128_D1024", "batch": 1, "seq_len": 128, "hidden_dim": 1024, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.03865100006805733, "p50": 0.03903099991475756, "p90": 0.04018100003122527, "mean": 0.03959079995183856, "iqr": 0.001300000121773337, "raw_times": [0.03888099990945193, 0.03903099991475756, 0.04018100003122527, 0.04120999983570073, 0.03865100006805733], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.05060100011178292, "peak_bytes": 2363392, "ok": false, "absmax": 0.03125, "corr": {"ok": false, "rtol": 0.001, "atol": 0.001, "absmax": 0.03125, "mae": 0.0015716552734375, "mse": 1.1444091796875e-05, "ref": "layer_norm_fp32"}, "err": null} +{"ts": "2025-10-27T14:46:20Z", "run": "9dc250db88854e28964e0fea9bac9f3c", "impl": "hf_kernels_layer_norm", "tags": {"family": "hf-kernels", "repo": "kernels-community/layer-norm", "op": "layer_norm"}, "wl": {"name": "LN_B1_S128_D2048", "batch": 1, "seq_len": 128, "hidden_dim": 2048, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.04517999991549004, "p50": 0.04712100007964182, "p90": 0.04805000003216264, "mean": 0.04695459997492435, "iqr": 0.001779000058377278, "raw_times": [0.04517999991549004, 0.04805000003216264, 0.04712100007964182, 0.046270999973785365, 0.04815099987354188], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.05504099999598111, "peak_bytes": 4726784, "ok": false, "absmax": 0.03125, "corr": {"ok": false, "rtol": 0.001, "atol": 0.001, "absmax": 0.03125, "mae": 0.0015716552734375, "mse": 1.1324882507324219e-05, "ref": "layer_norm_fp32"}, "err": null} +{"ts": "2025-10-27T14:46:20Z", "run": "9dc250db88854e28964e0fea9bac9f3c", "impl": "hf_kernels_layer_norm", "tags": {"family": "hf-kernels", "repo": "kernels-community/layer-norm", "op": "layer_norm"}, "wl": {"name": "LN_B1_S128_D4096", "batch": 1, "seq_len": 128, "hidden_dim": 4096, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.04581999996844388, "p50": 0.04766099982589367, "p90": 0.04786099998455029, "mean": 0.047156599976005964, "iqr": 0.0017899999420478707, "raw_times": [0.04766099982589367, 0.04786099998455029, 0.04837000005863956, 0.04607100004250242, 0.04581999996844388], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.05361099988476781, "peak_bytes": 9453568, "ok": false, "absmax": 0.03125, "corr": {"ok": false, "rtol": 0.001, "atol": 0.001, "absmax": 0.03125, "mae": 0.00146484375, "mse": 1.049041748046875e-05, "ref": "layer_norm_fp32"}, "err": null} +{"ts": "2025-10-27T14:46:20Z", "run": "9dc250db88854e28964e0fea9bac9f3c", "impl": "hf_kernels_layer_norm", "tags": {"family": "hf-kernels", "repo": "kernels-community/layer-norm", "op": "layer_norm"}, "wl": {"name": "LN_B1_S128_D8192", "batch": 1, "seq_len": 128, "hidden_dim": 8192, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.045190999799160636, "p50": 0.04684100008489622, "p90": 0.04752099994220771, "mean": 0.046596999982284615, "iqr": 0.00227999998969608, "raw_times": [0.04524099995251163, 0.04752099994220771, 0.04819100013264688, 0.04684100008489622, 0.045190999799160636], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.052801000038016355, "peak_bytes": 18907136, "ok": false, "absmax": 0.03125, "corr": {"ok": false, "rtol": 0.001, "atol": 0.001, "absmax": 0.03125, "mae": 0.0016326904296875, "mse": 1.1801719665527344e-05, "ref": "layer_norm_fp32"}, "err": null} +{"ts": "2025-10-27T14:46:20Z", "run": "9dc250db88854e28964e0fea9bac9f3c", "impl": "hf_kernels_layer_norm", "tags": {"family": "hf-kernels", "repo": "kernels-community/layer-norm", "op": "layer_norm"}, "wl": {"name": "LN_B1_S512_D1024", "batch": 1, "seq_len": 512, "hidden_dim": 1024, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.04395000019030704, "p50": 0.045061000037094345, "p90": 0.046920999920985196, "mean": 0.04563460001918429, "iqr": 0.0018609998733154498, "raw_times": [0.04718099989986513, 0.046920999920985196, 0.045060000047669746, 0.045061000037094345, 0.04395000019030704], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.05090100012239418, "peak_bytes": 9441280, "ok": false, "absmax": 0.03125, "corr": {"ok": false, "rtol": 0.001, "atol": 0.001, "absmax": 0.03125, "mae": 0.0015716552734375, "mse": 1.138448715209961e-05, "ref": "layer_norm_fp32"}, "err": null} +{"ts": "2025-10-27T14:46:20Z", "run": "9dc250db88854e28964e0fea9bac9f3c", "impl": "hf_kernels_layer_norm", "tags": {"family": "hf-kernels", "repo": "kernels-community/layer-norm", "op": "layer_norm"}, "wl": {"name": "LN_B1_S512_D2048", "batch": 1, "seq_len": 512, "hidden_dim": 2048, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.04656000010072603, "p50": 0.046920999920985196, "p90": 0.04878100003224972, "mean": 0.04884479999418545, "iqr": 0.0020300001324358163, "raw_times": [0.04656000010072603, 0.046750999899813905, 0.04878100003224972, 0.0552110000171524, 0.046920999920985196], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.0497399998948822, "peak_bytes": 18882560, "ok": false, "absmax": 0.03125, "corr": {"ok": false, "rtol": 0.001, "atol": 0.001, "absmax": 0.03125, "mae": 0.00154876708984375, "mse": 1.1205673217773438e-05, "ref": "layer_norm_fp32"}, "err": null} +{"ts": "2025-10-27T14:46:20Z", "run": "9dc250db88854e28964e0fea9bac9f3c", "impl": "hf_kernels_layer_norm", "tags": {"family": "hf-kernels", "repo": "kernels-community/layer-norm", "op": "layer_norm"}, "wl": {"name": "LN_B1_S512_D4096", "batch": 1, "seq_len": 512, "hidden_dim": 4096, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.04567099995256285, "p50": 0.04622100004780805, "p90": 0.04798100007974426, "mean": 0.047496800016233465, "iqr": 0.0018200000795332016, "raw_times": [0.04567099995256285, 0.0514500000008411, 0.04616100000021106, 0.04798100007974426, 0.04622100004780805], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.04885000021204178, "peak_bytes": 37765120, "ok": false, "absmax": 0.03125, "corr": {"ok": false, "rtol": 0.001, "atol": 0.001, "absmax": 0.03125, "mae": 0.0015716552734375, "mse": 1.1444091796875e-05, "ref": "layer_norm_fp32"}, "err": null} +{"ts": "2025-10-27T14:46:20Z", "run": "9dc250db88854e28964e0fea9bac9f3c", "impl": "hf_kernels_layer_norm", "tags": {"family": "hf-kernels", "repo": "kernels-community/layer-norm", "op": "layer_norm"}, "wl": {"name": "LN_B1_S512_D8192", "batch": 1, "seq_len": 512, "hidden_dim": 8192, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.04487000001063279, "p50": 0.045961000068928115, "p90": 0.046200000042517786, "mean": 0.04860060003011313, "iqr": 0.000509000074089272, "raw_times": [0.06028100006005843, 0.04487000001063279, 0.045690999968428514, 0.045961000068928115, 0.046200000042517786], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.05061100000602892, "peak_bytes": 75530240, "ok": false, "absmax": 0.03125, "corr": {"ok": false, "rtol": 0.001, "atol": 0.001, "absmax": 0.03125, "mae": 0.00157928466796875, "mse": 1.150369644165039e-05, "ref": "layer_norm_fp32"}, "err": null} +{"ts": "2025-10-27T14:46:20Z", "run": "9dc250db88854e28964e0fea9bac9f3c", "impl": "hf_kernels_layer_norm", "tags": {"family": "hf-kernels", "repo": "kernels-community/layer-norm", "op": "layer_norm"}, "wl": {"name": "LN_B1_S1024_D1024", "batch": 1, "seq_len": 1024, "hidden_dim": 1024, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.043511000058060745, "p50": 0.046270999973785365, "p90": 0.04790999992110301, "mean": 0.047574600012012525, "iqr": 0.002919999815276242, "raw_times": [0.044990000105826766, 0.04790999992110301, 0.043511000058060745, 0.05519100000128674, 0.046270999973785365], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.048970999841913, "peak_bytes": 21008384, "ok": false, "absmax": 0.03125, "corr": {"ok": false, "rtol": 0.001, "atol": 0.001, "absmax": 0.03125, "mae": 0.00157928466796875, "mse": 1.138448715209961e-05, "ref": "layer_norm_fp32"}, "err": null} +{"ts": "2025-10-27T14:46:20Z", "run": "9dc250db88854e28964e0fea9bac9f3c", "impl": "hf_kernels_layer_norm", "tags": {"family": "hf-kernels", "repo": "kernels-community/layer-norm", "op": "layer_norm"}, "wl": {"name": "LN_B1_S1024_D2048", "batch": 1, "seq_len": 1024, "hidden_dim": 2048, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.043170000026293565, "p50": 0.04767099994751334, "p90": 0.0476899999739544, "mean": 0.04691639996963204, "iqr": 0.0009390000741404947, "raw_times": [0.043170000026293565, 0.04930000000058499, 0.04767099994751334, 0.046750999899813905, 0.0476899999739544], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.05084099984742352, "peak_bytes": 37756928, "ok": false, "absmax": 0.03125, "corr": {"ok": false, "rtol": 0.001, "atol": 0.001, "absmax": 0.03125, "mae": 0.0015716552734375, "mse": 1.1444091796875e-05, "ref": "layer_norm_fp32"}, "err": null} +{"ts": "2025-10-27T14:46:20Z", "run": "9dc250db88854e28964e0fea9bac9f3c", "impl": "hf_kernels_layer_norm", "tags": {"family": "hf-kernels", "repo": "kernels-community/layer-norm", "op": "layer_norm"}, "wl": {"name": "LN_B1_S1024_D4096", "batch": 1, "seq_len": 1024, "hidden_dim": 4096, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.044720999994751764, "p50": 0.045860000000175205, "p90": 0.046411000084845, "mean": 0.04585680003401649, "iqr": 0.0012000000424450263, "raw_times": [0.044720999994751764, 0.04708100004791049, 0.046411000084845, 0.045860000000175205, 0.045211000042399974], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.05302099998516496, "peak_bytes": 75513856, "ok": false, "absmax": 0.03125, "corr": {"ok": false, "rtol": 0.001, "atol": 0.001, "absmax": 0.03125, "mae": 0.0016021728515625, "mse": 1.1682510375976562e-05, "ref": "layer_norm_fp32"}, "err": null} +{"ts": "2025-10-27T14:46:20Z", "run": "9dc250db88854e28964e0fea9bac9f3c", "impl": "hf_kernels_layer_norm", "tags": {"family": "hf-kernels", "repo": "kernels-community/layer-norm", "op": "layer_norm"}, "wl": {"name": "LN_B1_S1024_D8192", "batch": 1, "seq_len": 1024, "hidden_dim": 8192, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.04476999993130448, "p50": 0.04614999988916679, "p90": 0.04633100002138235, "mean": 0.04639259996110923, "iqr": 0.00019000003703695256, "raw_times": [0.04476999993130448, 0.04614999988916679, 0.04857099997934711, 0.0461409999843454, 0.04633100002138235], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.047730999995110324, "peak_bytes": 151027712, "ok": false, "absmax": 0.03125, "corr": {"ok": false, "rtol": 0.001, "atol": 0.001, "absmax": 0.03125, "mae": 0.00156402587890625, "mse": 1.1444091796875e-05, "ref": "layer_norm_fp32"}, "err": null} +{"ts": "2025-10-27T14:46:20Z", "run": "9dc250db88854e28964e0fea9bac9f3c", "impl": "hf_kernels_layer_norm", "tags": {"family": "hf-kernels", "repo": "kernels-community/layer-norm", "op": "layer_norm"}, "wl": {"name": "LN_B1_S2048_D1024", "batch": 1, "seq_len": 2048, "hidden_dim": 1024, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.04606099992088275, "p50": 0.046679999968546326, "p90": 0.04687099999500788, "mean": 0.0466285999664251, "iqr": 0.0006509999366244301, "raw_times": [0.04606099992088275, 0.047310999889305094, 0.04687099999500788, 0.046679999968546326, 0.04622000005838345], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.050389999842082034, "peak_bytes": 41979904, "ok": false, "absmax": 0.03125, "corr": {"ok": false, "rtol": 0.001, "atol": 0.001, "absmax": 0.03125, "mae": 0.00159454345703125, "mse": 1.150369644165039e-05, "ref": "layer_norm_fp32"}, "err": null} +{"ts": "2025-10-27T14:46:20Z", "run": "9dc250db88854e28964e0fea9bac9f3c", "impl": "hf_kernels_layer_norm", "tags": {"family": "hf-kernels", "repo": "kernels-community/layer-norm", "op": "layer_norm"}, "wl": {"name": "LN_B1_S2048_D2048", "batch": 1, "seq_len": 2048, "hidden_dim": 2048, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.04560100001071987, "p50": 0.04617999979927845, "p90": 0.04656999999497202, "mean": 0.0462445999346528, "iqr": 0.0007090000053722179, "raw_times": [0.04617999979927845, 0.045860999989599804, 0.04560100001071987, 0.04701099987869384, 0.04656999999497202], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.049061000026995316, "peak_bytes": 75505664, "ok": false, "absmax": 0.03125, "corr": {"ok": false, "rtol": 0.001, "atol": 0.001, "absmax": 0.03125, "mae": 0.00159454345703125, "mse": 1.1563301086425781e-05, "ref": "layer_norm_fp32"}, "err": null} +{"ts": "2025-10-27T14:46:20Z", "run": "9dc250db88854e28964e0fea9bac9f3c", "impl": "hf_kernels_layer_norm", "tags": {"family": "hf-kernels", "repo": "kernels-community/layer-norm", "op": "layer_norm"}, "wl": {"name": "LN_B1_S2048_D4096", "batch": 1, "seq_len": 2048, "hidden_dim": 4096, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.04474100001061743, "p50": 0.04615000011654047, "p90": 0.04696099995271652, "mean": 0.046176800060493406, "iqr": 0.0009599998520570807, "raw_times": [0.047031000121933175, 0.04474100001061743, 0.04600100010065944, 0.04615000011654047, 0.04696099995271652], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.051490000032572425, "peak_bytes": 151011328, "ok": false, "absmax": 0.03125, "corr": {"ok": false, "rtol": 0.001, "atol": 0.001, "absmax": 0.03125, "mae": 0.00156402587890625, "mse": 1.1444091796875e-05, "ref": "layer_norm_fp32"}, "err": null} +{"ts": "2025-10-27T14:46:20Z", "run": "9dc250db88854e28964e0fea9bac9f3c", "impl": "hf_kernels_layer_norm", "tags": {"family": "hf-kernels", "repo": "kernels-community/layer-norm", "op": "layer_norm"}, "wl": {"name": "LN_B1_S2048_D8192", "batch": 1, "seq_len": 2048, "hidden_dim": 8192, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.051341000016691396, "p50": 0.05155100006959401, "p90": 0.05226099983701715, "mean": 0.051880799992432, "iqr": 0.0007709998044447275, "raw_times": [0.051341000016691396, 0.051490000032572425, 0.05226099983701715, 0.05155100006959401, 0.05276100000628503], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.053531000048678834, "peak_bytes": 302022656, "ok": false, "absmax": 0.03125, "corr": {"ok": false, "rtol": 0.001, "atol": 0.001, "absmax": 0.03125, "mae": 0.00152587890625, "mse": 1.1146068572998047e-05, "ref": "layer_norm_fp32"}, "err": null} +{"ts": "2025-10-27T14:46:20Z", "run": "9dc250db88854e28964e0fea9bac9f3c", "impl": "hf_kernels_layer_norm", "tags": {"family": "hf-kernels", "repo": "kernels-community/layer-norm", "op": "layer_norm"}, "wl": {"name": "LN_B4_S128_D1024", "batch": 4, "seq_len": 128, "hidden_dim": 1024, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.044059999936507666, "p50": 0.04549100003714557, "p90": 0.045540999963122886, "mean": 0.04540859999906388, "iqr": 0.0004099999841855606, "raw_times": [0.04549100003714557, 0.044059999936507666, 0.04682000007960596, 0.045130999978937325, 0.045540999963122886], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.048860999868338695, "peak_bytes": 69242880, "ok": false, "absmax": 0.03125, "corr": {"ok": false, "rtol": 0.001, "atol": 0.001, "absmax": 0.03125, "mae": 0.0015716552734375, "mse": 1.138448715209961e-05, "ref": "layer_norm_fp32"}, "err": null} +{"ts": "2025-10-27T14:46:20Z", "run": "9dc250db88854e28964e0fea9bac9f3c", "impl": "hf_kernels_layer_norm", "tags": {"family": "hf-kernels", "repo": "kernels-community/layer-norm", "op": "layer_norm"}, "wl": {"name": "LN_B4_S128_D2048", "batch": 4, "seq_len": 128, "hidden_dim": 2048, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.04604099990501709, "p50": 0.04642099997909099, "p90": 0.04698099996858218, "mean": 0.05290099998092046, "iqr": 0.0009299999419454252, "raw_times": [0.07901100002527528, 0.04698099996858218, 0.04604099990501709, 0.04605100002663676, 0.04642099997909099], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.048481000021638465, "peak_bytes": 18882560, "ok": false, "absmax": 0.03125, "corr": {"ok": false, "rtol": 0.001, "atol": 0.001, "absmax": 0.03125, "mae": 0.00154876708984375, "mse": 1.1205673217773438e-05, "ref": "layer_norm_fp32"}, "err": null} +{"ts": "2025-10-27T14:46:20Z", "run": "9dc250db88854e28964e0fea9bac9f3c", "impl": "hf_kernels_layer_norm", "tags": {"family": "hf-kernels", "repo": "kernels-community/layer-norm", "op": "layer_norm"}, "wl": {"name": "LN_B4_S128_D4096", "batch": 4, "seq_len": 128, "hidden_dim": 4096, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.04422100005285756, "p50": 0.045961000068928115, "p90": 0.04607100004250242, "mean": 0.04557280003609776, "iqr": 0.0010700000530050602, "raw_times": [0.04500099998949736, 0.045961000068928115, 0.04422100005285756, 0.046610000026703347, 0.04607100004250242], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.05000100009056041, "peak_bytes": 37765120, "ok": false, "absmax": 0.03125, "corr": {"ok": false, "rtol": 0.001, "atol": 0.001, "absmax": 0.03125, "mae": 0.0015716552734375, "mse": 1.1444091796875e-05, "ref": "layer_norm_fp32"}, "err": null} +{"ts": "2025-10-27T14:46:20Z", "run": "9dc250db88854e28964e0fea9bac9f3c", "impl": "hf_kernels_layer_norm", "tags": {"family": "hf-kernels", "repo": "kernels-community/layer-norm", "op": "layer_norm"}, "wl": {"name": "LN_B4_S128_D8192", "batch": 4, "seq_len": 128, "hidden_dim": 8192, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.044550999973580474, "p50": 0.04615100010596507, "p90": 0.04661999992094934, "mean": 0.04619880000973353, "iqr": 0.0006089999260439072, "raw_times": [0.04661999992094934, 0.047661000053267344, 0.04615100010596507, 0.04601099999490543, 0.044550999973580474], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.05021999982091074, "peak_bytes": 75530240, "ok": false, "absmax": 0.03125, "corr": {"ok": false, "rtol": 0.001, "atol": 0.001, "absmax": 0.03125, "mae": 0.00157928466796875, "mse": 1.150369644165039e-05, "ref": "layer_norm_fp32"}, "err": null} +{"ts": "2025-10-27T14:46:20Z", "run": "9dc250db88854e28964e0fea9bac9f3c", "impl": "hf_kernels_layer_norm", "tags": {"family": "hf-kernels", "repo": "kernels-community/layer-norm", "op": "layer_norm"}, "wl": {"name": "LN_B4_S512_D1024", "batch": 4, "seq_len": 512, "hidden_dim": 1024, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.04479100016396842, "p50": 0.04570999999486958, "p90": 0.04578100015351083, "mean": 0.04546060008578934, "iqr": 0.0006410000423784368, "raw_times": [0.045881000005465467, 0.04578100015351083, 0.045140000111132395, 0.04479100016396842, 0.04570999999486958], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.05074099999546888, "peak_bytes": 37752832, "ok": false, "absmax": 0.03125, "corr": {"ok": false, "rtol": 0.001, "atol": 0.001, "absmax": 0.03125, "mae": 0.00159454345703125, "mse": 1.150369644165039e-05, "ref": "layer_norm_fp32"}, "err": null} +{"ts": "2025-10-27T14:46:20Z", "run": "9dc250db88854e28964e0fea9bac9f3c", "impl": "hf_kernels_layer_norm", "tags": {"family": "hf-kernels", "repo": "kernels-community/layer-norm", "op": "layer_norm"}, "wl": {"name": "LN_B4_S512_D2048", "batch": 4, "seq_len": 512, "hidden_dim": 2048, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.04388000002109038, "p50": 0.046260999852165696, "p90": 0.047070999926290824, "mean": 0.046070799999142764, "iqr": 0.0010899998414970469, "raw_times": [0.04716100011137314, 0.04598100008479378, 0.04388000002109038, 0.046260999852165696, 0.047070999926290824], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.05007100003240339, "peak_bytes": 75505664, "ok": false, "absmax": 0.03125, "corr": {"ok": false, "rtol": 0.001, "atol": 0.001, "absmax": 0.03125, "mae": 0.00159454345703125, "mse": 1.1563301086425781e-05, "ref": "layer_norm_fp32"}, "err": null} +{"ts": "2025-10-27T14:46:20Z", "run": "9dc250db88854e28964e0fea9bac9f3c", "impl": "hf_kernels_layer_norm", "tags": {"family": "hf-kernels", "repo": "kernels-community/layer-norm", "op": "layer_norm"}, "wl": {"name": "LN_B4_S512_D4096", "batch": 4, "seq_len": 512, "hidden_dim": 4096, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.04435100004229753, "p50": 0.045130999978937325, "p90": 0.04698099996858218, "mean": 0.04562479998639901, "iqr": 0.0023600000531587284, "raw_times": [0.044620999915423454, 0.04698099996858218, 0.04704000002675457, 0.04435100004229753, 0.045130999978937325], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.04849099991588446, "peak_bytes": 151011328, "ok": false, "absmax": 0.03125, "corr": {"ok": false, "rtol": 0.001, "atol": 0.001, "absmax": 0.03125, "mae": 0.00156402587890625, "mse": 1.1444091796875e-05, "ref": "layer_norm_fp32"}, "err": null} +{"ts": "2025-10-27T14:46:20Z", "run": "9dc250db88854e28964e0fea9bac9f3c", "impl": "hf_kernels_layer_norm", "tags": {"family": "hf-kernels", "repo": "kernels-community/layer-norm", "op": "layer_norm"}, "wl": {"name": "LN_B4_S512_D8192", "batch": 4, "seq_len": 512, "hidden_dim": 8192, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.05119000002196117, "p50": 0.05123000005369249, "p90": 0.05150099991624302, "mean": 0.051574400004028575, "iqr": 0.00027999999474559445, "raw_times": [0.05122099992149742, 0.05150099991624302, 0.05123000005369249, 0.052730000106748776, 0.05119000002196117], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.05633099999613478, "peak_bytes": 302022656, "ok": false, "absmax": 0.03125, "corr": {"ok": false, "rtol": 0.001, "atol": 0.001, "absmax": 0.03125, "mae": 0.00152587890625, "mse": 1.1146068572998047e-05, "ref": "layer_norm_fp32"}, "err": null} +{"ts": "2025-10-27T14:46:20Z", "run": "9dc250db88854e28964e0fea9bac9f3c", "impl": "hf_kernels_layer_norm", "tags": {"family": "hf-kernels", "repo": "kernels-community/layer-norm", "op": "layer_norm"}, "wl": {"name": "LN_B4_S1024_D1024", "batch": 4, "seq_len": 1024, "hidden_dim": 1024, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.04580100016937649, "p50": 0.04708999995273189, "p90": 0.04770099985762499, "mean": 0.05188039999666216, "iqr": 0.00096099984148168, "raw_times": [0.07206999998743413, 0.04674000001614331, 0.04580100016937649, 0.04708999995273189, 0.04770099985762499], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.04944000011164462, "peak_bytes": 83922944, "ok": false, "absmax": 0.03125, "corr": {"ok": false, "rtol": 0.001, "atol": 0.001, "absmax": 0.03125, "mae": 0.0015869140625, "mse": 1.150369644165039e-05, "ref": "layer_norm_fp32"}, "err": null} +{"ts": "2025-10-27T14:46:20Z", "run": "9dc250db88854e28964e0fea9bac9f3c", "impl": "hf_kernels_layer_norm", "tags": {"family": "hf-kernels", "repo": "kernels-community/layer-norm", "op": "layer_norm"}, "wl": {"name": "LN_B4_S1024_D2048", "batch": 4, "seq_len": 1024, "hidden_dim": 2048, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.04320099992582982, "p50": 0.04512100008469133, "p90": 0.04604099990501709, "mean": 0.04527500000222062, "iqr": 0.001329999804511317, "raw_times": [0.04320099992582982, 0.04604099990501709, 0.04471100010050577, 0.0473009999950591, 0.04512100008469133], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.051181000117139774, "peak_bytes": 151003136, "ok": false, "absmax": 0.03125, "corr": {"ok": false, "rtol": 0.001, "atol": 0.001, "absmax": 0.03125, "mae": 0.0015716552734375, "mse": 1.1444091796875e-05, "ref": "layer_norm_fp32"}, "err": null} +{"ts": "2025-10-27T14:46:20Z", "run": "9dc250db88854e28964e0fea9bac9f3c", "impl": "hf_kernels_layer_norm", "tags": {"family": "hf-kernels", "repo": "kernels-community/layer-norm", "op": "layer_norm"}, "wl": {"name": "LN_B4_S1024_D4096", "batch": 4, "seq_len": 1024, "hidden_dim": 4096, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.04984999986845651, "p50": 0.050290999979552, "p90": 0.050490999910834944, "mean": 0.050288599959458224, "iqr": 0.0005399999736255268, "raw_times": [0.04995099993720942, 0.050490999910834944, 0.050290999979552, 0.050860000101238256, 0.04984999986845651], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.052241000048525166, "peak_bytes": 302006272, "ok": false, "absmax": 0.03125, "corr": {"ok": false, "rtol": 0.001, "atol": 0.001, "absmax": 0.03125, "mae": 0.0015411376953125, "mse": 1.1205673217773438e-05, "ref": "layer_norm_fp32"}, "err": null} +{"ts": "2025-10-27T14:46:20Z", "run": "9dc250db88854e28964e0fea9bac9f3c", "impl": "hf_kernels_layer_norm", "tags": {"family": "hf-kernels", "repo": "kernels-community/layer-norm", "op": "layer_norm"}, "wl": {"name": "LN_B4_S1024_D8192", "batch": 4, "seq_len": 1024, "hidden_dim": 8192, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.2064129998871067, "p50": 0.2123330000358692, "p90": 0.218262999851504, "mean": 0.2148927999769512, "iqr": 0.010130999726243317, "raw_times": [0.20813200012526067, 0.218262999851504, 0.2123330000358692, 0.2064129998871067, 0.22932299998501549], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.21481299995684822, "peak_bytes": 604012544, "ok": false, "absmax": 0.03125, "corr": {"ok": false, "rtol": 0.001, "atol": 0.001, "absmax": 0.03125, "mae": 0.00152587890625, "mse": 1.1205673217773438e-05, "ref": "layer_norm_fp32"}, "err": null} +{"ts": "2025-10-27T14:46:20Z", "run": "9dc250db88854e28964e0fea9bac9f3c", "impl": "hf_kernels_layer_norm", "tags": {"family": "hf-kernels", "repo": "kernels-community/layer-norm", "op": "layer_norm"}, "wl": {"name": "LN_B4_S2048_D1024", "batch": 4, "seq_len": 2048, "hidden_dim": 1024, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.04353000008450181, "p50": 0.04543000000012398, "p90": 0.04657099998439662, "mean": 0.04557060001388891, "iqr": 0.001390000079481979, "raw_times": [0.04543000000012398, 0.04518099990491464, 0.04714100009550748, 0.04657099998439662, 0.04353000008450181], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.0490809998154873, "peak_bytes": 167809024, "ok": false, "absmax": 0.03125, "corr": {"ok": false, "rtol": 0.001, "atol": 0.001, "absmax": 0.03125, "mae": 0.0015869140625, "mse": 1.150369644165039e-05, "ref": "layer_norm_fp32"}, "err": null} +{"ts": "2025-10-27T14:46:20Z", "run": "9dc250db88854e28964e0fea9bac9f3c", "impl": "hf_kernels_layer_norm", "tags": {"family": "hf-kernels", "repo": "kernels-community/layer-norm", "op": "layer_norm"}, "wl": {"name": "LN_B4_S2048_D2048", "batch": 4, "seq_len": 2048, "hidden_dim": 2048, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.054420999958892935, "p50": 0.05506100001184677, "p90": 0.055460999874412664, "mean": 0.055042999929355574, "iqr": 0.0008699998943484388, "raw_times": [0.054420999958892935, 0.055460999874412664, 0.05568099982156127, 0.054590999980064225, 0.05506100001184677], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.05802099985885434, "peak_bytes": 301998080, "ok": false, "absmax": 0.03125, "corr": {"ok": false, "rtol": 0.001, "atol": 0.001, "absmax": 0.03125, "mae": 0.00157928466796875, "mse": 1.1444091796875e-05, "ref": "layer_norm_fp32"}, "err": null} +{"ts": "2025-10-27T14:46:20Z", "run": "9dc250db88854e28964e0fea9bac9f3c", "impl": "hf_kernels_layer_norm", "tags": {"family": "hf-kernels", "repo": "kernels-community/layer-norm", "op": "layer_norm"}, "wl": {"name": "LN_B4_S2048_D4096", "batch": 4, "seq_len": 2048, "hidden_dim": 4096, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.20996300008846447, "p50": 0.2102230000673444, "p90": 0.21053299997220165, "mean": 0.21050080003988114, "iqr": 0.0004209998678561533, "raw_times": [0.20996300008846447, 0.21053299997220165, 0.2102230000673444, 0.2101120001043455, 0.2116729999670497], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.21157300011509506, "peak_bytes": 603996160, "ok": false, "absmax": 0.03125, "corr": {"ok": false, "rtol": 0.001, "atol": 0.001, "absmax": 0.03125, "mae": 0.0015411376953125, "mse": 1.1265277862548828e-05, "ref": "layer_norm_fp32"}, "err": null} +{"ts": "2025-10-27T14:46:20Z", "run": "9dc250db88854e28964e0fea9bac9f3c", "impl": "hf_kernels_layer_norm", "tags": {"family": "hf-kernels", "repo": "kernels-community/layer-norm", "op": "layer_norm"}, "wl": {"name": "LN_B4_S2048_D8192", "batch": 4, "seq_len": 2048, "hidden_dim": 8192, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.4341660001045966, "p50": 0.4372359999251785, "p90": 0.4383160000998032, "mean": 0.437980000015159, "iqr": 0.004120000085094944, "raw_times": [0.4383160000998032, 0.4372359999251785, 0.4341660001045966, 0.43419600001470826, 0.44598599993150856], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.44448700009525055, "peak_bytes": 1207992320, "ok": false, "absmax": 0.03125, "corr": {"ok": false, "rtol": 0.001, "atol": 0.001, "absmax": 0.03125, "mae": 0.00152587890625, "mse": 1.1205673217773438e-05, "ref": "layer_norm_fp32"}, "err": null} +{"ts": "2025-10-27T14:46:20Z", "run": "9dc250db88854e28964e0fea9bac9f3c", "impl": "hf_kernels_layer_norm", "tags": {"family": "hf-kernels", "repo": "kernels-community/layer-norm", "op": "layer_norm"}, "wl": {"name": "LN_B16_S128_D1024", "batch": 16, "seq_len": 128, "hidden_dim": 1024, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.04514099987318332, "p50": 0.0465299999632407, "p90": 0.04655099996853096, "mean": 0.04629059999388119, "iqr": 0.0011309998626529705, "raw_times": [0.04514099987318332, 0.04655099996853096, 0.04781100005857297, 0.04542000010587799, 0.0465299999632407], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.04811100006918423, "peak_bytes": 276860928, "ok": false, "absmax": 0.03125, "corr": {"ok": false, "rtol": 0.001, "atol": 0.001, "absmax": 0.03125, "mae": 0.00159454345703125, "mse": 1.150369644165039e-05, "ref": "layer_norm_fp32"}, "err": null} +{"ts": "2025-10-27T14:46:20Z", "run": "9dc250db88854e28964e0fea9bac9f3c", "impl": "hf_kernels_layer_norm", "tags": {"family": "hf-kernels", "repo": "kernels-community/layer-norm", "op": "layer_norm"}, "wl": {"name": "LN_B16_S128_D2048", "batch": 16, "seq_len": 128, "hidden_dim": 2048, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.043820999962917995, "p50": 0.045551000084742554, "p90": 0.04633000003195775, "mean": 0.04580079998959263, "iqr": 0.0007890000688348664, "raw_times": [0.043820999962917995, 0.04776099990522198, 0.045551000084742554, 0.045540999963122886, 0.04633000003195775], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.05054000007476134, "peak_bytes": 75505664, "ok": false, "absmax": 0.03125, "corr": {"ok": false, "rtol": 0.001, "atol": 0.001, "absmax": 0.03125, "mae": 0.00159454345703125, "mse": 1.1563301086425781e-05, "ref": "layer_norm_fp32"}, "err": null} +{"ts": "2025-10-27T14:46:20Z", "run": "9dc250db88854e28964e0fea9bac9f3c", "impl": "hf_kernels_layer_norm", "tags": {"family": "hf-kernels", "repo": "kernels-community/layer-norm", "op": "layer_norm"}, "wl": {"name": "LN_B16_S128_D4096", "batch": 16, "seq_len": 128, "hidden_dim": 4096, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.04511099996307166, "p50": 0.04610100017998775, "p90": 0.04624100006367371, "mean": 0.04598500008796691, "iqr": 0.0004099999841855606, "raw_times": [0.04583100007948815, 0.04664100015361328, 0.04511099996307166, 0.04610100017998775, 0.04624100006367371], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.04932000001645065, "peak_bytes": 151011328, "ok": false, "absmax": 0.03125, "corr": {"ok": false, "rtol": 0.001, "atol": 0.001, "absmax": 0.03125, "mae": 0.00156402587890625, "mse": 1.1444091796875e-05, "ref": "layer_norm_fp32"}, "err": null} +{"ts": "2025-10-27T14:46:20Z", "run": "9dc250db88854e28964e0fea9bac9f3c", "impl": "hf_kernels_layer_norm", "tags": {"family": "hf-kernels", "repo": "kernels-community/layer-norm", "op": "layer_norm"}, "wl": {"name": "LN_B16_S128_D8192", "batch": 16, "seq_len": 128, "hidden_dim": 8192, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.05165100014892232, "p50": 0.052130999847577186, "p90": 0.05317099999047059, "mean": 0.05250480003269331, "iqr": 0.0012309999419812812, "raw_times": [0.052130999847577186, 0.05165100014892232, 0.053631000128007145, 0.05317099999047059, 0.05194000004848931], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.055000999964249786, "peak_bytes": 302022656, "ok": false, "absmax": 0.03125, "corr": {"ok": false, "rtol": 0.001, "atol": 0.001, "absmax": 0.03125, "mae": 0.00152587890625, "mse": 1.1146068572998047e-05, "ref": "layer_norm_fp32"}, "err": null} +{"ts": "2025-10-27T14:46:20Z", "run": "9dc250db88854e28964e0fea9bac9f3c", "impl": "hf_kernels_layer_norm", "tags": {"family": "hf-kernels", "repo": "kernels-community/layer-norm", "op": "layer_norm"}, "wl": {"name": "LN_B16_S512_D1024", "batch": 16, "seq_len": 512, "hidden_dim": 1024, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.045381000063571264, "p50": 0.045759999920846894, "p90": 0.04781100005857297, "mean": 0.04770240002471837, "iqr": 0.00238100005844899, "raw_times": [0.045759999920846894, 0.04781100005857297, 0.045381000063571264, 0.04543000000012398, 0.05413000008047675], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.04919000002701068, "peak_bytes": 150999040, "ok": false, "absmax": 0.03125, "corr": {"ok": false, "rtol": 0.001, "atol": 0.001, "absmax": 0.03125, "mae": 0.0015869140625, "mse": 1.150369644165039e-05, "ref": "layer_norm_fp32"}, "err": null} +{"ts": "2025-10-27T14:46:20Z", "run": "9dc250db88854e28964e0fea9bac9f3c", "impl": "hf_kernels_layer_norm", "tags": {"family": "hf-kernels", "repo": "kernels-community/layer-norm", "op": "layer_norm"}, "wl": {"name": "LN_B16_S512_D2048", "batch": 16, "seq_len": 512, "hidden_dim": 2048, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.05421099990599032, "p50": 0.054861000080563826, "p90": 0.05564100001720362, "mean": 0.05508300000656163, "iqr": 0.0010100000054080738, "raw_times": [0.056071000017254846, 0.054861000080563826, 0.05564100001720362, 0.05421099990599032, 0.05463100001179555], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.05805000000691507, "peak_bytes": 301998080, "ok": false, "absmax": 0.03125, "corr": {"ok": false, "rtol": 0.001, "atol": 0.001, "absmax": 0.03125, "mae": 0.00157928466796875, "mse": 1.1444091796875e-05, "ref": "layer_norm_fp32"}, "err": null} +{"ts": "2025-10-27T14:46:20Z", "run": "9dc250db88854e28964e0fea9bac9f3c", "impl": "hf_kernels_layer_norm", "tags": {"family": "hf-kernels", "repo": "kernels-community/layer-norm", "op": "layer_norm"}, "wl": {"name": "LN_B16_S512_D4096", "batch": 16, "seq_len": 512, "hidden_dim": 4096, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.20916299990858533, "p50": 0.21016300001974741, "p90": 0.21141399997759436, "mean": 0.21107719999235997, "iqr": 0.0015210000583465444, "raw_times": [0.21141399997759436, 0.2147530001366249, 0.2098929999192478, 0.21016300001974741, 0.20916299990858533], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.21191299993006396, "peak_bytes": 603996160, "ok": false, "absmax": 0.03125, "corr": {"ok": false, "rtol": 0.001, "atol": 0.001, "absmax": 0.03125, "mae": 0.0015411376953125, "mse": 1.1265277862548828e-05, "ref": "layer_norm_fp32"}, "err": null} +{"ts": "2025-10-27T14:46:20Z", "run": "9dc250db88854e28964e0fea9bac9f3c", "impl": "hf_kernels_layer_norm", "tags": {"family": "hf-kernels", "repo": "kernels-community/layer-norm", "op": "layer_norm"}, "wl": {"name": "LN_B16_S512_D8192", "batch": 16, "seq_len": 512, "hidden_dim": 8192, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.43155599996680394, "p50": 0.43475600000419945, "p90": 0.4373360000045068, "mean": 0.43558200000006764, "iqr": 0.003800000058618025, "raw_times": [0.43475600000419945, 0.44072600007893925, 0.4373360000045068, 0.43353599994588876, 0.43155599996680394], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.44892699997944874, "peak_bytes": 1207992320, "ok": false, "absmax": 0.03125, "corr": {"ok": false, "rtol": 0.001, "atol": 0.001, "absmax": 0.03125, "mae": 0.00152587890625, "mse": 1.1205673217773438e-05, "ref": "layer_norm_fp32"}, "err": null} +{"ts": "2025-10-27T14:46:20Z", "run": "9dc250db88854e28964e0fea9bac9f3c", "impl": "hf_kernels_layer_norm", "tags": {"family": "hf-kernels", "repo": "kernels-community/layer-norm", "op": "layer_norm"}, "wl": {"name": "LN_B16_S1024_D1024", "batch": 16, "seq_len": 1024, "hidden_dim": 1024, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.0483500000427739, "p50": 0.049099999841928366, "p90": 0.04950099992129253, "mean": 0.050544599935165024, "iqr": 0.0011199999789823778, "raw_times": [0.048380999942310154, 0.04950099992129253, 0.05739099992752017, 0.049099999841928366, 0.0483500000427739], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.05153099982635467, "peak_bytes": 335581184, "ok": false, "absmax": 0.03125, "corr": {"ok": false, "rtol": 0.001, "atol": 0.001, "absmax": 0.03125, "mae": 0.0015869140625, "mse": 1.150369644165039e-05, "ref": "layer_norm_fp32"}, "err": null} +{"ts": "2025-10-27T14:46:20Z", "run": "9dc250db88854e28964e0fea9bac9f3c", "impl": "hf_kernels_layer_norm", "tags": {"family": "hf-kernels", "repo": "kernels-community/layer-norm", "op": "layer_norm"}, "wl": {"name": "LN_B16_S1024_D2048", "batch": 16, "seq_len": 1024, "hidden_dim": 2048, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.2181429999836837, "p50": 0.2215729998624738, "p90": 0.2217329999893991, "mean": 0.22086119997766218, "iqr": 0.003440000000409782, "raw_times": [0.2181429999836837, 0.2217329999893991, 0.21829299998898932, 0.2215729998624738, 0.22456400006376498], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.22583300005862839, "peak_bytes": 603987968, "ok": false, "absmax": 0.03125, "corr": {"ok": false, "rtol": 0.001, "atol": 0.001, "absmax": 0.03125, "mae": 0.0015716552734375, "mse": 1.1444091796875e-05, "ref": "layer_norm_fp32"}, "err": null} +{"ts": "2025-10-27T14:46:20Z", "run": "9dc250db88854e28964e0fea9bac9f3c", "impl": "hf_kernels_layer_norm", "tags": {"family": "hf-kernels", "repo": "kernels-community/layer-norm", "op": "layer_norm"}, "wl": {"name": "LN_B16_S1024_D4096", "batch": 16, "seq_len": 1024, "hidden_dim": 4096, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.43596600016826415, "p50": 0.4398270000365301, "p90": 0.4409260000102222, "mean": 0.4390922000766295, "iqr": 0.003549999973984086, "raw_times": [0.4398270000365301, 0.4409260000102222, 0.4413660001318931, 0.43596600016826415, 0.4373760000362381], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.44040700004188693, "peak_bytes": 1207975936, "ok": false, "absmax": 0.03125, "corr": {"ok": false, "rtol": 0.001, "atol": 0.001, "absmax": 0.03125, "mae": 0.0015411376953125, "mse": 1.1205673217773438e-05, "ref": "layer_norm_fp32"}, "err": null} +{"ts": "2025-10-27T14:46:20Z", "run": "9dc250db88854e28964e0fea9bac9f3c", "impl": "hf_kernels_layer_norm", "tags": {"family": "hf-kernels", "repo": "kernels-community/layer-norm", "op": "layer_norm"}, "wl": {"name": "LN_B16_S1024_D8192", "batch": 16, "seq_len": 1024, "hidden_dim": 8192, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.8329219999723136, "p50": 0.8419220000632777, "p90": 0.8434520000264456, "mean": 0.84072780000497, "iqr": 0.002130000211764127, "raw_times": [0.8329219999723136, 0.8419220000632777, 0.8440210001481319, 0.8434520000264456, 0.8413219998146815], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.8442119999472197, "peak_bytes": 2415951872, "ok": false, "absmax": 0.03125, "corr": {"ok": false, "rtol": 0.001, "atol": 0.001, "absmax": 0.03125, "mae": 0.00151824951171875, "mse": 1.1146068572998047e-05, "ref": "layer_norm_fp32"}, "err": null} +{"ts": "2025-10-27T14:46:20Z", "run": "9dc250db88854e28964e0fea9bac9f3c", "impl": "hf_kernels_layer_norm", "tags": {"family": "hf-kernels", "repo": "kernels-community/layer-norm", "op": "layer_norm"}, "wl": {"name": "LN_B16_S2048_D1024", "batch": 16, "seq_len": 2048, "hidden_dim": 1024, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.21230300012575753, "p50": 0.2135429999725602, "p90": 0.2142630000889767, "mean": 0.21426700000120036, "iqr": 0.0008800002433417831, "raw_times": [0.21230300012575753, 0.2133829998456349, 0.2135429999725602, 0.2142630000889767, 0.21784299997307244], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.22175300000526477, "peak_bytes": 671125504, "ok": false, "absmax": 0.0625, "corr": {"ok": false, "rtol": 0.001, "atol": 0.001, "absmax": 0.0625, "mae": 0.0015869140625, "mse": 1.1563301086425781e-05, "ref": "layer_norm_fp32"}, "err": null} +{"ts": "2025-10-27T14:46:20Z", "run": "9dc250db88854e28964e0fea9bac9f3c", "impl": "hf_kernels_layer_norm", "tags": {"family": "hf-kernels", "repo": "kernels-community/layer-norm", "op": "layer_norm"}, "wl": {"name": "LN_B16_S2048_D2048", "batch": 16, "seq_len": 2048, "hidden_dim": 2048, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.4536460000963416, "p50": 0.45670700001210207, "p90": 0.4569770001126017, "mean": 0.45669080004699936, "iqr": 0.00113999999484804, "raw_times": [0.4536460000963416, 0.4569770001126017, 0.45583700011775363, 0.45670700001210207, 0.4602869998961978], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.4546860000118613, "peak_bytes": 1207967744, "ok": false, "absmax": 0.0625, "corr": {"ok": false, "rtol": 0.001, "atol": 0.001, "absmax": 0.0625, "mae": 0.0015716552734375, "mse": 1.1444091796875e-05, "ref": "layer_norm_fp32"}, "err": null} +{"ts": "2025-10-27T14:46:20Z", "run": "9dc250db88854e28964e0fea9bac9f3c", "impl": "hf_kernels_layer_norm", "tags": {"family": "hf-kernels", "repo": "kernels-community/layer-norm", "op": "layer_norm"}, "wl": {"name": "LN_B16_S2048_D4096", "batch": 16, "seq_len": 2048, "hidden_dim": 4096, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.8352710001418018, "p50": 0.8370320001631626, "p90": 0.8388319999994565, "mean": 0.8375798000997747, "iqr": 0.0019899998733308166, "raw_times": [0.8352710001418018, 0.8368420001261256, 0.8399220000683272, 0.8370320001631626, 0.8388319999994565], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.849921000053655, "peak_bytes": 2415935488, "ok": false, "absmax": 0.03125, "corr": {"ok": false, "rtol": 0.001, "atol": 0.001, "absmax": 0.03125, "mae": 0.0015411376953125, "mse": 1.1205673217773438e-05, "ref": "layer_norm_fp32"}, "err": null} +{"ts": "2025-10-27T14:46:20Z", "run": "9dc250db88854e28964e0fea9bac9f3c", "impl": "hf_kernels_layer_norm", "tags": {"family": "hf-kernels", "repo": "kernels-community/layer-norm", "op": "layer_norm"}, "wl": {"name": "LN_B16_S2048_D8192", "batch": 16, "seq_len": 2048, "hidden_dim": 8192, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 1.6451530000267667, "p50": 1.6546740000649152, "p90": 1.6553830000702874, "mean": 1.6516054000476288, "iqr": 0.008870000101524056, "raw_times": [1.6553830000702874, 1.6465129999687633, 1.6563040001074114, 1.6546740000649152, 1.6451530000267667], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 1.655194000022675, "peak_bytes": 4831870976, "ok": false, "absmax": 0.03125, "corr": {"ok": false, "rtol": 0.001, "atol": 0.001, "absmax": 0.03125, "mae": 0.0015106201171875, "mse": 1.1086463928222656e-05, "ref": "layer_norm_fp32"}, "err": null} diff --git a/layer_norm/impls/cells/benchmark.py b/layer_norm/impls/cells/benchmark.py index 6a00a9f99d8d044ab5f9dc0f5019344cef0612b9..d871d1b25fedf8b294c567e9ac582decb62f3cde 100644 --- a/layer_norm/impls/cells/benchmark.py +++ b/layer_norm/impls/cells/benchmark.py @@ -3,6 +3,7 @@ # dependencies = [ # "numpy", # "torch==2.8.0", +# "kernels", # "kernels-benchmark-tools", # ] # @@ -12,15 +13,37 @@ import torch import sys from kernels_benchmark_tools import KernelTypeEnum, run_benchmark +from kernels import get_kernel +# Load the layer norm kernel +layer_norm_kernel = get_kernel("kernels-community/layer-norm") -def torch_layer_norm(x, weight, bias, eps: float = 1e-5): - return torch.nn.functional.layer_norm(x, (x.shape[-1],), weight, bias, eps) + +def hf_kernels_layer_norm(x, weight, bias, eps: float = 1e-5): + B, S, D = x.shape + # The kernel expects [N, D] input; support beta (bias) if provided. + out = layer_norm_kernel.dropout_add_ln_fwd( + input=x.view(-1, D), + gamma=weight, + beta=bias, + rowscale=None, + colscale=None, + x0_subset=None, + z_subset=None, + dropout_p=0.0, + epsilon=eps, + rowscale_const=1.0, + z_numrows=S, + gen=None, + residual_in_fp32=False, + is_rms_norm=False, + )[0].view(B, S, D) + return out run_benchmark( kernel_type=KernelTypeEnum.LAYER_NORM, - impl_name="torch_layer_norm", - impl_tags={"family": "torch", "op": "layer_norm"}, - impl_func=torch_layer_norm, + impl_name="hf_kernels_layer_norm", + impl_tags={"family": "hf-kernels", "repo": "kernels-community/layer-norm", "op": "layer_norm"}, + impl_func=hf_kernels_layer_norm, ) \ No newline at end of file diff --git a/layer_norm/impls/hf_kernels_layer_norm.html b/layer_norm/impls/hf_kernels_layer_norm.html index 7c6779ec37fa43900d621bf6ad69b69b9c4ea785..41eded723a429c13af73316bd24eb2cd47fde3c8 100644 --- a/layer_norm/impls/hf_kernels_layer_norm.html +++ b/layer_norm/impls/hf_kernels_layer_norm.html @@ -3857,7 +3857,7 @@ span.linenos.special { color: #000000; background-color: #ffffc0; padding-left:
Generated on:
- Linux x86_64 | Linux-5.15.0-1084-aws-x86_64-with-glibc2.31 + Linux x86_64 | Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35
@@ -3873,7 +3873,7 @@ span.linenos.special { color: #000000; background-color: #ffffc0; padding-left: ▼ output ▶ uv-logs | -Cell: benchmark | 47.20s +Cell: benchmark | 6.33s | Raw @@ -3943,19 +3943,19 @@ PROFILE TRACE: hf_kernels_layer_norm | LN_B1_S128_D1024 ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ - hf_kernels_layer_norm 0.00% 0.000us 0.00% 0.000us 0.000us 130.368us 1245.87% 130.368us 130.368us 1 - hf_kernels_layer_norm 10.50% 197.573us 99.64% 1.875ms 1.875ms 0.000us 0.00% 14.048us 14.048us 1 - _layer_norm_f8ec252::dropout_add_ln_fwd 3.47% 65.272us 87.77% 1.652ms 550.605us 10.464us 100.00% 14.048us 4.683us 3 -void layer_norm::ln_fwd_kernel<layer_norm::Kernel_tr... 0.00% 0.000us 0.00% 0.000us 0.000us 10.464us 100.00% 10.464us 3.488us 3 - Activity Buffer Request 79.11% 1.489ms 79.11% 1.489ms 1.489ms 3.584us 34.25% 3.584us 3.584us 1 - aten::view 1.38% 25.881us 1.38% 25.881us 4.314us 0.000us 0.00% 0.000us 0.000us 6 - aten::empty 2.61% 49.141us 2.61% 49.141us 5.460us 0.000us 0.00% 0.000us 0.000us 9 -cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFla... 0.46% 8.610us 0.46% 8.610us 2.870us 0.000us 0.00% 0.000us 0.000us 3 - cudaLaunchKernel 2.12% 39.872us 2.12% 39.872us 13.291us 0.000us 0.00% 0.000us 0.000us 3 - cudaDeviceSynchronize 0.36% 6.770us 0.36% 6.770us 6.770us 0.000us 0.00% 0.000us 0.000us 1 + hf_kernels_layer_norm 0.00% 0.000us 0.00% 0.000us 0.000us 126.624us 1327.85% 126.624us 126.624us 1 + hf_kernels_layer_norm 10.50% 192.054us 99.63% 1.822ms 1.822ms 0.000us 0.00% 12.800us 12.800us 1 + _layer_norm_f8ec252::dropout_add_ln_fwd 3.73% 68.149us 87.79% 1.605ms 535.007us 9.536us 100.00% 12.800us 4.267us 3 +void layer_norm::ln_fwd_kernel<layer_norm::Kernel_tr... 0.00% 0.000us 0.00% 0.000us 0.000us 9.536us 100.00% 9.536us 3.179us 3 + Activity Buffer Request 78.93% 1.443ms 78.93% 1.443ms 1.443ms 3.264us 34.23% 3.264us 3.264us 1 + aten::view 1.34% 24.540us 1.34% 24.540us 4.090us 0.000us 0.00% 0.000us 0.000us 6 + aten::empty 2.50% 45.632us 2.50% 45.632us 5.070us 0.000us 0.00% 0.000us 0.000us 9 +cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFla... 0.52% 9.500us 0.52% 9.500us 3.167us 0.000us 0.00% 0.000us 0.000us 3 + cudaLaunchKernel 2.11% 38.660us 2.11% 38.660us 12.887us 0.000us 0.00% 0.000us 0.000us 3 + cudaDeviceSynchronize 0.37% 6.690us 0.37% 6.690us 6.690us 0.000us 0.00% 0.000us 0.000us 1 ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ -Self CPU time total: 1.882ms -Self CUDA time total: 10.464us +Self CPU time total: 1.828ms +Self CUDA time total: 9.536us @@ -3965,19 +3965,19 @@ PROFILE TRACE: hf_kernels_layer_norm | LN_B1_S128_D2048 ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ - hf_kernels_layer_norm 0.00% 0.000us 0.00% 0.000us 0.000us 117.981us 863.51% 117.981us 117.981us 1 - hf_kernels_layer_norm 7.44% 129.853us 99.69% 1.741ms 1.741ms 0.000us 0.00% 18.271us 18.271us 1 - _layer_norm_f8ec252::dropout_add_ln_fwd 2.62% 45.831us 91.51% 1.598ms 532.638us 13.663us 100.00% 18.271us 6.090us 3 -void layer_norm::ln_fwd_kernel<layer_norm::Kernel_tr... 0.00% 0.000us 0.00% 0.000us 0.000us 13.663us 100.00% 13.663us 4.554us 3 - Activity Buffer Request 85.13% 1.487ms 85.13% 1.487ms 1.487ms 4.608us 33.73% 4.608us 4.608us 1 - aten::view 0.75% 13.060us 0.75% 13.060us 2.177us 0.000us 0.00% 0.000us 0.000us 6 - aten::empty 1.75% 30.520us 1.75% 30.520us 3.391us 0.000us 0.00% 0.000us 0.000us 9 -cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFla... 0.27% 4.661us 0.27% 4.661us 1.554us 0.000us 0.00% 0.000us 0.000us 3 - cudaLaunchKernel 1.74% 30.321us 1.74% 30.321us 10.107us 0.000us 0.00% 0.000us 0.000us 3 - cudaDeviceSynchronize 0.31% 5.410us 0.31% 5.410us 5.410us 0.000us 0.00% 0.000us 0.000us 1 + hf_kernels_layer_norm 0.00% 0.000us 0.00% 0.000us 0.000us 118.975us 960.72% 118.975us 118.975us 1 + hf_kernels_layer_norm 8.90% 155.923us 99.67% 1.747ms 1.747ms 0.000us 0.00% 16.576us 16.576us 1 + _layer_norm_f8ec252::dropout_add_ln_fwd 2.71% 47.470us 90.07% 1.579ms 526.204us 12.384us 100.00% 16.576us 5.525us 3 +void layer_norm::ln_fwd_kernel<layer_norm::Kernel_tr... 0.00% 0.000us 0.00% 0.000us 0.000us 12.384us 100.00% 12.384us 4.128us 3 + Activity Buffer Request 83.60% 1.465ms 83.60% 1.465ms 1.465ms 4.192us 33.85% 4.192us 4.192us 1 + aten::view 0.71% 12.400us 0.71% 12.400us 2.067us 0.000us 0.00% 0.000us 0.000us 6 + aten::empty 1.73% 30.340us 1.73% 30.340us 3.371us 0.000us 0.00% 0.000us 0.000us 9 +cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFla... 0.28% 4.970us 0.28% 4.970us 1.657us 0.000us 0.00% 0.000us 0.000us 3 + cudaLaunchKernel 1.74% 30.551us 1.74% 30.551us 10.184us 0.000us 0.00% 0.000us 0.000us 3 + cudaDeviceSynchronize 0.33% 5.780us 0.33% 5.780us 5.780us 0.000us 0.00% 0.000us 0.000us 1 ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ -Self CPU time total: 1.746ms -Self CUDA time total: 13.663us +Self CPU time total: 1.753ms +Self CUDA time total: 12.384us @@ -3987,19 +3987,19 @@ PROFILE TRACE: hf_kernels_layer_norm | LN_B1_S128_D4096 ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ - hf_kernels_layer_norm 0.00% 0.000us 0.00% 0.000us 0.000us 116.509us 943.24% 116.509us 116.509us 1 - hf_kernels_layer_norm 7.98% 138.752us 99.72% 1.735ms 1.735ms 0.000us 0.00% 16.480us 16.480us 1 - _layer_norm_f8ec252::dropout_add_ln_fwd 2.45% 42.600us 91.01% 1.583ms 527.711us 12.352us 100.00% 16.480us 5.493us 3 -void layer_norm::ln_fwd_kernel<layer_norm::Kernel_tr... 0.00% 0.000us 0.00% 0.000us 0.000us 12.352us 100.00% 12.352us 4.117us 3 - Activity Buffer Request 85.01% 1.479ms 85.01% 1.479ms 1.479ms 4.128us 33.42% 4.128us 4.128us 1 - aten::view 0.74% 12.801us 0.74% 12.801us 2.134us 0.000us 0.00% 0.000us 0.000us 6 - aten::empty 1.67% 29.111us 1.67% 29.111us 3.235us 0.000us 0.00% 0.000us 0.000us 9 -cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFla... 0.27% 4.660us 0.27% 4.660us 1.553us 0.000us 0.00% 0.000us 0.000us 3 - cudaLaunchKernel 1.61% 28.011us 1.61% 28.011us 9.337us 0.000us 0.00% 0.000us 0.000us 3 - cudaDeviceSynchronize 0.28% 4.840us 0.28% 4.840us 4.840us 0.000us 0.00% 0.000us 0.000us 1 + hf_kernels_layer_norm 0.00% 0.000us 0.00% 0.000us 0.000us 109.887us 1003.99% 109.887us 109.887us 1 + hf_kernels_layer_norm 7.66% 143.860us 99.71% 1.872ms 1.872ms 0.000us 0.00% 14.626us 14.626us 1 + _layer_norm_f8ec252::dropout_add_ln_fwd 2.49% 46.702us 91.41% 1.716ms 571.882us 10.945us 100.00% 14.626us 4.875us 3 +void layer_norm::ln_fwd_kernel<layer_norm::Kernel_tr... 0.00% 0.000us 0.00% 0.000us 0.000us 10.945us 100.00% 10.945us 3.648us 3 + Activity Buffer Request 85.70% 1.609ms 85.70% 1.609ms 1.609ms 3.681us 33.63% 3.681us 3.681us 1 + aten::view 0.64% 12.051us 0.64% 12.051us 2.008us 0.000us 0.00% 0.000us 0.000us 6 + aten::empty 1.56% 29.239us 1.56% 29.239us 3.249us 0.000us 0.00% 0.000us 0.000us 9 +cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFla... 0.26% 4.870us 0.26% 4.870us 1.623us 0.000us 0.00% 0.000us 0.000us 3 + cudaLaunchKernel 1.40% 26.311us 1.40% 26.311us 8.770us 0.000us 0.00% 0.000us 0.000us 3 + cudaDeviceSynchronize 0.29% 5.350us 0.29% 5.350us 5.350us 0.000us 0.00% 0.000us 0.000us 1 ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ -Self CPU time total: 1.740ms -Self CUDA time total: 12.352us +Self CPU time total: 1.877ms +Self CUDA time total: 10.945us @@ -4009,19 +4009,19 @@ PROFILE TRACE: hf_kernels_layer_norm | LN_B1_S128_D8192 ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ - hf_kernels_layer_norm 0.00% 0.000us 0.00% 0.000us 0.000us 125.982us 578.96% 125.982us 125.982us 1 - hf_kernels_layer_norm 6.68% 137.125us 99.77% 2.048ms 2.048ms 0.000us 0.00% 29.120us 29.120us 1 - _layer_norm_f8ec252::dropout_add_ln_fwd 2.07% 42.461us 92.46% 1.898ms 632.783us 21.760us 100.00% 29.120us 9.707us 3 -void layer_norm::ln_fwd_kernel<layer_norm::Kernel_tr... 0.00% 0.000us 0.00% 0.000us 0.000us 21.760us 100.00% 21.760us 7.253us 3 - Activity Buffer Request 73.49% 1.509ms 73.49% 1.509ms 1.509ms 7.360us 33.82% 7.360us 7.360us 1 - aten::view 0.63% 13.010us 0.63% 13.010us 2.168us 0.000us 0.00% 0.000us 0.000us 6 - aten::empty 1.55% 31.790us 1.55% 31.790us 3.532us 0.000us 0.00% 0.000us 0.000us 9 -cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFla... 0.23% 4.660us 0.23% 4.660us 1.553us 0.000us 0.00% 0.000us 0.000us 3 - cudaLaunchKernel 15.12% 310.466us 15.12% 310.466us 103.489us 0.000us 0.00% 0.000us 0.000us 3 - cudaDeviceSynchronize 0.23% 4.720us 0.23% 4.720us 4.720us 0.000us 0.00% 0.000us 0.000us 1 + hf_kernels_layer_norm 0.00% 0.000us 0.00% 0.000us 0.000us 120.287us 916.82% 120.287us 120.287us 1 + hf_kernels_layer_norm 7.38% 148.710us 99.70% 2.008ms 2.008ms 0.000us 0.00% 17.504us 17.504us 1 + _layer_norm_f8ec252::dropout_add_ln_fwd 2.28% 45.984us 91.73% 1.848ms 615.912us 13.120us 100.00% 17.504us 5.835us 3 +void layer_norm::ln_fwd_kernel<layer_norm::Kernel_tr... 0.00% 0.000us 0.00% 0.000us 0.000us 13.120us 100.00% 13.120us 4.373us 3 + Activity Buffer Request 71.87% 1.448ms 71.87% 1.448ms 1.448ms 4.384us 33.41% 4.384us 4.384us 1 + aten::view 0.60% 12.011us 0.60% 12.011us 2.002us 0.000us 0.00% 0.000us 0.000us 6 + aten::empty 1.48% 29.740us 1.48% 29.740us 3.304us 0.000us 0.00% 0.000us 0.000us 9 +cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFla... 0.26% 5.319us 0.26% 5.319us 1.773us 0.000us 0.00% 0.000us 0.000us 3 + cudaLaunchKernel 15.83% 318.904us 15.83% 318.904us 106.301us 0.000us 0.00% 0.000us 0.000us 3 + cudaDeviceSynchronize 0.30% 5.970us 0.30% 5.970us 5.970us 0.000us 0.00% 0.000us 0.000us 1 ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ -Self CPU time total: 2.053ms -Self CUDA time total: 21.760us +Self CPU time total: 2.014ms +Self CUDA time total: 13.120us @@ -4031,19 +4031,19 @@ PROFILE TRACE: hf_kernels_layer_norm | LN_B1_S512_D1024 ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ - hf_kernels_layer_norm 0.00% 0.000us 0.00% 0.000us 0.000us 121.087us 1103.20% 121.087us 121.087us 1 - hf_kernels_layer_norm 42.59% 1.314ms 99.83% 3.079ms 3.079ms 0.000us 0.00% 14.528us 14.528us 1 - _layer_norm_f8ec252::dropout_add_ln_fwd 1.41% 43.391us 56.84% 1.753ms 584.439us 10.976us 100.00% 14.528us 4.843us 3 -void layer_norm::ln_fwd_kernel<layer_norm::Kernel_tr... 0.00% 0.000us 0.00% 0.000us 0.000us 10.976us 100.00% 10.976us 3.659us 3 - Activity Buffer Request 48.69% 1.502ms 48.69% 1.502ms 1.502ms 3.552us 32.36% 3.552us 3.552us 1 - aten::view 0.40% 12.250us 0.40% 12.250us 2.042us 0.000us 0.00% 0.000us 0.000us 6 - aten::empty 0.96% 29.520us 0.96% 29.520us 3.280us 0.000us 0.00% 0.000us 0.000us 9 -cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFla... 0.17% 5.350us 0.17% 5.350us 1.783us 0.000us 0.00% 0.000us 0.000us 3 - cudaLaunchKernel 5.61% 173.174us 5.61% 173.174us 57.725us 0.000us 0.00% 0.000us 0.000us 3 - cudaDeviceSynchronize 0.17% 5.330us 0.17% 5.330us 5.330us 0.000us 0.00% 0.000us 0.000us 1 + hf_kernels_layer_norm 0.00% 0.000us 0.00% 0.000us 0.000us 114.559us 1189.48% 114.559us 114.559us 1 + hf_kernels_layer_norm 7.21% 135.832us 99.75% 1.879ms 1.879ms 0.000us 0.00% 12.767us 12.767us 1 + _layer_norm_f8ec252::dropout_add_ln_fwd 2.53% 47.731us 91.89% 1.731ms 576.915us 9.631us 100.00% 12.767us 4.256us 3 +void layer_norm::ln_fwd_kernel<layer_norm::Kernel_tr... 0.00% 0.000us 0.00% 0.000us 0.000us 9.631us 100.00% 9.631us 3.210us 3 + Activity Buffer Request 78.55% 1.480ms 78.55% 1.480ms 1.480ms 3.136us 32.56% 3.136us 3.136us 1 + aten::view 0.65% 12.210us 0.65% 12.210us 2.035us 0.000us 0.00% 0.000us 0.000us 6 + aten::empty 1.55% 29.201us 1.55% 29.201us 3.245us 0.000us 0.00% 0.000us 0.000us 9 +cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFla... 0.26% 4.830us 0.26% 4.830us 1.610us 0.000us 0.00% 0.000us 0.000us 3 + cudaLaunchKernel 9.00% 169.482us 9.00% 169.482us 56.494us 0.000us 0.00% 0.000us 0.000us 3 + cudaDeviceSynchronize 0.25% 4.770us 0.25% 4.770us 4.770us 0.000us 0.00% 0.000us 0.000us 1 ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ -Self CPU time total: 3.084ms -Self CUDA time total: 10.976us +Self CPU time total: 1.884ms +Self CUDA time total: 9.631us @@ -4053,19 +4053,19 @@ PROFILE TRACE: hf_kernels_layer_norm | LN_B1_S512_D2048 ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ - hf_kernels_layer_norm 0.00% 0.000us 0.00% 0.000us 0.000us 118.334us 463.40% 118.334us 118.334us 1 - hf_kernels_layer_norm 20.93% 106.845us 98.96% 505.171us 505.171us 0.000us 0.00% 34.112us 34.112us 1 - _layer_norm_f8ec252::dropout_add_ln_fwd 8.38% 42.772us 75.59% 385.897us 128.632us 25.536us 100.00% 34.112us 11.371us 3 -void layer_norm::ln_fwd_kernel<layer_norm::Kernel_tr... 0.00% 0.000us 0.00% 0.000us 0.000us 25.536us 100.00% 25.536us 8.512us 3 - Activity Buffer Request 27.16% 138.642us 27.16% 138.642us 138.642us 8.576us 33.58% 8.576us 8.576us 1 - aten::view 2.43% 12.429us 2.43% 12.429us 2.072us 0.000us 0.00% 0.000us 0.000us 6 - aten::empty 6.18% 31.540us 6.18% 31.540us 3.504us 0.000us 0.00% 0.000us 0.000us 9 -cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFla... 1.06% 5.420us 1.06% 5.420us 1.807us 0.000us 0.00% 0.000us 0.000us 3 - cudaLaunchKernel 32.82% 167.523us 32.82% 167.523us 55.841us 0.000us 0.00% 0.000us 0.000us 3 - cudaDeviceSynchronize 1.04% 5.330us 1.04% 5.330us 5.330us 0.000us 0.00% 0.000us 0.000us 1 + hf_kernels_layer_norm 0.00% 0.000us 0.00% 0.000us 0.000us 117.151us 841.66% 117.151us 117.151us 1 + hf_kernels_layer_norm 7.38% 134.703us 99.74% 1.819ms 1.819ms 0.000us 0.00% 18.495us 18.495us 1 + _layer_norm_f8ec252::dropout_add_ln_fwd 2.52% 45.930us 91.68% 1.673ms 557.511us 13.919us 100.00% 18.495us 6.165us 3 +void layer_norm::ln_fwd_kernel<layer_norm::Kernel_tr... 0.00% 0.000us 0.00% 0.000us 0.000us 13.919us 100.00% 13.919us 4.640us 3 + Activity Buffer Request 78.70% 1.436ms 78.70% 1.436ms 1.436ms 4.576us 32.88% 4.576us 4.576us 1 + aten::view 0.67% 12.200us 0.67% 12.200us 2.033us 0.000us 0.00% 0.000us 0.000us 6 + aten::empty 1.63% 29.679us 1.63% 29.679us 3.298us 0.000us 0.00% 0.000us 0.000us 9 +cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFla... 0.30% 5.450us 0.30% 5.450us 1.817us 0.000us 0.00% 0.000us 0.000us 3 + cudaLaunchKernel 8.54% 155.763us 8.54% 155.763us 51.921us 0.000us 0.00% 0.000us 0.000us 3 + cudaDeviceSynchronize 0.26% 4.800us 0.26% 4.800us 4.800us 0.000us 0.00% 0.000us 0.000us 1 ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ -Self CPU time total: 510.501us -Self CUDA time total: 25.536us +Self CPU time total: 1.824ms +Self CUDA time total: 13.919us @@ -4075,19 +4075,19 @@ PROFILE TRACE: hf_kernels_layer_norm | LN_B1_S512_D4096 ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ - hf_kernels_layer_norm 0.00% 0.000us 0.00% 0.000us 0.000us 120.447us 409.57% 120.447us 120.447us 1 - hf_kernels_layer_norm 17.42% 106.524us 99.31% 607.323us 607.323us 0.000us 0.00% 39.296us 39.296us 1 - _layer_norm_f8ec252::dropout_add_ln_fwd 7.38% 45.140us 79.94% 488.879us 162.960us 29.408us 100.00% 39.296us 13.099us 3 -void layer_norm::ln_fwd_kernel<layer_norm::Kernel_tr... 0.00% 0.000us 0.00% 0.000us 0.000us 29.408us 100.00% 29.408us 9.803us 3 - Activity Buffer Request 39.10% 239.095us 39.10% 239.095us 239.095us 9.888us 33.62% 9.888us 9.888us 1 - aten::view 1.95% 11.920us 1.95% 11.920us 1.987us 0.000us 0.00% 0.000us 0.000us 6 - aten::empty 4.84% 29.591us 4.84% 29.591us 3.288us 0.000us 0.00% 0.000us 0.000us 9 -cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFla... 0.81% 4.930us 0.81% 4.930us 1.643us 0.000us 0.00% 0.000us 0.000us 3 - cudaLaunchKernel 27.82% 170.123us 27.82% 170.123us 56.708us 0.000us 0.00% 0.000us 0.000us 3 - cudaDeviceSynchronize 0.69% 4.200us 0.69% 4.200us 4.200us 0.000us 0.00% 0.000us 0.000us 1 + hf_kernels_layer_norm 0.00% 0.000us 0.00% 0.000us 0.000us 121.982us 816.32% 121.982us 121.982us 1 + hf_kernels_layer_norm 7.42% 137.921us 99.71% 1.853ms 1.853ms 0.000us 0.00% 19.934us 19.934us 1 + _layer_norm_f8ec252::dropout_add_ln_fwd 2.51% 46.641us 91.61% 1.702ms 567.498us 14.943us 100.00% 19.934us 6.645us 3 +void layer_norm::ln_fwd_kernel<layer_norm::Kernel_tr... 0.00% 0.000us 0.00% 0.000us 0.000us 14.943us 100.00% 14.943us 4.981us 3 + Activity Buffer Request 78.68% 1.462ms 78.68% 1.462ms 1.462ms 4.991us 33.40% 4.991us 4.991us 1 + aten::view 0.68% 12.581us 0.68% 12.581us 2.097us 0.000us 0.00% 0.000us 0.000us 6 + aten::empty 1.61% 30.011us 1.61% 30.011us 3.335us 0.000us 0.00% 0.000us 0.000us 9 +cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFla... 0.26% 4.880us 0.26% 4.880us 1.627us 0.000us 0.00% 0.000us 0.000us 3 + cudaLaunchKernel 8.55% 158.912us 8.55% 158.912us 52.971us 0.000us 0.00% 0.000us 0.000us 3 + cudaDeviceSynchronize 0.29% 5.320us 0.29% 5.320us 5.320us 0.000us 0.00% 0.000us 0.000us 1 ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ -Self CPU time total: 611.523us -Self CUDA time total: 29.408us +Self CPU time total: 1.858ms +Self CUDA time total: 14.943us @@ -4097,19 +4097,19 @@ PROFILE TRACE: hf_kernels_layer_norm | LN_B1_S512_D8192 ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ - hf_kernels_layer_norm 0.00% 0.000us 0.00% 0.000us 0.000us 133.151us 162.79% 133.151us 133.151us 1 - hf_kernels_layer_norm 6.90% 130.311us 99.76% 1.885ms 1.885ms 0.000us 0.00% 131.167us 131.167us 1 - _layer_norm_f8ec252::dropout_add_ln_fwd 2.33% 44.060us 92.18% 1.742ms 580.686us 81.791us 100.00% 131.167us 43.722us 3 -void layer_norm::ln_fwd_kernel<layer_norm::Kernel_tr... 0.00% 0.000us 0.00% 0.000us 0.000us 81.791us 100.00% 81.791us 27.264us 3 - Activity Buffer Request 79.05% 1.494ms 79.05% 1.494ms 1.494ms 49.376us 60.37% 49.376us 49.376us 1 - aten::view 0.68% 12.842us 0.68% 12.842us 2.140us 0.000us 0.00% 0.000us 0.000us 6 - aten::empty 1.69% 31.890us 1.69% 31.890us 3.543us 0.000us 0.00% 0.000us 0.000us 9 -cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFla... 0.27% 5.171us 0.27% 5.171us 1.724us 0.000us 0.00% 0.000us 0.000us 3 - cudaLaunchKernel 8.84% 167.034us 8.84% 167.034us 55.678us 0.000us 0.00% 0.000us 0.000us 3 - cudaDeviceSynchronize 0.24% 4.571us 0.24% 4.571us 4.571us 0.000us 0.00% 0.000us 0.000us 1 + hf_kernels_layer_norm 0.00% 0.000us 0.00% 0.000us 0.000us 122.336us 491.39% 122.336us 122.336us 1 + hf_kernels_layer_norm 7.27% 134.311us 99.73% 1.842ms 1.842ms 0.000us 0.00% 33.152us 33.152us 1 + _layer_norm_f8ec252::dropout_add_ln_fwd 2.48% 45.720us 91.77% 1.695ms 564.845us 24.896us 100.00% 33.152us 11.051us 3 +void layer_norm::ln_fwd_kernel<layer_norm::Kernel_tr... 0.00% 0.000us 0.00% 0.000us 0.000us 24.896us 100.00% 24.896us 8.299us 3 + Activity Buffer Request 78.89% 1.457ms 78.89% 1.457ms 1.457ms 8.256us 33.16% 8.256us 8.256us 1 + aten::view 0.69% 12.770us 0.69% 12.770us 2.128us 0.000us 0.00% 0.000us 0.000us 6 + aten::empty 1.64% 30.291us 1.64% 30.291us 3.366us 0.000us 0.00% 0.000us 0.000us 9 +cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFla... 0.28% 5.131us 0.28% 5.131us 1.710us 0.000us 0.00% 0.000us 0.000us 3 + cudaLaunchKernel 8.48% 156.672us 8.48% 156.672us 52.224us 0.000us 0.00% 0.000us 0.000us 3 + cudaDeviceSynchronize 0.27% 4.950us 0.27% 4.950us 4.950us 0.000us 0.00% 0.000us 0.000us 1 ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ -Self CPU time total: 1.890ms -Self CUDA time total: 81.791us +Self CPU time total: 1.847ms +Self CUDA time total: 24.896us @@ -4119,19 +4119,19 @@ PROFILE TRACE: hf_kernels_layer_norm | LN_B1_S1024_D1024 ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ - hf_kernels_layer_norm 0.00% 0.000us 0.00% 0.000us 0.000us 117.246us 654.27% 117.246us 117.246us 1 - hf_kernels_layer_norm 22.73% 119.272us 99.14% 520.171us 520.171us 0.000us 0.00% 23.808us 23.808us 1 - _layer_norm_f8ec252::dropout_add_ln_fwd 8.22% 43.142us 74.00% 388.268us 129.423us 17.920us 100.00% 23.808us 7.936us 3 -void layer_norm::ln_fwd_kernel<layer_norm::Kernel_tr... 0.00% 0.000us 0.00% 0.000us 0.000us 17.920us 100.00% 17.920us 5.973us 3 - Activity Buffer Request 27.73% 145.503us 27.73% 145.503us 145.503us 5.888us 32.86% 5.888us 5.888us 1 - aten::view 2.41% 12.631us 2.41% 12.631us 2.105us 0.000us 0.00% 0.000us 0.000us 6 - aten::empty 5.99% 31.410us 5.99% 31.410us 3.490us 0.000us 0.00% 0.000us 0.000us 9 -cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFla... 0.87% 4.560us 0.87% 4.560us 1.520us 0.000us 0.00% 0.000us 0.000us 3 - cudaLaunchKernel 31.19% 163.653us 31.19% 163.653us 54.551us 0.000us 0.00% 0.000us 0.000us 3 - cudaDeviceSynchronize 0.86% 4.531us 0.86% 4.531us 4.531us 0.000us 0.00% 0.000us 0.000us 1 + hf_kernels_layer_norm 0.00% 0.000us 0.00% 0.000us 0.000us 112.508us 1085.25% 112.508us 112.508us 1 + hf_kernels_layer_norm 20.69% 103.551us 99.03% 495.767us 495.767us 0.000us 0.00% 13.759us 13.759us 1 + _layer_norm_f8ec252::dropout_add_ln_fwd 9.55% 47.810us 76.09% 380.926us 126.975us 10.367us 100.00% 13.759us 4.586us 3 +void layer_norm::ln_fwd_kernel<layer_norm::Kernel_tr... 0.00% 0.000us 0.00% 0.000us 0.000us 10.367us 100.00% 10.367us 3.456us 3 + Activity Buffer Request 28.93% 144.803us 28.93% 144.803us 144.803us 3.392us 32.72% 3.392us 3.392us 1 + aten::view 2.26% 11.290us 2.26% 11.290us 1.882us 0.000us 0.00% 0.000us 0.000us 6 + aten::empty 5.78% 28.941us 5.78% 28.941us 3.216us 0.000us 0.00% 0.000us 0.000us 9 +cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFla... 1.18% 5.889us 1.18% 5.889us 1.963us 0.000us 0.00% 0.000us 0.000us 3 + cudaLaunchKernel 30.66% 153.483us 30.66% 153.483us 51.161us 0.000us 0.00% 0.000us 0.000us 3 + cudaDeviceSynchronize 0.97% 4.840us 0.97% 4.840us 4.840us 0.000us 0.00% 0.000us 0.000us 1 ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ -Self CPU time total: 524.702us -Self CUDA time total: 17.920us +Self CPU time total: 500.607us +Self CUDA time total: 10.367us @@ -4141,19 +4141,19 @@ PROFILE TRACE: hf_kernels_layer_norm | LN_B1_S1024_D2048 ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ - hf_kernels_layer_norm 0.00% 0.000us 0.00% 0.000us 0.000us 136.733us 373.85% 136.733us 136.733us 1 - hf_kernels_layer_norm 7.33% 138.162us 99.74% 1.881ms 1.881ms 0.000us 0.00% 48.861us 48.861us 1 - _layer_norm_f8ec252::dropout_add_ln_fwd 2.44% 46.001us 91.74% 1.730ms 576.679us 36.574us 100.00% 48.861us 16.287us 3 -void layer_norm::ln_fwd_kernel<layer_norm::Kernel_tr... 0.00% 0.000us 0.00% 0.000us 0.000us 36.574us 100.00% 36.574us 12.191us 3 - Activity Buffer Request 78.81% 1.486ms 78.81% 1.486ms 1.486ms 12.287us 33.59% 12.287us 12.287us 1 - aten::view 0.68% 12.810us 0.68% 12.810us 2.135us 0.000us 0.00% 0.000us 0.000us 6 - aten::empty 1.62% 30.630us 1.62% 30.630us 3.403us 0.000us 0.00% 0.000us 0.000us 9 -cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFla... 0.26% 4.810us 0.26% 4.810us 1.603us 0.000us 0.00% 0.000us 0.000us 3 - cudaLaunchKernel 8.61% 162.344us 8.61% 162.344us 54.115us 0.000us 0.00% 0.000us 0.000us 3 - cudaDeviceSynchronize 0.26% 4.870us 0.26% 4.870us 4.870us 0.000us 0.00% 0.000us 0.000us 1 + hf_kernels_layer_norm 0.00% 0.000us 0.00% 0.000us 0.000us 114.622us 709.29% 114.622us 114.622us 1 + hf_kernels_layer_norm 17.15% 104.082us 99.15% 601.769us 601.769us 0.000us 0.00% 21.536us 21.536us 1 + _layer_norm_f8ec252::dropout_add_ln_fwd 7.36% 44.690us 80.00% 485.537us 161.846us 16.160us 100.00% 21.536us 7.179us 3 +void layer_norm::ln_fwd_kernel<layer_norm::Kernel_tr... 0.00% 0.000us 0.00% 0.000us 0.000us 16.160us 100.00% 16.160us 5.387us 3 + Activity Buffer Request 41.13% 249.624us 41.13% 249.624us 249.624us 5.376us 33.27% 5.376us 5.376us 1 + aten::view 2.00% 12.150us 2.00% 12.150us 2.025us 0.000us 0.00% 0.000us 0.000us 6 + aten::empty 4.85% 29.441us 4.85% 29.441us 3.271us 0.000us 0.00% 0.000us 0.000us 9 +cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFla... 0.88% 5.329us 0.88% 5.329us 1.776us 0.000us 0.00% 0.000us 0.000us 3 + cudaLaunchKernel 25.78% 156.453us 25.78% 156.453us 52.151us 0.000us 0.00% 0.000us 0.000us 3 + cudaDeviceSynchronize 0.85% 5.140us 0.85% 5.140us 5.140us 0.000us 0.00% 0.000us 0.000us 1 ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ -Self CPU time total: 1.886ms -Self CUDA time total: 36.574us +Self CPU time total: 606.909us +Self CUDA time total: 16.160us @@ -4163,19 +4163,19 @@ PROFILE TRACE: hf_kernels_layer_norm | LN_B1_S1024_D4096 ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ - hf_kernels_layer_norm 0.00% 0.000us 0.00% 0.000us 0.000us 133.789us 167.24% 133.789us 133.789us 1 - hf_kernels_layer_norm 7.08% 135.354us 99.76% 1.906ms 1.906ms 0.000us 0.00% 130.077us 130.077us 1 - _layer_norm_f8ec252::dropout_add_ln_fwd 2.31% 44.121us 92.04% 1.758ms 586.112us 79.998us 100.00% 130.077us 43.359us 3 -void layer_norm::ln_fwd_kernel<layer_norm::Kernel_tr... 0.00% 0.000us 0.00% 0.000us 0.000us 79.998us 100.00% 79.998us 26.666us 3 - Activity Buffer Request 79.38% 1.516ms 79.38% 1.516ms 1.516ms 50.079us 62.60% 50.079us 50.079us 1 - aten::view 0.64% 12.280us 0.64% 12.280us 2.047us 0.000us 0.00% 0.000us 0.000us 6 - aten::empty 1.63% 31.230us 1.63% 31.230us 3.470us 0.000us 0.00% 0.000us 0.000us 9 -cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFla... 0.30% 5.759us 0.30% 5.759us 1.920us 0.000us 0.00% 0.000us 0.000us 3 - cudaLaunchKernel 8.41% 160.764us 8.41% 160.764us 53.588us 0.000us 0.00% 0.000us 0.000us 3 - cudaDeviceSynchronize 0.24% 4.530us 0.24% 4.530us 4.530us 0.000us 0.00% 0.000us 0.000us 1 + hf_kernels_layer_norm 0.00% 0.000us 0.00% 0.000us 0.000us 116.126us 544.07% 116.126us 116.126us 1 + hf_kernels_layer_norm 21.73% 103.750us 98.94% 472.437us 472.437us 0.000us 0.00% 28.448us 28.448us 1 + _layer_norm_f8ec252::dropout_add_ln_fwd 9.81% 46.840us 74.86% 357.435us 119.145us 21.344us 100.00% 28.448us 9.483us 3 +void layer_norm::ln_fwd_kernel<layer_norm::Kernel_tr... 0.00% 0.000us 0.00% 0.000us 0.000us 21.344us 100.00% 21.344us 7.115us 3 + Activity Buffer Request 25.46% 121.562us 25.46% 121.562us 121.562us 7.104us 33.28% 7.104us 7.104us 1 + aten::view 2.36% 11.252us 2.36% 11.252us 1.875us 0.000us 0.00% 0.000us 0.000us 6 + aten::empty 6.20% 29.622us 6.20% 29.622us 3.291us 0.000us 0.00% 0.000us 0.000us 9 +cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFla... 1.03% 4.929us 1.03% 4.929us 1.643us 0.000us 0.00% 0.000us 0.000us 3 + cudaLaunchKernel 32.35% 154.482us 32.35% 154.482us 51.494us 0.000us 0.00% 0.000us 0.000us 3 + cudaDeviceSynchronize 1.06% 5.060us 1.06% 5.060us 5.060us 0.000us 0.00% 0.000us 0.000us 1 ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ -Self CPU time total: 1.911ms -Self CUDA time total: 79.998us +Self CPU time total: 477.497us +Self CUDA time total: 21.344us @@ -4185,19 +4185,19 @@ PROFILE TRACE: hf_kernels_layer_norm | LN_B1_S1024_D8192 ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ - hf_kernels_layer_norm 24.43% 134.014us 98.52% 540.532us 540.532us 0.000us 0.00% 271.640us 271.640us 1 - _layer_norm_f8ec252::dropout_add_ln_fwd 7.86% 43.128us 71.93% 394.628us 131.543us 169.403us 100.00% 271.640us 90.547us 3 - hf_kernels_layer_norm 0.00% 0.000us 0.00% 0.000us 0.000us 170.779us 100.81% 170.779us 170.779us 1 -void layer_norm::ln_fwd_kernel<layer_norm::Kernel_tr... 0.00% 0.000us 0.00% 0.000us 0.000us 169.403us 100.00% 169.403us 56.468us 3 - Activity Buffer Request 27.58% 151.333us 27.58% 151.333us 151.333us 102.237us 60.35% 102.237us 102.237us 1 - aten::view 2.17% 11.890us 2.17% 11.890us 1.982us 0.000us 0.00% 0.000us 0.000us 6 - aten::empty 5.94% 32.563us 5.94% 32.563us 3.618us 0.000us 0.00% 0.000us 0.000us 9 -cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFla... 0.88% 4.820us 0.88% 4.820us 1.607us 0.000us 0.00% 0.000us 0.000us 3 - cudaLaunchKernel 29.67% 162.784us 29.67% 162.784us 54.261us 0.000us 0.00% 0.000us 0.000us 3 - cudaDeviceSynchronize 1.48% 8.120us 1.48% 8.120us 8.120us 0.000us 0.00% 0.000us 0.000us 1 + hf_kernels_layer_norm 0.00% 0.000us 0.00% 0.000us 0.000us 123.005us 198.35% 123.005us 123.005us 1 + hf_kernels_layer_norm 17.67% 104.362us 99.18% 585.739us 585.739us 0.000us 0.00% 97.950us 97.950us 1 + _layer_norm_f8ec252::dropout_add_ln_fwd 7.69% 45.431us 79.53% 469.697us 156.566us 62.015us 100.00% 97.950us 32.650us 3 +void layer_norm::ln_fwd_kernel<layer_norm::Kernel_tr... 0.00% 0.000us 0.00% 0.000us 0.000us 62.015us 100.00% 62.015us 20.672us 3 + Activity Buffer Request 38.94% 229.994us 38.94% 229.994us 229.994us 35.935us 57.95% 35.935us 35.935us 1 + aten::view 1.98% 11.680us 1.98% 11.680us 1.947us 0.000us 0.00% 0.000us 0.000us 6 + aten::empty 4.96% 29.301us 4.96% 29.301us 3.256us 0.000us 0.00% 0.000us 0.000us 9 +cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFla... 0.94% 5.530us 0.94% 5.530us 1.843us 0.000us 0.00% 0.000us 0.000us 3 + cudaLaunchKernel 27.00% 159.441us 27.00% 159.441us 53.147us 0.000us 0.00% 0.000us 0.000us 3 + cudaDeviceSynchronize 0.82% 4.870us 0.82% 4.870us 4.870us 0.000us 0.00% 0.000us 0.000us 1 ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ -Self CPU time total: 548.652us -Self CUDA time total: 169.403us +Self CPU time total: 590.609us +Self CUDA time total: 62.015us @@ -4207,19 +4207,19 @@ PROFILE TRACE: hf_kernels_layer_norm | LN_B1_S2048_D1024 ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ - hf_kernels_layer_norm 0.00% 0.000us 0.00% 0.000us 0.000us 123.518us 475.36% 123.518us 123.518us 1 - hf_kernels_layer_norm 6.86% 128.144us 99.77% 1.864ms 1.864ms 0.000us 0.00% 34.752us 34.752us 1 - _layer_norm_f8ec252::dropout_add_ln_fwd 2.35% 44.000us 92.22% 1.723ms 574.492us 25.984us 100.00% 34.752us 11.584us 3 -void layer_norm::ln_fwd_kernel<layer_norm::Kernel_tr... 0.00% 0.000us 0.00% 0.000us 0.000us 25.984us 100.00% 25.984us 8.661us 3 - Activity Buffer Request 79.41% 1.484ms 79.41% 1.484ms 1.484ms 8.768us 33.74% 8.768us 8.768us 1 - aten::view 0.69% 12.810us 0.69% 12.810us 2.135us 0.000us 0.00% 0.000us 0.000us 6 - aten::empty 1.65% 30.922us 1.65% 30.922us 3.436us 0.000us 0.00% 0.000us 0.000us 9 -cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFla... 0.24% 4.540us 0.24% 4.540us 1.513us 0.000us 0.00% 0.000us 0.000us 3 - cudaLaunchKernel 8.56% 160.003us 8.56% 160.003us 53.334us 0.000us 0.00% 0.000us 0.000us 3 - cudaDeviceSynchronize 0.23% 4.370us 0.23% 4.370us 4.370us 0.000us 0.00% 0.000us 0.000us 1 + hf_kernels_layer_norm 0.00% 0.000us 0.00% 0.000us 0.000us 112.925us 880.03% 112.925us 112.925us 1 + hf_kernels_layer_norm 21.36% 101.251us 98.99% 469.286us 469.286us 0.000us 0.00% 17.152us 17.152us 1 + _layer_norm_f8ec252::dropout_add_ln_fwd 9.95% 47.161us 75.23% 356.625us 118.875us 12.832us 100.00% 17.152us 5.717us 3 +void layer_norm::ln_fwd_kernel<layer_norm::Kernel_tr... 0.00% 0.000us 0.00% 0.000us 0.000us 12.832us 100.00% 12.832us 4.277us 3 + Activity Buffer Request 24.52% 116.222us 24.52% 116.222us 116.222us 4.320us 33.67% 4.320us 4.320us 1 + aten::view 2.41% 11.410us 2.41% 11.410us 1.902us 0.000us 0.00% 0.000us 0.000us 6 + aten::empty 6.33% 30.000us 6.33% 30.000us 3.333us 0.000us 0.00% 0.000us 0.000us 9 +cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFla... 0.99% 4.690us 0.99% 4.690us 1.563us 0.000us 0.00% 0.000us 0.000us 3 + cudaLaunchKernel 33.44% 158.552us 33.44% 158.552us 52.851us 0.000us 0.00% 0.000us 0.000us 3 + cudaDeviceSynchronize 1.01% 4.791us 1.01% 4.791us 4.791us 0.000us 0.00% 0.000us 0.000us 1 ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ -Self CPU time total: 1.869ms -Self CUDA time total: 25.984us +Self CPU time total: 474.077us +Self CUDA time total: 12.832us @@ -4229,19 +4229,19 @@ PROFILE TRACE: hf_kernels_layer_norm | LN_B1_S2048_D2048 ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ - hf_kernels_layer_norm 22.41% 105.332us 99.06% 465.510us 465.510us 0.000us 0.00% 143.994us 143.994us 1 - _layer_norm_f8ec252::dropout_add_ln_fwd 9.32% 43.790us 74.15% 348.436us 116.145us 90.972us 100.00% 143.994us 47.998us 3 - hf_kernels_layer_norm 0.00% 0.000us 0.00% 0.000us 0.000us 126.972us 139.57% 126.972us 126.972us 1 -void layer_norm::ln_fwd_kernel<layer_norm::Kernel_tr... 0.00% 0.000us 0.00% 0.000us 0.000us 90.972us 100.00% 90.972us 30.324us 3 - Activity Buffer Request 22.99% 108.033us 22.99% 108.033us 108.033us 53.022us 58.28% 53.022us 53.022us 1 - aten::view 2.50% 11.742us 2.50% 11.742us 1.957us 0.000us 0.00% 0.000us 0.000us 6 - aten::empty 6.55% 30.800us 6.55% 30.800us 3.422us 0.000us 0.00% 0.000us 0.000us 9 -cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFla... 1.14% 5.380us 1.14% 5.380us 1.793us 0.000us 0.00% 0.000us 0.000us 3 - cudaLaunchKernel 34.14% 160.433us 34.14% 160.433us 53.478us 0.000us 0.00% 0.000us 0.000us 3 - cudaDeviceSynchronize 0.94% 4.420us 0.94% 4.420us 4.420us 0.000us 0.00% 0.000us 0.000us 1 + hf_kernels_layer_norm 0.00% 0.000us 0.00% 0.000us 0.000us 116.894us 456.05% 116.894us 116.894us 1 + hf_kernels_layer_norm 16.78% 104.390us 99.21% 617.040us 617.040us 0.000us 0.00% 34.336us 34.336us 1 + _layer_norm_f8ec252::dropout_add_ln_fwd 7.67% 47.682us 80.57% 501.128us 167.043us 25.632us 100.00% 34.336us 11.445us 3 +void layer_norm::ln_fwd_kernel<layer_norm::Kernel_tr... 0.00% 0.000us 0.00% 0.000us 0.000us 25.632us 100.00% 25.632us 8.544us 3 + Activity Buffer Request 42.51% 264.394us 42.51% 264.394us 264.394us 8.704us 33.96% 8.704us 8.704us 1 + aten::view 1.85% 11.522us 1.85% 11.522us 1.920us 0.000us 0.00% 0.000us 0.000us 6 + aten::empty 4.71% 29.300us 4.71% 29.300us 3.256us 0.000us 0.00% 0.000us 0.000us 9 +cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFla... 0.84% 5.220us 0.84% 5.220us 1.740us 0.000us 0.00% 0.000us 0.000us 3 + cudaLaunchKernel 24.85% 154.532us 24.85% 154.532us 51.511us 0.000us 0.00% 0.000us 0.000us 3 + cudaDeviceSynchronize 0.79% 4.910us 0.79% 4.910us 4.910us 0.000us 0.00% 0.000us 0.000us 1 ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ -Self CPU time total: 469.930us -Self CUDA time total: 90.972us +Self CPU time total: 621.950us +Self CUDA time total: 25.632us @@ -4251,19 +4251,19 @@ PROFILE TRACE: hf_kernels_layer_norm | LN_B1_S2048_D4096 ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ - hf_kernels_layer_norm 7.06% 132.903us 99.72% 1.877ms 1.877ms 0.000us 0.00% 251.833us 251.833us 1 - _layer_norm_f8ec252::dropout_add_ln_fwd 2.36% 44.503us 92.00% 1.732ms 577.246us 154.620us 100.00% 251.833us 83.944us 3 - hf_kernels_layer_norm 0.00% 0.000us 0.00% 0.000us 0.000us 155.868us 100.81% 155.868us 155.868us 1 -void layer_norm::ln_fwd_kernel<layer_norm::Kernel_tr... 0.00% 0.000us 0.00% 0.000us 0.000us 154.620us 100.00% 154.620us 51.540us 3 - Activity Buffer Request 79.11% 1.489ms 79.11% 1.489ms 1.489ms 97.213us 62.87% 97.213us 97.213us 1 - aten::view 0.66% 12.470us 0.66% 12.470us 2.078us 0.000us 0.00% 0.000us 0.000us 6 - aten::empty 1.68% 31.630us 1.68% 31.630us 3.514us 0.000us 0.00% 0.000us 0.000us 9 -cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFla... 0.25% 4.790us 0.25% 4.790us 1.597us 0.000us 0.00% 0.000us 0.000us 3 - cudaLaunchKernel 8.59% 161.763us 8.59% 161.763us 53.921us 0.000us 0.00% 0.000us 0.000us 3 - cudaDeviceSynchronize 0.28% 5.240us 0.28% 5.240us 5.240us 0.000us 0.00% 0.000us 0.000us 1 + hf_kernels_layer_norm 0.00% 0.000us 0.00% 0.000us 0.000us 123.901us 207.17% 123.901us 123.901us 1 + hf_kernels_layer_norm 17.03% 105.700us 99.25% 616.179us 616.179us 0.000us 0.00% 95.452us 95.452us 1 + _layer_norm_f8ec252::dropout_add_ln_fwd 7.57% 46.994us 80.35% 498.838us 166.279us 59.805us 100.00% 95.452us 31.817us 3 +void layer_norm::ln_fwd_kernel<layer_norm::Kernel_tr... 0.00% 0.000us 0.00% 0.000us 0.000us 59.805us 100.00% 59.805us 19.935us 3 + Activity Buffer Request 42.09% 261.283us 42.09% 261.283us 261.283us 35.647us 59.61% 35.647us 35.647us 1 + aten::view 1.88% 11.641us 1.88% 11.641us 1.940us 0.000us 0.00% 0.000us 0.000us 6 + aten::empty 4.84% 30.020us 4.84% 30.020us 3.336us 0.000us 0.00% 0.000us 0.000us 9 +cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFla... 0.92% 5.739us 0.92% 5.739us 1.913us 0.000us 0.00% 0.000us 0.000us 3 + cudaLaunchKernel 24.93% 154.802us 24.93% 154.802us 51.601us 0.000us 0.00% 0.000us 0.000us 3 + cudaDeviceSynchronize 0.75% 4.650us 0.75% 4.650us 4.650us 0.000us 0.00% 0.000us 0.000us 1 ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ -Self CPU time total: 1.882ms -Self CUDA time total: 154.620us +Self CPU time total: 620.829us +Self CUDA time total: 59.805us @@ -4273,19 +4273,19 @@ PROFILE TRACE: hf_kernels_layer_norm | LN_B1_S2048_D8192 ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ - hf_kernels_layer_norm 5.50% 137.653us 75.82% 1.896ms 1.896ms 0.000us 0.00% 1.022ms 1.022ms 1 - _layer_norm_f8ec252::dropout_add_ln_fwd 1.78% 44.522us 69.80% 1.746ms 581.866us 773.939us 100.00% 1.022ms 340.666us 3 - hf_kernels_layer_norm 0.00% 0.000us 0.00% 0.000us 0.000us 775.315us 100.18% 775.315us 775.315us 1 -void layer_norm::ln_fwd_kernel<layer_norm::Kernel_tr... 0.00% 0.000us 0.00% 0.000us 0.000us 773.939us 100.00% 773.939us 257.980us 3 - Activity Buffer Request 60.12% 1.504ms 60.12% 1.504ms 1.504ms 248.060us 32.05% 248.060us 248.060us 1 - aten::view 0.52% 12.931us 0.52% 12.931us 2.155us 0.000us 0.00% 0.000us 0.000us 6 - aten::empty 1.22% 30.580us 1.22% 30.580us 3.398us 0.000us 0.00% 0.000us 0.000us 9 -cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFla... 0.18% 4.610us 0.18% 4.610us 1.537us 0.000us 0.00% 0.000us 0.000us 3 - cudaLaunchKernel 6.49% 162.233us 6.49% 162.233us 54.078us 0.000us 0.00% 0.000us 0.000us 3 - cudaDeviceSynchronize 24.18% 604.773us 24.18% 604.773us 604.773us 0.000us 0.00% 0.000us 0.000us 1 + hf_kernels_layer_norm 20.93% 115.170us 99.06% 545.227us 545.227us 0.000us 0.00% 194.686us 194.686us 1 + _layer_norm_f8ec252::dropout_add_ln_fwd 8.82% 48.552us 75.83% 417.326us 139.109us 120.767us 100.00% 194.686us 64.895us 3 + hf_kernels_layer_norm 0.00% 0.000us 0.00% 0.000us 0.000us 137.247us 113.65% 137.247us 137.247us 1 +void layer_norm::ln_fwd_kernel<layer_norm::Kernel_tr... 0.00% 0.000us 0.00% 0.000us 0.000us 120.767us 100.00% 120.767us 40.256us 3 + Activity Buffer Request 31.56% 173.672us 31.56% 173.672us 173.672us 73.919us 61.21% 73.919us 73.919us 1 + aten::view 2.31% 12.731us 2.31% 12.731us 2.122us 0.000us 0.00% 0.000us 0.000us 6 + aten::empty 5.79% 31.840us 5.79% 31.840us 3.538us 0.000us 0.00% 0.000us 0.000us 9 +cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFla... 1.15% 6.350us 1.15% 6.350us 2.117us 0.000us 0.00% 0.000us 0.000us 3 + cudaLaunchKernel 28.51% 156.912us 28.51% 156.912us 52.304us 0.000us 0.00% 0.000us 0.000us 3 + cudaDeviceSynchronize 0.94% 5.151us 0.94% 5.151us 5.151us 0.000us 0.00% 0.000us 0.000us 1 ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ -Self CPU time total: 2.501ms -Self CUDA time total: 773.939us +Self CPU time total: 550.378us +Self CUDA time total: 120.767us @@ -4295,19 +4295,19 @@ PROFILE TRACE: hf_kernels_layer_norm | LN_B4_S128_D1024 ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ - hf_kernels_layer_norm 0.00% 0.000us 0.00% 0.000us 0.000us 114.235us 1022.79% 114.235us 114.235us 1 - hf_kernels_layer_norm 20.56% 107.954us 99.19% 520.921us 520.921us 0.000us 0.00% 14.722us 14.722us 1 - _layer_norm_f8ec252::dropout_add_ln_fwd 8.06% 42.351us 76.34% 400.957us 133.652us 11.169us 100.00% 14.722us 4.907us 3 -void layer_norm::ln_fwd_kernel<layer_norm::Kernel_tr... 0.00% 0.000us 0.00% 0.000us 0.000us 11.169us 100.00% 11.169us 3.723us 3 - Activity Buffer Request 31.14% 163.523us 31.14% 163.523us 163.523us 3.553us 31.81% 3.553us 3.553us 1 - aten::view 2.29% 12.010us 2.29% 12.010us 2.002us 0.000us 0.00% 0.000us 0.000us 6 - aten::empty 6.02% 31.620us 6.02% 31.620us 3.513us 0.000us 0.00% 0.000us 0.000us 9 -cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFla... 0.87% 4.550us 0.87% 4.550us 1.517us 0.000us 0.00% 0.000us 0.000us 3 - cudaLaunchKernel 30.26% 158.913us 30.26% 158.913us 52.971us 0.000us 0.00% 0.000us 0.000us 3 - cudaDeviceSynchronize 0.81% 4.270us 0.81% 4.270us 4.270us 0.000us 0.00% 0.000us 0.000us 1 + hf_kernels_layer_norm 0.00% 0.000us 0.00% 0.000us 0.000us 120.958us 1277.01% 120.958us 120.958us 1 + hf_kernels_layer_norm 13.96% 126.333us 99.48% 900.293us 900.293us 0.000us 0.00% 12.480us 12.480us 1 + _layer_norm_f8ec252::dropout_add_ln_fwd 5.25% 47.490us 84.03% 760.450us 253.483us 9.472us 100.00% 12.480us 4.160us 3 +void layer_norm::ln_fwd_kernel<layer_norm::Kernel_tr... 0.00% 0.000us 0.00% 0.000us 0.000us 9.472us 100.00% 9.472us 3.157us 3 + Activity Buffer Request 56.99% 515.778us 56.99% 515.778us 515.778us 3.008us 31.76% 3.008us 3.008us 1 + aten::view 1.49% 13.510us 1.49% 13.510us 2.252us 0.000us 0.00% 0.000us 0.000us 6 + aten::empty 3.30% 29.900us 3.30% 29.900us 3.322us 0.000us 0.00% 0.000us 0.000us 9 +cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFla... 0.61% 5.520us 0.61% 5.520us 1.840us 0.000us 0.00% 0.000us 0.000us 3 + cudaLaunchKernel 17.87% 161.762us 17.87% 161.762us 53.921us 0.000us 0.00% 0.000us 0.000us 3 + cudaDeviceSynchronize 0.52% 4.731us 0.52% 4.731us 4.731us 0.000us 0.00% 0.000us 0.000us 1 ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ -Self CPU time total: 525.191us -Self CUDA time total: 11.169us +Self CPU time total: 905.024us +Self CUDA time total: 9.472us @@ -4317,19 +4317,19 @@ PROFILE TRACE: hf_kernels_layer_norm | LN_B4_S128_D2048 ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ - hf_kernels_layer_norm 0.00% 0.000us 0.00% 0.000us 0.000us 123.677us 491.74% 123.677us 123.677us 1 - hf_kernels_layer_norm 6.82% 128.063us 99.76% 1.873ms 1.873ms 0.000us 0.00% 33.759us 33.759us 1 - _layer_norm_f8ec252::dropout_add_ln_fwd 2.38% 44.761us 92.27% 1.732ms 577.489us 25.151us 100.00% 33.759us 11.253us 3 -void layer_norm::ln_fwd_kernel<layer_norm::Kernel_tr... 0.00% 0.000us 0.00% 0.000us 0.000us 25.151us 100.00% 25.151us 8.384us 3 - Activity Buffer Request 79.39% 1.491ms 79.39% 1.491ms 1.491ms 8.608us 34.23% 8.608us 8.608us 1 - aten::view 0.67% 12.571us 0.67% 12.571us 2.095us 0.000us 0.00% 0.000us 0.000us 6 - aten::empty 1.69% 31.810us 1.69% 31.810us 3.534us 0.000us 0.00% 0.000us 0.000us 9 -cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFla... 0.24% 4.510us 0.24% 4.510us 1.503us 0.000us 0.00% 0.000us 0.000us 3 - cudaLaunchKernel 8.56% 160.733us 8.56% 160.733us 53.578us 0.000us 0.00% 0.000us 0.000us 3 - cudaDeviceSynchronize 0.24% 4.560us 0.24% 4.560us 4.560us 0.000us 0.00% 0.000us 0.000us 1 + hf_kernels_layer_norm 0.00% 0.000us 0.00% 0.000us 0.000us 119.647us 905.32% 119.647us 119.647us 1 + hf_kernels_layer_norm 7.02% 129.983us 99.72% 1.846ms 1.846ms 0.000us 0.00% 17.632us 17.632us 1 + _layer_norm_f8ec252::dropout_add_ln_fwd 2.48% 45.879us 92.05% 1.704ms 568.058us 13.216us 100.00% 17.632us 5.877us 3 +void layer_norm::ln_fwd_kernel<layer_norm::Kernel_tr... 0.00% 0.000us 0.00% 0.000us 0.000us 13.216us 100.00% 13.216us 4.405us 3 + Activity Buffer Request 79.30% 1.468ms 79.30% 1.468ms 1.468ms 4.416us 33.41% 4.416us 4.416us 1 + aten::view 0.65% 12.030us 0.65% 12.030us 2.005us 0.000us 0.00% 0.000us 0.000us 6 + aten::empty 1.60% 29.701us 1.60% 29.701us 3.300us 0.000us 0.00% 0.000us 0.000us 9 +cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFla... 0.33% 6.090us 0.33% 6.090us 2.030us 0.000us 0.00% 0.000us 0.000us 3 + cudaLaunchKernel 8.34% 154.332us 8.34% 154.332us 51.444us 0.000us 0.00% 0.000us 0.000us 3 + cudaDeviceSynchronize 0.28% 5.130us 0.28% 5.130us 5.130us 0.000us 0.00% 0.000us 0.000us 1 ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ -Self CPU time total: 1.878ms -Self CUDA time total: 25.151us +Self CPU time total: 1.851ms +Self CUDA time total: 13.216us @@ -4339,19 +4339,19 @@ PROFILE TRACE: hf_kernels_layer_norm | LN_B4_S128_D4096 ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ - hf_kernels_layer_norm 0.00% 0.000us 0.00% 0.000us 0.000us 119.706us 417.98% 119.706us 119.706us 1 - hf_kernels_layer_norm 25.81% 125.022us 99.07% 479.820us 479.820us 0.000us 0.00% 38.142us 38.142us 1 - _layer_norm_f8ec252::dropout_add_ln_fwd 9.03% 43.713us 70.85% 343.148us 114.383us 28.639us 100.00% 38.142us 12.714us 3 -void layer_norm::ln_fwd_kernel<layer_norm::Kernel_tr... 0.00% 0.000us 0.00% 0.000us 0.000us 28.639us 100.00% 28.639us 9.546us 3 - Activity Buffer Request 21.68% 105.002us 21.68% 105.002us 105.002us 9.503us 33.18% 9.503us 9.503us 1 - aten::view 2.41% 11.650us 2.41% 11.650us 1.942us 0.000us 0.00% 0.000us 0.000us 6 - aten::empty 6.56% 31.751us 6.56% 31.751us 3.528us 0.000us 0.00% 0.000us 0.000us 9 -cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFla... 0.93% 4.499us 0.93% 4.499us 1.500us 0.000us 0.00% 0.000us 0.000us 3 - cudaLaunchKernel 32.66% 158.183us 32.66% 158.183us 52.728us 0.000us 0.00% 0.000us 0.000us 3 - cudaDeviceSynchronize 0.93% 4.510us 0.93% 4.510us 4.510us 0.000us 0.00% 0.000us 0.000us 1 + hf_kernels_layer_norm 0.00% 0.000us 0.00% 0.000us 0.000us 119.904us 814.57% 119.904us 119.904us 1 + hf_kernels_layer_norm 6.96% 128.481us 99.73% 1.842ms 1.842ms 0.000us 0.00% 19.648us 19.648us 1 + _layer_norm_f8ec252::dropout_add_ln_fwd 2.56% 47.250us 92.11% 1.701ms 566.981us 14.720us 100.00% 19.648us 6.549us 3 +void layer_norm::ln_fwd_kernel<layer_norm::Kernel_tr... 0.00% 0.000us 0.00% 0.000us 0.000us 14.720us 100.00% 14.720us 4.907us 3 + Activity Buffer Request 79.23% 1.463ms 79.23% 1.463ms 1.463ms 4.928us 33.48% 4.928us 4.928us 1 + aten::view 0.66% 12.121us 0.66% 12.121us 2.020us 0.000us 0.00% 0.000us 0.000us 6 + aten::empty 1.62% 29.881us 1.62% 29.881us 3.320us 0.000us 0.00% 0.000us 0.000us 9 +cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFla... 0.34% 6.300us 0.34% 6.300us 2.100us 0.000us 0.00% 0.000us 0.000us 3 + cudaLaunchKernel 8.36% 154.452us 8.36% 154.452us 51.484us 0.000us 0.00% 0.000us 0.000us 3 + cudaDeviceSynchronize 0.27% 5.031us 0.27% 5.031us 5.031us 0.000us 0.00% 0.000us 0.000us 1 ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ -Self CPU time total: 484.330us -Self CUDA time total: 28.639us +Self CPU time total: 1.847ms +Self CUDA time total: 14.720us @@ -4361,19 +4361,19 @@ PROFILE TRACE: hf_kernels_layer_norm | LN_B4_S128_D8192 ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ - hf_kernels_layer_norm 0.00% 0.000us 0.00% 0.000us 0.000us 136.221us 162.42% 136.221us 136.221us 1 - hf_kernels_layer_norm 5.91% 110.784us 99.77% 1.870ms 1.870ms 0.000us 0.00% 135.358us 135.358us 1 - _layer_norm_f8ec252::dropout_add_ln_fwd 2.49% 46.760us 93.19% 1.747ms 582.332us 83.871us 100.00% 135.358us 45.119us 3 -void layer_norm::ln_fwd_kernel<layer_norm::Kernel_tr... 0.00% 0.000us 0.00% 0.000us 0.000us 83.871us 100.00% 83.871us 27.957us 3 - Activity Buffer Request 79.40% 1.488ms 79.40% 1.488ms 1.488ms 51.487us 61.39% 51.487us 51.487us 1 - aten::view 0.67% 12.640us 0.67% 12.640us 2.107us 0.000us 0.00% 0.000us 0.000us 6 - aten::empty 1.64% 30.810us 1.64% 30.810us 3.423us 0.000us 0.00% 0.000us 0.000us 9 -cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFla... 0.26% 4.850us 0.26% 4.850us 1.617us 0.000us 0.00% 0.000us 0.000us 3 - cudaLaunchKernel 9.39% 176.124us 9.39% 176.124us 58.708us 0.000us 0.00% 0.000us 0.000us 3 - cudaDeviceSynchronize 0.23% 4.240us 0.23% 4.240us 4.240us 0.000us 0.00% 0.000us 0.000us 1 + hf_kernels_layer_norm 0.00% 0.000us 0.00% 0.000us 0.000us 123.838us 511.90% 123.838us 123.838us 1 + hf_kernels_layer_norm 6.93% 126.950us 99.73% 1.827ms 1.827ms 0.000us 0.00% 32.224us 32.224us 1 + _layer_norm_f8ec252::dropout_add_ln_fwd 2.51% 46.080us 92.13% 1.688ms 562.698us 24.192us 100.00% 32.224us 10.741us 3 +void layer_norm::ln_fwd_kernel<layer_norm::Kernel_tr... 0.00% 0.000us 0.00% 0.000us 0.000us 24.192us 100.00% 24.192us 8.064us 3 + Activity Buffer Request 79.12% 1.450ms 79.12% 1.450ms 1.450ms 8.032us 33.20% 8.032us 8.032us 1 + aten::view 0.67% 12.241us 0.67% 12.241us 2.040us 0.000us 0.00% 0.000us 0.000us 6 + aten::empty 1.67% 30.641us 1.67% 30.641us 3.405us 0.000us 0.00% 0.000us 0.000us 9 +cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFla... 0.33% 5.980us 0.33% 5.980us 1.993us 0.000us 0.00% 0.000us 0.000us 3 + cudaLaunchKernel 8.50% 155.772us 8.50% 155.772us 51.924us 0.000us 0.00% 0.000us 0.000us 3 + cudaDeviceSynchronize 0.27% 4.990us 0.27% 4.990us 4.990us 0.000us 0.00% 0.000us 0.000us 1 ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ -Self CPU time total: 1.875ms -Self CUDA time total: 83.871us +Self CPU time total: 1.832ms +Self CUDA time total: 24.192us @@ -4383,19 +4383,19 @@ PROFILE TRACE: hf_kernels_layer_norm | LN_B4_S512_D1024 ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ - hf_kernels_layer_norm 0.00% 0.000us 0.00% 0.000us 0.000us 127.996us 494.44% 127.996us 127.996us 1 - hf_kernels_layer_norm 7.05% 134.013us 99.76% 1.896ms 1.896ms 0.000us 0.00% 34.367us 34.367us 1 - _layer_norm_f8ec252::dropout_add_ln_fwd 2.28% 43.262us 92.05% 1.750ms 583.272us 25.887us 100.00% 34.367us 11.456us 3 -void layer_norm::ln_fwd_kernel<layer_norm::Kernel_tr... 0.00% 0.000us 0.00% 0.000us 0.000us 25.887us 100.00% 25.887us 8.629us 3 - Activity Buffer Request 79.44% 1.510ms 79.44% 1.510ms 1.510ms 8.480us 32.76% 8.480us 8.480us 1 - aten::view 0.66% 12.451us 0.66% 12.451us 2.075us 0.000us 0.00% 0.000us 0.000us 6 - aten::empty 1.65% 31.400us 1.65% 31.400us 3.489us 0.000us 0.00% 0.000us 0.000us 9 -cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFla... 0.25% 4.830us 0.25% 4.830us 1.610us 0.000us 0.00% 0.000us 0.000us 3 - cudaLaunchKernel 8.44% 160.343us 8.44% 160.343us 53.448us 0.000us 0.00% 0.000us 0.000us 3 - cudaDeviceSynchronize 0.24% 4.600us 0.24% 4.600us 4.600us 0.000us 0.00% 0.000us 0.000us 1 + hf_kernels_layer_norm 0.00% 0.000us 0.00% 0.000us 0.000us 114.752us 903.27% 114.752us 114.752us 1 + hf_kernels_layer_norm 6.98% 127.002us 99.74% 1.816ms 1.816ms 0.000us 0.00% 16.896us 16.896us 1 + _layer_norm_f8ec252::dropout_add_ln_fwd 2.46% 44.721us 92.11% 1.677ms 559.031us 12.704us 100.00% 16.896us 5.632us 3 +void layer_norm::ln_fwd_kernel<layer_norm::Kernel_tr... 0.00% 0.000us 0.00% 0.000us 0.000us 12.704us 100.00% 12.704us 4.235us 3 + Activity Buffer Request 79.42% 1.446ms 79.42% 1.446ms 1.446ms 4.192us 33.00% 4.192us 4.192us 1 + aten::view 0.65% 11.810us 0.65% 11.810us 1.968us 0.000us 0.00% 0.000us 0.000us 6 + aten::empty 1.61% 29.350us 1.61% 29.350us 3.261us 0.000us 0.00% 0.000us 0.000us 9 +cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFla... 0.30% 5.480us 0.30% 5.480us 1.827us 0.000us 0.00% 0.000us 0.000us 3 + cudaLaunchKernel 8.33% 151.582us 8.33% 151.582us 50.527us 0.000us 0.00% 0.000us 0.000us 3 + cudaDeviceSynchronize 0.26% 4.810us 0.26% 4.810us 4.810us 0.000us 0.00% 0.000us 0.000us 1 ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ -Self CPU time total: 1.901ms -Self CUDA time total: 25.887us +Self CPU time total: 1.821ms +Self CUDA time total: 12.704us @@ -4405,19 +4405,19 @@ PROFILE TRACE: hf_kernels_layer_norm | LN_B4_S512_D2048 ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ - hf_kernels_layer_norm 25.05% 130.783us 99.19% 517.901us 517.901us 0.000us 0.00% 143.738us 143.738us 1 - _layer_norm_f8ec252::dropout_add_ln_fwd 8.16% 42.610us 71.72% 374.457us 124.819us 90.940us 100.00% 143.738us 47.913us 3 - hf_kernels_layer_norm 0.00% 0.000us 0.00% 0.000us 0.000us 129.787us 142.72% 129.787us 129.787us 1 -void layer_norm::ln_fwd_kernel<layer_norm::Kernel_tr... 0.00% 0.000us 0.00% 0.000us 0.000us 90.940us 100.00% 90.940us 30.313us 3 - Activity Buffer Request 26.41% 137.873us 26.41% 137.873us 137.873us 52.798us 58.06% 52.798us 52.798us 1 - aten::view 2.42% 12.661us 2.42% 12.661us 2.110us 0.000us 0.00% 0.000us 0.000us 6 - aten::empty 6.15% 32.091us 6.15% 32.091us 3.566us 0.000us 0.00% 0.000us 0.000us 9 -cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFla... 0.86% 4.510us 0.86% 4.510us 1.503us 0.000us 0.00% 0.000us 0.000us 3 - cudaLaunchKernel 30.14% 157.373us 30.14% 157.373us 52.458us 0.000us 0.00% 0.000us 0.000us 3 - cudaDeviceSynchronize 0.81% 4.209us 0.81% 4.209us 4.209us 0.000us 0.00% 0.000us 0.000us 1 + hf_kernels_layer_norm 0.00% 0.000us 0.00% 0.000us 0.000us 114.171us 434.06% 114.171us 114.171us 1 + hf_kernels_layer_norm 21.27% 106.031us 98.93% 493.167us 493.167us 0.000us 0.00% 35.134us 35.134us 1 + _layer_norm_f8ec252::dropout_add_ln_fwd 8.94% 44.581us 75.39% 375.835us 125.278us 26.303us 100.00% 35.134us 11.711us 3 +void layer_norm::ln_fwd_kernel<layer_norm::Kernel_tr... 0.00% 0.000us 0.00% 0.000us 0.000us 26.303us 100.00% 26.303us 8.768us 3 + Activity Buffer Request 28.70% 143.052us 28.70% 143.052us 143.052us 8.831us 33.57% 8.831us 8.831us 1 + aten::view 2.27% 11.301us 2.27% 11.301us 1.883us 0.000us 0.00% 0.000us 0.000us 6 + aten::empty 5.86% 29.220us 5.86% 29.220us 3.247us 0.000us 0.00% 0.000us 0.000us 9 +cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFla... 0.95% 4.720us 0.95% 4.720us 1.573us 0.000us 0.00% 0.000us 0.000us 3 + cudaLaunchKernel 30.95% 154.262us 30.95% 154.262us 51.421us 0.000us 0.00% 0.000us 0.000us 3 + cudaDeviceSynchronize 1.07% 5.331us 1.07% 5.331us 5.331us 0.000us 0.00% 0.000us 0.000us 1 ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ -Self CPU time total: 522.110us -Self CUDA time total: 90.940us +Self CPU time total: 498.498us +Self CUDA time total: 26.303us @@ -4427,19 +4427,19 @@ PROFILE TRACE: hf_kernels_layer_norm | LN_B4_S512_D4096 ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ - hf_kernels_layer_norm 7.06% 133.042us 99.73% 1.880ms 1.880ms 0.000us 0.00% 249.725us 249.725us 1 - _layer_norm_f8ec252::dropout_add_ln_fwd 2.38% 44.822us 92.02% 1.735ms 578.362us 152.446us 100.00% 249.725us 83.242us 3 - hf_kernels_layer_norm 0.00% 0.000us 0.00% 0.000us 0.000us 153.726us 100.84% 153.726us 153.726us 1 -void layer_norm::ln_fwd_kernel<layer_norm::Kernel_tr... 0.00% 0.000us 0.00% 0.000us 0.000us 152.446us 100.00% 152.446us 50.815us 3 - Activity Buffer Request 78.99% 1.489ms 78.99% 1.489ms 1.489ms 97.279us 63.81% 97.279us 97.279us 1 - aten::view 0.65% 12.322us 0.65% 12.322us 2.054us 0.000us 0.00% 0.000us 0.000us 6 - aten::empty 1.73% 32.600us 1.73% 32.600us 3.622us 0.000us 0.00% 0.000us 0.000us 9 -cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFla... 0.25% 4.629us 0.25% 4.629us 1.543us 0.000us 0.00% 0.000us 0.000us 3 - cudaLaunchKernel 8.68% 163.723us 8.68% 163.723us 54.574us 0.000us 0.00% 0.000us 0.000us 3 - cudaDeviceSynchronize 0.27% 5.040us 0.27% 5.040us 5.040us 0.000us 0.00% 0.000us 0.000us 1 + hf_kernels_layer_norm 0.00% 0.000us 0.00% 0.000us 0.000us 126.302us 214.16% 126.302us 126.302us 1 + hf_kernels_layer_norm 6.77% 126.701us 99.74% 1.866ms 1.866ms 0.000us 0.00% 94.496us 94.496us 1 + _layer_norm_f8ec252::dropout_add_ln_fwd 2.55% 47.732us 92.27% 1.726ms 575.432us 58.976us 100.00% 94.496us 31.499us 3 +void layer_norm::ln_fwd_kernel<layer_norm::Kernel_tr... 0.00% 0.000us 0.00% 0.000us 0.000us 58.976us 100.00% 58.976us 19.659us 3 + Activity Buffer Request 79.36% 1.485ms 79.36% 1.485ms 1.485ms 35.520us 60.23% 35.520us 35.520us 1 + aten::view 0.70% 13.010us 0.70% 13.010us 2.168us 0.000us 0.00% 0.000us 0.000us 6 + aten::empty 1.62% 30.339us 1.62% 30.339us 3.371us 0.000us 0.00% 0.000us 0.000us 9 +cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFla... 0.26% 4.881us 0.26% 4.881us 1.627us 0.000us 0.00% 0.000us 0.000us 3 + cudaLaunchKernel 8.48% 158.562us 8.48% 158.562us 52.854us 0.000us 0.00% 0.000us 0.000us 3 + cudaDeviceSynchronize 0.26% 4.860us 0.26% 4.860us 4.860us 0.000us 0.00% 0.000us 0.000us 1 ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ -Self CPU time total: 1.885ms -Self CUDA time total: 152.446us +Self CPU time total: 1.871ms +Self CUDA time total: 58.976us @@ -4449,19 +4449,19 @@ PROFILE TRACE: hf_kernels_layer_norm | LN_B4_S512_D8192 ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ - hf_kernels_layer_norm 11.27% 123.983us 43.73% 481.131us 481.131us 0.000us 0.00% 1.032ms 1.032ms 1 - _layer_norm_f8ec252::dropout_add_ln_fwd 3.84% 42.226us 31.34% 344.788us 114.929us 778.697us 100.00% 1.032ms 344.128us 3 - hf_kernels_layer_norm 0.00% 0.000us 0.00% 0.000us 0.000us 780.107us 100.18% 780.107us 780.107us 1 -void layer_norm::ln_fwd_kernel<layer_norm::Kernel_tr... 0.00% 0.000us 0.00% 0.000us 0.000us 778.697us 100.00% 778.697us 259.566us 3 - Activity Buffer Request 9.29% 102.192us 9.29% 102.192us 102.192us 253.688us 32.58% 253.688us 253.688us 1 - aten::view 1.12% 12.360us 1.12% 12.360us 2.060us 0.000us 0.00% 0.000us 0.000us 6 - aten::empty 2.69% 29.646us 2.69% 29.646us 3.294us 0.000us 0.00% 0.000us 0.000us 9 -cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFla... 0.46% 5.100us 0.46% 5.100us 1.700us 0.000us 0.00% 0.000us 0.000us 3 - cudaLaunchKernel 15.05% 165.624us 15.05% 165.624us 55.208us 0.000us 0.00% 0.000us 0.000us 3 - cudaDeviceSynchronize 56.27% 619.134us 56.27% 619.134us 619.134us 0.000us 0.00% 0.000us 0.000us 1 + hf_kernels_layer_norm 7.35% 135.313us 99.73% 1.836ms 1.836ms 0.000us 0.00% 200.830us 200.830us 1 + _layer_norm_f8ec252::dropout_add_ln_fwd 2.50% 46.052us 91.69% 1.688ms 562.585us 126.431us 100.00% 200.830us 66.943us 3 + hf_kernels_layer_norm 0.00% 0.000us 0.00% 0.000us 0.000us 142.015us 112.33% 142.015us 142.015us 1 +void layer_norm::ln_fwd_kernel<layer_norm::Kernel_tr... 0.00% 0.000us 0.00% 0.000us 0.000us 126.431us 100.00% 126.431us 42.144us 3 + Activity Buffer Request 77.83% 1.433ms 77.83% 1.433ms 1.433ms 74.399us 58.85% 74.399us 74.399us 1 + aten::view 0.68% 12.599us 0.68% 12.599us 2.100us 0.000us 0.00% 0.000us 0.000us 6 + aten::empty 1.73% 31.929us 1.73% 31.929us 3.548us 0.000us 0.00% 0.000us 0.000us 9 +cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFla... 0.30% 5.440us 0.30% 5.440us 1.813us 0.000us 0.00% 0.000us 0.000us 3 + cudaLaunchKernel 9.33% 171.692us 9.33% 171.692us 57.231us 0.000us 0.00% 0.000us 0.000us 3 + cudaDeviceSynchronize 0.27% 4.980us 0.27% 4.980us 4.980us 0.000us 0.00% 0.000us 0.000us 1 ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ -Self CPU time total: 1.100ms -Self CUDA time total: 778.697us +Self CPU time total: 1.841ms +Self CUDA time total: 126.431us @@ -4471,19 +4471,19 @@ PROFILE TRACE: hf_kernels_layer_norm | LN_B4_S1024_D1024 ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ - hf_kernels_layer_norm 21.50% 105.382us 99.03% 485.481us 485.481us 0.000us 0.00% 129.403us 129.403us 1 - _layer_norm_f8ec252::dropout_add_ln_fwd 8.84% 43.324us 74.99% 367.638us 122.546us 78.013us 100.00% 129.403us 43.134us 3 - hf_kernels_layer_norm 0.00% 0.000us 0.00% 0.000us 0.000us 123.195us 157.92% 123.195us 123.195us 1 -void layer_norm::ln_fwd_kernel<layer_norm::Kernel_tr... 0.00% 0.000us 0.00% 0.000us 0.000us 78.013us 100.00% 78.013us 26.004us 3 - Activity Buffer Request 25.08% 122.952us 25.08% 122.952us 122.952us 51.390us 65.87% 51.390us 51.390us 1 - aten::view 2.54% 12.461us 2.54% 12.461us 2.077us 0.000us 0.00% 0.000us 0.000us 6 - aten::empty 6.14% 30.100us 6.14% 30.100us 3.344us 0.000us 0.00% 0.000us 0.000us 9 -cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFla... 0.92% 4.499us 0.92% 4.499us 1.500us 0.000us 0.00% 0.000us 0.000us 3 - cudaLaunchKernel 34.02% 166.763us 34.02% 166.763us 55.588us 0.000us 0.00% 0.000us 0.000us 3 - cudaDeviceSynchronize 0.97% 4.770us 0.97% 4.770us 4.770us 0.000us 0.00% 0.000us 0.000us 1 + hf_kernels_layer_norm 0.00% 0.000us 0.00% 0.000us 0.000us 114.877us 559.23% 114.877us 114.877us 1 + hf_kernels_layer_norm 18.77% 104.472us 99.13% 551.627us 551.627us 0.000us 0.00% 27.357us 27.357us 1 + _layer_norm_f8ec252::dropout_add_ln_fwd 8.09% 45.039us 78.27% 435.585us 145.195us 20.542us 100.00% 27.357us 9.119us 3 +void layer_norm::ln_fwd_kernel<layer_norm::Kernel_tr... 0.00% 0.000us 0.00% 0.000us 0.000us 20.542us 100.00% 20.542us 6.847us 3 + Activity Buffer Request 36.72% 204.352us 36.72% 204.352us 204.352us 6.815us 33.18% 6.815us 6.815us 1 + aten::view 2.08% 11.570us 2.08% 11.570us 1.928us 0.000us 0.00% 0.000us 0.000us 6 + aten::empty 5.24% 29.142us 5.24% 29.142us 3.238us 0.000us 0.00% 0.000us 0.000us 9 +cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFla... 0.93% 5.150us 0.93% 5.150us 1.717us 0.000us 0.00% 0.000us 0.000us 3 + cudaLaunchKernel 27.30% 151.902us 27.30% 151.902us 50.634us 0.000us 0.00% 0.000us 0.000us 3 + cudaDeviceSynchronize 0.87% 4.869us 0.87% 4.869us 4.869us 0.000us 0.00% 0.000us 0.000us 1 ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ -Self CPU time total: 490.251us -Self CUDA time total: 78.013us +Self CPU time total: 556.496us +Self CUDA time total: 20.542us @@ -4493,19 +4493,19 @@ PROFILE TRACE: hf_kernels_layer_norm | LN_B4_S1024_D2048 ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ - hf_kernels_layer_norm 17.00% 113.402us 99.21% 661.694us 661.694us 0.000us 0.00% 284.025us 284.025us 1 - _layer_norm_f8ec252::dropout_add_ln_fwd 6.92% 46.121us 80.21% 534.951us 178.317us 178.523us 100.00% 284.025us 94.675us 3 - hf_kernels_layer_norm 0.00% 0.000us 0.00% 0.000us 0.000us 179.835us 100.73% 179.835us 179.835us 1 -void layer_norm::ln_fwd_kernel<layer_norm::Kernel_tr... 0.00% 0.000us 0.00% 0.000us 0.000us 178.523us 100.00% 178.523us 59.508us 3 - Activity Buffer Request 40.82% 272.256us 40.82% 272.256us 272.256us 105.502us 59.10% 105.502us 105.502us 1 - aten::view 2.00% 13.341us 2.00% 13.341us 2.223us 0.000us 0.00% 0.000us 0.000us 6 - aten::empty 6.85% 45.671us 6.85% 45.671us 5.075us 0.000us 0.00% 0.000us 0.000us 9 -cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFla... 0.72% 4.820us 0.72% 4.820us 1.607us 0.000us 0.00% 0.000us 0.000us 3 - cudaLaunchKernel 24.90% 166.083us 24.90% 166.083us 55.361us 0.000us 0.00% 0.000us 0.000us 3 - cudaDeviceSynchronize 0.79% 5.260us 0.79% 5.260us 5.260us 0.000us 0.00% 0.000us 0.000us 1 + hf_kernels_layer_norm 0.00% 0.000us 0.00% 0.000us 0.000us 128.543us 194.15% 128.543us 128.543us 1 + hf_kernels_layer_norm 6.47% 121.263us 99.74% 1.870ms 1.870ms 0.000us 0.00% 103.680us 103.680us 1 + _layer_norm_f8ec252::dropout_add_ln_fwd 2.50% 46.880us 92.61% 1.737ms 578.834us 66.208us 100.00% 103.680us 34.560us 3 +void layer_norm::ln_fwd_kernel<layer_norm::Kernel_tr... 0.00% 0.000us 0.00% 0.000us 0.000us 66.208us 100.00% 66.208us 22.069us 3 + Activity Buffer Request 80.04% 1.501ms 80.04% 1.501ms 1.501ms 37.472us 56.60% 37.472us 37.472us 1 + aten::view 0.67% 12.550us 0.67% 12.550us 2.092us 0.000us 0.00% 0.000us 0.000us 6 + aten::empty 1.61% 30.111us 1.61% 30.111us 3.346us 0.000us 0.00% 0.000us 0.000us 9 +cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFla... 0.29% 5.429us 0.29% 5.429us 1.810us 0.000us 0.00% 0.000us 0.000us 3 + cudaLaunchKernel 8.17% 153.262us 8.17% 153.262us 51.087us 0.000us 0.00% 0.000us 0.000us 3 + cudaDeviceSynchronize 0.26% 4.790us 0.26% 4.790us 4.790us 0.000us 0.00% 0.000us 0.000us 1 ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ -Self CPU time total: 666.954us -Self CUDA time total: 178.523us +Self CPU time total: 1.875ms +Self CUDA time total: 66.208us @@ -4515,19 +4515,19 @@ PROFILE TRACE: hf_kernels_layer_norm | LN_B4_S1024_D4096 ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ - hf_kernels_layer_norm 10.00% 107.534us 43.74% 470.530us 470.530us 0.000us 0.00% 1.006ms 1.006ms 1 - _layer_norm_f8ec252::dropout_add_ln_fwd 3.96% 42.640us 32.65% 351.216us 117.072us 763.349us 100.00% 1.006ms 335.355us 3 - hf_kernels_layer_norm 0.00% 0.000us 0.00% 0.000us 0.000us 764.661us 100.17% 764.661us 764.661us 1 -void layer_norm::ln_fwd_kernel<layer_norm::Kernel_tr... 0.00% 0.000us 0.00% 0.000us 0.000us 763.349us 100.00% 763.349us 254.450us 3 - Activity Buffer Request 10.47% 112.593us 10.47% 112.593us 112.593us 242.717us 31.80% 242.717us 242.717us 1 - aten::view 1.10% 11.780us 1.10% 11.780us 1.963us 0.000us 0.00% 0.000us 0.000us 6 - aten::empty 2.92% 31.430us 2.92% 31.430us 3.492us 0.000us 0.00% 0.000us 0.000us 9 -cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFla... 0.42% 4.470us 0.42% 4.470us 1.490us 0.000us 0.00% 0.000us 0.000us 3 - cudaLaunchKernel 14.88% 160.083us 14.88% 160.083us 53.361us 0.000us 0.00% 0.000us 0.000us 3 - cudaDeviceSynchronize 56.26% 605.233us 56.26% 605.233us 605.233us 0.000us 0.00% 0.000us 0.000us 1 + hf_kernels_layer_norm 21.88% 101.912us 98.91% 460.726us 460.726us 0.000us 0.00% 193.786us 193.786us 1 + _layer_norm_f8ec252::dropout_add_ln_fwd 10.30% 47.997us 74.62% 347.614us 115.871us 120.124us 100.00% 193.786us 64.595us 3 + hf_kernels_layer_norm 0.00% 0.000us 0.00% 0.000us 0.000us 129.116us 107.49% 129.116us 129.116us 1 +void layer_norm::ln_fwd_kernel<layer_norm::Kernel_tr... 0.00% 0.000us 0.00% 0.000us 0.000us 120.124us 100.00% 120.124us 40.041us 3 + Activity Buffer Request 23.66% 110.222us 23.66% 110.222us 110.222us 73.662us 61.32% 73.662us 73.662us 1 + aten::view 2.40% 11.200us 2.40% 11.200us 1.867us 0.000us 0.00% 0.000us 0.000us 6 + aten::empty 6.29% 29.283us 6.29% 29.283us 3.254us 0.000us 0.00% 0.000us 0.000us 9 +cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFla... 1.07% 4.970us 1.07% 4.970us 1.657us 0.000us 0.00% 0.000us 0.000us 3 + cudaLaunchKernel 33.30% 155.142us 33.30% 155.142us 51.714us 0.000us 0.00% 0.000us 0.000us 3 + cudaDeviceSynchronize 1.09% 5.100us 1.09% 5.100us 5.100us 0.000us 0.00% 0.000us 0.000us 1 ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ -Self CPU time total: 1.076ms -Self CUDA time total: 763.349us +Self CPU time total: 465.826us +Self CUDA time total: 120.124us @@ -4537,19 +4537,19 @@ PROFILE TRACE: hf_kernels_layer_norm | LN_B4_S1024_D8192 ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ - hf_kernels_layer_norm 4.84% 112.452us 36.30% 843.368us 843.368us 0.000us 0.00% 2.131ms 2.131ms 1 - _layer_norm_f8ec252::dropout_add_ln_fwd 1.85% 43.099us 30.92% 718.406us 239.469us 1.645ms 100.00% 2.131ms 710.322us 3 - hf_kernels_layer_norm 0.00% 0.000us 0.00% 0.000us 0.000us 1.647ms 100.09% 1.647ms 1.647ms 1 -void layer_norm::ln_fwd_kernel<layer_norm::Kernel_tr... 0.00% 0.000us 0.00% 0.000us 0.000us 1.645ms 100.00% 1.645ms 548.437us 3 - Activity Buffer Request 20.28% 471.291us 20.28% 471.291us 471.291us 485.654us 29.52% 485.654us 485.654us 1 - aten::view 0.54% 12.510us 0.54% 12.510us 2.085us 0.000us 0.00% 0.000us 0.000us 6 - aten::empty 1.34% 31.070us 1.34% 31.070us 3.452us 0.000us 0.00% 0.000us 0.000us 9 -cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFla... 0.22% 5.221us 0.22% 5.221us 1.740us 0.000us 0.00% 0.000us 0.000us 3 - cudaLaunchKernel 7.22% 167.725us 7.22% 167.725us 55.908us 0.000us 0.00% 0.000us 0.000us 3 - cudaDeviceSynchronize 63.70% 1.480ms 63.70% 1.480ms 1.480ms 0.000us 0.00% 0.000us 0.000us 1 + hf_kernels_layer_norm 10.47% 108.133us 61.96% 639.990us 639.990us 0.000us 0.00% 741.038us 741.038us 1 + _layer_norm_f8ec252::dropout_add_ln_fwd 4.66% 48.171us 50.27% 519.257us 173.086us 556.019us 100.00% 741.038us 247.013us 3 + hf_kernels_layer_norm 0.00% 0.000us 0.00% 0.000us 0.000us 557.395us 100.25% 557.395us 557.395us 1 +void layer_norm::ln_fwd_kernel<layer_norm::Kernel_tr... 0.00% 0.000us 0.00% 0.000us 0.000us 556.019us 100.00% 556.019us 185.340us 3 + Activity Buffer Request 26.52% 273.914us 26.52% 273.914us 273.914us 185.019us 33.28% 185.019us 185.019us 1 + aten::view 1.22% 12.600us 1.22% 12.600us 2.100us 0.000us 0.00% 0.000us 0.000us 6 + aten::empty 2.91% 30.100us 2.91% 30.100us 3.344us 0.000us 0.00% 0.000us 0.000us 9 +cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFla... 0.47% 4.869us 0.47% 4.869us 1.623us 0.000us 0.00% 0.000us 0.000us 3 + cudaLaunchKernel 15.70% 162.203us 15.70% 162.203us 54.068us 0.000us 0.00% 0.000us 0.000us 3 + cudaDeviceSynchronize 38.04% 392.946us 38.04% 392.946us 392.946us 0.000us 0.00% 0.000us 0.000us 1 ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ -Self CPU time total: 2.324ms -Self CUDA time total: 1.645ms +Self CPU time total: 1.033ms +Self CUDA time total: 556.019us @@ -4559,19 +4559,19 @@ PROFILE TRACE: hf_kernels_layer_norm | LN_B4_S2048_D1024 ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ - hf_kernels_layer_norm 22.85% 107.114us 99.00% 464.050us 464.050us 0.000us 0.00% 251.384us 251.384us 1 - _layer_norm_f8ec252::dropout_add_ln_fwd 9.45% 44.281us 73.62% 345.116us 115.039us 150.043us 100.00% 251.384us 83.795us 3 - hf_kernels_layer_norm 0.00% 0.000us 0.00% 0.000us 0.000us 151.420us 100.92% 151.420us 151.420us 1 -void layer_norm::ln_fwd_kernel<layer_norm::Kernel_tr... 0.00% 0.000us 0.00% 0.000us 0.000us 150.043us 100.00% 150.043us 50.014us 3 - Activity Buffer Request 22.03% 103.262us 22.03% 103.262us 103.262us 101.341us 67.54% 101.341us 101.341us 1 - aten::view 2.52% 11.820us 2.52% 11.820us 1.970us 0.000us 0.00% 0.000us 0.000us 6 - aten::empty 6.37% 29.880us 6.37% 29.880us 3.320us 0.000us 0.00% 0.000us 0.000us 9 -cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFla... 1.27% 5.930us 1.27% 5.930us 1.977us 0.000us 0.00% 0.000us 0.000us 3 - cudaLaunchKernel 34.51% 161.763us 34.51% 161.763us 53.921us 0.000us 0.00% 0.000us 0.000us 3 - cudaDeviceSynchronize 1.00% 4.700us 1.00% 4.700us 4.700us 0.000us 0.00% 0.000us 0.000us 1 + hf_kernels_layer_norm 0.00% 0.000us 0.00% 0.000us 0.000us 117.533us 202.70% 117.533us 117.533us 1 + hf_kernels_layer_norm 16.63% 101.441us 99.21% 605.228us 605.228us 0.000us 0.00% 93.950us 93.950us 1 + _layer_norm_f8ec252::dropout_add_ln_fwd 7.68% 46.841us 80.72% 492.428us 164.143us 57.983us 100.00% 93.950us 31.317us 3 +void layer_norm::ln_fwd_kernel<layer_norm::Kernel_tr... 0.00% 0.000us 0.00% 0.000us 0.000us 57.983us 100.00% 57.983us 19.328us 3 + Activity Buffer Request 41.81% 255.054us 41.81% 255.054us 255.054us 35.967us 62.03% 35.967us 35.967us 1 + aten::view 1.86% 11.359us 1.86% 11.359us 1.893us 0.000us 0.00% 0.000us 0.000us 6 + aten::empty 4.84% 29.531us 4.84% 29.531us 3.281us 0.000us 0.00% 0.000us 0.000us 9 +cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFla... 0.88% 5.399us 0.88% 5.399us 1.800us 0.000us 0.00% 0.000us 0.000us 3 + cudaLaunchKernel 25.51% 155.603us 25.51% 155.603us 51.868us 0.000us 0.00% 0.000us 0.000us 3 + cudaDeviceSynchronize 0.79% 4.850us 0.79% 4.850us 4.850us 0.000us 0.00% 0.000us 0.000us 1 ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ -Self CPU time total: 468.750us -Self CUDA time total: 150.043us +Self CPU time total: 610.078us +Self CUDA time total: 57.983us @@ -4581,19 +4581,19 @@ PROFILE TRACE: hf_kernels_layer_norm | LN_B4_S2048_D2048 ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ - hf_kernels_layer_norm 7.34% 110.231us 57.64% 865.428us 865.428us 0.000us 0.00% 1.059ms 1.059ms 1 - _layer_norm_f8ec252::dropout_add_ln_fwd 2.97% 44.579us 49.44% 742.306us 247.435us 800.455us 100.00% 1.059ms 352.853us 3 - hf_kernels_layer_norm 0.00% 0.000us 0.00% 0.000us 0.000us 801.894us 100.18% 801.894us 801.894us 1 -void layer_norm::ln_fwd_kernel<layer_norm::Kernel_tr... 0.00% 0.000us 0.00% 0.000us 0.000us 800.455us 100.00% 800.455us 266.818us 3 - Activity Buffer Request 33.27% 499.511us 33.27% 499.511us 499.511us 258.104us 32.24% 258.104us 258.104us 1 - aten::view 0.86% 12.891us 0.86% 12.891us 2.148us 0.000us 0.00% 0.000us 0.000us 6 - aten::empty 2.04% 30.574us 2.04% 30.574us 3.397us 0.000us 0.00% 0.000us 0.000us 9 -cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFla... 0.36% 5.369us 0.36% 5.369us 1.790us 0.000us 0.00% 0.000us 0.000us 3 - cudaLaunchKernel 10.81% 162.273us 10.81% 162.273us 54.091us 0.000us 0.00% 0.000us 0.000us 3 - cudaDeviceSynchronize 42.36% 635.954us 42.36% 635.954us 635.954us 0.000us 0.00% 0.000us 0.000us 1 + hf_kernels_layer_norm 16.67% 104.061us 99.23% 619.539us 619.539us 0.000us 0.00% 218.617us 218.617us 1 + _layer_norm_f8ec252::dropout_add_ln_fwd 7.57% 47.260us 80.66% 503.568us 167.856us 138.780us 100.00% 218.617us 72.872us 3 + hf_kernels_layer_norm 0.00% 0.000us 0.00% 0.000us 0.000us 140.188us 101.01% 140.188us 140.188us 1 +void layer_norm::ln_fwd_kernel<layer_norm::Kernel_tr... 0.00% 0.000us 0.00% 0.000us 0.000us 138.780us 100.00% 138.780us 46.260us 3 + Activity Buffer Request 42.90% 267.854us 42.90% 267.854us 267.854us 79.837us 57.53% 79.837us 79.837us 1 + aten::view 1.91% 11.910us 1.91% 11.910us 1.985us 0.000us 0.00% 0.000us 0.000us 6 + aten::empty 4.81% 30.001us 4.81% 30.001us 3.333us 0.000us 0.00% 0.000us 0.000us 9 +cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFla... 0.76% 4.720us 0.76% 4.720us 1.573us 0.000us 0.00% 0.000us 0.000us 3 + cudaLaunchKernel 24.62% 153.733us 24.62% 153.733us 51.244us 0.000us 0.00% 0.000us 0.000us 3 + cudaDeviceSynchronize 0.77% 4.780us 0.77% 4.780us 4.780us 0.000us 0.00% 0.000us 0.000us 1 ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ -Self CPU time total: 1.501ms -Self CUDA time total: 800.455us +Self CPU time total: 624.319us +Self CUDA time total: 138.780us @@ -4603,19 +4603,19 @@ PROFILE TRACE: hf_kernels_layer_norm | LN_B4_S2048_D4096 ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ - hf_kernels_layer_norm 6.21% 121.211us 24.06% 469.730us 469.730us 0.000us 0.00% 2.133ms 2.133ms 1 - _layer_norm_f8ec252::dropout_add_ln_fwd 2.16% 42.120us 17.26% 337.067us 112.356us 1.640ms 100.00% 2.133ms 710.841us 3 - hf_kernels_layer_norm 0.00% 0.000us 0.00% 0.000us 0.000us 1.641ms 100.08% 1.641ms 1.641ms 1 -void layer_norm::ln_fwd_kernel<layer_norm::Kernel_tr... 0.00% 0.000us 0.00% 0.000us 0.000us 1.640ms 100.00% 1.640ms 546.618us 3 - Activity Buffer Request 4.76% 92.922us 4.76% 92.922us 92.922us 492.667us 30.04% 492.667us 492.667us 1 - aten::view 0.59% 11.452us 0.59% 11.452us 1.909us 0.000us 0.00% 0.000us 0.000us 6 - aten::empty 1.61% 31.362us 1.61% 31.362us 3.485us 0.000us 0.00% 0.000us 0.000us 9 -cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFla... 0.23% 4.460us 0.23% 4.460us 1.487us 0.000us 0.00% 0.000us 0.000us 3 - cudaLaunchKernel 8.51% 166.203us 8.51% 166.203us 55.401us 0.000us 0.00% 0.000us 0.000us 3 - cudaDeviceSynchronize 75.94% 1.483ms 75.94% 1.483ms 1.483ms 0.000us 0.00% 0.000us 0.000us 1 + hf_kernels_layer_norm 11.56% 103.222us 56.17% 501.697us 501.697us 0.000us 0.00% 729.744us 729.744us 1 + _layer_norm_f8ec252::dropout_add_ln_fwd 5.35% 47.791us 43.31% 386.845us 128.948us 547.924us 100.00% 729.744us 243.248us 3 + hf_kernels_layer_norm 0.00% 0.000us 0.00% 0.000us 0.000us 549.427us 100.27% 549.427us 549.427us 1 +void layer_norm::ln_fwd_kernel<layer_norm::Kernel_tr... 0.00% 0.000us 0.00% 0.000us 0.000us 547.924us 100.00% 547.924us 182.641us 3 + Activity Buffer Request 16.56% 147.902us 16.56% 147.902us 147.902us 181.820us 33.18% 181.820us 181.820us 1 + aten::view 1.30% 11.630us 1.30% 11.630us 1.938us 0.000us 0.00% 0.000us 0.000us 6 + aten::empty 3.31% 29.600us 3.31% 29.600us 3.289us 0.000us 0.00% 0.000us 0.000us 9 +cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFla... 0.55% 4.940us 0.55% 4.940us 1.647us 0.000us 0.00% 0.000us 0.000us 3 + cudaLaunchKernel 17.53% 156.612us 17.53% 156.612us 52.204us 0.000us 0.00% 0.000us 0.000us 3 + cudaDeviceSynchronize 43.83% 391.555us 43.83% 391.555us 391.555us 0.000us 0.00% 0.000us 0.000us 1 ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ -Self CPU time total: 1.953ms -Self CUDA time total: 1.640ms +Self CPU time total: 893.252us +Self CUDA time total: 547.924us @@ -4625,19 +4625,19 @@ PROFILE TRACE: hf_kernels_layer_norm | LN_B4_S2048_D8192 ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ - hf_kernels_layer_norm 2.73% 111.353us 22.06% 898.879us 898.879us 0.000us 0.00% 4.367ms 4.367ms 1 - _layer_norm_f8ec252::dropout_add_ln_fwd 1.02% 41.530us 19.00% 774.227us 258.076us 3.342ms 100.00% 4.367ms 1.456ms 3 - hf_kernels_layer_norm 0.00% 0.000us 0.00% 0.000us 0.000us 3.343ms 100.04% 3.343ms 3.343ms 1 -void layer_norm::ln_fwd_kernel<layer_norm::Kernel_tr... 0.00% 0.000us 0.00% 0.000us 0.000us 3.342ms 100.00% 3.342ms 1.114ms 3 - Activity Buffer Request 12.92% 526.282us 12.92% 526.282us 526.282us 1.025ms 30.68% 1.025ms 1.025ms 1 - aten::view 0.33% 13.299us 0.33% 13.299us 2.217us 0.000us 0.00% 0.000us 0.000us 6 - aten::empty 0.83% 33.890us 0.83% 33.890us 3.766us 0.000us 0.00% 0.000us 0.000us 9 -cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFla... 0.13% 5.100us 0.13% 5.100us 1.700us 0.000us 0.00% 0.000us 0.000us 3 - cudaLaunchKernel 4.11% 167.425us 4.11% 167.425us 55.808us 0.000us 0.00% 0.000us 0.000us 3 - cudaDeviceSynchronize 77.94% 3.175ms 77.94% 3.175ms 3.175ms 0.000us 0.00% 0.000us 0.000us 1 + hf_kernels_layer_norm 6.34% 102.532us 36.35% 588.198us 588.198us 0.000us 0.00% 1.536ms 1.536ms 1 + _layer_norm_f8ec252::dropout_add_ln_fwd 2.97% 48.143us 29.27% 473.696us 157.899us 1.186ms 100.00% 1.536ms 511.906us 3 + hf_kernels_layer_norm 0.00% 0.000us 0.00% 0.000us 0.000us 1.188ms 100.13% 1.188ms 1.188ms 1 +void layer_norm::ln_fwd_kernel<layer_norm::Kernel_tr... 0.00% 0.000us 0.00% 0.000us 0.000us 1.186ms 100.00% 1.186ms 395.396us 3 + Activity Buffer Request 14.38% 232.673us 14.38% 232.673us 232.673us 349.530us 29.47% 349.530us 349.530us 1 + aten::view 0.74% 11.970us 0.74% 11.970us 1.995us 0.000us 0.00% 0.000us 0.000us 6 + aten::empty 1.86% 30.039us 1.86% 30.039us 3.338us 0.000us 0.00% 0.000us 0.000us 9 +cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFla... 0.30% 4.850us 0.30% 4.850us 1.617us 0.000us 0.00% 0.000us 0.000us 3 + cudaLaunchKernel 9.76% 157.991us 9.76% 157.991us 52.664us 0.000us 0.00% 0.000us 0.000us 3 + cudaDeviceSynchronize 63.65% 1.030ms 63.65% 1.030ms 1.030ms 0.000us 0.00% 0.000us 0.000us 1 ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ -Self CPU time total: 4.074ms -Self CUDA time total: 3.342ms +Self CPU time total: 1.618ms +Self CUDA time total: 1.186ms @@ -4647,19 +4647,19 @@ PROFILE TRACE: hf_kernels_layer_norm | LN_B16_S128_D1024 ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ - hf_kernels_layer_norm 0.00% 0.000us 0.00% 0.000us 0.000us 123.232us 471.36% 123.232us 123.232us 1 - hf_kernels_layer_norm 12.80% 108.565us 99.46% 843.618us 843.618us 0.000us 0.00% 35.008us 35.008us 1 - _layer_norm_f8ec252::dropout_add_ln_fwd 5.05% 42.820us 85.21% 722.694us 240.898us 26.144us 100.00% 35.008us 11.669us 3 -void layer_norm::ln_fwd_kernel<layer_norm::Kernel_tr... 0.00% 0.000us 0.00% 0.000us 0.000us 26.144us 100.00% 26.144us 8.715us 3 - Activity Buffer Request 56.64% 480.421us 56.64% 480.421us 480.421us 8.864us 33.90% 8.864us 8.864us 1 - aten::view 1.46% 12.359us 1.46% 12.359us 2.060us 0.000us 0.00% 0.000us 0.000us 6 - aten::empty 3.60% 30.550us 3.60% 30.550us 3.394us 0.000us 0.00% 0.000us 0.000us 9 -cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFla... 0.62% 5.250us 0.62% 5.250us 1.750us 0.000us 0.00% 0.000us 0.000us 3 - cudaLaunchKernel 19.29% 163.653us 19.29% 163.653us 54.551us 0.000us 0.00% 0.000us 0.000us 3 - cudaDeviceSynchronize 0.54% 4.550us 0.54% 4.550us 4.550us 0.000us 0.00% 0.000us 0.000us 1 + hf_kernels_layer_norm 0.00% 0.000us 0.00% 0.000us 0.000us 112.413us 848.59% 112.413us 112.413us 1 + hf_kernels_layer_norm 21.62% 101.733us 99.00% 465.906us 465.906us 0.000us 0.00% 17.726us 17.726us 1 + _layer_norm_f8ec252::dropout_add_ln_fwd 10.03% 47.199us 74.95% 352.704us 117.568us 13.247us 100.00% 17.726us 5.909us 3 +void layer_norm::ln_fwd_kernel<layer_norm::Kernel_tr... 0.00% 0.000us 0.00% 0.000us 0.000us 13.247us 100.00% 13.247us 4.416us 3 + Activity Buffer Request 24.84% 116.882us 24.84% 116.882us 116.882us 4.479us 33.81% 4.479us 4.479us 1 + aten::view 2.44% 11.469us 2.44% 11.469us 1.912us 0.000us 0.00% 0.000us 0.000us 6 + aten::empty 6.31% 29.701us 6.31% 29.701us 3.300us 0.000us 0.00% 0.000us 0.000us 9 +cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFla... 1.17% 5.520us 1.17% 5.520us 1.840us 0.000us 0.00% 0.000us 0.000us 3 + cudaLaunchKernel 32.60% 153.402us 32.60% 153.402us 51.134us 0.000us 0.00% 0.000us 0.000us 3 + cudaDeviceSynchronize 1.00% 4.700us 1.00% 4.700us 4.700us 0.000us 0.00% 0.000us 0.000us 1 ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ -Self CPU time total: 848.168us -Self CUDA time total: 26.144us +Self CPU time total: 470.606us +Self CUDA time total: 13.247us @@ -4669,19 +4669,19 @@ PROFILE TRACE: hf_kernels_layer_norm | LN_B16_S128_D2048 ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ - hf_kernels_layer_norm 20.19% 105.083us 99.15% 516.111us 516.111us 0.000us 0.00% 144.730us 144.730us 1 - _layer_norm_f8ec252::dropout_add_ln_fwd 8.34% 43.421us 76.72% 399.369us 133.123us 91.356us 100.00% 144.730us 48.243us 3 - hf_kernels_layer_norm 0.00% 0.000us 0.00% 0.000us 0.000us 129.148us 141.37% 129.148us 129.148us 1 -void layer_norm::ln_fwd_kernel<layer_norm::Kernel_tr... 0.00% 0.000us 0.00% 0.000us 0.000us 91.356us 100.00% 91.356us 30.452us 3 - Activity Buffer Request 30.57% 159.154us 30.57% 159.154us 159.154us 53.374us 58.42% 53.374us 53.374us 1 - aten::view 2.24% 11.659us 2.24% 11.659us 1.943us 0.000us 0.00% 0.000us 0.000us 6 - aten::empty 5.91% 30.740us 5.91% 30.740us 3.416us 0.000us 0.00% 0.000us 0.000us 9 -cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFla... 1.03% 5.350us 1.03% 5.350us 1.783us 0.000us 0.00% 0.000us 0.000us 3 - cudaLaunchKernel 30.87% 160.704us 30.87% 160.704us 53.568us 0.000us 0.00% 0.000us 0.000us 3 - cudaDeviceSynchronize 0.85% 4.440us 0.85% 4.440us 4.440us 0.000us 0.00% 0.000us 0.000us 1 + hf_kernels_layer_norm 0.00% 0.000us 0.00% 0.000us 0.000us 116.766us 456.71% 116.766us 116.766us 1 + hf_kernels_layer_norm 17.51% 102.502us 99.17% 580.409us 580.409us 0.000us 0.00% 34.239us 34.239us 1 + _layer_norm_f8ec252::dropout_add_ln_fwd 7.99% 46.742us 79.55% 465.587us 155.196us 25.567us 100.00% 34.239us 11.413us 3 +void layer_norm::ln_fwd_kernel<layer_norm::Kernel_tr... 0.00% 0.000us 0.00% 0.000us 0.000us 25.567us 100.00% 25.567us 8.522us 3 + Activity Buffer Request 39.32% 230.104us 39.32% 230.104us 230.104us 8.672us 33.92% 8.672us 8.672us 1 + aten::view 2.11% 12.320us 2.11% 12.320us 2.053us 0.000us 0.00% 0.000us 0.000us 6 + aten::empty 5.04% 29.500us 5.04% 29.500us 3.278us 0.000us 0.00% 0.000us 0.000us 9 +cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFla... 1.02% 5.979us 1.02% 5.979us 1.993us 0.000us 0.00% 0.000us 0.000us 3 + cudaLaunchKernel 26.19% 153.262us 26.19% 153.262us 51.087us 0.000us 0.00% 0.000us 0.000us 3 + cudaDeviceSynchronize 0.83% 4.860us 0.83% 4.860us 4.860us 0.000us 0.00% 0.000us 0.000us 1 ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ -Self CPU time total: 520.551us -Self CUDA time total: 91.356us +Self CPU time total: 585.269us +Self CUDA time total: 25.567us @@ -4691,19 +4691,19 @@ PROFILE TRACE: hf_kernels_layer_norm | LN_B16_S128_D4096 ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ - hf_kernels_layer_norm 13.09% 110.823us 99.44% 841.628us 841.628us 0.000us 0.00% 249.627us 249.627us 1 - _layer_norm_f8ec252::dropout_add_ln_fwd 5.86% 49.630us 84.88% 718.434us 239.478us 153.277us 100.00% 249.627us 83.209us 3 - hf_kernels_layer_norm 0.00% 0.000us 0.00% 0.000us 0.000us 154.621us 100.88% 154.621us 154.621us 1 -void layer_norm::ln_fwd_kernel<layer_norm::Kernel_tr... 0.00% 0.000us 0.00% 0.000us 0.000us 153.277us 100.00% 153.277us 51.092us 3 - Activity Buffer Request 55.47% 469.480us 55.47% 469.480us 469.480us 96.350us 62.86% 96.350us 96.350us 1 - aten::view 1.46% 12.371us 1.46% 12.371us 2.062us 0.000us 0.00% 0.000us 0.000us 6 - aten::empty 3.70% 31.290us 3.70% 31.290us 3.477us 0.000us 0.00% 0.000us 0.000us 9 -cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFla... 0.56% 4.751us 0.56% 4.751us 1.584us 0.000us 0.00% 0.000us 0.000us 3 - cudaLaunchKernel 19.29% 163.283us 19.29% 163.283us 54.428us 0.000us 0.00% 0.000us 0.000us 3 - cudaDeviceSynchronize 0.56% 4.750us 0.56% 4.750us 4.750us 0.000us 0.00% 0.000us 0.000us 1 + hf_kernels_layer_norm 0.00% 0.000us 0.00% 0.000us 0.000us 120.223us 201.23% 120.223us 120.223us 1 + hf_kernels_layer_norm 16.35% 102.201us 99.23% 620.398us 620.398us 0.000us 0.00% 95.200us 95.200us 1 + _layer_norm_f8ec252::dropout_add_ln_fwd 7.44% 46.527us 81.07% 506.887us 168.962us 59.744us 100.00% 95.200us 31.733us 3 +void layer_norm::ln_fwd_kernel<layer_norm::Kernel_tr... 0.00% 0.000us 0.00% 0.000us 0.000us 59.744us 100.00% 59.744us 19.915us 3 + Activity Buffer Request 43.52% 272.134us 43.52% 272.134us 272.134us 35.456us 59.35% 35.456us 35.456us 1 + aten::view 1.81% 11.310us 1.81% 11.310us 1.885us 0.000us 0.00% 0.000us 0.000us 6 + aten::empty 4.69% 29.332us 4.69% 29.332us 3.259us 0.000us 0.00% 0.000us 0.000us 9 +cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFla... 0.86% 5.391us 0.86% 5.391us 1.797us 0.000us 0.00% 0.000us 0.000us 3 + cudaLaunchKernel 24.55% 153.503us 24.55% 153.503us 51.168us 0.000us 0.00% 0.000us 0.000us 3 + cudaDeviceSynchronize 0.77% 4.841us 0.77% 4.841us 4.841us 0.000us 0.00% 0.000us 0.000us 1 ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ -Self CPU time total: 846.378us -Self CUDA time total: 153.277us +Self CPU time total: 625.239us +Self CUDA time total: 59.744us @@ -4713,19 +4713,19 @@ PROFILE TRACE: hf_kernels_layer_norm | LN_B16_S128_D8192 ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ - hf_kernels_layer_norm 9.75% 106.662us 43.81% 479.070us 479.070us 0.000us 0.00% 1.022ms 1.022ms 1 - _layer_norm_f8ec252::dropout_add_ln_fwd 3.93% 43.001us 32.91% 359.867us 119.956us 772.676us 100.00% 1.022ms 340.734us 3 - hf_kernels_layer_norm 0.00% 0.000us 0.00% 0.000us 0.000us 773.988us 100.17% 773.988us 773.988us 1 -void layer_norm::ln_fwd_kernel<layer_norm::Kernel_tr... 0.00% 0.000us 0.00% 0.000us 0.000us 772.676us 100.00% 772.676us 257.559us 3 - Activity Buffer Request 10.97% 119.943us 10.97% 119.943us 119.943us 249.527us 32.29% 249.527us 249.527us 1 - aten::view 1.15% 12.541us 1.15% 12.541us 2.090us 0.000us 0.00% 0.000us 0.000us 6 - aten::empty 2.94% 32.110us 2.94% 32.110us 3.568us 0.000us 0.00% 0.000us 0.000us 9 -cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFla... 0.41% 4.440us 0.41% 4.440us 1.480us 0.000us 0.00% 0.000us 0.000us 3 - cudaLaunchKernel 14.66% 160.373us 14.66% 160.373us 53.458us 0.000us 0.00% 0.000us 0.000us 3 - cudaDeviceSynchronize 56.19% 614.524us 56.19% 614.524us 614.524us 0.000us 0.00% 0.000us 0.000us 1 + hf_kernels_layer_norm 20.57% 103.320us 99.00% 497.196us 497.196us 0.000us 0.00% 197.814us 197.814us 1 + _layer_norm_f8ec252::dropout_add_ln_fwd 9.11% 45.760us 76.10% 382.195us 127.398us 124.346us 100.00% 197.814us 65.938us 3 + hf_kernels_layer_norm 0.00% 0.000us 0.00% 0.000us 0.000us 132.857us 106.84% 132.857us 132.857us 1 +void layer_norm::ln_fwd_kernel<layer_norm::Kernel_tr... 0.00% 0.000us 0.00% 0.000us 0.000us 124.346us 100.00% 124.346us 41.449us 3 + Activity Buffer Request 28.52% 143.222us 28.52% 143.222us 143.222us 73.468us 59.08% 73.468us 73.468us 1 + aten::view 2.33% 11.681us 2.33% 11.681us 1.947us 0.000us 0.00% 0.000us 0.000us 6 + aten::empty 6.37% 31.970us 6.37% 31.970us 3.552us 0.000us 0.00% 0.000us 0.000us 9 +cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFla... 0.95% 4.761us 0.95% 4.761us 1.587us 0.000us 0.00% 0.000us 0.000us 3 + cudaLaunchKernel 31.16% 156.482us 31.16% 156.482us 52.161us 0.000us 0.00% 0.000us 0.000us 3 + cudaDeviceSynchronize 1.00% 5.020us 1.00% 5.020us 5.020us 0.000us 0.00% 0.000us 0.000us 1 ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ -Self CPU time total: 1.094ms -Self CUDA time total: 772.676us +Self CPU time total: 502.216us +Self CUDA time total: 124.346us @@ -4735,19 +4735,19 @@ PROFILE TRACE: hf_kernels_layer_norm | LN_B16_S512_D1024 ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ - hf_kernels_layer_norm 12.77% 113.905us 99.47% 886.969us 886.969us 0.000us 0.00% 250.621us 250.621us 1 - _layer_norm_f8ec252::dropout_add_ln_fwd 5.13% 45.759us 85.35% 761.025us 253.675us 149.982us 100.00% 250.621us 83.540us 3 - hf_kernels_layer_norm 0.00% 0.000us 0.00% 0.000us 0.000us 151.390us 100.94% 151.390us 151.390us 1 -void layer_norm::ln_fwd_kernel<layer_norm::Kernel_tr... 0.00% 0.000us 0.00% 0.000us 0.000us 149.982us 100.00% 149.982us 49.994us 3 - Activity Buffer Request 57.16% 509.711us 57.16% 509.711us 509.711us 100.639us 67.10% 100.639us 100.639us 1 - aten::view 1.35% 12.039us 1.35% 12.039us 2.007us 0.000us 0.00% 0.000us 0.000us 6 - aten::empty 3.32% 29.620us 3.32% 29.620us 3.291us 0.000us 0.00% 0.000us 0.000us 9 -cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFla... 0.60% 5.321us 0.60% 5.321us 1.774us 0.000us 0.00% 0.000us 0.000us 3 - cudaLaunchKernel 19.13% 170.614us 19.13% 170.614us 56.871us 0.000us 0.00% 0.000us 0.000us 3 - cudaDeviceSynchronize 0.53% 4.691us 0.53% 4.691us 4.691us 0.000us 0.00% 0.000us 0.000us 1 + hf_kernels_layer_norm 0.00% 0.000us 0.00% 0.000us 0.000us 124.255us 213.59% 124.255us 124.255us 1 + hf_kernels_layer_norm 13.39% 104.902us 99.38% 778.360us 778.360us 0.000us 0.00% 94.430us 94.430us 1 + _layer_norm_f8ec252::dropout_add_ln_fwd 6.10% 47.738us 84.51% 661.878us 220.626us 58.175us 100.00% 94.430us 31.477us 3 +void layer_norm::ln_fwd_kernel<layer_norm::Kernel_tr... 0.00% 0.000us 0.00% 0.000us 0.000us 58.175us 100.00% 58.175us 19.392us 3 + Activity Buffer Request 54.12% 423.885us 54.12% 423.885us 423.885us 36.255us 62.32% 36.255us 36.255us 1 + aten::view 1.48% 11.580us 1.48% 11.580us 1.930us 0.000us 0.00% 0.000us 0.000us 6 + aten::empty 3.89% 30.461us 3.89% 30.461us 3.385us 0.000us 0.00% 0.000us 0.000us 9 +cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFla... 0.64% 5.001us 0.64% 5.001us 1.667us 0.000us 0.00% 0.000us 0.000us 3 + cudaLaunchKernel 19.76% 154.793us 19.76% 154.793us 51.598us 0.000us 0.00% 0.000us 0.000us 3 + cudaDeviceSynchronize 0.62% 4.840us 0.62% 4.840us 4.840us 0.000us 0.00% 0.000us 0.000us 1 ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ -Self CPU time total: 891.660us -Self CUDA time total: 149.982us +Self CPU time total: 783.200us +Self CUDA time total: 58.175us @@ -4757,19 +4757,19 @@ PROFILE TRACE: hf_kernels_layer_norm | LN_B16_S512_D2048 ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ - hf_kernels_layer_norm 8.98% 115.483us 49.89% 641.834us 641.834us 0.000us 0.00% 1.066ms 1.066ms 1 - _layer_norm_f8ec252::dropout_add_ln_fwd 3.41% 43.812us 39.92% 513.570us 171.190us 804.418us 100.00% 1.066ms 355.382us 3 - hf_kernels_layer_norm 0.00% 0.000us 0.00% 0.000us 0.000us 805.858us 100.18% 805.858us 805.858us 1 -void layer_norm::ln_fwd_kernel<layer_norm::Kernel_tr... 0.00% 0.000us 0.00% 0.000us 0.000us 804.418us 100.00% 804.418us 268.139us 3 - Activity Buffer Request 21.15% 272.045us 21.15% 272.045us 272.045us 261.728us 32.54% 261.728us 261.728us 1 - aten::view 0.99% 12.781us 0.99% 12.781us 2.130us 0.000us 0.00% 0.000us 0.000us 6 - aten::empty 2.33% 29.920us 2.33% 29.920us 3.324us 0.000us 0.00% 0.000us 0.000us 9 -cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFla... 0.35% 4.450us 0.35% 4.450us 1.483us 0.000us 0.00% 0.000us 0.000us 3 - cudaLaunchKernel 12.70% 163.343us 12.70% 163.343us 54.448us 0.000us 0.00% 0.000us 0.000us 3 - cudaDeviceSynchronize 50.11% 644.543us 50.11% 644.543us 644.543us 0.000us 0.00% 0.000us 0.000us 1 + hf_kernels_layer_norm 21.79% 100.002us 98.87% 453.846us 453.846us 0.000us 0.00% 220.923us 220.923us 1 + _layer_norm_f8ec252::dropout_add_ln_fwd 9.94% 45.651us 74.52% 342.064us 114.021us 139.741us 100.00% 220.923us 73.641us 3 + hf_kernels_layer_norm 0.00% 0.000us 0.00% 0.000us 0.000us 141.149us 101.01% 141.149us 141.149us 1 +void layer_norm::ln_fwd_kernel<layer_norm::Kernel_tr... 0.00% 0.000us 0.00% 0.000us 0.000us 139.741us 100.00% 139.741us 46.580us 3 + Activity Buffer Request 23.19% 106.461us 23.19% 106.461us 106.461us 81.182us 58.09% 81.182us 81.182us 1 + aten::view 2.57% 11.780us 2.57% 11.780us 1.963us 0.000us 0.00% 0.000us 0.000us 6 + aten::empty 6.95% 31.900us 6.95% 31.900us 3.544us 0.000us 0.00% 0.000us 0.000us 9 +cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFla... 1.20% 5.510us 1.20% 5.510us 1.837us 0.000us 0.00% 0.000us 0.000us 3 + cudaLaunchKernel 33.23% 152.542us 33.23% 152.542us 50.847us 0.000us 0.00% 0.000us 0.000us 3 + cudaDeviceSynchronize 1.13% 5.191us 1.13% 5.191us 5.191us 0.000us 0.00% 0.000us 0.000us 1 ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ -Self CPU time total: 1.286ms -Self CUDA time total: 804.418us +Self CPU time total: 459.037us +Self CUDA time total: 139.741us @@ -4779,19 +4779,19 @@ PROFILE TRACE: hf_kernels_layer_norm | LN_B16_S512_D4096 ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ - hf_kernels_layer_norm 5.64% 113.303us 25.29% 508.381us 508.381us 0.000us 0.00% 2.159ms 2.159ms 1 - _layer_norm_f8ec252::dropout_add_ln_fwd 2.17% 43.641us 19.06% 383.097us 127.699us 1.664ms 100.00% 2.159ms 719.632us 3 - hf_kernels_layer_norm 0.00% 0.000us 0.00% 0.000us 0.000us 1.665ms 100.07% 1.665ms 1.665ms 1 -void layer_norm::ln_fwd_kernel<layer_norm::Kernel_tr... 0.00% 0.000us 0.00% 0.000us 0.000us 1.664ms 100.00% 1.664ms 554.697us 3 - Activity Buffer Request 7.17% 144.123us 7.17% 144.123us 144.123us 494.805us 29.73% 494.805us 494.805us 1 - aten::view 0.60% 11.981us 0.60% 11.981us 1.997us 0.000us 0.00% 0.000us 0.000us 6 - aten::empty 1.55% 31.100us 1.55% 31.100us 3.456us 0.000us 0.00% 0.000us 0.000us 9 -cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFla... 0.23% 4.550us 0.23% 4.550us 1.517us 0.000us 0.00% 0.000us 0.000us 3 - cudaLaunchKernel 7.94% 159.683us 7.94% 159.683us 53.228us 0.000us 0.00% 0.000us 0.000us 3 - cudaDeviceSynchronize 74.71% 1.502ms 74.71% 1.502ms 1.502ms 0.000us 0.00% 0.000us 0.000us 1 + hf_kernels_layer_norm 8.50% 106.103us 68.87% 859.212us 859.212us 0.000us 0.00% 730.264us 730.264us 1 + _layer_norm_f8ec252::dropout_add_ln_fwd 3.84% 47.858us 59.45% 741.700us 247.233us 547.642us 100.00% 730.264us 243.421us 3 + hf_kernels_layer_norm 0.00% 0.000us 0.00% 0.000us 0.000us 549.114us 100.27% 549.114us 549.114us 1 +void layer_norm::ln_fwd_kernel<layer_norm::Kernel_tr... 0.00% 0.000us 0.00% 0.000us 0.000us 547.642us 100.00% 547.642us 182.547us 3 + Activity Buffer Request 40.36% 503.557us 40.36% 503.557us 503.557us 182.622us 33.35% 182.622us 182.622us 1 + aten::view 0.91% 11.409us 0.91% 11.409us 1.901us 0.000us 0.00% 0.000us 0.000us 6 + aten::empty 2.41% 30.103us 2.41% 30.103us 3.345us 0.000us 0.00% 0.000us 0.000us 9 +cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFla... 0.44% 5.510us 0.44% 5.510us 1.837us 0.000us 0.00% 0.000us 0.000us 3 + cudaLaunchKernel 12.40% 154.672us 12.40% 154.672us 51.557us 0.000us 0.00% 0.000us 0.000us 3 + cudaDeviceSynchronize 31.13% 388.435us 31.13% 388.435us 388.435us 0.000us 0.00% 0.000us 0.000us 1 ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ -Self CPU time total: 2.010ms -Self CUDA time total: 1.664ms +Self CPU time total: 1.248ms +Self CUDA time total: 547.642us @@ -4801,19 +4801,19 @@ PROFILE TRACE: hf_kernels_layer_norm | LN_B16_S512_D8192 ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ - hf_kernels_layer_norm 2.98% 113.892us 17.62% 674.305us 674.305us 0.000us 0.00% 4.332ms 4.332ms 1 - _layer_norm_f8ec252::dropout_add_ln_fwd 1.13% 43.322us 14.33% 548.253us 182.751us 3.318ms 100.00% 4.332ms 1.444ms 3 - hf_kernels_layer_norm 0.00% 0.000us 0.00% 0.000us 0.000us 3.319ms 100.04% 3.319ms 3.319ms 1 -void layer_norm::ln_fwd_kernel<layer_norm::Kernel_tr... 0.00% 0.000us 0.00% 0.000us 0.000us 3.318ms 100.00% 3.318ms 1.106ms 3 - Activity Buffer Request 7.93% 303.427us 7.93% 303.427us 303.427us 1.015ms 30.58% 1.015ms 1.015ms 1 - aten::view 0.32% 12.160us 0.32% 12.160us 2.027us 0.000us 0.00% 0.000us 0.000us 6 - aten::empty 0.81% 30.960us 0.81% 30.960us 3.440us 0.000us 0.00% 0.000us 0.000us 9 -cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFla... 0.13% 5.090us 0.13% 5.090us 1.697us 0.000us 0.00% 0.000us 0.000us 3 - cudaLaunchKernel 4.32% 165.454us 4.32% 165.454us 55.151us 0.000us 0.00% 0.000us 0.000us 3 - cudaDeviceSynchronize 82.38% 3.152ms 82.38% 3.152ms 3.152ms 0.000us 0.00% 0.000us 0.000us 1 + hf_kernels_layer_norm 6.20% 117.401us 45.70% 865.822us 865.822us 0.000us 0.00% 1.533ms 1.533ms 1 + _layer_norm_f8ec252::dropout_add_ln_fwd 2.53% 47.909us 38.86% 736.290us 245.430us 1.191ms 100.00% 1.533ms 511.056us 3 + hf_kernels_layer_norm 0.00% 0.000us 0.00% 0.000us 0.000us 1.192ms 100.13% 1.192ms 1.192ms 1 +void layer_norm::ln_fwd_kernel<layer_norm::Kernel_tr... 0.00% 0.000us 0.00% 0.000us 0.000us 1.191ms 100.00% 1.191ms 396.977us 3 + Activity Buffer Request 26.13% 495.047us 26.13% 495.047us 495.047us 342.236us 28.74% 342.236us 342.236us 1 + aten::view 0.64% 12.131us 0.64% 12.131us 2.022us 0.000us 0.00% 0.000us 0.000us 6 + aten::empty 1.61% 30.562us 1.61% 30.562us 3.396us 0.000us 0.00% 0.000us 0.000us 9 +cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFla... 0.31% 5.930us 0.31% 5.930us 1.977us 0.000us 0.00% 0.000us 0.000us 3 + cudaLaunchKernel 8.28% 156.842us 8.28% 156.842us 52.281us 0.000us 0.00% 0.000us 0.000us 3 + cudaDeviceSynchronize 54.30% 1.029ms 54.30% 1.029ms 1.029ms 0.000us 0.00% 0.000us 0.000us 1 ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ -Self CPU time total: 3.826ms -Self CUDA time total: 3.318ms +Self CPU time total: 1.895ms +Self CUDA time total: 1.191ms @@ -4823,19 +4823,19 @@ PROFILE TRACE: hf_kernels_layer_norm | LN_B16_S1024_D1024 ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ - hf_kernels_layer_norm 9.56% 107.052us 43.15% 483.460us 483.460us 0.000us 0.00% 1.061ms 1.061ms 1 - _layer_norm_f8ec252::dropout_add_ln_fwd 3.89% 43.551us 32.51% 364.228us 121.409us 796.221us 100.00% 1.061ms 353.791us 3 - hf_kernels_layer_norm 0.00% 0.000us 0.00% 0.000us 0.000us 797.501us 100.16% 797.501us 797.501us 1 -void layer_norm::ln_fwd_kernel<layer_norm::Kernel_tr... 0.00% 0.000us 0.00% 0.000us 0.000us 796.221us 100.00% 796.221us 265.407us 3 - Activity Buffer Request 10.86% 121.633us 10.86% 121.633us 121.633us 265.151us 33.30% 265.151us 265.151us 1 - aten::view 1.09% 12.180us 1.09% 12.180us 2.030us 0.000us 0.00% 0.000us 0.000us 6 - aten::empty 2.75% 30.759us 2.75% 30.759us 3.418us 0.000us 0.00% 0.000us 0.000us 9 -cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFla... 0.45% 5.070us 0.45% 5.070us 1.690us 0.000us 0.00% 0.000us 0.000us 3 - cudaLaunchKernel 14.57% 163.215us 14.57% 163.215us 54.405us 0.000us 0.00% 0.000us 0.000us 3 - cudaDeviceSynchronize 56.85% 636.843us 56.85% 636.843us 636.843us 0.000us 0.00% 0.000us 0.000us 1 + hf_kernels_layer_norm 18.76% 102.890us 99.05% 543.128us 543.128us 0.000us 0.00% 191.549us 191.549us 1 + _layer_norm_f8ec252::dropout_add_ln_fwd 9.08% 49.784us 78.18% 428.658us 142.886us 117.790us 100.00% 191.549us 63.850us 3 + hf_kernels_layer_norm 0.00% 0.000us 0.00% 0.000us 0.000us 127.934us 108.61% 127.934us 127.934us 1 +void layer_norm::ln_fwd_kernel<layer_norm::Kernel_tr... 0.00% 0.000us 0.00% 0.000us 0.000us 117.790us 100.00% 117.790us 39.263us 3 + Activity Buffer Request 33.02% 181.032us 33.02% 181.032us 181.032us 73.759us 62.62% 73.759us 73.759us 1 + aten::view 2.11% 11.580us 2.11% 11.580us 1.930us 0.000us 0.00% 0.000us 0.000us 6 + aten::empty 5.47% 30.020us 5.47% 30.020us 3.336us 0.000us 0.00% 0.000us 0.000us 9 +cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFla... 1.00% 5.460us 1.00% 5.460us 1.820us 0.000us 0.00% 0.000us 0.000us 3 + cudaLaunchKernel 29.61% 162.362us 29.61% 162.362us 54.121us 0.000us 0.00% 0.000us 0.000us 3 + cudaDeviceSynchronize 0.95% 5.190us 0.95% 5.190us 5.190us 0.000us 0.00% 0.000us 0.000us 1 ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ -Self CPU time total: 1.120ms -Self CUDA time total: 796.221us +Self CPU time total: 548.318us +Self CUDA time total: 117.790us @@ -4845,19 +4845,19 @@ PROFILE TRACE: hf_kernels_layer_norm | LN_B16_S1024_D2048 ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ - hf_kernels_layer_norm 5.01% 109.623us 28.99% 634.714us 634.714us 0.000us 0.00% 2.221ms 2.221ms 1 - _layer_norm_f8ec252::dropout_add_ln_fwd 1.96% 43.002us 23.41% 512.541us 170.847us 1.714ms 100.00% 2.221ms 740.491us 3 - hf_kernels_layer_norm 0.00% 0.000us 0.00% 0.000us 0.000us 1.715ms 100.07% 1.715ms 1.715ms 1 -void layer_norm::ln_fwd_kernel<layer_norm::Kernel_tr... 0.00% 0.000us 0.00% 0.000us 0.000us 1.714ms 100.00% 1.714ms 571.320us 3 - Activity Buffer Request 12.44% 272.486us 12.44% 272.486us 272.486us 507.513us 29.61% 507.513us 507.513us 1 - aten::view 0.57% 12.550us 0.57% 12.550us 2.092us 0.000us 0.00% 0.000us 0.000us 6 - aten::empty 1.46% 31.970us 1.46% 31.970us 3.552us 0.000us 0.00% 0.000us 0.000us 9 -cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFla... 0.21% 4.629us 0.21% 4.629us 1.543us 0.000us 0.00% 0.000us 0.000us 3 - cudaLaunchKernel 7.33% 160.454us 7.33% 160.454us 53.485us 0.000us 0.00% 0.000us 0.000us 3 - cudaDeviceSynchronize 71.01% 1.555ms 71.01% 1.555ms 1.555ms 0.000us 0.00% 0.000us 0.000us 1 + hf_kernels_layer_norm 9.87% 125.762us 69.07% 879.903us 879.903us 0.000us 0.00% 766.838us 766.838us 1 + _layer_norm_f8ec252::dropout_add_ln_fwd 3.87% 49.332us 58.21% 741.561us 247.187us 575.481us 100.00% 766.838us 255.613us 3 + hf_kernels_layer_norm 0.00% 0.000us 0.00% 0.000us 0.000us 576.857us 100.24% 576.857us 576.857us 1 +void layer_norm::ln_fwd_kernel<layer_norm::Kernel_tr... 0.00% 0.000us 0.00% 0.000us 0.000us 575.481us 100.00% 575.481us 191.827us 3 + Activity Buffer Request 39.29% 500.518us 39.29% 500.518us 500.518us 191.357us 33.25% 191.357us 191.357us 1 + aten::view 0.99% 12.580us 0.99% 12.580us 2.097us 0.000us 0.00% 0.000us 0.000us 6 + aten::empty 2.41% 30.689us 2.41% 30.689us 3.410us 0.000us 0.00% 0.000us 0.000us 9 +cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFla... 0.43% 5.420us 0.43% 5.420us 1.807us 0.000us 0.00% 0.000us 0.000us 3 + cudaLaunchKernel 12.21% 155.602us 12.21% 155.602us 51.867us 0.000us 0.00% 0.000us 0.000us 3 + cudaDeviceSynchronize 30.93% 394.045us 30.93% 394.045us 394.045us 0.000us 0.00% 0.000us 0.000us 1 ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ -Self CPU time total: 2.190ms -Self CUDA time total: 1.714ms +Self CPU time total: 1.274ms +Self CUDA time total: 575.481us @@ -4867,19 +4867,19 @@ PROFILE TRACE: hf_kernels_layer_norm | LN_B16_S1024_D4096 ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ - hf_kernels_layer_norm 2.73% 138.274us 37.76% 1.915ms 1.915ms 0.000us 0.00% 4.337ms 4.337ms 1 - _layer_norm_f8ec252::dropout_add_ln_fwd 0.84% 42.541us 34.77% 1.764ms 588.006us 3.325ms 100.00% 4.337ms 1.446ms 3 - hf_kernels_layer_norm 0.00% 0.000us 0.00% 0.000us 0.000us 3.326ms 100.04% 3.326ms 3.326ms 1 -void layer_norm::ln_fwd_kernel<layer_norm::Kernel_tr... 0.00% 0.000us 0.00% 0.000us 0.000us 3.325ms 100.00% 3.325ms 1.108ms 3 - Activity Buffer Request 29.98% 1.521ms 29.98% 1.521ms 1.521ms 1.012ms 30.42% 1.012ms 1.012ms 1 - aten::view 0.26% 13.190us 0.26% 13.190us 2.198us 0.000us 0.00% 0.000us 0.000us 6 - aten::empty 0.63% 32.210us 0.63% 32.210us 3.579us 0.000us 0.00% 0.000us 0.000us 9 -cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFla... 0.10% 4.921us 0.10% 4.921us 1.640us 0.000us 0.00% 0.000us 0.000us 3 - cudaLaunchKernel 3.22% 163.343us 3.22% 163.343us 54.448us 0.000us 0.00% 0.000us 0.000us 3 - cudaDeviceSynchronize 62.24% 3.157ms 62.24% 3.157ms 3.157ms 0.000us 0.00% 0.000us 0.000us 1 + hf_kernels_layer_norm 6.87% 103.651us 31.62% 476.976us 476.976us 0.000us 0.00% 1.531ms 1.531ms 1 + _layer_norm_f8ec252::dropout_add_ln_fwd 3.16% 47.619us 23.98% 361.844us 120.615us 1.187ms 100.00% 1.531ms 510.298us 3 + hf_kernels_layer_norm 0.00% 0.000us 0.00% 0.000us 0.000us 1.188ms 100.13% 1.188ms 1.188ms 1 +void layer_norm::ln_fwd_kernel<layer_norm::Kernel_tr... 0.00% 0.000us 0.00% 0.000us 0.000us 1.187ms 100.00% 1.187ms 395.515us 3 + Activity Buffer Request 8.20% 123.752us 8.20% 123.752us 123.752us 344.347us 29.02% 344.347us 344.347us 1 + aten::view 0.76% 11.481us 0.76% 11.481us 1.913us 0.000us 0.00% 0.000us 0.000us 6 + aten::empty 1.98% 29.821us 1.98% 29.821us 3.313us 0.000us 0.00% 0.000us 0.000us 9 +cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFla... 0.39% 5.930us 0.39% 5.930us 1.977us 0.000us 0.00% 0.000us 0.000us 3 + cudaLaunchKernel 10.26% 154.722us 10.26% 154.722us 51.574us 0.000us 0.00% 0.000us 0.000us 3 + cudaDeviceSynchronize 68.38% 1.032ms 68.38% 1.032ms 1.032ms 0.000us 0.00% 0.000us 0.000us 1 ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ -Self CPU time total: 5.073ms -Self CUDA time total: 3.325ms +Self CPU time total: 1.509ms +Self CUDA time total: 1.187ms @@ -4889,19 +4889,19 @@ PROFILE TRACE: hf_kernels_layer_norm | LN_B16_S1024_D8192 ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ - hf_kernels_layer_norm 1.55% 109.525us 7.88% 556.992us 556.992us 0.000us 0.00% 8.859ms 8.859ms 1 - _layer_norm_f8ec252::dropout_add_ln_fwd 0.62% 43.791us 6.15% 434.348us 144.783us 6.670ms 100.00% 8.859ms 2.953ms 3 - hf_kernels_layer_norm 0.00% 0.000us 0.00% 0.000us 0.000us 6.672ms 100.02% 6.672ms 6.672ms 1 -void layer_norm::ln_fwd_kernel<layer_norm::Kernel_tr... 0.00% 0.000us 0.00% 0.000us 0.000us 6.670ms 100.00% 6.670ms 2.223ms 3 - Activity Buffer Request 2.69% 189.754us 2.69% 189.754us 189.754us 2.188ms 32.81% 2.188ms 2.188ms 1 - aten::view 0.19% 13.119us 0.19% 13.119us 2.187us 0.000us 0.00% 0.000us 0.000us 6 - aten::empty 0.46% 32.450us 0.46% 32.450us 3.606us 0.000us 0.00% 0.000us 0.000us 9 -cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFla... 0.07% 4.630us 0.07% 4.630us 1.543us 0.000us 0.00% 0.000us 0.000us 3 - cudaLaunchKernel 2.32% 163.723us 2.32% 163.723us 54.574us 0.000us 0.00% 0.000us 0.000us 3 - cudaDeviceSynchronize 92.12% 6.509ms 92.12% 6.509ms 6.509ms 0.000us 0.00% 0.000us 0.000us 1 + hf_kernels_layer_norm 4.11% 127.961us 28.50% 887.612us 887.612us 0.000us 0.00% 3.104ms 3.104ms 1 + _layer_norm_f8ec252::dropout_add_ln_fwd 1.47% 45.722us 24.01% 747.701us 249.234us 2.375ms 100.00% 3.104ms 1.035ms 3 + hf_kernels_layer_norm 0.00% 0.000us 0.00% 0.000us 0.000us 2.376ms 100.06% 2.376ms 2.376ms 1 +void layer_norm::ln_fwd_kernel<layer_norm::Kernel_tr... 0.00% 0.000us 0.00% 0.000us 0.000us 2.375ms 100.00% 2.375ms 791.601us 3 + Activity Buffer Request 16.22% 505.157us 16.22% 505.157us 505.157us 729.500us 30.72% 729.500us 729.500us 1 + aten::view 0.38% 11.950us 0.38% 11.950us 1.992us 0.000us 0.00% 0.000us 0.000us 6 + aten::empty 0.97% 30.190us 0.97% 30.190us 3.354us 0.000us 0.00% 0.000us 0.000us 9 +cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFla... 0.16% 4.890us 0.16% 4.890us 1.630us 0.000us 0.00% 0.000us 0.000us 3 + cudaLaunchKernel 5.19% 161.742us 5.19% 161.742us 53.914us 0.000us 0.00% 0.000us 0.000us 3 + cudaDeviceSynchronize 71.50% 2.226ms 71.50% 2.226ms 2.226ms 0.000us 0.00% 0.000us 0.000us 1 ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ -Self CPU time total: 7.066ms -Self CUDA time total: 6.670ms +Self CPU time total: 3.114ms +Self CUDA time total: 2.375ms @@ -4911,19 +4911,19 @@ PROFILE TRACE: hf_kernels_layer_norm | LN_B16_S2048_D1024 ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ - hf_kernels_layer_norm 3.93% 134.994us 55.36% 1.902ms 1.902ms 0.000us 0.00% 2.214ms 2.214ms 1 - _layer_norm_f8ec252::dropout_add_ln_fwd 1.25% 43.084us 51.05% 1.754ms 584.723us 1.702ms 100.00% 2.214ms 738.134us 3 - hf_kernels_layer_norm 0.00% 0.000us 0.00% 0.000us 0.000us 1.703ms 100.08% 1.703ms 1.703ms 1 -void layer_norm::ln_fwd_kernel<layer_norm::Kernel_tr... 0.00% 0.000us 0.00% 0.000us 0.000us 1.702ms 100.00% 1.702ms 567.353us 3 - Activity Buffer Request 44.05% 1.514ms 44.05% 1.514ms 1.514ms 512.345us 30.10% 512.345us 512.345us 1 - aten::view 0.39% 13.309us 0.39% 13.309us 2.218us 0.000us 0.00% 0.000us 0.000us 6 - aten::empty 0.90% 31.069us 0.90% 31.069us 3.452us 0.000us 0.00% 0.000us 0.000us 9 -cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFla... 0.14% 4.900us 0.14% 4.900us 1.633us 0.000us 0.00% 0.000us 0.000us 3 - cudaLaunchKernel 4.70% 161.503us 4.70% 161.503us 53.834us 0.000us 0.00% 0.000us 0.000us 3 - cudaDeviceSynchronize 44.64% 1.534ms 44.64% 1.534ms 1.534ms 0.000us 0.00% 0.000us 0.000us 1 + hf_kernels_layer_norm 5.82% 128.863us 81.59% 1.808ms 1.808ms 0.000us 0.00% 756.792us 756.792us 1 + _layer_norm_f8ec252::dropout_add_ln_fwd 2.11% 46.800us 75.21% 1.666ms 555.488us 566.586us 100.00% 756.792us 252.264us 3 + hf_kernels_layer_norm 0.00% 0.000us 0.00% 0.000us 0.000us 567.994us 100.25% 567.994us 567.994us 1 +void layer_norm::ln_fwd_kernel<layer_norm::Kernel_tr... 0.00% 0.000us 0.00% 0.000us 0.000us 566.586us 100.00% 566.586us 188.862us 3 + Activity Buffer Request 64.48% 1.429ms 64.48% 1.429ms 1.429ms 190.206us 33.57% 190.206us 190.206us 1 + aten::view 0.56% 12.380us 0.56% 12.380us 2.063us 0.000us 0.00% 0.000us 0.000us 6 + aten::empty 1.35% 29.990us 1.35% 29.990us 3.332us 0.000us 0.00% 0.000us 0.000us 9 +cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFla... 0.24% 5.300us 0.24% 5.300us 1.767us 0.000us 0.00% 0.000us 0.000us 3 + cudaLaunchKernel 7.03% 155.802us 7.03% 155.802us 51.934us 0.000us 0.00% 0.000us 0.000us 3 + cudaDeviceSynchronize 18.41% 407.946us 18.41% 407.946us 407.946us 0.000us 0.00% 0.000us 0.000us 1 ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ -Self CPU time total: 3.436ms -Self CUDA time total: 1.702ms +Self CPU time total: 2.216ms +Self CUDA time total: 566.586us @@ -4933,19 +4933,19 @@ PROFILE TRACE: hf_kernels_layer_norm | LN_B16_S2048_D2048 ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ - hf_kernels_layer_norm 2.85% 110.083us 15.45% 596.563us 596.563us 0.000us 0.00% 4.460ms 4.460ms 1 - _layer_norm_f8ec252::dropout_add_ln_fwd 1.10% 42.541us 12.28% 474.190us 158.063us 3.425ms 100.00% 4.460ms 1.487ms 3 - hf_kernels_layer_norm 0.00% 0.000us 0.00% 0.000us 0.000us 3.426ms 100.04% 3.426ms 3.426ms 1 -void layer_norm::ln_fwd_kernel<layer_norm::Kernel_tr... 0.00% 0.000us 0.00% 0.000us 0.000us 3.425ms 100.00% 3.425ms 1.142ms 3 - Activity Buffer Request 5.89% 227.585us 5.89% 227.585us 227.585us 1.035ms 30.23% 1.035ms 1.035ms 1 - aten::view 0.32% 12.290us 0.32% 12.290us 2.048us 0.000us 0.00% 0.000us 0.000us 6 - aten::empty 0.80% 30.950us 0.80% 30.950us 3.439us 0.000us 0.00% 0.000us 0.000us 9 -cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFla... 0.11% 4.410us 0.11% 4.410us 1.470us 0.000us 0.00% 0.000us 0.000us 3 - cudaLaunchKernel 4.37% 168.704us 4.37% 168.704us 56.235us 0.000us 0.00% 0.000us 0.000us 3 - cudaDeviceSynchronize 84.55% 3.265ms 84.55% 3.265ms 3.265ms 0.000us 0.00% 0.000us 0.000us 1 + hf_kernels_layer_norm 6.78% 107.581us 32.18% 510.957us 510.957us 0.000us 0.00% 1.590ms 1.590ms 1 + _layer_norm_f8ec252::dropout_add_ln_fwd 2.95% 46.851us 24.67% 391.616us 130.539us 1.234ms 100.00% 1.590ms 529.905us 3 + hf_kernels_layer_norm 0.00% 0.000us 0.00% 0.000us 0.000us 1.235ms 100.12% 1.235ms 1.235ms 1 +void layer_norm::ln_fwd_kernel<layer_norm::Kernel_tr... 0.00% 0.000us 0.00% 0.000us 0.000us 1.234ms 100.00% 1.234ms 411.346us 3 + Activity Buffer Request 9.78% 155.342us 9.78% 155.342us 155.342us 355.677us 28.82% 355.677us 355.677us 1 + aten::view 0.74% 11.760us 0.74% 11.760us 1.960us 0.000us 0.00% 0.000us 0.000us 6 + aten::empty 1.88% 29.861us 1.88% 29.861us 3.318us 0.000us 0.00% 0.000us 0.000us 9 +cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFla... 0.38% 5.960us 0.38% 5.960us 1.987us 0.000us 0.00% 0.000us 0.000us 3 + cudaLaunchKernel 9.67% 153.602us 9.67% 153.602us 51.201us 0.000us 0.00% 0.000us 0.000us 3 + cudaDeviceSynchronize 67.82% 1.077ms 67.82% 1.077ms 1.077ms 0.000us 0.00% 0.000us 0.000us 1 ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ -Self CPU time total: 3.862ms -Self CUDA time total: 3.425ms +Self CPU time total: 1.588ms +Self CUDA time total: 1.234ms @@ -4955,19 +4955,19 @@ PROFILE TRACE: hf_kernels_layer_norm | LN_B16_S2048_D4096 ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ - hf_kernels_layer_norm 1.57% 111.872us 8.59% 613.703us 613.703us 0.000us 0.00% 8.913ms 8.913ms 1 - _layer_norm_f8ec252::dropout_add_ln_fwd 0.61% 43.889us 6.86% 489.921us 163.307us 6.694ms 100.00% 8.913ms 2.971ms 3 - hf_kernels_layer_norm 0.00% 0.000us 0.00% 0.000us 0.000us 6.696ms 100.02% 6.696ms 6.696ms 1 -void layer_norm::ln_fwd_kernel<layer_norm::Kernel_tr... 0.00% 0.000us 0.00% 0.000us 0.000us 6.694ms 100.00% 6.694ms 2.231ms 3 - Activity Buffer Request 3.32% 237.355us 3.32% 237.355us 237.355us 2.218ms 33.13% 2.218ms 2.218ms 1 - aten::view 0.17% 11.910us 0.17% 11.910us 1.985us 0.000us 0.00% 0.000us 0.000us 6 - aten::empty 0.44% 31.511us 0.44% 31.511us 3.501us 0.000us 0.00% 0.000us 0.000us 9 -cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFla... 0.07% 5.351us 0.07% 5.351us 1.784us 0.000us 0.00% 0.000us 0.000us 3 - cudaLaunchKernel 2.41% 171.815us 2.41% 171.815us 57.272us 0.000us 0.00% 0.000us 0.000us 3 - cudaDeviceSynchronize 91.41% 6.530ms 91.41% 6.530ms 6.530ms 0.000us 0.00% 0.000us 0.000us 1 + hf_kernels_layer_norm 4.29% 122.511us 22.27% 635.379us 635.379us 0.000us 0.00% 3.116ms 3.116ms 1 + _layer_norm_f8ec252::dropout_add_ln_fwd 1.67% 47.772us 17.54% 500.568us 166.856us 2.375ms 100.00% 3.116ms 1.039ms 3 + hf_kernels_layer_norm 0.00% 0.000us 0.00% 0.000us 0.000us 2.377ms 100.06% 2.377ms 2.377ms 1 +void layer_norm::ln_fwd_kernel<layer_norm::Kernel_tr... 0.00% 0.000us 0.00% 0.000us 0.000us 2.375ms 100.00% 2.375ms 791.801us 3 + Activity Buffer Request 8.85% 252.513us 8.85% 252.513us 252.513us 740.986us 31.19% 740.986us 740.986us 1 + aten::view 0.43% 12.300us 0.43% 12.300us 2.050us 0.000us 0.00% 0.000us 0.000us 6 + aten::empty 1.05% 29.891us 1.05% 29.891us 3.321us 0.000us 0.00% 0.000us 0.000us 9 +cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFla... 0.21% 6.001us 0.21% 6.001us 2.000us 0.000us 0.00% 0.000us 0.000us 3 + cudaLaunchKernel 5.76% 164.391us 5.76% 164.391us 54.797us 0.000us 0.00% 0.000us 0.000us 3 + cudaDeviceSynchronize 77.73% 2.218ms 77.73% 2.218ms 2.218ms 0.000us 0.00% 0.000us 0.000us 1 ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ -Self CPU time total: 7.144ms -Self CUDA time total: 6.694ms +Self CPU time total: 2.853ms +Self CUDA time total: 2.375ms @@ -4977,129 +4977,81 @@ PROFILE TRACE: hf_kernels_layer_norm | LN_B16_S2048_D8192 ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ - hf_kernels_layer_norm 0.82% 112.894us 4.93% 682.565us 682.565us 0.000us 0.00% 17.728ms 17.728ms 1 - _layer_norm_f8ec252::dropout_add_ln_fwd 0.31% 43.279us 4.02% 556.541us 185.514us 13.321ms 100.00% 17.728ms 5.909ms 3 - hf_kernels_layer_norm 0.00% 0.000us 0.00% 0.000us 0.000us 13.323ms 100.01% 13.323ms 13.323ms 1 -void layer_norm::ln_fwd_kernel<layer_norm::Kernel_tr... 0.00% 0.000us 0.00% 0.000us 0.000us 13.321ms 100.00% 13.321ms 4.440ms 3 - Activity Buffer Request 2.16% 298.477us 2.16% 298.477us 298.477us 4.407ms 33.08% 4.407ms 4.407ms 1 - aten::view 0.09% 13.130us 0.09% 13.130us 2.188us 0.000us 0.00% 0.000us 0.000us 6 - aten::empty 0.24% 33.051us 0.24% 33.051us 3.672us 0.000us 0.00% 0.000us 0.000us 9 -cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFla... 0.04% 5.240us 0.04% 5.240us 1.747us 0.000us 0.00% 0.000us 0.000us 3 - cudaLaunchKernel 1.28% 176.494us 1.28% 176.494us 58.831us 0.000us 0.00% 0.000us 0.000us 3 - cudaDeviceSynchronize 95.07% 13.154ms 95.07% 13.154ms 13.154ms 0.000us 0.00% 0.000us 0.000us 1 + hf_kernels_layer_norm 2.07% 109.351us 12.73% 673.809us 673.809us 0.000us 0.00% 6.337ms 6.337ms 1 + _layer_norm_f8ec252::dropout_add_ln_fwd 0.93% 49.100us 10.45% 553.127us 184.376us 4.781ms 100.00% 6.337ms 2.112ms 3 + hf_kernels_layer_norm 0.00% 0.000us 0.00% 0.000us 0.000us 4.782ms 100.03% 4.782ms 4.782ms 1 +void layer_norm::ln_fwd_kernel<layer_norm::Kernel_tr... 0.00% 0.000us 0.00% 0.000us 0.000us 4.781ms 100.00% 4.781ms 1.594ms 3 + Activity Buffer Request 5.38% 284.544us 5.38% 284.544us 284.544us 1.556ms 32.54% 1.556ms 1.556ms 1 + aten::view 0.21% 11.331us 0.21% 11.331us 1.889us 0.000us 0.00% 0.000us 0.000us 6 + aten::empty 0.57% 29.971us 0.57% 29.971us 3.330us 0.000us 0.00% 0.000us 0.000us 9 +cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFla... 0.11% 5.990us 0.11% 5.990us 1.997us 0.000us 0.00% 0.000us 0.000us 3 + cudaLaunchKernel 3.47% 183.522us 3.47% 183.522us 61.174us 0.000us 0.00% 0.000us 0.000us 3 + cudaDeviceSynchronize 87.27% 4.620ms 87.27% 4.620ms 4.620ms 0.000us 0.00% 0.000us 0.000us 1 ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ -Self CPU time total: 13.836ms -Self CUDA time total: 13.321ms +Self CPU time total: 5.294ms +Self CUDA time total: 4.781ms impl wl p50(ms) ok -hf_kernels_layer_norm LN_B16_S1024_D1024 0.29 False -hf_kernels_layer_norm LN_B16_S1024_D2048 0.61 False -hf_kernels_layer_norm LN_B16_S1024_D4096 1.15 False -hf_kernels_layer_norm LN_B16_S1024_D8192 2.27 False +hf_kernels_layer_norm LN_B16_S1024_D1024 0.05 False +hf_kernels_layer_norm LN_B16_S1024_D2048 0.22 False +hf_kernels_layer_norm LN_B16_S1024_D4096 0.44 False +hf_kernels_layer_norm LN_B16_S1024_D8192 0.84 False hf_kernels_layer_norm LN_B16_S128_D1024 0.05 False hf_kernels_layer_norm LN_B16_S128_D2048 0.05 False -hf_kernels_layer_norm LN_B16_S128_D4096 0.06 False -hf_kernels_layer_norm LN_B16_S128_D8192 0.30 False -hf_kernels_layer_norm LN_B16_S2048_D1024 0.61 False -hf_kernels_layer_norm LN_B16_S2048_D2048 1.20 False -hf_kernels_layer_norm LN_B16_S2048_D4096 2.27 False -hf_kernels_layer_norm LN_B16_S2048_D8192 4.51 False -hf_kernels_layer_norm LN_B16_S512_D1024 0.06 False -hf_kernels_layer_norm LN_B16_S512_D2048 0.30 False -hf_kernels_layer_norm LN_B16_S512_D4096 0.59 False -hf_kernels_layer_norm LN_B16_S512_D8192 1.16 False +hf_kernels_layer_norm LN_B16_S128_D4096 0.05 False +hf_kernels_layer_norm LN_B16_S128_D8192 0.05 False +hf_kernels_layer_norm LN_B16_S2048_D1024 0.21 False +hf_kernels_layer_norm LN_B16_S2048_D2048 0.46 False +hf_kernels_layer_norm LN_B16_S2048_D4096 0.84 False +hf_kernels_layer_norm LN_B16_S2048_D8192 1.65 False +hf_kernels_layer_norm LN_B16_S512_D1024 0.05 False +hf_kernels_layer_norm LN_B16_S512_D2048 0.05 False +hf_kernels_layer_norm LN_B16_S512_D4096 0.21 False +hf_kernels_layer_norm LN_B16_S512_D8192 0.43 False hf_kernels_layer_norm LN_B1_S1024_D1024 0.05 False hf_kernels_layer_norm LN_B1_S1024_D2048 0.05 False hf_kernels_layer_norm LN_B1_S1024_D4096 0.05 False -hf_kernels_layer_norm LN_B1_S1024_D8192 0.06 False -hf_kernels_layer_norm LN_B1_S128_D1024 0.05 False +hf_kernels_layer_norm LN_B1_S1024_D8192 0.05 False +hf_kernels_layer_norm LN_B1_S128_D1024 0.04 False hf_kernels_layer_norm LN_B1_S128_D2048 0.05 False hf_kernels_layer_norm LN_B1_S128_D4096 0.05 False hf_kernels_layer_norm LN_B1_S128_D8192 0.05 False hf_kernels_layer_norm LN_B1_S2048_D1024 0.05 False hf_kernels_layer_norm LN_B1_S2048_D2048 0.05 False -hf_kernels_layer_norm LN_B1_S2048_D4096 0.06 False -hf_kernels_layer_norm LN_B1_S2048_D8192 0.29 False +hf_kernels_layer_norm LN_B1_S2048_D4096 0.05 False +hf_kernels_layer_norm LN_B1_S2048_D8192 0.05 False hf_kernels_layer_norm LN_B1_S512_D1024 0.05 False hf_kernels_layer_norm LN_B1_S512_D2048 0.05 False hf_kernels_layer_norm LN_B1_S512_D4096 0.05 False hf_kernels_layer_norm LN_B1_S512_D8192 0.05 False hf_kernels_layer_norm LN_B4_S1024_D1024 0.05 False -hf_kernels_layer_norm LN_B4_S1024_D2048 0.07 False -hf_kernels_layer_norm LN_B4_S1024_D4096 0.29 False -hf_kernels_layer_norm LN_B4_S1024_D8192 0.59 False +hf_kernels_layer_norm LN_B4_S1024_D2048 0.05 False +hf_kernels_layer_norm LN_B4_S1024_D4096 0.05 False +hf_kernels_layer_norm LN_B4_S1024_D8192 0.21 False hf_kernels_layer_norm LN_B4_S128_D1024 0.05 False hf_kernels_layer_norm LN_B4_S128_D2048 0.05 False hf_kernels_layer_norm LN_B4_S128_D4096 0.05 False hf_kernels_layer_norm LN_B4_S128_D8192 0.05 False -hf_kernels_layer_norm LN_B4_S2048_D1024 0.06 False -hf_kernels_layer_norm LN_B4_S2048_D2048 0.30 False -hf_kernels_layer_norm LN_B4_S2048_D4096 0.60 False -hf_kernels_layer_norm LN_B4_S2048_D8192 1.15 False +hf_kernels_layer_norm LN_B4_S2048_D1024 0.05 False +hf_kernels_layer_norm LN_B4_S2048_D2048 0.06 False +hf_kernels_layer_norm LN_B4_S2048_D4096 0.21 False +hf_kernels_layer_norm LN_B4_S2048_D8192 0.44 False hf_kernels_layer_norm LN_B4_S512_D1024 0.05 False hf_kernels_layer_norm LN_B4_S512_D2048 0.05 False -hf_kernels_layer_norm LN_B4_S512_D4096 0.06 False -hf_kernels_layer_norm LN_B4_S512_D8192 0.29 False +hf_kernels_layer_norm LN_B4_S512_D4096 0.05 False +hf_kernels_layer_norm LN_B4_S512_D8192 0.05 False
▶ UV Install Logs
Fetching 4 files: 0%| | 0/4 [00:00<?, ?it/s] -Fetching 4 files: 25%|██▌ | 1/4 [00:00<00:00, 4.84it/s] -Fetching 4 files: 50%|█████ | 2/4 [00:02<00:02, 1.20s/it] -Fetching 4 files: 100%|██████████| 4/4 [00:02<00:00, 1.91it/s]
+Fetching 4 files: 25%|██▌ | 1/4 [00:00<00:00, 7.10it/s] +Fetching 4 files: 50%|█████ | 2/4 [00:01<00:01, 1.13it/s] +Fetching 4 files: 100%|██████████| 4/4 [00:01<00:00, 2.59it/s]

Artifacts:

layer_norm.jsonl diff --git a/layer_norm/impls/torch_layer_norm.html b/layer_norm/impls/torch_layer_norm.html index 41e261deb76e9889eb66386796e50a970adfd528..6ab9639b8e0f9d249da06445fa4c9ac229f15e09 100644 --- a/layer_norm/impls/torch_layer_norm.html +++ b/layer_norm/impls/torch_layer_norm.html @@ -3857,7 +3857,7 @@ span.linenos.special { color: #000000; background-color: #ffffc0; padding-left:
Generated on:
- Linux x86_64 | Linux-5.15.0-1084-aws-x86_64-with-glibc2.31 + Linux x86_64 | Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35
@@ -3872,7 +3872,7 @@ span.linenos.special { color: #000000; background-color: #ffffc0; padding-left: ▼ output ▶ uv-logs | -Cell: nv | 4.05s +Cell: nv | 0.22s | Raw @@ -3887,34 +3887,22 @@ Cell: nv | 4.05s
-
Fri Oct 24 19:23:22 2025       
+
Mon Oct 27 14:46:07 2025       
 +-----------------------------------------------------------------------------------------+
-| NVIDIA-SMI 560.35.05              Driver Version: 560.35.05      CUDA Version: 12.6     |
+| NVIDIA-SMI 570.195.03             Driver Version: 570.195.03     CUDA Version: 12.8     |
 |-----------------------------------------+------------------------+----------------------+
 | GPU  Name                 Persistence-M | Bus-Id          Disp.A | Volatile Uncorr. ECC |
 | Fan  Temp   Perf          Pwr:Usage/Cap |           Memory-Usage | GPU-Util  Compute M. |
 |                                         |                        |               MIG M. |
 |=========================================+========================+======================|
-|   0  NVIDIA L4                      Off |   00000000:38:00.0 Off |                    0 |
-| N/A   36C    P0             27W /   72W |       1MiB /  23034MiB |      0%      Default |
-|                                         |                        |                  N/A |
-+-----------------------------------------+------------------------+----------------------+
-|   1  NVIDIA L4                      Off |   00000000:3A:00.0 Off |                    0 |
-| N/A   33C    P0             28W /   72W |       1MiB /  23034MiB |      2%      Default |
-|                                         |                        |                  N/A |
-+-----------------------------------------+------------------------+----------------------+
-|   2  NVIDIA L4                      Off |   00000000:3C:00.0 Off |                    0 |
-| N/A   34C    P0             27W /   72W |       1MiB /  23034MiB |      2%      Default |
-|                                         |                        |                  N/A |
-+-----------------------------------------+------------------------+----------------------+
-|   3  NVIDIA L4                      Off |   00000000:3E:00.0 Off |                    0 |
-| N/A   33C    P0             27W /   72W |       1MiB /  23034MiB |      2%      Default |
+|   0  NVIDIA L40S                    On  |   00000000:4D:00.0 Off |                    0 |
+| N/A   31C    P0             79W /  350W |       0MiB /  46068MiB |     11%      Default |
 |                                         |                        |                  N/A |
 +-----------------------------------------+------------------------+----------------------+
 
 +-----------------------------------------------------------------------------------------+
 | Processes:                                                                              |
-|  GPU   GI   CI        PID   Type   Process name                              GPU Memory |
+|  GPU   GI   CI              PID   Type   Process name                        GPU Memory |
 |        ID   ID                                                               Usage      |
 |=========================================================================================|
 |  No running processes found                                                             |
@@ -3932,7 +3920,7 @@ Cell: nv | 4.05s
 ▼ output
  ▶ uv-logs
  | 
-Cell: benchmark | 43.38s
+Cell: benchmark | 7.77s
  | 
 
 Raw
@@ -3979,19 +3967,19 @@ PROFILE TRACE: torch_layer_norm | LN_B1_S128_D1024
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
                                                    Name    Self CPU %      Self CPU   CPU total %     CPU total  CPU time avg     Self CUDA   Self CUDA %    CUDA total  CUDA time avg    # of Calls  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-                                       torch_layer_norm         0.00%       0.000us         0.00%       0.000us       0.000us     112.575us      1075.83%     112.575us     112.575us             1  
-                                       torch_layer_norm         7.69%     144.433us        99.57%       1.871ms       1.871ms       0.000us         0.00%      14.080us      14.080us             1  
-                                       aten::layer_norm         0.88%      16.561us        91.89%       1.727ms     575.622us       0.000us         0.00%      14.080us       4.693us             3  
-                                aten::native_layer_norm         4.64%      87.271us        91.01%       1.710ms     570.102us      10.464us       100.00%      14.080us       4.693us             3  
-void at::native::(anonymous namespace)::vectorized_l...         0.00%       0.000us         0.00%       0.000us       0.000us      10.464us       100.00%      10.464us       3.488us             3  
-                                Activity Buffer Request        80.98%       1.522ms        80.98%       1.522ms       1.522ms       3.616us        34.56%       3.616us       3.616us             1  
-                                            aten::empty         2.62%      49.332us         2.62%      49.332us       5.481us       0.000us         0.00%       0.000us       0.000us             9  
-                                       cudaLaunchKernel         2.38%      44.720us         2.38%      44.720us      14.907us       0.000us         0.00%       0.000us       0.000us             3  
-                                             aten::view         0.37%       7.020us         0.37%       7.020us       1.170us       0.000us         0.00%       0.000us       0.000us             6  
-                                  cudaDeviceSynchronize         0.43%       8.020us         0.43%       8.020us       8.020us       0.000us         0.00%       0.000us       0.000us             1  
+                                       torch_layer_norm         0.00%       0.000us         0.00%       0.000us       0.000us     117.951us      1284.31%     117.951us     117.951us             1  
+                                       torch_layer_norm         8.74%     158.633us        99.57%       1.807ms       1.807ms       0.000us         0.00%      12.352us      12.352us             1  
+                                       aten::layer_norm         0.95%      17.160us        90.83%       1.649ms     549.530us       0.000us         0.00%      12.352us       4.117us             3  
+                                aten::native_layer_norm         4.49%      81.559us        89.88%       1.631ms     543.810us       9.184us       100.00%      12.352us       4.117us             3  
+void at::native::(anonymous namespace)::vectorized_l...         0.00%       0.000us         0.00%       0.000us       0.000us       9.184us       100.00%       9.184us       3.061us             3  
+                                Activity Buffer Request        79.88%       1.450ms        79.88%       1.450ms       1.450ms       3.168us        34.49%       3.168us       3.168us             1  
+                                            aten::empty         2.58%      46.801us         2.58%      46.801us       5.200us       0.000us         0.00%       0.000us       0.000us             9  
+                                       cudaLaunchKernel         2.54%      46.162us         2.54%      46.162us      15.387us       0.000us         0.00%       0.000us       0.000us             3  
+                                             aten::view         0.39%       7.072us         0.39%       7.072us       1.179us       0.000us         0.00%       0.000us       0.000us             6  
+                                  cudaDeviceSynchronize         0.43%       7.860us         0.43%       7.860us       7.860us       0.000us         0.00%       0.000us       0.000us             1  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-Self CPU time total: 1.879ms
-Self CUDA time total: 10.464us
+Self CPU time total: 1.815ms
+Self CUDA time total: 9.184us
 
 
 
@@ -4001,19 +3989,19 @@ PROFILE TRACE: torch_layer_norm | LN_B1_S128_D2048
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
                                                    Name    Self CPU %      Self CPU   CPU total %     CPU total  CPU time avg     Self CUDA   Self CUDA %    CUDA total  CUDA time avg    # of Calls  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-                                       torch_layer_norm         0.00%       0.000us         0.00%       0.000us       0.000us      92.221us       706.40%      92.221us      92.221us             1  
-                                       torch_layer_norm         4.41%      75.663us        99.71%       1.711ms       1.711ms       0.000us         0.00%      17.343us      17.343us             1  
-                                       aten::layer_norm         0.51%       8.781us        95.30%       1.636ms     545.198us       0.000us         0.00%      17.343us       5.781us             3  
-                                aten::native_layer_norm         2.86%      49.142us        94.79%       1.627ms     542.271us      13.055us       100.00%      17.343us       5.781us             3  
-void at::native::(anonymous namespace)::vectorized_l...         0.00%       0.000us         0.00%       0.000us       0.000us      13.055us       100.00%      13.055us       4.352us             3  
-                                Activity Buffer Request        88.33%       1.516ms        88.33%       1.516ms       1.516ms       4.288us        32.85%       4.288us       4.288us             1  
-                                            aten::empty         1.73%      29.720us         1.73%      29.720us       3.302us       0.000us         0.00%       0.000us       0.000us             9  
-                                       cudaLaunchKernel         1.63%      27.900us         1.63%      27.900us       9.300us       0.000us         0.00%       0.000us       0.000us             3  
-                                             aten::view         0.24%       4.089us         0.24%       4.089us       0.682us       0.000us         0.00%       0.000us       0.000us             6  
-                                  cudaDeviceSynchronize         0.29%       5.010us         0.29%       5.010us       5.010us       0.000us         0.00%       0.000us       0.000us             1  
+                                       torch_layer_norm         0.00%       0.000us         0.00%       0.000us       0.000us      91.263us       777.10%      91.263us      91.263us             1  
+                                       torch_layer_norm         4.45%      73.631us        99.68%       1.650ms       1.650ms       0.000us         0.00%      15.616us      15.616us             1  
+                                       aten::layer_norm         0.53%       8.730us        95.23%       1.577ms     525.519us       0.000us         0.00%      15.616us       5.205us             3  
+                                aten::native_layer_norm         3.21%      53.200us        94.70%       1.568ms     522.609us      11.744us       100.00%      15.616us       5.205us             3  
+void at::native::(anonymous namespace)::vectorized_l...         0.00%       0.000us         0.00%       0.000us       0.000us      11.744us       100.00%      11.744us       3.915us             3  
+                                Activity Buffer Request        87.81%       1.454ms        87.81%       1.454ms       1.454ms       3.872us        32.97%       3.872us       3.872us             1  
+                                            aten::empty         1.80%      29.853us         1.80%      29.853us       3.317us       0.000us         0.00%       0.000us       0.000us             9  
+                                       cudaLaunchKernel         1.64%      27.230us         1.64%      27.230us       9.077us       0.000us         0.00%       0.000us       0.000us             3  
+                                             aten::view         0.23%       3.770us         0.23%       3.770us       0.628us       0.000us         0.00%       0.000us       0.000us             6  
+                                  cudaDeviceSynchronize         0.32%       5.350us         0.32%       5.350us       5.350us       0.000us         0.00%       0.000us       0.000us             1  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-Self CPU time total: 1.716ms
-Self CUDA time total: 13.055us
+Self CPU time total: 1.656ms
+Self CUDA time total: 11.744us
 
 
 
@@ -4023,19 +4011,19 @@ PROFILE TRACE: torch_layer_norm | LN_B1_S128_D4096
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
                                                    Name    Self CPU %      Self CPU   CPU total %     CPU total  CPU time avg     Self CUDA   Self CUDA %    CUDA total  CUDA time avg    # of Calls  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-                                       torch_layer_norm         0.00%       0.000us         0.00%       0.000us       0.000us      93.568us       488.15%      93.568us      93.568us             1  
-                                       torch_layer_norm         4.18%      71.812us        99.71%       1.711ms       1.711ms       0.000us         0.00%      25.600us      25.600us             1  
-                                       aten::layer_norm         0.51%       8.700us        95.53%       1.639ms     546.498us       0.000us         0.00%      25.600us       8.533us             3  
-                                aten::native_layer_norm         2.93%      50.294us        95.02%       1.631ms     543.598us      19.168us       100.00%      25.600us       8.533us             3  
-void at::native::(anonymous namespace)::vectorized_l...         0.00%       0.000us         0.00%       0.000us       0.000us      19.168us       100.00%      19.168us       6.389us             3  
-                                Activity Buffer Request        88.71%       1.522ms        88.71%       1.522ms       1.522ms       6.432us        33.56%       6.432us       6.432us             1  
-                                            aten::empty         1.61%      27.640us         1.61%      27.640us       3.071us       0.000us         0.00%       0.000us       0.000us             9  
-                                       cudaLaunchKernel         1.55%      26.519us         1.55%      26.519us       8.840us       0.000us         0.00%       0.000us       0.000us             3  
-                                             aten::view         0.23%       3.889us         0.23%       3.889us       0.648us       0.000us         0.00%       0.000us       0.000us             6  
-                                  cudaDeviceSynchronize         0.29%       4.970us         0.29%       4.970us       4.970us       0.000us         0.00%       0.000us       0.000us             1  
+                                       torch_layer_norm         0.00%       0.000us         0.00%       0.000us       0.000us      93.407us       570.11%      93.407us      93.407us             1  
+                                       torch_layer_norm         4.26%      70.071us        99.67%       1.640ms       1.640ms       0.000us         0.00%      21.856us      21.856us             1  
+                                       aten::layer_norm         0.57%       9.440us        95.41%       1.570ms     523.176us       0.000us         0.00%      21.856us       7.285us             3  
+                                aten::native_layer_norm         3.17%      52.082us        94.83%       1.560ms     520.029us      16.384us       100.00%      21.856us       7.285us             3  
+void at::native::(anonymous namespace)::vectorized_l...         0.00%       0.000us         0.00%       0.000us       0.000us      16.384us       100.00%      16.384us       5.461us             3  
+                                Activity Buffer Request        87.95%       1.447ms        87.95%       1.447ms       1.447ms       5.472us        33.40%       5.472us       5.472us             1  
+                                            aten::empty         1.77%      29.121us         1.77%      29.121us       3.236us       0.000us         0.00%       0.000us       0.000us             9  
+                                       cudaLaunchKernel         1.71%      28.080us         1.71%      28.080us       9.360us       0.000us         0.00%       0.000us       0.000us             3  
+                                             aten::view         0.24%       4.030us         0.24%       4.030us       0.672us       0.000us         0.00%       0.000us       0.000us             6  
+                                  cudaDeviceSynchronize         0.33%       5.460us         0.33%       5.460us       5.460us       0.000us         0.00%       0.000us       0.000us             1  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-Self CPU time total: 1.716ms
-Self CUDA time total: 19.168us
+Self CPU time total: 1.645ms
+Self CUDA time total: 16.384us
 
 
 
@@ -4045,19 +4033,19 @@ PROFILE TRACE: torch_layer_norm | LN_B1_S128_D8192
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
                                                    Name    Self CPU %      Self CPU   CPU total %     CPU total  CPU time avg     Self CUDA   Self CUDA %    CUDA total  CUDA time avg    # of Calls  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-                                       torch_layer_norm         0.00%       0.000us         0.00%       0.000us       0.000us      92.541us       280.78%      92.541us      92.541us             1  
-                                       torch_layer_norm         4.34%      68.272us        99.69%       1.570ms       1.570ms       0.000us         0.00%      43.839us      43.839us             1  
-                                       aten::layer_norm         0.53%       8.411us        95.35%       1.502ms     500.504us       0.000us         0.00%      43.839us      14.613us             3  
-                                aten::native_layer_norm         3.08%      48.533us        94.82%       1.493ms     497.700us      32.959us       100.00%      43.839us      14.613us             3  
-void at::native::(anonymous namespace)::vectorized_l...         0.00%       0.000us         0.00%       0.000us       0.000us      32.959us       100.00%      32.959us      10.986us             3  
-                                Activity Buffer Request        75.11%       1.183ms        75.11%       1.183ms       1.183ms      10.880us        33.01%      10.880us      10.880us             1  
-                                            aten::empty         1.74%      27.430us         1.74%      27.430us       3.048us       0.000us         0.00%       0.000us       0.000us             9  
-                                       cudaLaunchKernel        14.65%     230.765us        14.65%     230.765us      76.922us       0.000us         0.00%       0.000us       0.000us             3  
-                                             aten::view         0.24%       3.717us         0.24%       3.717us       0.619us       0.000us         0.00%       0.000us       0.000us             6  
-                                  cudaDeviceSynchronize         0.31%       4.880us         0.31%       4.880us       4.880us       0.000us         0.00%       0.000us       0.000us             1  
+                                       torch_layer_norm         0.00%       0.000us         0.00%       0.000us       0.000us     118.239us       440.39%     118.239us     118.239us             1  
+                                       torch_layer_norm         5.44%      79.142us        99.61%       1.449ms       1.449ms       0.000us         0.00%      35.810us      35.810us             1  
+                                       aten::layer_norm         0.75%      10.900us        94.17%       1.370ms     456.578us       0.000us         0.00%      35.810us      11.937us             3  
+                                aten::native_layer_norm         4.07%      59.211us        93.42%       1.359ms     452.944us      26.849us       100.00%      35.810us      11.937us             3  
+void at::native::(anonymous namespace)::vectorized_l...         0.00%       0.000us         0.00%       0.000us       0.000us      26.849us       100.00%      26.849us       8.950us             3  
+                                Activity Buffer Request        72.70%       1.057ms        72.70%       1.057ms       1.057ms       8.961us        33.38%       8.961us       8.961us             1  
+                                            aten::empty         2.44%      35.559us         2.44%      35.559us       3.951us       0.000us         0.00%       0.000us       0.000us             9  
+                                       cudaLaunchKernel        13.86%     201.604us        13.86%     201.604us      67.201us       0.000us         0.00%       0.000us       0.000us             3  
+                                             aten::view         0.34%       4.961us         0.34%       4.961us       0.827us       0.000us         0.00%       0.000us       0.000us             6  
+                                  cudaDeviceSynchronize         0.39%       5.680us         0.39%       5.680us       5.680us       0.000us         0.00%       0.000us       0.000us             1  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-Self CPU time total: 1.575ms
-Self CUDA time total: 32.959us
+Self CPU time total: 1.455ms
+Self CUDA time total: 26.849us
 
 
 
@@ -4067,19 +4055,19 @@ PROFILE TRACE: torch_layer_norm | LN_B1_S512_D1024
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
                                                    Name    Self CPU %      Self CPU   CPU total %     CPU total  CPU time avg     Self CUDA   Self CUDA %    CUDA total  CUDA time avg    # of Calls  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-                                       torch_layer_norm         0.00%       0.000us         0.00%       0.000us       0.000us      87.967us       645.30%      87.967us      87.967us             1  
-                                       torch_layer_norm         3.73%      70.373us        99.76%       1.880ms       1.880ms       0.000us         0.00%      18.016us      18.016us             1  
-                                       aten::layer_norm         0.45%       8.529us        96.03%       1.809ms     603.153us       0.000us         0.00%      18.016us       6.005us             3  
-                                aten::native_layer_norm         2.56%      48.230us        95.57%       1.801ms     600.310us      13.632us       100.00%      18.016us       6.005us             3  
-void at::native::(anonymous namespace)::vectorized_l...         0.00%       0.000us         0.00%       0.000us       0.000us      13.632us       100.00%      13.632us       4.544us             3  
-                                Activity Buffer Request        79.33%       1.495ms        79.33%       1.495ms       1.495ms       4.384us        32.16%       4.384us       4.384us             1  
-                                            aten::empty         1.49%      27.990us         1.49%      27.990us       3.110us       0.000us         0.00%       0.000us       0.000us             9  
-                                       cudaLaunchKernel        12.00%     226.104us        12.00%     226.104us      75.368us       0.000us         0.00%       0.000us       0.000us             3  
-                                             aten::view         0.20%       3.822us         0.20%       3.822us       0.637us       0.000us         0.00%       0.000us       0.000us             6  
-                                  cudaDeviceSynchronize         0.24%       4.480us         0.24%       4.480us       4.480us       0.000us         0.00%       0.000us       0.000us             1  
+                                       torch_layer_norm         0.00%       0.000us         0.00%       0.000us       0.000us      95.007us       954.65%      95.007us      95.007us             1  
+                                       torch_layer_norm         4.08%      72.861us        99.69%       1.782ms       1.782ms       0.000us         0.00%      13.216us      13.216us             1  
+                                       aten::layer_norm         0.50%       9.010us        95.61%       1.709ms     569.593us       0.000us         0.00%      13.216us       4.405us             3  
+                                aten::native_layer_norm         3.10%      55.433us        95.11%       1.700ms     566.590us       9.952us       100.00%      13.216us       4.405us             3  
+void at::native::(anonymous namespace)::vectorized_l...         0.00%       0.000us         0.00%       0.000us       0.000us       9.952us       100.00%       9.952us       3.317us             3  
+                                Activity Buffer Request        81.03%       1.448ms        81.03%       1.448ms       1.448ms       3.264us        32.80%       3.264us       3.264us             1  
+                                            aten::empty         1.69%      30.250us         1.69%      30.250us       3.361us       0.000us         0.00%       0.000us       0.000us             9  
+                                       cudaLaunchKernel         9.05%     161.792us         9.05%     161.792us      53.931us       0.000us         0.00%       0.000us       0.000us             3  
+                                             aten::view         0.23%       4.100us         0.23%       4.100us       0.683us       0.000us         0.00%       0.000us       0.000us             6  
+                                  cudaDeviceSynchronize         0.31%       5.520us         0.31%       5.520us       5.520us       0.000us         0.00%       0.000us       0.000us             1  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-Self CPU time total: 1.884ms
-Self CUDA time total: 13.632us
+Self CPU time total: 1.787ms
+Self CUDA time total: 9.952us
 
 
 
@@ -4089,19 +4077,19 @@ PROFILE TRACE: torch_layer_norm | LN_B1_S512_D2048
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
                                                    Name    Self CPU %      Self CPU   CPU total %     CPU total  CPU time avg     Self CUDA   Self CUDA %    CUDA total  CUDA time avg    # of Calls  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-                                       torch_layer_norm         0.00%       0.000us         0.00%       0.000us       0.000us      87.132us       436.38%      87.132us      87.132us             1  
-                                       torch_layer_norm        11.20%      67.652us        99.23%     599.293us     599.293us       0.000us         0.00%      26.430us      26.430us             1  
-                                       aten::layer_norm         1.44%       8.699us        88.03%     531.641us     177.214us       0.000us         0.00%      26.430us       8.810us             3  
-                                aten::native_layer_norm         7.85%      47.430us        86.59%     522.942us     174.314us      19.967us       100.00%      26.430us       8.810us             3  
-void at::native::(anonymous namespace)::vectorized_l...         0.00%       0.000us         0.00%       0.000us       0.000us      19.967us       100.00%      19.967us       6.656us             3  
-                                Activity Buffer Request        37.02%     223.565us        37.02%     223.565us     223.565us       6.463us        32.37%       6.463us       6.463us             1  
-                                            aten::empty         4.44%      26.841us         4.44%      26.841us       2.982us       0.000us         0.00%       0.000us       0.000us             9  
-                                       cudaLaunchKernel        36.60%     221.066us        36.60%     221.066us      73.689us       0.000us         0.00%       0.000us       0.000us             3  
-                                             aten::view         0.67%       4.040us         0.67%       4.040us       0.673us       0.000us         0.00%       0.000us       0.000us             6  
-                                  cudaDeviceSynchronize         0.77%       4.630us         0.77%       4.630us       4.630us       0.000us         0.00%       0.000us       0.000us             1  
+                                       torch_layer_norm         0.00%       0.000us         0.00%       0.000us       0.000us      88.574us       668.68%      88.574us      88.574us             1  
+                                       torch_layer_norm        15.40%      66.901us        98.88%     429.607us     429.607us       0.000us         0.00%      17.629us      17.629us             1  
+                                       aten::layer_norm         2.14%       9.290us        83.48%     362.706us     120.902us       0.000us         0.00%      17.629us       5.876us             3  
+                                aten::native_layer_norm        12.03%      52.280us        81.34%     353.416us     117.805us      13.246us       100.00%      17.629us       5.876us             3  
+void at::native::(anonymous namespace)::vectorized_l...         0.00%       0.000us         0.00%       0.000us       0.000us      13.246us       100.00%      13.246us       4.415us             3  
+                                Activity Buffer Request        26.09%     113.362us        26.09%     113.362us     113.362us       4.383us        33.09%       4.383us       4.383us             1  
+                                            aten::empty         6.80%      29.541us         6.80%      29.541us       3.282us       0.000us         0.00%       0.000us       0.000us             9  
+                                       cudaLaunchKernel        35.53%     154.353us        35.53%     154.353us      51.451us       0.000us         0.00%       0.000us       0.000us             3  
+                                             aten::view         0.89%       3.880us         0.89%       3.880us       0.647us       0.000us         0.00%       0.000us       0.000us             6  
+                                  cudaDeviceSynchronize         1.12%       4.880us         1.12%       4.880us       4.880us       0.000us         0.00%       0.000us       0.000us             1  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-Self CPU time total: 603.923us
-Self CUDA time total: 19.967us
+Self CPU time total: 434.487us
+Self CUDA time total: 13.246us
 
 
 
@@ -4111,19 +4099,19 @@ PROFILE TRACE: torch_layer_norm | LN_B1_S512_D4096
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
                                                    Name    Self CPU %      Self CPU   CPU total %     CPU total  CPU time avg     Self CUDA   Self CUDA %    CUDA total  CUDA time avg    # of Calls  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-                                       torch_layer_norm         0.00%       0.000us         0.00%       0.000us       0.000us      98.654us       302.25%      98.654us      98.654us             1  
-                                       torch_layer_norm         3.90%      73.122us        99.74%       1.871ms       1.871ms       0.000us         0.00%      42.848us      42.848us             1  
-                                       aten::layer_norm         0.49%       9.220us        95.85%       1.798ms     599.309us       0.000us         0.00%      42.848us      14.283us             3  
-                                aten::native_layer_norm         2.69%      50.411us        95.35%       1.789ms     596.236us      32.640us       100.00%      42.848us      14.283us             3  
-void at::native::(anonymous namespace)::vectorized_l...         0.00%       0.000us         0.00%       0.000us       0.000us      32.640us       100.00%      32.640us      10.880us             3  
-                                Activity Buffer Request        79.57%       1.493ms        79.57%       1.493ms       1.493ms      10.208us        31.27%      10.208us      10.208us             1  
-                                            aten::empty         1.49%      28.020us         1.49%      28.020us       3.113us       0.000us         0.00%       0.000us       0.000us             9  
-                                       cudaLaunchKernel        11.39%     213.675us        11.39%     213.675us      71.225us       0.000us         0.00%       0.000us       0.000us             3  
-                                             aten::view         0.21%       4.030us         0.21%       4.030us       0.672us       0.000us         0.00%       0.000us       0.000us             6  
-                                  cudaDeviceSynchronize         0.26%       4.820us         0.26%       4.820us       4.820us       0.000us         0.00%       0.000us       0.000us             1  
+                                       torch_layer_norm         0.00%       0.000us         0.00%       0.000us       0.000us      96.609us       488.49%      96.609us      96.609us             1  
+                                       torch_layer_norm         4.03%      71.860us        99.72%       1.776ms       1.776ms       0.000us         0.00%      26.305us      26.305us             1  
+                                       aten::layer_norm         0.54%       9.591us        95.68%       1.704ms     568.087us       0.000us         0.00%      26.305us       8.768us             3  
+                                aten::native_layer_norm         2.97%      52.832us        95.14%       1.695ms     564.890us      19.777us       100.00%      26.305us       8.768us             3  
+void at::native::(anonymous namespace)::vectorized_l...         0.00%       0.000us         0.00%       0.000us       0.000us      19.777us       100.00%      19.777us       6.592us             3  
+                                Activity Buffer Request        81.50%       1.452ms        81.50%       1.452ms       1.452ms       6.528us        33.01%       6.528us       6.528us             1  
+                                            aten::empty         1.62%      28.940us         1.62%      28.940us       3.216us       0.000us         0.00%       0.000us       0.000us             9  
+                                       cudaLaunchKernel         8.82%     157.073us         8.82%     157.073us      52.358us       0.000us         0.00%       0.000us       0.000us             3  
+                                             aten::view         0.23%       4.100us         0.23%       4.100us       0.683us       0.000us         0.00%       0.000us       0.000us             6  
+                                  cudaDeviceSynchronize         0.28%       5.050us         0.28%       5.050us       5.050us       0.000us         0.00%       0.000us       0.000us             1  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-Self CPU time total: 1.876ms
-Self CUDA time total: 32.640us
+Self CPU time total: 1.781ms
+Self CUDA time total: 19.777us
 
 
 
@@ -4133,19 +4121,19 @@ PROFILE TRACE: torch_layer_norm | LN_B1_S512_D8192
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
                                                    Name    Self CPU %      Self CPU   CPU total %     CPU total  CPU time avg     Self CUDA   Self CUDA %    CUDA total  CUDA time avg    # of Calls  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-                                       torch_layer_norm         3.66%      69.232us        99.75%       1.887ms       1.887ms       0.000us         0.00%     140.349us     140.349us             1  
-                                       aten::layer_norm         0.48%       9.050us        96.09%       1.817ms     605.826us       0.000us         0.00%     140.349us      46.783us             3  
-                                aten::native_layer_norm         2.62%      49.510us        95.61%       1.808ms     602.810us      87.870us       100.00%     140.349us      46.783us             3  
-                                       torch_layer_norm         0.00%       0.000us         0.00%       0.000us       0.000us     103.646us       117.95%     103.646us     103.646us             1  
-void at::native::(anonymous namespace)::vectorized_l...         0.00%       0.000us         0.00%       0.000us       0.000us      87.870us       100.00%      87.870us      29.290us             3  
-                                Activity Buffer Request        80.23%       1.517ms        80.23%       1.517ms       1.517ms      52.479us        59.72%      52.479us      52.479us             1  
-                                            aten::empty         1.47%      27.721us         1.47%      27.721us       3.080us       0.000us         0.00%       0.000us       0.000us             9  
-                                       cudaLaunchKernel        11.09%     209.785us        11.09%     209.785us      69.928us       0.000us         0.00%       0.000us       0.000us             3  
-                                             aten::view         0.21%       3.950us         0.21%       3.950us       0.658us       0.000us         0.00%       0.000us       0.000us             6  
-                                  cudaDeviceSynchronize         0.25%       4.771us         0.25%       4.771us       4.771us       0.000us         0.00%       0.000us       0.000us             1  
+                                       torch_layer_norm         0.00%       0.000us         0.00%       0.000us       0.000us     101.087us       312.17%     101.087us     101.087us             1  
+                                       torch_layer_norm         4.21%      75.141us        99.72%       1.779ms       1.779ms       0.000us         0.00%      43.134us      43.134us             1  
+                                       aten::layer_norm         0.50%       9.000us        95.50%       1.703ms     567.803us       0.000us         0.00%      43.134us      14.378us             3  
+                                aten::native_layer_norm         3.03%      54.032us        95.00%       1.694ms     564.803us      32.382us       100.00%      43.134us      14.378us             3  
+void at::native::(anonymous namespace)::vectorized_l...         0.00%       0.000us         0.00%       0.000us       0.000us      32.382us       100.00%      32.382us      10.794us             3  
+                                Activity Buffer Request        81.39%       1.452ms        81.39%       1.452ms       1.452ms      10.752us        33.20%      10.752us      10.752us             1  
+                                            aten::empty         1.73%      30.799us         1.73%      30.799us       3.422us       0.000us         0.00%       0.000us       0.000us             9  
+                                       cudaLaunchKernel         8.63%     153.894us         8.63%     153.894us      51.298us       0.000us         0.00%       0.000us       0.000us             3  
+                                             aten::view         0.22%       3.990us         0.22%       3.990us       0.665us       0.000us         0.00%       0.000us       0.000us             6  
+                                  cudaDeviceSynchronize         0.28%       5.050us         0.28%       5.050us       5.050us       0.000us         0.00%       0.000us       0.000us             1  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-Self CPU time total: 1.891ms
-Self CUDA time total: 87.870us
+Self CPU time total: 1.784ms
+Self CUDA time total: 32.382us
 
 
 
@@ -4155,19 +4143,19 @@ PROFILE TRACE: torch_layer_norm | LN_B1_S1024_D1024
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
                                                    Name    Self CPU %      Self CPU   CPU total %     CPU total  CPU time avg     Self CUDA   Self CUDA %    CUDA total  CUDA time avg    # of Calls  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-                                       torch_layer_norm         0.00%       0.000us         0.00%       0.000us       0.000us      86.653us       383.56%      86.653us      86.653us             1  
-                                       torch_layer_norm        10.96%      67.652us        99.24%     612.643us     612.643us       0.000us         0.00%      29.888us      29.888us             1  
-                                       aten::layer_norm         1.40%       8.670us        88.28%     544.991us     181.664us       0.000us         0.00%      29.888us       9.963us             3  
-                                aten::native_layer_norm         7.55%      46.623us        86.87%     536.321us     178.774us      22.592us       100.00%      29.888us       9.963us             3  
-void at::native::(anonymous namespace)::vectorized_l...         0.00%       0.000us         0.00%       0.000us       0.000us      22.592us       100.00%      22.592us       7.531us             3  
-                                Activity Buffer Request        39.97%     246.735us        39.97%     246.735us     246.735us       7.296us        32.29%       7.296us       7.296us             1  
-                                            aten::empty         4.55%      28.120us         4.55%      28.120us       3.124us       0.000us         0.00%       0.000us       0.000us             9  
-                                       cudaLaunchKernel        34.17%     210.983us        34.17%     210.983us      70.328us       0.000us         0.00%       0.000us       0.000us             3  
-                                             aten::view         0.63%       3.860us         0.63%       3.860us       0.643us       0.000us         0.00%       0.000us       0.000us             6  
-                                  cudaDeviceSynchronize         0.76%       4.720us         0.76%       4.720us       4.720us       0.000us         0.00%       0.000us       0.000us             1  
+                                       torch_layer_norm         0.00%       0.000us         0.00%       0.000us       0.000us      84.605us       738.59%      84.605us      84.605us             1  
+                                       torch_layer_norm        14.65%      66.062us        98.90%     446.008us     446.008us       0.000us         0.00%      15.231us      15.231us             1  
+                                       aten::layer_norm         1.88%       8.459us        84.25%     379.946us     126.649us       0.000us         0.00%      15.231us       5.077us             3  
+                                aten::native_layer_norm        11.07%      49.901us        82.38%     371.487us     123.829us      11.455us       100.00%      15.231us       5.077us             3  
+void at::native::(anonymous namespace)::vectorized_l...         0.00%       0.000us         0.00%       0.000us       0.000us      11.455us       100.00%      11.455us       3.818us             3  
+                                Activity Buffer Request        30.37%     136.933us        30.37%     136.933us     136.933us       3.776us        32.96%       3.776us       3.776us             1  
+                                            aten::empty         6.35%      28.620us         6.35%      28.620us       3.180us       0.000us         0.00%       0.000us       0.000us             9  
+                                       cudaLaunchKernel        33.76%     152.233us        33.76%     152.233us      50.744us       0.000us         0.00%       0.000us       0.000us             3  
+                                             aten::view         0.84%       3.800us         0.84%       3.800us       0.633us       0.000us         0.00%       0.000us       0.000us             6  
+                                  cudaDeviceSynchronize         1.10%       4.941us         1.10%       4.941us       4.941us       0.000us         0.00%       0.000us       0.000us             1  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-Self CPU time total: 617.363us
-Self CUDA time total: 22.592us
+Self CPU time total: 450.949us
+Self CUDA time total: 11.455us
 
 
 
@@ -4177,19 +4165,19 @@ PROFILE TRACE: torch_layer_norm | LN_B1_S1024_D2048
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
                                                    Name    Self CPU %      Self CPU   CPU total %     CPU total  CPU time avg     Self CUDA   Self CUDA %    CUDA total  CUDA time avg    # of Calls  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-                                       torch_layer_norm         0.00%       0.000us         0.00%       0.000us       0.000us     101.408us       299.53%     101.408us     101.408us             1  
-                                       torch_layer_norm         3.59%      69.623us        99.76%       1.933ms       1.933ms       0.000us         0.00%      44.608us      44.608us             1  
-                                       aten::layer_norm         0.46%       8.960us        96.16%       1.864ms     621.253us       0.000us         0.00%      44.608us      14.869us             3  
-                                aten::native_layer_norm         2.58%      49.912us        95.70%       1.855ms     618.266us      33.856us       100.00%      44.608us      14.869us             3  
-void at::native::(anonymous namespace)::vectorized_l...         0.00%       0.000us         0.00%       0.000us       0.000us      33.856us       100.00%      33.856us      11.285us             3  
-                                Activity Buffer Request        80.06%       1.552ms        80.06%       1.552ms       1.552ms      10.752us        31.76%      10.752us      10.752us             1  
-                                            aten::empty         1.48%      28.770us         1.48%      28.770us       3.197us       0.000us         0.00%       0.000us       0.000us             9  
-                                       cudaLaunchKernel        11.38%     220.624us        11.38%     220.624us      73.541us       0.000us         0.00%       0.000us       0.000us             3  
-                                             aten::view         0.20%       3.930us         0.20%       3.930us       0.655us       0.000us         0.00%       0.000us       0.000us             6  
-                                  cudaDeviceSynchronize         0.24%       4.730us         0.24%       4.730us       4.730us       0.000us         0.00%       0.000us       0.000us             1  
+                                       torch_layer_norm         0.00%       0.000us         0.00%       0.000us       0.000us      95.615us       580.22%      95.615us      95.615us             1  
+                                       torch_layer_norm         3.86%      68.250us        99.72%       1.762ms       1.762ms       0.000us         0.00%      21.951us      21.951us             1  
+                                       aten::layer_norm         0.50%       8.771us        95.86%       1.694ms     564.703us       0.000us         0.00%      21.951us       7.317us             3  
+                                aten::native_layer_norm         3.18%      56.263us        95.36%       1.685ms     561.780us      16.479us       100.00%      21.951us       7.317us             3  
+void at::native::(anonymous namespace)::vectorized_l...         0.00%       0.000us         0.00%       0.000us       0.000us      16.479us       100.00%      16.479us       5.493us             3  
+                                Activity Buffer Request        81.70%       1.444ms        81.70%       1.444ms       1.444ms       5.472us        33.21%       5.472us       5.472us             1  
+                                            aten::empty         1.62%      28.639us         1.62%      28.639us       3.182us       0.000us         0.00%       0.000us       0.000us             9  
+                                       cudaLaunchKernel         8.61%     152.252us         8.61%     152.252us      50.751us       0.000us         0.00%       0.000us       0.000us             3  
+                                             aten::view         0.24%       4.230us         0.24%       4.230us       0.705us       0.000us         0.00%       0.000us       0.000us             6  
+                                  cudaDeviceSynchronize         0.28%       4.980us         0.28%       4.980us       4.980us       0.000us         0.00%       0.000us       0.000us             1  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-Self CPU time total: 1.938ms
-Self CUDA time total: 33.856us
+Self CPU time total: 1.767ms
+Self CUDA time total: 16.479us
 
 
 
@@ -4199,19 +4187,19 @@ PROFILE TRACE: torch_layer_norm | LN_B1_S1024_D4096
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
                                                    Name    Self CPU %      Self CPU   CPU total %     CPU total  CPU time avg     Self CUDA   Self CUDA %    CUDA total  CUDA time avg    # of Calls  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-                                       torch_layer_norm        11.17%      70.062us        99.23%     622.373us     622.373us       0.000us         0.00%     136.799us     136.799us             1  
-                                       aten::layer_norm         1.42%       8.898us        88.06%     552.311us     184.104us       0.000us         0.00%     136.799us      45.600us             3  
-                                aten::native_layer_norm         7.72%      48.411us        86.64%     543.413us     181.138us      86.463us       100.00%     136.799us      45.600us             3  
-                                       torch_layer_norm         0.00%       0.000us         0.00%       0.000us       0.000us     104.799us       121.21%     104.799us     104.799us             1  
-void at::native::(anonymous namespace)::vectorized_l...         0.00%       0.000us         0.00%       0.000us       0.000us      86.463us       100.00%      86.463us      28.821us             3  
-                                Activity Buffer Request        40.82%     256.046us        40.82%     256.046us     256.046us      50.336us        58.22%      50.336us      50.336us             1  
-                                            aten::empty         4.50%      28.250us         4.50%      28.250us       3.139us       0.000us         0.00%       0.000us       0.000us             9  
-                                       cudaLaunchKernel        32.98%     206.875us        32.98%     206.875us      68.958us       0.000us         0.00%       0.000us       0.000us             3  
-                                             aten::view         0.61%       3.831us         0.61%       3.831us       0.638us       0.000us         0.00%       0.000us       0.000us             6  
-                                  cudaDeviceSynchronize         0.77%       4.841us         0.77%       4.841us       4.841us       0.000us         0.00%       0.000us       0.000us             1  
+                                       torch_layer_norm         0.00%       0.000us         0.00%       0.000us       0.000us      88.894us       345.94%      88.894us      88.894us             1  
+                                       torch_layer_norm        15.31%      64.511us        98.72%     416.027us     416.027us       0.000us         0.00%      34.240us      34.240us             1  
+                                       aten::layer_norm         2.02%       8.530us        83.41%     351.516us     117.172us       0.000us         0.00%      34.240us      11.413us             3  
+                                aten::native_layer_norm        12.31%      51.881us        81.39%     342.986us     114.329us      25.696us       100.00%      34.240us      11.413us             3  
+void at::native::(anonymous namespace)::vectorized_l...         0.00%       0.000us         0.00%       0.000us       0.000us      25.696us       100.00%      25.696us       8.565us             3  
+                                Activity Buffer Request        25.35%     106.822us        25.35%     106.822us     106.822us       8.544us        33.25%       8.544us       8.544us             1  
+                                            aten::empty         6.69%      28.191us         6.69%      28.191us       3.132us       0.000us         0.00%       0.000us       0.000us             9  
+                                       cudaLaunchKernel        36.17%     152.423us        36.17%     152.423us      50.808us       0.000us         0.00%       0.000us       0.000us             3  
+                                             aten::view         0.87%       3.669us         0.87%       3.669us       0.612us       0.000us         0.00%       0.000us       0.000us             6  
+                                  cudaDeviceSynchronize         1.28%       5.400us         1.28%       5.400us       5.400us       0.000us         0.00%       0.000us       0.000us             1  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-Self CPU time total: 627.214us
-Self CUDA time total: 86.463us
+Self CPU time total: 421.427us
+Self CUDA time total: 25.696us
 
 
 
@@ -4221,19 +4209,19 @@ PROFILE TRACE: torch_layer_norm | LN_B1_S1024_D8192
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
                                                    Name    Self CPU %      Self CPU   CPU total %     CPU total  CPU time avg     Self CUDA   Self CUDA %    CUDA total  CUDA time avg    # of Calls  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-                                       torch_layer_norm        10.59%      67.421us        90.93%     578.903us     578.903us       0.000us         0.00%     292.824us     292.824us             1  
-                                       aten::layer_norm         1.32%       8.381us        80.34%     511.482us     170.494us       0.000us         0.00%     292.824us      97.608us             3  
-                                aten::native_layer_norm         7.29%      46.413us        79.02%     503.101us     167.700us     181.403us       100.00%     292.824us      97.608us             3  
-                                       torch_layer_norm         0.00%       0.000us         0.00%       0.000us       0.000us     182.844us       100.79%     182.844us     182.844us             1  
-void at::native::(anonymous namespace)::vectorized_l...         0.00%       0.000us         0.00%       0.000us       0.000us     181.403us       100.00%     181.403us      60.468us             3  
-                                Activity Buffer Request        34.89%     222.115us        34.89%     222.115us     222.115us     111.421us        61.42%     111.421us     111.421us             1  
-                                            aten::empty         4.31%      27.430us         4.31%      27.430us       3.048us       0.000us         0.00%       0.000us       0.000us             9  
-                                       cudaLaunchKernel        31.85%     202.744us        31.85%     202.744us      67.581us       0.000us         0.00%       0.000us       0.000us             3  
-                                             aten::view         0.69%       4.399us         0.69%       4.399us       0.733us       0.000us         0.00%       0.000us       0.000us             6  
-                                  cudaDeviceSynchronize         9.07%      57.751us         9.07%      57.751us      57.751us       0.000us         0.00%       0.000us       0.000us             1  
+                                       torch_layer_norm         3.99%      70.451us        99.68%       1.760ms       1.760ms       0.000us         0.00%     110.273us     110.273us             1  
+                                       aten::layer_norm         0.54%       9.469us        95.69%       1.690ms     563.186us       0.000us         0.00%     110.273us      36.758us             3  
+                                aten::native_layer_norm         2.91%      51.321us        95.15%       1.680ms     560.030us      70.464us       100.00%     110.273us      36.758us             3  
+                                       torch_layer_norm         0.00%       0.000us         0.00%       0.000us       0.000us     104.384us       148.14%     104.384us     104.384us             1  
+void at::native::(anonymous namespace)::vectorized_l...         0.00%       0.000us         0.00%       0.000us       0.000us      70.464us       100.00%      70.464us      23.488us             3  
+                                Activity Buffer Request        81.54%       1.440ms        81.54%       1.440ms       1.440ms      39.809us        56.50%      39.809us      39.809us             1  
+                                            aten::empty         1.69%      29.812us         1.69%      29.812us       3.312us       0.000us         0.00%       0.000us       0.000us             9  
+                                       cudaLaunchKernel         8.79%     155.141us         8.79%     155.141us      51.714us       0.000us         0.00%       0.000us       0.000us             3  
+                                             aten::view         0.23%       4.141us         0.23%       4.141us       0.690us       0.000us         0.00%       0.000us       0.000us             6  
+                                  cudaDeviceSynchronize         0.32%       5.631us         0.32%       5.631us       5.631us       0.000us         0.00%       0.000us       0.000us             1  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-Self CPU time total: 636.654us
-Self CUDA time total: 181.403us
+Self CPU time total: 1.766ms
+Self CUDA time total: 70.464us
 
 
 
@@ -4243,19 +4231,19 @@ PROFILE TRACE: torch_layer_norm | LN_B1_S2048_D1024
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
                                                    Name    Self CPU %      Self CPU   CPU total %     CPU total  CPU time avg     Self CUDA   Self CUDA %    CUDA total  CUDA time avg    # of Calls  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-                                       torch_layer_norm         0.00%       0.000us         0.00%       0.000us       0.000us      94.270us       223.52%      94.270us      94.270us             1  
-                                       torch_layer_norm        10.87%      65.642us        99.23%     599.223us     599.223us       0.000us         0.00%      55.232us      55.232us             1  
-                                       aten::layer_norm         1.37%       8.270us        88.36%     533.581us     177.860us       0.000us         0.00%      55.232us      18.411us             3  
-                                aten::native_layer_norm         8.01%      48.352us        86.99%     525.311us     175.104us      42.176us       100.00%      55.232us      18.411us             3  
-void at::native::(anonymous namespace)::vectorized_l...         0.00%       0.000us         0.00%       0.000us       0.000us      42.176us       100.00%      42.176us      14.059us             3  
-                                Activity Buffer Request        40.23%     242.915us        40.23%     242.915us     242.915us      13.056us        30.96%      13.056us      13.056us             1  
-                                            aten::empty         4.42%      26.710us         4.42%      26.710us       2.968us       0.000us         0.00%       0.000us       0.000us             9  
-                                       cudaLaunchKernel        33.66%     203.264us        33.66%     203.264us      67.755us       0.000us         0.00%       0.000us       0.000us             3  
-                                             aten::view         0.67%       4.070us         0.67%       4.070us       0.678us       0.000us         0.00%       0.000us       0.000us             6  
-                                  cudaDeviceSynchronize         0.77%       4.660us         0.77%       4.660us       4.660us       0.000us         0.00%       0.000us       0.000us             1  
+                                       torch_layer_norm         0.00%       0.000us         0.00%       0.000us       0.000us      94.879us       526.67%      94.879us      94.879us             1  
+                                       torch_layer_norm         3.90%      69.211us        99.68%       1.768ms       1.768ms       0.000us         0.00%      23.935us      23.935us             1  
+                                       aten::layer_norm         0.53%       9.340us        95.78%       1.699ms     566.293us       0.000us         0.00%      23.935us       7.978us             3  
+                                aten::native_layer_norm         2.96%      52.430us        95.26%       1.690ms     563.180us      18.015us       100.00%      23.935us       7.978us             3  
+void at::native::(anonymous namespace)::vectorized_l...         0.00%       0.000us         0.00%       0.000us       0.000us      18.015us       100.00%      18.015us       6.005us             3  
+                                Activity Buffer Request        81.67%       1.449ms        81.67%       1.449ms       1.449ms       5.920us        32.86%       5.920us       5.920us             1  
+                                            aten::empty         1.69%      29.991us         1.69%      29.991us       3.332us       0.000us         0.00%       0.000us       0.000us             9  
+                                       cudaLaunchKernel         8.72%     154.594us         8.72%     154.594us      51.531us       0.000us         0.00%       0.000us       0.000us             3  
+                                             aten::view         0.22%       3.890us         0.22%       3.890us       0.648us       0.000us         0.00%       0.000us       0.000us             6  
+                                  cudaDeviceSynchronize         0.32%       5.590us         0.32%       5.590us       5.590us       0.000us         0.00%       0.000us       0.000us             1  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-Self CPU time total: 603.883us
-Self CUDA time total: 42.176us
+Self CPU time total: 1.774ms
+Self CUDA time total: 18.015us
 
 
 
@@ -4265,19 +4253,19 @@ PROFILE TRACE: torch_layer_norm | LN_B1_S2048_D2048
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
                                                    Name    Self CPU %      Self CPU   CPU total %     CPU total  CPU time avg     Self CUDA   Self CUDA %    CUDA total  CUDA time avg    # of Calls  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-                                       torch_layer_norm        11.67%      65.482us        99.14%     556.242us     556.242us       0.000us         0.00%     139.454us     139.454us             1  
-                                       aten::layer_norm         1.45%       8.119us        87.47%     490.760us     163.587us       0.000us         0.00%     139.454us      46.485us             3  
-                                aten::native_layer_norm         8.23%      46.172us        86.02%     482.641us     160.880us      89.983us       100.00%     139.454us      46.485us             3  
-                                       torch_layer_norm         0.00%       0.000us         0.00%       0.000us       0.000us      99.327us       110.38%      99.327us      99.327us             1  
-void at::native::(anonymous namespace)::vectorized_l...         0.00%       0.000us         0.00%       0.000us       0.000us      89.983us       100.00%      89.983us      29.994us             3  
-                                Activity Buffer Request        36.14%     202.785us        36.14%     202.785us     202.785us      49.471us        54.98%      49.471us      49.471us             1  
-                                            aten::empty         4.95%      27.770us         4.95%      27.770us       3.086us       0.000us         0.00%       0.000us       0.000us             9  
-                                       cudaLaunchKernel        35.90%     201.414us        35.90%     201.414us      67.138us       0.000us         0.00%       0.000us       0.000us             3  
-                                             aten::view         0.80%       4.500us         0.80%       4.500us       0.750us       0.000us         0.00%       0.000us       0.000us             6  
-                                  cudaDeviceSynchronize         0.86%       4.841us         0.86%       4.841us       4.841us       0.000us         0.00%       0.000us       0.000us             1  
+                                       torch_layer_norm         0.00%       0.000us         0.00%       0.000us       0.000us      92.671us       343.53%      92.671us      92.671us             1  
+                                       torch_layer_norm        14.22%      66.652us        98.98%     463.858us     463.858us       0.000us         0.00%      35.872us      35.872us             1  
+                                       aten::layer_norm         1.92%       9.009us        84.76%     397.206us     132.402us       0.000us         0.00%      35.872us      11.957us             3  
+                                aten::native_layer_norm        11.29%      52.919us        82.83%     388.197us     129.399us      26.976us       100.00%      35.872us      11.957us             3  
+void at::native::(anonymous namespace)::vectorized_l...         0.00%       0.000us         0.00%       0.000us       0.000us      26.976us       100.00%      26.976us       8.992us             3  
+                                Activity Buffer Request        32.20%     150.883us        32.20%     150.883us     150.883us       8.896us        32.98%       8.896us       8.896us             1  
+                                            aten::empty         6.01%      28.182us         6.01%      28.182us       3.131us       0.000us         0.00%       0.000us       0.000us             9  
+                                       cudaLaunchKernel        32.49%     152.273us        32.49%     152.273us      50.758us       0.000us         0.00%       0.000us       0.000us             3  
+                                             aten::view         0.84%       3.940us         0.84%       3.940us       0.657us       0.000us         0.00%       0.000us       0.000us             6  
+                                  cudaDeviceSynchronize         1.02%       4.791us         1.02%       4.791us       4.791us       0.000us         0.00%       0.000us       0.000us             1  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-Self CPU time total: 561.083us
-Self CUDA time total: 89.983us
+Self CPU time total: 468.649us
+Self CUDA time total: 26.976us
 
 
 
@@ -4287,19 +4275,19 @@ PROFILE TRACE: torch_layer_norm | LN_B1_S2048_D4096
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
                                                    Name    Self CPU %      Self CPU   CPU total %     CPU total  CPU time avg     Self CUDA   Self CUDA %    CUDA total  CUDA time avg    # of Calls  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-                                       torch_layer_norm         3.41%      68.312us        98.25%       1.970ms       1.970ms       0.000us         0.00%     270.744us     270.744us             1  
-                                       aten::layer_norm         0.47%       9.381us        94.84%       1.901ms     633.747us       0.000us         0.00%     270.744us      90.248us             3  
-                                aten::native_layer_norm         2.60%      52.050us        94.37%       1.892ms     630.620us     169.179us       100.00%     270.744us      90.248us             3  
-                                       torch_layer_norm         0.00%       0.000us         0.00%       0.000us       0.000us     170.619us       100.85%     170.619us     170.619us             1  
-void at::native::(anonymous namespace)::vectorized_l...         0.00%       0.000us         0.00%       0.000us       0.000us     169.179us       100.00%     169.179us      56.393us             3  
-                                Activity Buffer Request        80.19%       1.608ms        80.19%       1.608ms       1.608ms     101.565us        60.03%     101.565us     101.565us             1  
-                                            aten::empty         1.37%      27.561us         1.37%      27.561us       3.062us       0.000us         0.00%       0.000us       0.000us             9  
-                                       cudaLaunchKernel         9.98%     199.994us         9.98%     199.994us      66.665us       0.000us         0.00%       0.000us       0.000us             3  
-                                             aten::view         0.23%       4.700us         0.23%       4.700us       0.783us       0.000us         0.00%       0.000us       0.000us             6  
-                                  cudaDeviceSynchronize         1.75%      35.140us         1.75%      35.140us      35.140us       0.000us         0.00%       0.000us       0.000us             1  
+                                       torch_layer_norm         0.00%       0.000us         0.00%       0.000us       0.000us     133.341us       184.87%     133.341us     133.341us             1  
+                                       torch_layer_norm         3.93%      69.900us        99.72%       1.772ms       1.772ms       0.000us         0.00%     112.892us     112.892us             1  
+                                       aten::layer_norm         0.55%       9.790us        95.79%       1.702ms     567.350us       0.000us         0.00%     112.892us      37.631us             3  
+                                aten::native_layer_norm         3.28%      58.200us        95.24%       1.692ms     564.087us      72.125us       100.00%     112.892us      37.631us             3  
+void at::native::(anonymous namespace)::vectorized_l...         0.00%       0.000us         0.00%       0.000us       0.000us      72.125us       100.00%      72.125us      24.042us             3  
+                                Activity Buffer Request        80.05%       1.422ms        80.05%       1.422ms       1.422ms      40.767us        56.52%      40.767us      40.767us             1  
+                                            aten::empty         1.64%      29.113us         1.64%      29.113us       3.235us       0.000us         0.00%       0.000us       0.000us             9  
+                                       cudaLaunchKernel        10.01%     177.823us        10.01%     177.823us      59.274us       0.000us         0.00%       0.000us       0.000us             3  
+                                             aten::view         0.27%       4.770us         0.27%       4.770us       0.795us       0.000us         0.00%       0.000us       0.000us             6  
+                                  cudaDeviceSynchronize         0.28%       4.900us         0.28%       4.900us       4.900us       0.000us         0.00%       0.000us       0.000us             1  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-Self CPU time total: 2.005ms
-Self CUDA time total: 169.179us
+Self CPU time total: 1.777ms
+Self CUDA time total: 72.125us
 
 
 
@@ -4309,19 +4297,19 @@ PROFILE TRACE: torch_layer_norm | LN_B1_S2048_D8192
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
                                                    Name    Self CPU %      Self CPU   CPU total %     CPU total  CPU time avg     Self CUDA   Self CUDA %    CUDA total  CUDA time avg    # of Calls  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-                                       torch_layer_norm         5.72%      67.681us        46.83%     553.822us     553.822us       0.000us         0.00%       1.004ms       1.004ms             1  
-                                       aten::layer_norm         0.73%       8.590us        41.11%     486.141us     162.047us       0.000us         0.00%       1.004ms     334.666us             3  
-                                aten::native_layer_norm         3.92%      46.321us        40.38%     477.551us     159.184us     752.710us       100.00%       1.004ms     334.666us             3  
-                                       torch_layer_norm         0.00%       0.000us         0.00%       0.000us       0.000us     754.214us       100.20%     754.214us     754.214us             1  
-void at::native::(anonymous namespace)::vectorized_l...         0.00%       0.000us         0.00%       0.000us       0.000us     752.710us       100.00%     752.710us     250.903us             3  
-                                Activity Buffer Request        16.98%     200.864us        16.98%     200.864us     200.864us     251.287us        33.38%     251.287us     251.287us             1  
-                                            aten::empty         2.27%      26.822us         2.27%      26.822us       2.980us       0.000us         0.00%       0.000us       0.000us             9  
-                                       cudaLaunchKernel        16.81%     198.794us        16.81%     198.794us      66.265us       0.000us         0.00%       0.000us       0.000us             3  
-                                             aten::view         0.40%       4.750us         0.40%       4.750us       0.792us       0.000us         0.00%       0.000us       0.000us             6  
-                                  cudaDeviceSynchronize        53.17%     628.854us        53.17%     628.854us     628.854us       0.000us         0.00%       0.000us       0.000us             1  
+                                       torch_layer_norm        14.68%      65.741us        95.47%     427.658us     427.658us       0.000us         0.00%     230.621us     230.621us             1  
+                                       aten::layer_norm         2.04%       9.121us        80.79%     361.917us     120.639us       0.000us         0.00%     230.621us      76.874us             3  
+                                aten::native_layer_norm        11.17%      50.059us        78.75%     352.796us     117.599us     144.510us       100.00%     230.621us      76.874us             3  
+                                       torch_layer_norm         0.00%       0.000us         0.00%       0.000us       0.000us     146.014us       101.04%     146.014us     146.014us             1  
+void at::native::(anonymous namespace)::vectorized_l...         0.00%       0.000us         0.00%       0.000us       0.000us     144.510us       100.00%     144.510us      48.170us             3  
+                                Activity Buffer Request        26.04%     116.642us        26.04%     116.642us     116.642us      86.111us        59.59%      86.111us      86.111us             1  
+                                            aten::empty         6.43%      28.811us         6.43%      28.811us       3.201us       0.000us         0.00%       0.000us       0.000us             9  
+                                       cudaLaunchKernel        34.20%     153.184us        34.20%     153.184us      51.061us       0.000us         0.00%       0.000us       0.000us             3  
+                                             aten::view         0.92%       4.100us         0.92%       4.100us       0.683us       0.000us         0.00%       0.000us       0.000us             6  
+                                  cudaDeviceSynchronize         4.53%      20.311us         4.53%      20.311us      20.311us       0.000us         0.00%       0.000us       0.000us             1  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-Self CPU time total: 1.183ms
-Self CUDA time total: 752.710us
+Self CPU time total: 447.969us
+Self CUDA time total: 144.510us
 
 
 
@@ -4331,19 +4319,19 @@ PROFILE TRACE: torch_layer_norm | LN_B4_S128_D1024
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
                                                    Name    Self CPU %      Self CPU   CPU total %     CPU total  CPU time avg     Self CUDA   Self CUDA %    CUDA total  CUDA time avg    # of Calls  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-                                       torch_layer_norm         0.00%       0.000us         0.00%       0.000us       0.000us      98.270us       717.51%      98.270us      98.270us             1  
-                                       torch_layer_norm         4.06%      75.621us        99.75%       1.859ms       1.859ms       0.000us         0.00%      18.144us      18.144us             1  
-                                       aten::layer_norm         0.48%       8.869us        95.70%       1.783ms     594.400us       0.000us         0.00%      18.144us       6.048us             3  
-                                aten::native_layer_norm         2.66%      49.555us        95.22%       1.774ms     591.443us      13.696us       100.00%      18.144us       6.048us             3  
-void at::native::(anonymous namespace)::vectorized_l...         0.00%       0.000us         0.00%       0.000us       0.000us      13.696us       100.00%      13.696us       4.565us             3  
-                                Activity Buffer Request        80.42%       1.498ms        80.42%       1.498ms       1.498ms       4.448us        32.48%       4.448us       4.448us             1  
-                                            aten::empty         1.52%      28.408us         1.52%      28.408us       3.156us       0.000us         0.00%       0.000us       0.000us             9  
-                                       cudaLaunchKernel        10.37%     193.204us        10.37%     193.204us      64.401us       0.000us         0.00%       0.000us       0.000us             3  
-                                             aten::view         0.25%       4.670us         0.25%       4.670us       0.778us       0.000us         0.00%       0.000us       0.000us             6  
-                                  cudaDeviceSynchronize         0.25%       4.580us         0.25%       4.580us       4.580us       0.000us         0.00%       0.000us       0.000us             1  
+                                       torch_layer_norm         0.00%       0.000us         0.00%       0.000us       0.000us      92.096us       943.61%      92.096us      92.096us             1  
+                                       torch_layer_norm         3.85%      68.512us        99.73%       1.773ms       1.773ms       0.000us         0.00%      12.864us      12.864us             1  
+                                       aten::layer_norm         0.55%       9.759us        95.87%       1.705ms     568.216us       0.000us         0.00%      12.864us       4.288us             3  
+                                aten::native_layer_norm         3.00%      53.309us        95.32%       1.695ms     564.963us       9.760us       100.00%      12.864us       4.288us             3  
+void at::native::(anonymous namespace)::vectorized_l...         0.00%       0.000us         0.00%       0.000us       0.000us       9.760us       100.00%       9.760us       3.253us             3  
+                                Activity Buffer Request        81.26%       1.445ms        81.26%       1.445ms       1.445ms       3.104us        31.80%       3.104us       3.104us             1  
+                                            aten::empty         1.70%      30.172us         1.70%      30.172us       3.352us       0.000us         0.00%       0.000us       0.000us             9  
+                                       cudaLaunchKernel         9.14%     162.452us         9.14%     162.452us      54.151us       0.000us         0.00%       0.000us       0.000us             3  
+                                             aten::view         0.24%       4.201us         0.24%       4.201us       0.700us       0.000us         0.00%       0.000us       0.000us             6  
+                                  cudaDeviceSynchronize         0.27%       4.880us         0.27%       4.880us       4.880us       0.000us         0.00%       0.000us       0.000us             1  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-Self CPU time total: 1.863ms
-Self CUDA time total: 13.696us
+Self CPU time total: 1.778ms
+Self CUDA time total: 9.760us
 
 
 
@@ -4353,19 +4341,19 @@ PROFILE TRACE: torch_layer_norm | LN_B4_S128_D2048
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
                                                    Name    Self CPU %      Self CPU   CPU total %     CPU total  CPU time avg     Self CUDA   Self CUDA %    CUDA total  CUDA time avg    # of Calls  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-                                       torch_layer_norm         0.00%       0.000us         0.00%       0.000us       0.000us      84.157us       423.54%      84.157us      84.157us             1  
-                                       torch_layer_norm        12.56%      65.601us        99.10%     517.451us     517.451us       0.000us         0.00%      26.238us      26.238us             1  
-                                       aten::layer_norm         1.65%       8.620us        86.53%     451.850us     150.617us       0.000us         0.00%      26.238us       8.746us             3  
-                                aten::native_layer_norm         8.95%      46.731us        84.88%     443.230us     147.743us      19.870us       100.00%      26.238us       8.746us             3  
-void at::native::(anonymous namespace)::vectorized_l...         0.00%       0.000us         0.00%       0.000us       0.000us      19.870us       100.00%      19.870us       6.623us             3  
-                                Activity Buffer Request        33.96%     177.304us        33.96%     177.304us     177.304us       6.368us        32.05%       6.368us       6.368us             1  
-                                            aten::empty         5.03%      26.250us         5.03%      26.250us       2.917us       0.000us         0.00%       0.000us       0.000us             9  
-                                       cudaLaunchKernel        36.12%     188.585us        36.12%     188.585us      62.862us       0.000us         0.00%       0.000us       0.000us             3  
-                                             aten::view         0.83%       4.360us         0.83%       4.360us       0.727us       0.000us         0.00%       0.000us       0.000us             6  
-                                  cudaDeviceSynchronize         0.90%       4.720us         0.90%       4.720us       4.720us       0.000us         0.00%       0.000us       0.000us             1  
+                                       torch_layer_norm         0.00%       0.000us         0.00%       0.000us       0.000us      91.521us       709.63%      91.521us      91.521us             1  
+                                       torch_layer_norm         4.32%      76.641us        99.71%       1.771ms       1.771ms       0.000us         0.00%      17.186us      17.186us             1  
+                                       aten::layer_norm         0.52%       9.251us        95.40%       1.694ms     564.620us       0.000us         0.00%      17.186us       5.729us             3  
+                                aten::native_layer_norm         2.94%      52.208us        94.87%       1.685ms     561.536us      12.897us       100.00%      17.186us       5.729us             3  
+void at::native::(anonymous namespace)::vectorized_l...         0.00%       0.000us         0.00%       0.000us       0.000us      12.897us       100.00%      12.897us       4.299us             3  
+                                Activity Buffer Request        81.35%       1.444ms        81.35%       1.444ms       1.444ms       4.289us        33.26%       4.289us       4.289us             1  
+                                            aten::empty         1.65%      29.223us         1.65%      29.223us       3.247us       0.000us         0.00%       0.000us       0.000us             9  
+                                       cudaLaunchKernel         8.72%     154.793us         8.72%     154.793us      51.598us       0.000us         0.00%       0.000us       0.000us             3  
+                                             aten::view         0.22%       3.890us         0.22%       3.890us       0.648us       0.000us         0.00%       0.000us       0.000us             6  
+                                  cudaDeviceSynchronize         0.29%       5.110us         0.29%       5.110us       5.110us       0.000us         0.00%       0.000us       0.000us             1  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-Self CPU time total: 522.171us
-Self CUDA time total: 19.870us
+Self CPU time total: 1.776ms
+Self CUDA time total: 12.897us
 
 
 
@@ -4375,19 +4363,19 @@ PROFILE TRACE: torch_layer_norm | LN_B4_S128_D4096
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
                                                    Name    Self CPU %      Self CPU   CPU total %     CPU total  CPU time avg     Self CUDA   Self CUDA %    CUDA total  CUDA time avg    # of Calls  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-                                       torch_layer_norm         0.00%       0.000us         0.00%       0.000us       0.000us     100.830us       309.83%     100.830us     100.830us             1  
-                                       torch_layer_norm         3.66%      68.162us        99.75%       1.858ms       1.858ms       0.000us         0.00%      42.752us      42.752us             1  
-                                       aten::layer_norm         0.47%       8.830us        96.09%       1.790ms     596.629us       0.000us         0.00%      42.752us      14.251us             3  
-                                aten::native_layer_norm         2.68%      49.840us        95.61%       1.781ms     593.686us      32.544us       100.00%      42.752us      14.251us             3  
-void at::native::(anonymous namespace)::vectorized_l...         0.00%       0.000us         0.00%       0.000us       0.000us      32.544us       100.00%      32.544us      10.848us             3  
-                                Activity Buffer Request        81.03%       1.509ms        81.03%       1.509ms       1.509ms      10.208us        31.37%      10.208us      10.208us             1  
-                                            aten::empty         1.49%      27.810us         1.49%      27.810us       3.090us       0.000us         0.00%       0.000us       0.000us             9  
-                                       cudaLaunchKernel        10.20%     189.944us        10.20%     189.944us      63.315us       0.000us         0.00%       0.000us       0.000us             3  
-                                             aten::view         0.21%       3.981us         0.21%       3.981us       0.663us       0.000us         0.00%       0.000us       0.000us             6  
-                                  cudaDeviceSynchronize         0.25%       4.711us         0.25%       4.711us       4.711us       0.000us         0.00%       0.000us       0.000us             1  
+                                       torch_layer_norm         0.00%       0.000us         0.00%       0.000us       0.000us      88.130us       448.50%      88.130us      88.130us             1  
+                                       torch_layer_norm        11.06%      64.130us        99.16%     575.190us     575.190us       0.000us         0.00%      26.147us      26.147us             1  
+                                       aten::layer_norm         1.59%       9.222us        88.10%     511.060us     170.353us       0.000us         0.00%      26.147us       8.716us             3  
+                                aten::native_layer_norm         8.61%      49.940us        86.51%     501.838us     167.279us      19.650us       100.00%      26.147us       8.716us             3  
+void at::native::(anonymous namespace)::vectorized_l...         0.00%       0.000us         0.00%       0.000us       0.000us      19.650us       100.00%      19.650us       6.550us             3  
+                                Activity Buffer Request        45.46%     263.724us        45.46%     263.724us     263.724us       6.497us        33.06%       6.497us       6.497us             1  
+                                            aten::empty         4.97%      28.852us         4.97%      28.852us       3.206us       0.000us         0.00%       0.000us       0.000us             9  
+                                       cudaLaunchKernel        26.69%     154.833us        26.69%     154.833us      51.611us       0.000us         0.00%       0.000us       0.000us             3  
+                                             aten::view         0.77%       4.489us         0.77%       4.489us       0.748us       0.000us         0.00%       0.000us       0.000us             6  
+                                  cudaDeviceSynchronize         0.84%       4.880us         0.84%       4.880us       4.880us       0.000us         0.00%       0.000us       0.000us             1  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-Self CPU time total: 1.863ms
-Self CUDA time total: 32.544us
+Self CPU time total: 580.070us
+Self CUDA time total: 19.650us
 
 
 
@@ -4397,19 +4385,19 @@ PROFILE TRACE: torch_layer_norm | LN_B4_S128_D8192
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
                                                    Name    Self CPU %      Self CPU   CPU total %     CPU total  CPU time avg     Self CUDA   Self CUDA %    CUDA total  CUDA time avg    # of Calls  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-                                       torch_layer_norm         3.74%      69.652us        99.74%       1.860ms       1.860ms       0.000us         0.00%     141.437us     141.437us             1  
-                                       aten::layer_norm         0.49%       9.081us        96.01%       1.790ms     596.709us       0.000us         0.00%     141.437us      47.146us             3  
-                                aten::native_layer_norm         2.73%      50.892us        95.52%       1.781ms     593.682us      88.286us       100.00%     141.437us      47.146us             3  
-                                       torch_layer_norm         0.00%       0.000us         0.00%       0.000us       0.000us     105.086us       119.03%     105.086us     105.086us             1  
-void at::native::(anonymous namespace)::vectorized_l...         0.00%       0.000us         0.00%       0.000us       0.000us      88.286us       100.00%      88.286us      29.429us             3  
-                                Activity Buffer Request        81.06%       1.511ms        81.06%       1.511ms       1.511ms      53.151us        60.20%      53.151us      53.151us             1  
-                                            aten::empty         1.52%      28.430us         1.52%      28.430us       3.159us       0.000us         0.00%       0.000us       0.000us             9  
-                                       cudaLaunchKernel        10.00%     186.444us        10.00%     186.444us      62.148us       0.000us         0.00%       0.000us       0.000us             3  
-                                             aten::view         0.21%       3.839us         0.21%       3.839us       0.640us       0.000us         0.00%       0.000us       0.000us             6  
-                                  cudaDeviceSynchronize         0.26%       4.780us         0.26%       4.780us       4.780us       0.000us         0.00%       0.000us       0.000us             1  
+                                       torch_layer_norm         0.00%       0.000us         0.00%       0.000us       0.000us      92.576us       290.74%      92.576us      92.576us             1  
+                                       torch_layer_norm        10.78%      63.911us        99.14%     587.520us     587.520us       0.000us         0.00%      42.562us      42.562us             1  
+                                       aten::layer_norm         1.44%       8.510us        88.35%     523.609us     174.536us       0.000us         0.00%      42.562us      14.187us             3  
+                                aten::native_layer_norm         8.62%      51.095us        86.92%     515.099us     171.700us      31.841us       100.00%      42.562us      14.187us             3  
+void at::native::(anonymous namespace)::vectorized_l...         0.00%       0.000us         0.00%       0.000us       0.000us      31.841us       100.00%      31.841us      10.614us             3  
+                                Activity Buffer Request        46.87%     277.744us        46.87%     277.744us     277.744us      10.721us        33.67%      10.721us      10.721us             1  
+                                            aten::empty         4.75%      28.169us         4.75%      28.169us       3.130us       0.000us         0.00%       0.000us       0.000us             9  
+                                       cudaLaunchKernel        25.92%     153.632us        25.92%     153.632us      51.211us       0.000us         0.00%       0.000us       0.000us             3  
+                                             aten::view         0.75%       4.459us         0.75%       4.459us       0.743us       0.000us         0.00%       0.000us       0.000us             6  
+                                  cudaDeviceSynchronize         0.86%       5.110us         0.86%       5.110us       5.110us       0.000us         0.00%       0.000us       0.000us             1  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-Self CPU time total: 1.865ms
-Self CUDA time total: 88.286us
+Self CPU time total: 592.630us
+Self CUDA time total: 31.841us
 
 
 
@@ -4419,19 +4407,19 @@ PROFILE TRACE: torch_layer_norm | LN_B4_S512_D1024
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
                                                    Name    Self CPU %      Self CPU   CPU total %     CPU total  CPU time avg     Self CUDA   Self CUDA %    CUDA total  CUDA time avg    # of Calls  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-                                       torch_layer_norm         0.00%       0.000us         0.00%       0.000us       0.000us      93.917us       224.90%      93.917us      93.917us             1  
-                                       torch_layer_norm        11.99%      66.702us        99.09%     551.052us     551.052us       0.000us         0.00%      54.848us      54.848us             1  
-                                       aten::layer_norm         1.50%       8.369us        87.10%     484.350us     161.450us       0.000us         0.00%      54.848us      18.283us             3  
-                                aten::native_layer_norm         8.25%      45.863us        85.59%     475.981us     158.660us      41.760us       100.00%      54.848us      18.283us             3  
-void at::native::(anonymous namespace)::vectorized_l...         0.00%       0.000us         0.00%       0.000us       0.000us      41.760us       100.00%      41.760us      13.920us             3  
-                                Activity Buffer Request        38.06%     211.665us        38.06%     211.665us     211.665us      13.088us        31.34%      13.088us      13.088us             1  
-                                            aten::empty         5.01%      27.870us         5.01%      27.870us       3.097us       0.000us         0.00%       0.000us       0.000us             9  
-                                       cudaLaunchKernel        33.56%     186.643us        33.56%     186.643us      62.214us       0.000us         0.00%       0.000us       0.000us             3  
-                                             aten::view         0.71%       3.940us         0.71%       3.940us       0.657us       0.000us         0.00%       0.000us       0.000us             6  
-                                  cudaDeviceSynchronize         0.91%       5.050us         0.91%       5.050us       5.050us       0.000us         0.00%       0.000us       0.000us             1  
+                                       torch_layer_norm         0.00%       0.000us         0.00%       0.000us       0.000us      95.776us       539.28%      95.776us      95.776us             1  
+                                       torch_layer_norm        13.84%     112.583us        99.26%     807.595us     807.595us       0.000us         0.00%      23.680us      23.680us             1  
+                                       aten::layer_norm         1.40%      11.400us        85.42%     695.012us     231.671us       0.000us         0.00%      23.680us       7.893us             3  
+                                aten::native_layer_norm         7.57%      61.601us        84.02%     683.612us     227.871us      17.760us       100.00%      23.680us       7.893us             3  
+void at::native::(anonymous namespace)::vectorized_l...         0.00%       0.000us         0.00%       0.000us       0.000us      17.760us       100.00%      17.760us       5.920us             3  
+                                Activity Buffer Request        33.76%     274.664us        33.76%     274.664us     274.664us       5.920us        33.33%       5.920us       5.920us             1  
+                                            aten::empty         3.69%      30.062us         3.69%      30.062us       3.340us       0.000us         0.00%       0.000us       0.000us             9  
+                                       cudaLaunchKernel        38.34%     311.955us        38.34%     311.955us     103.985us       0.000us         0.00%       0.000us       0.000us             3  
+                                             aten::view         0.66%       5.330us         0.66%       5.330us       0.888us       0.000us         0.00%       0.000us       0.000us             6  
+                                  cudaDeviceSynchronize         0.74%       6.030us         0.74%       6.030us       6.030us       0.000us         0.00%       0.000us       0.000us             1  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-Self CPU time total: 556.102us
-Self CUDA time total: 41.760us
+Self CPU time total: 813.625us
+Self CUDA time total: 17.760us
 
 
 
@@ -4441,19 +4429,19 @@ PROFILE TRACE: torch_layer_norm | LN_B4_S512_D2048
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
                                                    Name    Self CPU %      Self CPU   CPU total %     CPU total  CPU time avg     Self CUDA   Self CUDA %    CUDA total  CUDA time avg    # of Calls  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-                                       torch_layer_norm         4.62%      86.531us        99.75%       1.867ms       1.867ms       0.000us         0.00%     136.638us     136.638us             1  
-                                       aten::layer_norm         0.50%       9.359us        95.12%       1.780ms     593.443us       0.000us         0.00%     136.638us      45.546us             3  
-                                aten::native_layer_norm         2.82%      52.769us        94.62%       1.771ms     590.323us      88.543us       100.00%     136.638us      45.546us             3  
-                                       torch_layer_norm         0.00%       0.000us         0.00%       0.000us       0.000us     111.902us       126.38%     111.902us     111.902us             1  
-void at::native::(anonymous namespace)::vectorized_l...         0.00%       0.000us         0.00%       0.000us       0.000us      88.543us       100.00%      88.543us      29.514us             3  
-                                Activity Buffer Request        80.23%       1.502ms        80.23%       1.502ms       1.502ms      48.095us        54.32%      48.095us      48.095us             1  
-                                            aten::empty         1.54%      28.823us         1.54%      28.823us       3.203us       0.000us         0.00%       0.000us       0.000us             9  
-                                       cudaLaunchKernel         9.82%     183.715us         9.82%     183.715us      61.238us       0.000us         0.00%       0.000us       0.000us             3  
-                                             aten::view         0.21%       4.020us         0.21%       4.020us       0.670us       0.000us         0.00%       0.000us       0.000us             6  
-                                  cudaDeviceSynchronize         0.25%       4.770us         0.25%       4.770us       4.770us       0.000us         0.00%       0.000us       0.000us             1  
+                                       torch_layer_norm         0.00%       0.000us         0.00%       0.000us       0.000us      96.383us       353.93%      96.383us      96.383us             1  
+                                       torch_layer_norm         4.14%      80.990us        99.72%       1.949ms       1.949ms       0.000us         0.00%      36.288us      36.288us             1  
+                                       aten::layer_norm         0.49%       9.631us        95.58%       1.868ms     622.648us       0.000us         0.00%      36.288us      12.096us             3  
+                                aten::native_layer_norm         2.77%      54.113us        95.09%       1.858ms     619.438us      27.232us       100.00%      36.288us      12.096us             3  
+void at::native::(anonymous namespace)::vectorized_l...         0.00%       0.000us         0.00%       0.000us       0.000us      27.232us       100.00%      27.232us       9.077us             3  
+                                Activity Buffer Request        75.84%       1.482ms        75.84%       1.482ms       1.482ms       9.056us        33.25%       9.056us       9.056us             1  
+                                            aten::empty         1.50%      29.320us         1.50%      29.320us       3.258us       0.000us         0.00%       0.000us       0.000us             9  
+                                       cudaLaunchKernel        14.76%     288.535us        14.76%     288.535us      96.178us       0.000us         0.00%       0.000us       0.000us             3  
+                                             aten::view         0.22%       4.249us         0.22%       4.249us       0.708us       0.000us         0.00%       0.000us       0.000us             6  
+                                  cudaDeviceSynchronize         0.28%       5.411us         0.28%       5.411us       5.411us       0.000us         0.00%       0.000us       0.000us             1  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-Self CPU time total: 1.872ms
-Self CUDA time total: 88.543us
+Self CPU time total: 1.954ms
+Self CUDA time total: 27.232us
 
 
 
@@ -4463,19 +4451,19 @@ PROFILE TRACE: torch_layer_norm | LN_B4_S512_D4096
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
                                                    Name    Self CPU %      Self CPU   CPU total %     CPU total  CPU time avg     Self CUDA   Self CUDA %    CUDA total  CUDA time avg    # of Calls  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-                                       torch_layer_norm        11.20%      64.902us        91.70%     531.442us     531.442us       0.000us         0.00%     274.259us     274.259us             1  
-                                       aten::layer_norm         1.46%       8.459us        80.50%     466.540us     155.513us       0.000us         0.00%     274.259us      91.420us             3  
-                                aten::native_layer_norm         8.11%      47.030us        79.04%     458.081us     152.694us     170.744us       100.00%     274.259us      91.420us             3  
-                                       torch_layer_norm         0.00%       0.000us         0.00%       0.000us       0.000us     172.183us       100.84%     172.183us     172.183us             1  
-void at::native::(anonymous namespace)::vectorized_l...         0.00%       0.000us         0.00%       0.000us       0.000us     170.744us       100.00%     170.744us      56.915us             3  
-                                Activity Buffer Request        33.75%     195.605us        33.75%     195.605us     195.605us     103.515us        60.63%     103.515us     103.515us             1  
-                                            aten::empty         4.92%      28.491us         4.92%      28.491us       3.166us       0.000us         0.00%       0.000us       0.000us             9  
-                                       cudaLaunchKernel        31.58%     183.015us        31.58%     183.015us      61.005us       0.000us         0.00%       0.000us       0.000us             3  
-                                             aten::view         0.68%       3.940us         0.68%       3.940us       0.657us       0.000us         0.00%       0.000us       0.000us             6  
-                                  cudaDeviceSynchronize         8.30%      48.121us         8.30%      48.121us      48.121us       0.000us         0.00%       0.000us       0.000us             1  
+                                       torch_layer_norm         3.80%      69.480us        99.73%       1.822ms       1.822ms       0.000us         0.00%     112.641us     112.641us             1  
+                                       aten::layer_norm         0.50%       9.151us        95.93%       1.752ms     584.111us       0.000us         0.00%     112.641us      37.547us             3  
+                                aten::native_layer_norm         2.81%      51.420us        95.43%       1.743ms     581.060us      72.033us       100.00%     112.641us      37.547us             3  
+                                       torch_layer_norm         0.00%       0.000us         0.00%       0.000us       0.000us     101.696us       141.18%     101.696us     101.696us             1  
+void at::native::(anonymous namespace)::vectorized_l...         0.00%       0.000us         0.00%       0.000us       0.000us      72.033us       100.00%      72.033us      24.011us             3  
+                                Activity Buffer Request        80.53%       1.471ms        80.53%       1.471ms       1.471ms      40.608us        56.37%      40.608us      40.608us             1  
+                                            aten::empty         1.60%      29.163us         1.60%      29.163us       3.240us       0.000us         0.00%       0.000us       0.000us             9  
+                                       cudaLaunchKernel        10.27%     187.683us        10.27%     187.683us      62.561us       0.000us         0.00%       0.000us       0.000us             3  
+                                             aten::view         0.22%       3.950us         0.22%       3.950us       0.658us       0.000us         0.00%       0.000us       0.000us             6  
+                                  cudaDeviceSynchronize         0.27%       4.880us         0.27%       4.880us       4.880us       0.000us         0.00%       0.000us       0.000us             1  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-Self CPU time total: 579.563us
-Self CUDA time total: 170.744us
+Self CPU time total: 1.827ms
+Self CUDA time total: 72.033us
 
 
 
@@ -4485,19 +4473,19 @@ PROFILE TRACE: torch_layer_norm | LN_B4_S512_D8192
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
                                                    Name    Self CPU %      Self CPU   CPU total %     CPU total  CPU time avg     Self CUDA   Self CUDA %    CUDA total  CUDA time avg    # of Calls  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-                                       torch_layer_norm         2.78%      68.521us        74.25%       1.833ms       1.833ms       0.000us         0.00%       1.015ms       1.015ms             1  
-                                       aten::layer_norm         0.37%       9.021us        71.48%       1.765ms     588.209us       0.000us         0.00%       1.015ms     338.437us             3  
-                                aten::native_layer_norm         1.97%      48.600us        71.11%       1.756ms     585.202us     765.011us       100.00%       1.015ms     338.437us             3  
-                                       torch_layer_norm         0.00%       0.000us         0.00%       0.000us       0.000us     766.355us       100.18%     766.355us     766.355us             1  
-void at::native::(anonymous namespace)::vectorized_l...         0.00%       0.000us         0.00%       0.000us       0.000us     765.011us       100.00%     765.011us     255.004us             3  
-                                Activity Buffer Request        60.48%       1.493ms        60.48%       1.493ms       1.493ms     250.300us        32.72%     250.300us     250.300us             1  
-                                            aten::empty         1.12%      27.530us         1.12%      27.530us       3.059us       0.000us         0.00%       0.000us       0.000us             9  
-                                       cudaLaunchKernel         7.39%     182.375us         7.39%     182.375us      60.792us       0.000us         0.00%       0.000us       0.000us             3  
-                                             aten::view         0.16%       4.040us         0.16%       4.040us       0.673us       0.000us         0.00%       0.000us       0.000us             6  
-                                  cudaDeviceSynchronize        25.75%     635.633us        25.75%     635.633us     635.633us       0.000us         0.00%       0.000us       0.000us             1  
+                                       torch_layer_norm         3.85%      68.680us        99.71%       1.780ms       1.780ms       0.000us         0.00%     229.955us     229.955us             1  
+                                       aten::layer_norm         0.61%      10.850us        95.86%       1.711ms     570.370us       0.000us         0.00%     229.955us      76.652us             3  
+                                aten::native_layer_norm         3.11%      55.560us        95.26%       1.700ms     566.754us     144.066us       100.00%     229.955us      76.652us             3  
+                                       torch_layer_norm         0.00%       0.000us         0.00%       0.000us       0.000us     145.569us       101.04%     145.569us     145.569us             1  
+void at::native::(anonymous namespace)::vectorized_l...         0.00%       0.000us         0.00%       0.000us       0.000us     144.066us       100.00%     144.066us      48.022us             3  
+                                Activity Buffer Request        79.52%       1.419ms        79.52%       1.419ms       1.419ms      85.889us        59.62%      85.889us      85.889us             1  
+                                            aten::empty         1.71%      30.551us         1.71%      30.551us       3.395us       0.000us         0.00%       0.000us       0.000us             9  
+                                       cudaLaunchKernel        10.67%     190.375us        10.67%     190.375us      63.458us       0.000us         0.00%       0.000us       0.000us             3  
+                                             aten::view         0.24%       4.330us         0.24%       4.330us       0.722us       0.000us         0.00%       0.000us       0.000us             6  
+                                  cudaDeviceSynchronize         0.29%       5.130us         0.29%       5.130us       5.130us       0.000us         0.00%       0.000us       0.000us             1  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-Self CPU time total: 2.469ms
-Self CUDA time total: 765.011us
+Self CPU time total: 1.785ms
+Self CUDA time total: 144.066us
 
 
 
@@ -4507,19 +4495,19 @@ PROFILE TRACE: torch_layer_norm | LN_B4_S1024_D1024
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
                                                    Name    Self CPU %      Self CPU   CPU total %     CPU total  CPU time avg     Self CUDA   Self CUDA %    CUDA total  CUDA time avg    # of Calls  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-                                       torch_layer_norm         3.81%      69.942us        99.74%       1.831ms       1.831ms       0.000us         0.00%     147.579us     147.579us             1  
-                                       aten::layer_norm         0.48%       8.750us        95.93%       1.761ms     586.892us       0.000us         0.00%     147.579us      49.193us             3  
-                                aten::native_layer_norm         2.73%      50.120us        95.45%       1.752ms     583.976us      98.333us       100.00%     147.579us      49.193us             3  
-                                       torch_layer_norm         0.00%       0.000us         0.00%       0.000us       0.000us     109.597us       111.45%     109.597us     109.597us             1  
-void at::native::(anonymous namespace)::vectorized_l...         0.00%       0.000us         0.00%       0.000us       0.000us      98.333us       100.00%      98.333us      32.778us             3  
-                                Activity Buffer Request        81.29%       1.492ms        81.29%       1.492ms       1.492ms      49.246us        50.08%      49.246us      49.246us             1  
-                                            aten::empty         1.50%      27.580us         1.50%      27.580us       3.064us       0.000us         0.00%       0.000us       0.000us             9  
-                                       cudaLaunchKernel         9.72%     178.483us         9.72%     178.483us      59.494us       0.000us         0.00%       0.000us       0.000us             3  
-                                             aten::view         0.21%       3.842us         0.21%       3.842us       0.640us       0.000us         0.00%       0.000us       0.000us             6  
-                                  cudaDeviceSynchronize         0.26%       4.750us         0.26%       4.750us       4.750us       0.000us         0.00%       0.000us       0.000us             1  
+                                       torch_layer_norm         0.00%       0.000us         0.00%       0.000us       0.000us     115.904us       398.90%     115.904us     115.904us             1  
+                                       torch_layer_norm         4.36%      77.971us        99.69%       1.781ms       1.781ms       0.000us         0.00%      38.656us      38.656us             1  
+                                       aten::layer_norm         0.59%      10.570us        95.33%       1.703ms     567.730us       0.000us         0.00%      38.656us      12.885us             3  
+                                aten::native_layer_norm         3.31%      59.081us        94.74%       1.693ms     564.207us      29.056us       100.00%      38.656us      12.885us             3  
+void at::native::(anonymous namespace)::vectorized_l...         0.00%       0.000us         0.00%       0.000us       0.000us      29.056us       100.00%      29.056us       9.685us             3  
+                                Activity Buffer Request        80.03%       1.430ms        80.03%       1.430ms       1.430ms       9.600us        33.04%       9.600us       9.600us             1  
+                                            aten::empty         1.84%      32.962us         1.84%      32.962us       3.662us       0.000us         0.00%       0.000us       0.000us             9  
+                                       cudaLaunchKernel         9.29%     165.972us         9.29%     165.972us      55.324us       0.000us         0.00%       0.000us       0.000us             3  
+                                             aten::view         0.27%       4.790us         0.27%       4.790us       0.798us       0.000us         0.00%       0.000us       0.000us             6  
+                                  cudaDeviceSynchronize         0.31%       5.470us         0.31%       5.470us       5.470us       0.000us         0.00%       0.000us       0.000us             1  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-Self CPU time total: 1.835ms
-Self CUDA time total: 98.333us
+Self CPU time total: 1.787ms
+Self CUDA time total: 29.056us
 
 
 
@@ -4529,19 +4517,19 @@ PROFILE TRACE: torch_layer_norm | LN_B4_S1024_D2048
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
                                                    Name    Self CPU %      Self CPU   CPU total %     CPU total  CPU time avg     Self CUDA   Self CUDA %    CUDA total  CUDA time avg    # of Calls  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-                                       torch_layer_norm        13.41%      85.581us        92.66%     591.443us     591.443us       0.000us         0.00%     270.843us     270.843us             1  
-                                       aten::layer_norm         1.36%       8.710us        79.25%     505.862us     168.621us       0.000us         0.00%     270.843us      90.281us             3  
-                                aten::native_layer_norm         7.27%      46.392us        77.88%     497.152us     165.717us     172.445us       100.00%     270.843us      90.281us             3  
-                                       torch_layer_norm         0.00%       0.000us         0.00%       0.000us       0.000us     173.885us       100.84%     173.885us     173.885us             1  
-void at::native::(anonymous namespace)::vectorized_l...         0.00%       0.000us         0.00%       0.000us       0.000us     172.445us       100.00%     172.445us      57.482us             3  
-                                Activity Buffer Request        37.57%     239.825us        37.57%     239.825us     239.825us      98.398us        57.06%      98.398us      98.398us             1  
-                                            aten::empty         4.27%      27.241us         4.27%      27.241us       3.027us       0.000us         0.00%       0.000us       0.000us             9  
-                                       cudaLaunchKernel        28.17%     179.824us        28.17%     179.824us      59.941us       0.000us         0.00%       0.000us       0.000us             3  
-                                             aten::view         0.61%       3.870us         0.61%       3.870us       0.645us       0.000us         0.00%       0.000us       0.000us             6  
-                                  cudaDeviceSynchronize         7.34%      46.881us         7.34%      46.881us      46.881us       0.000us         0.00%       0.000us       0.000us             1  
+                                       torch_layer_norm        14.07%      64.760us        98.95%     455.588us     455.588us       0.000us         0.00%     101.120us     101.120us             1  
+                                       aten::layer_norm         1.91%       8.791us        84.88%     390.828us     130.276us       0.000us         0.00%     101.120us      33.707us             3  
+                                aten::native_layer_norm        11.79%      54.281us        82.97%     382.037us     127.346us      65.344us       100.00%     101.120us      33.707us             3  
+                                       torch_layer_norm         0.00%       0.000us         0.00%       0.000us       0.000us      96.510us       147.70%      96.510us      96.510us             1  
+void at::native::(anonymous namespace)::vectorized_l...         0.00%       0.000us         0.00%       0.000us       0.000us      65.344us       100.00%      65.344us      21.781us             3  
+                                Activity Buffer Request        29.77%     137.072us        29.77%     137.072us     137.072us      35.776us        54.75%      35.776us      35.776us             1  
+                                            aten::empty         6.60%      30.402us         6.60%      30.402us       3.378us       0.000us         0.00%       0.000us       0.000us             9  
+                                       cudaLaunchKernel        33.93%     156.232us        33.93%     156.232us      52.077us       0.000us         0.00%       0.000us       0.000us             3  
+                                             aten::view         0.88%       4.050us         0.88%       4.050us       0.675us       0.000us         0.00%       0.000us       0.000us             6  
+                                  cudaDeviceSynchronize         1.05%       4.840us         1.05%       4.840us       4.840us       0.000us         0.00%       0.000us       0.000us             1  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-Self CPU time total: 638.324us
-Self CUDA time total: 172.445us
+Self CPU time total: 460.428us
+Self CUDA time total: 65.344us
 
 
 
@@ -4551,19 +4539,19 @@ PROFILE TRACE: torch_layer_norm | LN_B4_S1024_D4096
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
                                                    Name    Self CPU %      Self CPU   CPU total %     CPU total  CPU time avg     Self CUDA   Self CUDA %    CUDA total  CUDA time avg    # of Calls  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-                                       torch_layer_norm         2.80%      69.051us        74.17%       1.829ms       1.829ms       0.000us         0.00%       1.017ms       1.017ms             1  
-                                       aten::layer_norm         0.38%       9.271us        71.37%       1.760ms     586.799us       0.000us         0.00%       1.017ms     338.980us             3  
-                                aten::native_layer_norm         1.96%      48.329us        70.99%       1.751ms     583.709us     774.001us       100.00%       1.017ms     338.980us             3  
-                                       torch_layer_norm         0.00%       0.000us         0.00%       0.000us       0.000us     775.313us       100.17%     775.313us     775.313us             1  
-void at::native::(anonymous namespace)::vectorized_l...         0.00%       0.000us         0.00%       0.000us       0.000us     774.001us       100.00%     774.001us     258.000us             3  
-                                Activity Buffer Request        60.50%       1.492ms        60.50%       1.492ms       1.492ms     242.939us        31.39%     242.939us     242.939us             1  
-                                            aten::empty         1.12%      27.712us         1.12%      27.712us       3.079us       0.000us         0.00%       0.000us       0.000us             9  
-                                       cudaLaunchKernel         7.26%     179.014us         7.26%     179.014us      59.671us       0.000us         0.00%       0.000us       0.000us             3  
-                                             aten::view         0.16%       3.900us         0.16%       3.900us       0.650us       0.000us         0.00%       0.000us       0.000us             6  
-                                  cudaDeviceSynchronize        25.83%     637.134us        25.83%     637.134us     637.134us       0.000us         0.00%       0.000us       0.000us             1  
+                                       torch_layer_norm         3.83%      67.811us        99.72%       1.767ms       1.767ms       0.000us         0.00%     207.840us     207.840us             1  
+                                       aten::layer_norm         0.55%       9.819us        95.89%       1.699ms     566.320us       0.000us         0.00%     207.840us      69.280us             3  
+                                aten::native_layer_norm         3.03%      53.603us        95.34%       1.689ms     563.047us     129.312us       100.00%     207.840us      69.280us             3  
+                                       torch_layer_norm         0.00%       0.000us         0.00%       0.000us       0.000us     130.911us       101.24%     130.911us     130.911us             1  
+void at::native::(anonymous namespace)::vectorized_l...         0.00%       0.000us         0.00%       0.000us       0.000us     129.312us       100.00%     129.312us      43.104us             3  
+                                Activity Buffer Request        81.49%       1.444ms        81.49%       1.444ms       1.444ms      78.528us        60.73%      78.528us      78.528us             1  
+                                            aten::empty         1.74%      30.830us         1.74%      30.830us       3.426us       0.000us         0.00%       0.000us       0.000us             9  
+                                       cudaLaunchKernel         8.86%     156.973us         8.86%     156.973us      52.324us       0.000us         0.00%       0.000us       0.000us             3  
+                                             aten::view         0.23%       4.020us         0.23%       4.020us       0.670us       0.000us         0.00%       0.000us       0.000us             6  
+                                  cudaDeviceSynchronize         0.28%       4.980us         0.28%       4.980us       4.980us       0.000us         0.00%       0.000us       0.000us             1  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-Self CPU time total: 2.467ms
-Self CUDA time total: 774.001us
+Self CPU time total: 1.772ms
+Self CUDA time total: 129.312us
 
 
 
@@ -4573,19 +4561,19 @@ PROFILE TRACE: torch_layer_norm | LN_B4_S1024_D8192
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
                                                    Name    Self CPU %      Self CPU   CPU total %     CPU total  CPU time avg     Self CUDA   Self CUDA %    CUDA total  CUDA time avg    # of Calls  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-                                       torch_layer_norm         3.34%      67.143us        27.08%     544.452us     544.452us       0.000us         0.00%       2.061ms       2.061ms             1  
-                                       aten::layer_norm         0.43%       8.689us        23.74%     477.309us     159.103us       0.000us         0.00%       2.061ms     687.112us             3  
-                                aten::native_layer_norm         2.32%      46.570us        23.31%     468.620us     156.207us       1.591ms       100.00%       2.061ms     687.112us             3  
-                                       torch_layer_norm         0.00%       0.000us         0.00%       0.000us       0.000us       1.593ms       100.09%       1.593ms       1.593ms             1  
-void at::native::(anonymous namespace)::vectorized_l...         0.00%       0.000us         0.00%       0.000us       0.000us       1.591ms       100.00%       1.591ms     530.454us             3  
-                                Activity Buffer Request        10.53%     211.705us        10.53%     211.705us     211.705us     469.975us        29.53%     469.975us     469.975us             1  
-                                            aten::empty         1.38%      27.780us         1.38%      27.780us       3.087us       0.000us         0.00%       0.000us       0.000us             9  
-                                       cudaLaunchKernel         8.88%     178.623us         8.88%     178.623us      59.541us       0.000us         0.00%       0.000us       0.000us             3  
-                                             aten::view         0.20%       3.942us         0.20%       3.942us       0.657us       0.000us         0.00%       0.000us       0.000us             6  
-                                  cudaDeviceSynchronize        72.92%       1.466ms        72.92%       1.466ms       1.466ms       0.000us         0.00%       0.000us       0.000us             1  
+                                       torch_layer_norm         3.13%      68.611us        81.17%       1.779ms       1.779ms       0.000us         0.00%     737.526us     737.526us             1  
+                                       aten::layer_norm         0.41%       9.061us        78.04%       1.711ms     570.260us       0.000us         0.00%     737.526us     245.842us             3  
+                                aten::native_layer_norm         2.43%      53.328us        77.62%       1.702ms     567.240us     547.705us       100.00%     737.526us     245.842us             3  
+                                       torch_layer_norm         0.00%       0.000us         0.00%       0.000us       0.000us     549.241us       100.28%     549.241us     549.241us             1  
+void at::native::(anonymous namespace)::vectorized_l...         0.00%       0.000us         0.00%       0.000us       0.000us     547.705us       100.00%     547.705us     182.568us             3  
+                                Activity Buffer Request        66.39%       1.455ms        66.39%       1.455ms       1.455ms     189.821us        34.66%     189.821us     189.821us             1  
+                                            aten::empty         1.36%      29.741us         1.36%      29.741us       3.305us       0.000us         0.00%       0.000us       0.000us             9  
+                                       cudaLaunchKernel         7.27%     159.364us         7.27%     159.364us      53.121us       0.000us         0.00%       0.000us       0.000us             3  
+                                             aten::view         0.18%       3.911us         0.18%       3.911us       0.652us       0.000us         0.00%       0.000us       0.000us             6  
+                                  cudaDeviceSynchronize        18.83%     412.857us        18.83%     412.857us     412.857us       0.000us         0.00%       0.000us       0.000us             1  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-Self CPU time total: 2.011ms
-Self CUDA time total: 1.591ms
+Self CPU time total: 2.192ms
+Self CUDA time total: 547.705us
 
 
 
@@ -4595,19 +4583,19 @@ PROFILE TRACE: torch_layer_norm | LN_B4_S2048_D1024
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
                                                    Name    Self CPU %      Self CPU   CPU total %     CPU total  CPU time avg     Self CUDA   Self CUDA %    CUDA total  CUDA time avg    # of Calls  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-                                       torch_layer_norm         3.62%      68.292us        96.50%       1.823ms       1.823ms       0.000us         0.00%     293.305us     293.305us             1  
-                                       aten::layer_norm         0.46%       8.692us        92.88%       1.754ms     584.823us       0.000us         0.00%     293.305us      97.768us             3  
-                                aten::native_layer_norm         2.51%      47.441us        92.42%       1.746ms     581.925us     194.459us       100.00%     293.305us      97.768us             3  
-                                       torch_layer_norm         0.00%       0.000us         0.00%       0.000us       0.000us     195.932us       100.76%     195.932us     195.932us             1  
-void at::native::(anonymous namespace)::vectorized_l...         0.00%       0.000us         0.00%       0.000us       0.000us     194.459us       100.00%     194.459us      64.820us             3  
-                                Activity Buffer Request        78.85%       1.489ms        78.85%       1.489ms       1.489ms      98.846us        50.83%      98.846us      98.846us             1  
-                                            aten::empty         1.54%      29.039us         1.54%      29.039us       3.227us       0.000us         0.00%       0.000us       0.000us             9  
-                                       cudaLaunchKernel         9.30%     175.764us         9.30%     175.764us      58.588us       0.000us         0.00%       0.000us       0.000us             3  
-                                             aten::view         0.21%       4.050us         0.21%       4.050us       0.675us       0.000us         0.00%       0.000us       0.000us             6  
-                                  cudaDeviceSynchronize         3.50%      66.181us         3.50%      66.181us      66.181us       0.000us         0.00%       0.000us       0.000us             1  
+                                       torch_layer_norm        13.81%      64.951us        98.91%     465.198us     465.198us       0.000us         0.00%     102.813us     102.813us             1  
+                                       aten::layer_norm         2.00%       9.429us        85.10%     400.247us     133.416us       0.000us         0.00%     102.813us      34.271us             3  
+                                aten::native_layer_norm        10.88%      51.150us        83.10%     390.818us     130.273us      68.606us       100.00%     102.813us      34.271us             3  
+                                       torch_layer_norm         0.00%       0.000us         0.00%       0.000us       0.000us     100.893us       147.06%     100.893us     100.893us             1  
+void at::native::(anonymous namespace)::vectorized_l...         0.00%       0.000us         0.00%       0.000us       0.000us      68.606us       100.00%      68.606us      22.869us             3  
+                                Activity Buffer Request        31.07%     146.142us        31.07%     146.142us     146.142us      34.207us        49.86%      34.207us      34.207us             1  
+                                            aten::empty         6.17%      29.002us         6.17%      29.002us       3.222us       0.000us         0.00%       0.000us       0.000us             9  
+                                       cudaLaunchKernel        34.16%     160.644us        34.16%     160.644us      53.548us       0.000us         0.00%       0.000us       0.000us             3  
+                                             aten::view         0.82%       3.880us         0.82%       3.880us       0.647us       0.000us         0.00%       0.000us       0.000us             6  
+                                  cudaDeviceSynchronize         1.09%       5.121us         1.09%       5.121us       5.121us       0.000us         0.00%       0.000us       0.000us             1  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-Self CPU time total: 1.889ms
-Self CUDA time total: 194.459us
+Self CPU time total: 470.319us
+Self CUDA time total: 68.606us
 
 
 
@@ -4617,19 +4605,19 @@ PROFILE TRACE: torch_layer_norm | LN_B4_S2048_D2048
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
                                                    Name    Self CPU %      Self CPU   CPU total %     CPU total  CPU time avg     Self CUDA   Self CUDA %    CUDA total  CUDA time avg    # of Calls  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-                                       torch_layer_norm         2.78%      69.740us        74.10%       1.857ms       1.857ms       0.000us         0.00%       1.019ms       1.019ms             1  
-                                       aten::layer_norm         0.37%       9.390us        71.32%       1.787ms     595.616us       0.000us         0.00%       1.019ms     339.749us             3  
-                                aten::native_layer_norm         1.97%      49.270us        70.94%       1.777ms     592.486us     782.484us       100.00%       1.019ms     339.749us             3  
-                                       torch_layer_norm         0.00%       0.000us         0.00%       0.000us       0.000us     783.796us       100.17%     783.796us     783.796us             1  
-void at::native::(anonymous namespace)::vectorized_l...         0.00%       0.000us         0.00%       0.000us       0.000us     782.484us       100.00%     782.484us     260.828us             3  
-                                Activity Buffer Request        60.61%       1.519ms        60.61%       1.519ms       1.519ms     236.764us        30.26%     236.764us     236.764us             1  
-                                            aten::empty         1.20%      30.103us         1.20%      30.103us       3.345us       0.000us         0.00%       0.000us       0.000us             9  
-                                       cudaLaunchKernel         7.01%     175.614us         7.01%     175.614us      58.538us       0.000us         0.00%       0.000us       0.000us             3  
-                                             aten::view         0.16%       3.909us         0.16%       3.909us       0.651us       0.000us         0.00%       0.000us       0.000us             6  
-                                  cudaDeviceSynchronize        25.90%     648.943us        25.90%     648.943us     648.943us       0.000us         0.00%       0.000us       0.000us             1  
+                                       torch_layer_norm         3.85%      67.820us        99.72%       1.755ms       1.755ms       0.000us         0.00%     204.288us     204.288us             1  
+                                       aten::layer_norm         0.52%       9.151us        95.86%       1.687ms     562.280us       0.000us         0.00%     204.288us      68.096us             3  
+                                aten::native_layer_norm         2.95%      51.910us        95.34%       1.678ms     559.230us     129.120us       100.00%     204.288us      68.096us             3  
+                                       torch_layer_norm         0.00%       0.000us         0.00%       0.000us       0.000us     130.560us       101.12%     130.560us     130.560us             1  
+void at::native::(anonymous namespace)::vectorized_l...         0.00%       0.000us         0.00%       0.000us       0.000us     129.120us       100.00%     129.120us      43.040us             3  
+                                Activity Buffer Request        81.69%       1.437ms        81.69%       1.437ms       1.437ms      75.168us        58.22%      75.168us      75.168us             1  
+                                            aten::empty         1.73%      30.362us         1.73%      30.362us       3.374us       0.000us         0.00%       0.000us       0.000us             9  
+                                       cudaLaunchKernel         8.76%     154.112us         8.76%     154.112us      51.371us       0.000us         0.00%       0.000us       0.000us             3  
+                                             aten::view         0.22%       3.910us         0.22%       3.910us       0.652us       0.000us         0.00%       0.000us       0.000us             6  
+                                  cudaDeviceSynchronize         0.28%       4.960us         0.28%       4.960us       4.960us       0.000us         0.00%       0.000us       0.000us             1  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-Self CPU time total: 2.506ms
-Self CUDA time total: 782.484us
+Self CPU time total: 1.760ms
+Self CUDA time total: 129.120us
 
 
 
@@ -4639,19 +4627,19 @@ PROFILE TRACE: torch_layer_norm | LN_B4_S2048_D4096
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
                                                    Name    Self CPU %      Self CPU   CPU total %     CPU total  CPU time avg     Self CUDA   Self CUDA %    CUDA total  CUDA time avg    # of Calls  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-                                       torch_layer_norm         3.31%      67.692us        28.17%     576.572us     576.572us       0.000us         0.00%       2.073ms       2.073ms             1  
-                                       aten::layer_norm         0.43%       8.840us        24.86%     508.880us     169.627us       0.000us         0.00%       2.073ms     691.102us             3  
-                                aten::native_layer_norm         2.41%      49.301us        24.43%     500.040us     166.680us       1.601ms       100.00%       2.073ms     691.102us             3  
-                                       torch_layer_norm         0.00%       0.000us         0.00%       0.000us       0.000us       1.602ms       100.08%       1.602ms       1.602ms             1  
-void at::native::(anonymous namespace)::vectorized_l...         0.00%       0.000us         0.00%       0.000us       0.000us       1.601ms       100.00%       1.601ms     533.655us             3  
-                                Activity Buffer Request        10.93%     223.675us        10.93%     223.675us     223.675us     472.343us        29.50%     472.343us     472.343us             1  
-                                            aten::empty         1.43%      29.180us         1.43%      29.180us       3.242us       0.000us         0.00%       0.000us       0.000us             9  
-                                       cudaLaunchKernel         9.47%     193.884us         9.47%     193.884us      64.628us       0.000us         0.00%       0.000us       0.000us             3  
-                                             aten::view         0.20%       4.000us         0.20%       4.000us       0.667us       0.000us         0.00%       0.000us       0.000us             6  
-                                  cudaDeviceSynchronize        71.83%       1.470ms        71.83%       1.470ms       1.470ms       0.000us         0.00%       0.000us       0.000us             1  
+                                       torch_layer_norm         3.24%      70.231us        80.97%       1.754ms       1.754ms       0.000us         0.00%     714.792us     714.792us             1  
+                                       aten::layer_norm         0.42%       9.200us        77.73%       1.684ms     561.233us       0.000us         0.00%     714.792us     238.264us             3  
+                                aten::native_layer_norm         2.38%      51.610us        77.31%       1.674ms     558.166us     542.598us       100.00%     714.792us     238.264us             3  
+                                       torch_layer_norm         0.00%       0.000us         0.00%       0.000us       0.000us     544.071us       100.27%     544.071us     544.071us             1  
+void at::native::(anonymous namespace)::vectorized_l...         0.00%       0.000us         0.00%       0.000us       0.000us     542.598us       100.00%     542.598us     180.866us             3  
+                                Activity Buffer Request        66.26%       1.435ms        66.26%       1.435ms       1.435ms     172.194us        31.74%     172.194us     172.194us             1  
+                                            aten::empty         1.34%      28.942us         1.34%      28.942us       3.216us       0.000us         0.00%       0.000us       0.000us             9  
+                                       cudaLaunchKernel         7.14%     154.623us         7.14%     154.623us      51.541us       0.000us         0.00%       0.000us       0.000us             3  
+                                             aten::view         0.19%       4.030us         0.19%       4.030us       0.672us       0.000us         0.00%       0.000us       0.000us             6  
+                                  cudaDeviceSynchronize        19.03%     412.116us        19.03%     412.116us     412.116us       0.000us         0.00%       0.000us       0.000us             1  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-Self CPU time total: 2.047ms
-Self CUDA time total: 1.601ms
+Self CPU time total: 2.166ms
+Self CUDA time total: 542.598us
 
 
 
@@ -4661,19 +4649,19 @@ PROFILE TRACE: torch_layer_norm | LN_B4_S2048_D8192
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
                                                    Name    Self CPU %      Self CPU   CPU total %     CPU total  CPU time avg     Self CUDA   Self CUDA %    CUDA total  CUDA time avg    # of Calls  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-                                       torch_layer_norm         1.41%      71.321us        36.81%       1.859ms       1.859ms       0.000us         0.00%       4.346ms       4.346ms             1  
-                                       aten::layer_norm         0.18%       9.191us        35.39%       1.788ms     595.990us       0.000us         0.00%       4.346ms       1.449ms             3  
-                                aten::native_layer_norm         0.98%      49.420us        35.21%       1.779ms     592.926us       3.326ms       100.00%       4.346ms       1.449ms             3  
-                                       torch_layer_norm         0.00%       0.000us         0.00%       0.000us       0.000us       3.327ms       100.04%       3.327ms       3.327ms             1  
-void at::native::(anonymous namespace)::vectorized_l...         0.00%       0.000us         0.00%       0.000us       0.000us       3.326ms       100.00%       3.326ms       1.109ms             3  
-                                Activity Buffer Request        30.03%       1.517ms        30.03%       1.517ms       1.517ms       1.021ms        30.69%       1.021ms       1.021ms             1  
-                                            aten::empty         0.58%      29.141us         0.58%      29.141us       3.238us       0.000us         0.00%       0.000us       0.000us             9  
-                                       cudaLaunchKernel         3.55%     179.254us         3.55%     179.254us      59.751us       0.000us         0.00%       0.000us       0.000us             3  
-                                             aten::view         0.08%       3.870us         0.08%       3.870us       0.645us       0.000us         0.00%       0.000us       0.000us             6  
-                                  cudaDeviceSynchronize        63.19%       3.192ms        63.19%       3.192ms       3.192ms       0.000us         0.00%       0.000us       0.000us             1  
+                                       torch_layer_norm         2.50%      69.210us        63.28%       1.753ms       1.753ms       0.000us         0.00%       1.482ms       1.482ms             1  
+                                       aten::layer_norm         0.34%       9.550us        60.78%       1.684ms     561.333us       0.000us         0.00%       1.482ms     494.135us             3  
+                                aten::native_layer_norm         1.89%      52.442us        60.43%       1.674ms     558.150us       1.150ms       100.00%       1.482ms     494.135us             3  
+                                       torch_layer_norm         0.00%       0.000us         0.00%       0.000us       0.000us       1.151ms       100.12%       1.151ms       1.151ms             1  
+void at::native::(anonymous namespace)::vectorized_l...         0.00%       0.000us         0.00%       0.000us       0.000us       1.150ms       100.00%       1.150ms     383.212us             3  
+                                Activity Buffer Request        51.68%       1.432ms        51.68%       1.432ms       1.432ms     332.769us        28.95%     332.769us     332.769us             1  
+                                            aten::empty         1.10%      30.460us         1.10%      30.460us       3.384us       0.000us         0.00%       0.000us       0.000us             9  
+                                       cudaLaunchKernel         5.62%     155.772us         5.62%     155.772us      51.924us       0.000us         0.00%       0.000us       0.000us             3  
+                                             aten::view         0.14%       3.891us         0.14%       3.891us       0.649us       0.000us         0.00%       0.000us       0.000us             6  
+                                  cudaDeviceSynchronize        36.72%       1.018ms        36.72%       1.018ms       1.018ms       0.000us         0.00%       0.000us       0.000us             1  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-Self CPU time total: 5.051ms
-Self CUDA time total: 3.326ms
+Self CPU time total: 2.771ms
+Self CUDA time total: 1.150ms
 
 
 
@@ -4683,19 +4671,19 @@ PROFILE TRACE: torch_layer_norm | LN_B16_S128_D1024
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
                                                    Name    Self CPU %      Self CPU   CPU total %     CPU total  CPU time avg     Self CUDA   Self CUDA %    CUDA total  CUDA time avg    # of Calls  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-                                       torch_layer_norm         0.00%       0.000us         0.00%       0.000us       0.000us      94.845us       222.19%      94.845us      94.845us             1  
-                                       torch_layer_norm        12.19%      64.781us        99.08%     526.632us     526.632us       0.000us         0.00%      56.095us      56.095us             1  
-                                       aten::layer_norm         1.55%       8.240us        86.89%     461.851us     153.950us       0.000us         0.00%      56.095us      18.698us             3  
-                                aten::native_layer_norm         8.82%      46.862us        85.34%     453.611us     151.204us      42.687us       100.00%      56.095us      18.698us             3  
-void at::native::(anonymous namespace)::vectorized_l...         0.00%       0.000us         0.00%       0.000us       0.000us      42.687us       100.00%      42.687us      14.229us             3  
-                                Activity Buffer Request        37.58%     199.725us        37.58%     199.725us     199.725us      13.408us        31.41%      13.408us      13.408us             1  
-                                            aten::empty         5.07%      26.941us         5.07%      26.941us       2.993us       0.000us         0.00%       0.000us       0.000us             9  
-                                       cudaLaunchKernel        33.00%     175.413us        33.00%     175.413us      58.471us       0.000us         0.00%       0.000us       0.000us             3  
-                                             aten::view         0.88%       4.670us         0.88%       4.670us       0.778us       0.000us         0.00%       0.000us       0.000us             6  
-                                  cudaDeviceSynchronize         0.92%       4.890us         0.92%       4.890us       4.890us       0.000us         0.00%       0.000us       0.000us             1  
+                                       torch_layer_norm         0.00%       0.000us         0.00%       0.000us       0.000us      86.813us       481.04%      86.813us      86.813us             1  
+                                       torch_layer_norm        13.94%      63.610us        98.78%     450.788us     450.788us       0.000us         0.00%      23.966us      23.966us             1  
+                                       aten::layer_norm         1.92%       8.751us        84.84%     387.178us     129.059us       0.000us         0.00%      23.966us       7.989us             3  
+                                aten::native_layer_norm        11.33%      51.701us        82.93%     378.427us     126.142us      18.047us       100.00%      23.966us       7.989us             3  
+void at::native::(anonymous namespace)::vectorized_l...         0.00%       0.000us         0.00%       0.000us       0.000us      18.047us       100.00%      18.047us       6.016us             3  
+                                Activity Buffer Request        30.87%     140.892us        30.87%     140.892us     140.892us       5.919us        32.80%       5.919us       5.919us             1  
+                                            aten::empty         6.07%      27.691us         6.07%      27.691us       3.077us       0.000us         0.00%       0.000us       0.000us             9  
+                                       cudaLaunchKernel        33.75%     154.013us        33.75%     154.013us      51.338us       0.000us         0.00%       0.000us       0.000us             3  
+                                             aten::view         0.91%       4.130us         0.91%       4.130us       0.688us       0.000us         0.00%       0.000us       0.000us             6  
+                                  cudaDeviceSynchronize         1.22%       5.560us         1.22%       5.560us       5.560us       0.000us         0.00%       0.000us       0.000us             1  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-Self CPU time total: 531.522us
-Self CUDA time total: 42.687us
+Self CPU time total: 456.348us
+Self CUDA time total: 18.047us
 
 
 
@@ -4705,19 +4693,19 @@ PROFILE TRACE: torch_layer_norm | LN_B16_S128_D2048
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
                                                    Name    Self CPU %      Self CPU   CPU total %     CPU total  CPU time avg     Self CUDA   Self CUDA %    CUDA total  CUDA time avg    # of Calls  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-                                       torch_layer_norm        10.93%      66.051us        99.19%     599.583us     599.583us       0.000us         0.00%     137.212us     137.212us             1  
-                                       aten::layer_norm         1.47%       8.912us        88.27%     533.532us     177.844us       0.000us         0.00%     137.212us      45.737us             3  
-                                aten::native_layer_norm         8.28%      50.060us        86.79%     524.620us     174.873us      88.510us       100.00%     137.212us      45.737us             3  
-                                       torch_layer_norm         0.00%       0.000us         0.00%       0.000us       0.000us     106.109us       119.88%     106.109us     106.109us             1  
-void at::native::(anonymous namespace)::vectorized_l...         0.00%       0.000us         0.00%       0.000us       0.000us      88.510us       100.00%      88.510us      29.503us             3  
-                                Activity Buffer Request        42.84%     258.935us        42.84%     258.935us     258.935us      48.702us        55.02%      48.702us      48.702us             1  
-                                            aten::empty         4.66%      28.180us         4.66%      28.180us       3.131us       0.000us         0.00%       0.000us       0.000us             9  
-                                       cudaLaunchKernel        30.27%     182.954us        30.27%     182.954us      60.985us       0.000us         0.00%       0.000us       0.000us             3  
-                                             aten::view         0.74%       4.491us         0.74%       4.491us       0.748us       0.000us         0.00%       0.000us       0.000us             6  
-                                  cudaDeviceSynchronize         0.81%       4.880us         0.81%       4.880us       4.880us       0.000us         0.00%       0.000us       0.000us             1  
+                                       torch_layer_norm         0.00%       0.000us         0.00%       0.000us       0.000us      94.272us       347.01%      94.272us      94.272us             1  
+                                       torch_layer_norm         3.87%      67.581us        99.70%       1.743ms       1.743ms       0.000us         0.00%      36.063us      36.063us             1  
+                                       aten::layer_norm         0.54%       9.410us        95.84%       1.675ms     558.423us       0.000us         0.00%      36.063us      12.021us             3  
+                                aten::native_layer_norm         3.00%      52.431us        95.30%       1.666ms     555.286us      27.167us       100.00%      36.063us      12.021us             3  
+void at::native::(anonymous namespace)::vectorized_l...         0.00%       0.000us         0.00%       0.000us       0.000us      27.167us       100.00%      27.167us       9.056us             3  
+                                Activity Buffer Request        81.64%       1.427ms        81.64%       1.427ms       1.427ms       8.896us        32.75%       8.896us       8.896us             1  
+                                            aten::empty         1.64%      28.640us         1.64%      28.640us       3.182us       0.000us         0.00%       0.000us       0.000us             9  
+                                       cudaLaunchKernel         8.79%     153.563us         8.79%     153.563us      51.188us       0.000us         0.00%       0.000us       0.000us             3  
+                                             aten::view         0.23%       4.090us         0.23%       4.090us       0.682us       0.000us         0.00%       0.000us       0.000us             6  
+                                  cudaDeviceSynchronize         0.30%       5.160us         0.30%       5.160us       5.160us       0.000us         0.00%       0.000us       0.000us             1  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-Self CPU time total: 604.463us
-Self CUDA time total: 88.510us
+Self CPU time total: 1.748ms
+Self CUDA time total: 27.167us
 
 
 
@@ -4727,19 +4715,19 @@ PROFILE TRACE: torch_layer_norm | LN_B16_S128_D4096
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
                                                    Name    Self CPU %      Self CPU   CPU total %     CPU total  CPU time avg     Self CUDA   Self CUDA %    CUDA total  CUDA time avg    # of Calls  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-                                       torch_layer_norm        10.02%      64.262us        91.98%     590.053us     590.053us       0.000us         0.00%     278.967us     278.967us             1  
-                                       aten::layer_norm         1.23%       7.910us        81.96%     525.791us     175.264us       0.000us         0.00%     278.967us      92.989us             3  
-                                aten::native_layer_norm         7.34%      47.060us        80.73%     517.881us     172.627us     173.722us       100.00%     278.967us      92.989us             3  
-                                       torch_layer_norm         0.00%       0.000us         0.00%       0.000us       0.000us     175.066us       100.77%     175.066us     175.066us             1  
-void at::native::(anonymous namespace)::vectorized_l...         0.00%       0.000us         0.00%       0.000us       0.000us     173.722us       100.00%     173.722us      57.907us             3  
-                                Activity Buffer Request        41.71%     267.606us        41.71%     267.606us     267.606us     105.245us        60.58%     105.245us     105.245us             1  
-                                            aten::empty         4.10%      26.330us         4.10%      26.330us       2.926us       0.000us         0.00%       0.000us       0.000us             9  
-                                       cudaLaunchKernel        26.95%     172.865us        26.95%     172.865us      57.622us       0.000us         0.00%       0.000us       0.000us             3  
-                                             aten::view         0.63%       4.020us         0.63%       4.020us       0.670us       0.000us         0.00%       0.000us       0.000us             6  
-                                  cudaDeviceSynchronize         8.02%      51.462us         8.02%      51.462us      51.462us       0.000us         0.00%       0.000us       0.000us             1  
+                                       torch_layer_norm        15.30%      64.290us        98.85%     415.327us     415.327us       0.000us         0.00%     113.182us     113.182us             1  
+                                       aten::layer_norm         1.89%       7.931us        83.55%     351.037us     117.012us       0.000us         0.00%     113.182us      37.727us             3  
+                                aten::native_layer_norm        12.15%      51.059us        81.66%     343.106us     114.369us      72.639us       100.00%     113.182us      37.727us             3  
+                                       torch_layer_norm         0.00%       0.000us         0.00%       0.000us       0.000us      97.758us       134.58%      97.758us      97.758us             1  
+void at::native::(anonymous namespace)::vectorized_l...         0.00%       0.000us         0.00%       0.000us       0.000us      72.639us       100.00%      72.639us      24.213us             3  
+                                Activity Buffer Request        25.15%     105.652us        25.15%     105.652us     105.652us      40.543us        55.81%      40.543us      40.543us             1  
+                                            aten::empty         7.08%      29.763us         7.08%      29.763us       3.307us       0.000us         0.00%       0.000us       0.000us             9  
+                                       cudaLaunchKernel        36.37%     152.792us        36.37%     152.792us      50.931us       0.000us         0.00%       0.000us       0.000us             3  
+                                             aten::view         0.91%       3.840us         0.91%       3.840us       0.640us       0.000us         0.00%       0.000us       0.000us             6  
+                                  cudaDeviceSynchronize         1.15%       4.831us         1.15%       4.831us       4.831us       0.000us         0.00%       0.000us       0.000us             1  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-Self CPU time total: 641.515us
-Self CUDA time total: 173.722us
+Self CPU time total: 420.158us
+Self CUDA time total: 72.639us
 
 
 
@@ -4749,19 +4737,19 @@ PROFILE TRACE: torch_layer_norm | LN_B16_S128_D8192
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
                                                    Name    Self CPU %      Self CPU   CPU total %     CPU total  CPU time avg     Self CUDA   Self CUDA %    CUDA total  CUDA time avg    # of Calls  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-                                       torch_layer_norm         5.42%      64.123us        46.63%     551.262us     551.262us       0.000us         0.00%     999.011us     999.011us             1  
-                                       aten::layer_norm         0.70%       8.291us        41.21%     487.139us     162.380us       0.000us         0.00%     999.011us     333.004us             3  
-                                aten::native_layer_norm         4.04%      47.749us        40.50%     478.848us     159.616us     754.698us       100.00%     999.011us     333.004us             3  
-                                       torch_layer_norm         0.00%       0.000us         0.00%       0.000us       0.000us     755.978us       100.17%     755.978us     755.978us             1  
-void at::native::(anonymous namespace)::vectorized_l...         0.00%       0.000us         0.00%       0.000us       0.000us     754.698us       100.00%     754.698us     251.566us             3  
-                                Activity Buffer Request        19.30%     228.145us        19.30%     228.145us     228.145us     244.313us        32.37%     244.313us     244.313us             1  
-                                            aten::empty         2.39%      28.290us         2.39%      28.290us       3.143us       0.000us         0.00%       0.000us       0.000us             9  
-                                       cudaLaunchKernel        14.40%     170.293us        14.40%     170.293us      56.764us       0.000us         0.00%       0.000us       0.000us             3  
-                                             aten::view         0.37%       4.371us         0.37%       4.371us       0.729us       0.000us         0.00%       0.000us       0.000us             6  
-                                  cudaDeviceSynchronize        53.37%     630.953us        53.37%     630.953us     630.953us       0.000us         0.00%       0.000us       0.000us             1  
+                                       torch_layer_norm         3.89%      68.361us        99.32%       1.748ms       1.748ms       0.000us         0.00%     226.432us     226.432us             1  
+                                       aten::layer_norm         0.51%       8.970us        95.44%       1.679ms     559.750us       0.000us         0.00%     226.432us      75.477us             3  
+                                aten::native_layer_norm         3.03%      53.343us        94.93%       1.670ms     556.760us     142.207us       100.00%     226.432us      75.477us             3  
+                                       torch_layer_norm         0.00%       0.000us         0.00%       0.000us       0.000us     143.552us       100.95%     143.552us     143.552us             1  
+void at::native::(anonymous namespace)::vectorized_l...         0.00%       0.000us         0.00%       0.000us       0.000us     142.207us       100.00%     142.207us      47.402us             3  
+                                Activity Buffer Request        81.27%       1.430ms        81.27%       1.430ms       1.430ms      84.225us        59.23%      84.225us      84.225us             1  
+                                            aten::empty         1.69%      29.760us         1.69%      29.760us       3.307us       0.000us         0.00%       0.000us       0.000us             9  
+                                       cudaLaunchKernel         8.71%     153.172us         8.71%     153.172us      51.057us       0.000us         0.00%       0.000us       0.000us             3  
+                                             aten::view         0.23%       4.080us         0.23%       4.080us       0.680us       0.000us         0.00%       0.000us       0.000us             6  
+                                  cudaDeviceSynchronize         0.68%      11.911us         0.68%      11.911us      11.911us       0.000us         0.00%       0.000us       0.000us             1  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-Self CPU time total: 1.182ms
-Self CUDA time total: 754.698us
+Self CPU time total: 1.760ms
+Self CUDA time total: 142.207us
 
 
 
@@ -4771,19 +4759,19 @@ PROFILE TRACE: torch_layer_norm | LN_B16_S512_D1024
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
                                                    Name    Self CPU %      Self CPU   CPU total %     CPU total  CPU time avg     Self CUDA   Self CUDA %    CUDA total  CUDA time avg    # of Calls  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-                                       torch_layer_norm        10.01%      65.931us        89.70%     590.752us     590.752us       0.000us         0.00%     288.918us     288.918us             1  
-                                       aten::layer_norm         1.34%       8.832us        79.69%     524.821us     174.940us       0.000us         0.00%     288.918us      96.306us             3  
-                                aten::native_layer_norm         7.23%      47.600us        78.35%     515.989us     171.996us     192.505us       100.00%     288.918us      96.306us             3  
-                                       torch_layer_norm         0.00%       0.000us         0.00%       0.000us       0.000us     193.977us       100.76%     193.977us     193.977us             1  
-void at::native::(anonymous namespace)::vectorized_l...         0.00%       0.000us         0.00%       0.000us       0.000us     192.505us       100.00%     192.505us      64.168us             3  
-                                Activity Buffer Request        41.13%     270.896us        41.13%     270.896us     270.896us      96.413us        50.08%      96.413us      96.413us             1  
-                                            aten::empty         4.24%      27.950us         4.24%      27.950us       3.106us       0.000us         0.00%       0.000us       0.000us             9  
-                                       cudaLaunchKernel        25.15%     165.623us        25.15%     165.623us      55.208us       0.000us         0.00%       0.000us       0.000us             3  
-                                             aten::view         0.60%       3.920us         0.60%       3.920us       0.653us       0.000us         0.00%       0.000us       0.000us             6  
-                                  cudaDeviceSynchronize        10.30%      67.841us        10.30%      67.841us      67.841us       0.000us         0.00%       0.000us       0.000us             1  
+                                       torch_layer_norm         3.86%      67.581us        99.71%       1.745ms       1.745ms       0.000us         0.00%     103.967us     103.967us             1  
+                                       aten::layer_norm         0.51%       8.910us        95.84%       1.677ms     559.073us       0.000us         0.00%     103.967us      34.656us             3  
+                                aten::native_layer_norm         3.07%      53.660us        95.33%       1.668ms     556.103us      69.343us       100.00%     103.967us      34.656us             3  
+                                       torch_layer_norm         0.00%       0.000us         0.00%       0.000us       0.000us     103.487us       149.24%     103.487us     103.487us             1  
+void at::native::(anonymous namespace)::vectorized_l...         0.00%       0.000us         0.00%       0.000us       0.000us      69.343us       100.00%      69.343us      23.114us             3  
+                                Activity Buffer Request        81.52%       1.427ms        81.52%       1.427ms       1.427ms      34.624us        49.93%      34.624us      34.624us             1  
+                                            aten::empty         1.61%      28.261us         1.61%      28.261us       3.140us       0.000us         0.00%       0.000us       0.000us             9  
+                                       cudaLaunchKernel         8.90%     155.753us         8.90%     155.753us      51.918us       0.000us         0.00%       0.000us       0.000us             3  
+                                             aten::view         0.24%       4.120us         0.24%       4.120us       0.687us       0.000us         0.00%       0.000us       0.000us             6  
+                                  cudaDeviceSynchronize         0.29%       5.151us         0.29%       5.151us       5.151us       0.000us         0.00%       0.000us       0.000us             1  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-Self CPU time total: 658.593us
-Self CUDA time total: 192.505us
+Self CPU time total: 1.750ms
+Self CUDA time total: 69.343us
 
 
 
@@ -4793,19 +4781,19 @@ PROFILE TRACE: torch_layer_norm | LN_B16_S512_D2048
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
                                                    Name    Self CPU %      Self CPU   CPU total %     CPU total  CPU time avg     Self CUDA   Self CUDA %    CUDA total  CUDA time avg    # of Calls  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-                                       torch_layer_norm         5.51%      65.042us        44.22%     521.671us     521.671us       0.000us         0.00%       1.021ms       1.021ms             1  
-                                       aten::layer_norm         0.70%       8.259us        38.70%     456.629us     152.210us       0.000us         0.00%       1.021ms     340.419us             3  
-                                aten::native_layer_norm         4.08%      48.143us        38.00%     448.370us     149.457us     782.094us       100.00%       1.021ms     340.419us             3  
-                                       torch_layer_norm         0.00%       0.000us         0.00%       0.000us       0.000us     783.471us       100.18%     783.471us     783.471us             1  
-void at::native::(anonymous namespace)::vectorized_l...         0.00%       0.000us         0.00%       0.000us       0.000us     782.094us       100.00%     782.094us     260.698us             3  
-                                Activity Buffer Request        17.26%     203.644us        17.26%     203.644us     203.644us     239.163us        30.58%     239.163us     239.163us             1  
-                                            aten::empty         2.33%      27.440us         2.33%      27.440us       3.049us       0.000us         0.00%       0.000us       0.000us             9  
-                                       cudaLaunchKernel        14.01%     165.323us        14.01%     165.323us      55.108us       0.000us         0.00%       0.000us       0.000us             3  
-                                             aten::view         0.32%       3.820us         0.32%       3.820us       0.637us       0.000us         0.00%       0.000us       0.000us             6  
-                                  cudaDeviceSynchronize        55.78%     658.143us        55.78%     658.143us     658.143us       0.000us         0.00%       0.000us       0.000us             1  
+                                       torch_layer_norm        11.35%      67.490us        99.15%     589.690us     589.690us       0.000us         0.00%     202.330us     202.330us             1  
+                                       aten::layer_norm         1.44%       8.590us        87.80%     522.200us     174.067us       0.000us         0.00%     202.330us      67.443us             3  
+                                aten::native_layer_norm         8.41%      50.041us        86.35%     513.610us     171.203us     128.124us       100.00%     202.330us      67.443us             3  
+                                       torch_layer_norm         0.00%       0.000us         0.00%       0.000us       0.000us     129.692us       101.22%     129.692us     129.692us             1  
+void at::native::(anonymous namespace)::vectorized_l...         0.00%       0.000us         0.00%       0.000us       0.000us     128.124us       100.00%     128.124us      42.708us             3  
+                                Activity Buffer Request        46.63%     277.315us        46.63%     277.315us     277.315us      74.206us        57.92%      74.206us      74.206us             1  
+                                            aten::empty         4.68%      27.831us         4.68%      27.831us       3.092us       0.000us         0.00%       0.000us       0.000us             9  
+                                       cudaLaunchKernel        25.89%     153.973us        25.89%     153.973us      51.324us       0.000us         0.00%       0.000us       0.000us             3  
+                                             aten::view         0.75%       4.450us         0.75%       4.450us       0.742us       0.000us         0.00%       0.000us       0.000us             6  
+                                  cudaDeviceSynchronize         0.85%       5.080us         0.85%       5.080us       5.080us       0.000us         0.00%       0.000us       0.000us             1  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-Self CPU time total: 1.180ms
-Self CUDA time total: 782.094us
+Self CPU time total: 594.770us
+Self CUDA time total: 128.124us
 
 
 
@@ -4815,19 +4803,19 @@ PROFILE TRACE: torch_layer_norm | LN_B16_S512_D4096
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
                                                    Name    Self CPU %      Self CPU   CPU total %     CPU total  CPU time avg     Self CUDA   Self CUDA %    CUDA total  CUDA time avg    # of Calls  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-                                       torch_layer_norm         3.28%      65.613us        26.55%     531.932us     531.932us       0.000us         0.00%       2.062ms       2.062ms             1  
-                                       aten::layer_norm         0.44%       8.751us        23.28%     466.319us     155.440us       0.000us         0.00%       2.062ms     687.358us             3  
-                                aten::native_layer_norm         2.38%      47.740us        22.84%     457.568us     152.523us       1.599ms       100.00%       2.062ms     687.358us             3  
-                                       torch_layer_norm         0.00%       0.000us         0.00%       0.000us       0.000us       1.600ms       100.08%       1.600ms       1.600ms             1  
-void at::native::(anonymous namespace)::vectorized_l...         0.00%       0.000us         0.00%       0.000us       0.000us       1.599ms       100.00%       1.599ms     532.854us             3  
-                                Activity Buffer Request        10.57%     211.745us        10.57%     211.745us     211.745us     463.511us        29.00%     463.511us     463.511us             1  
-                                            aten::empty         1.42%      28.490us         1.42%      28.490us       3.166us       0.000us         0.00%       0.000us       0.000us             9  
-                                       cudaLaunchKernel         8.28%     165.833us         8.28%     165.833us      55.278us       0.000us         0.00%       0.000us       0.000us             3  
-                                             aten::view         0.19%       3.760us         0.19%       3.760us       0.627us       0.000us         0.00%       0.000us       0.000us             6  
-                                  cudaDeviceSynchronize        73.45%       1.471ms        73.45%       1.471ms       1.471ms       0.000us         0.00%       0.000us       0.000us             1  
+                                       torch_layer_norm         6.87%      68.511us        58.17%     579.770us     579.770us       0.000us         0.00%     720.407us     720.407us             1  
+                                       aten::layer_norm         0.88%       8.821us        51.29%     511.259us     170.420us       0.000us         0.00%     720.407us     240.136us             3  
+                                aten::native_layer_norm         5.17%      51.521us        50.41%     502.438us     167.479us     546.073us       100.00%     720.407us     240.136us             3  
+                                       torch_layer_norm         0.00%       0.000us         0.00%       0.000us       0.000us     547.577us       100.28%     547.577us     547.577us             1  
+void at::native::(anonymous namespace)::vectorized_l...         0.00%       0.000us         0.00%       0.000us       0.000us     546.073us       100.00%     546.073us     182.024us             3  
+                                Activity Buffer Request        26.52%     264.294us        26.52%     264.294us     264.294us     174.334us        31.93%     174.334us     174.334us             1  
+                                            aten::empty         2.91%      29.030us         2.91%      29.030us       3.226us       0.000us         0.00%       0.000us       0.000us             9  
+                                       cudaLaunchKernel        15.39%     153.384us        15.39%     153.384us      51.128us       0.000us         0.00%       0.000us       0.000us             3  
+                                             aten::view         0.42%       4.209us         0.42%       4.209us       0.702us       0.000us         0.00%       0.000us       0.000us             6  
+                                  cudaDeviceSynchronize        41.83%     416.987us        41.83%     416.987us     416.987us       0.000us         0.00%       0.000us       0.000us             1  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-Self CPU time total: 2.003ms
-Self CUDA time total: 1.599ms
+Self CPU time total: 996.757us
+Self CUDA time total: 546.073us
 
 
 
@@ -4837,19 +4825,19 @@ PROFILE TRACE: torch_layer_norm | LN_B16_S512_D8192
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
                                                    Name    Self CPU %      Self CPU   CPU total %     CPU total  CPU time avg     Self CUDA   Self CUDA %    CUDA total  CUDA time avg    # of Calls  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-                                       torch_layer_norm         1.73%      65.201us        15.56%     587.973us     587.973us       0.000us         0.00%       4.337ms       4.337ms             1  
-                                       aten::layer_norm         0.23%       8.512us        13.84%     522.772us     174.257us       0.000us         0.00%       4.337ms       1.446ms             3  
-                                aten::native_layer_norm         1.23%      46.631us        13.61%     514.260us     171.420us       3.314ms       100.00%       4.337ms       1.446ms             3  
-                                       torch_layer_norm         0.00%       0.000us         0.00%       0.000us       0.000us       3.315ms       100.04%       3.315ms       3.315ms             1  
-void at::native::(anonymous namespace)::vectorized_l...         0.00%       0.000us         0.00%       0.000us       0.000us       3.314ms       100.00%       3.314ms       1.105ms             3  
-                                Activity Buffer Request         7.12%     269.056us         7.12%     269.056us     269.056us       1.023ms        30.87%       1.023ms       1.023ms             1  
-                                            aten::empty         0.74%      27.840us         0.74%      27.840us       3.093us       0.000us         0.00%       0.000us       0.000us             9  
-                                       cudaLaunchKernel         4.41%     166.733us         4.41%     166.733us      55.578us       0.000us         0.00%       0.000us       0.000us             3  
-                                             aten::view         0.11%       4.000us         0.11%       4.000us       0.667us       0.000us         0.00%       0.000us       0.000us             6  
-                                  cudaDeviceSynchronize        84.44%       3.190ms        84.44%       3.190ms       3.190ms       0.000us         0.00%       0.000us       0.000us             1  
+                                       torch_layer_norm         4.10%      64.241us        34.57%     541.829us     541.829us       0.000us         0.00%       1.480ms       1.480ms             1  
+                                       aten::layer_norm         0.55%       8.560us        30.47%     477.588us     159.196us       0.000us         0.00%       1.480ms     493.436us             3  
+                                aten::native_layer_norm         3.24%      50.830us        29.93%     469.028us     156.343us       1.149ms       100.00%       1.480ms     493.436us             3  
+                                       torch_layer_norm         0.00%       0.000us         0.00%       0.000us       0.000us       1.151ms       100.12%       1.151ms       1.151ms             1  
+void at::native::(anonymous namespace)::vectorized_l...         0.00%       0.000us         0.00%       0.000us       0.000us       1.149ms       100.00%       1.149ms     383.133us             3  
+                                Activity Buffer Request        14.86%     232.814us        14.86%     232.814us     232.814us     330.909us        28.79%     330.909us     330.909us             1  
+                                            aten::empty         1.86%      29.081us         1.86%      29.081us       3.231us       0.000us         0.00%       0.000us       0.000us             9  
+                                       cudaLaunchKernel         9.70%     152.022us         9.70%     152.022us      50.674us       0.000us         0.00%       0.000us       0.000us             3  
+                                             aten::view         0.27%       4.281us         0.27%       4.281us       0.713us       0.000us         0.00%       0.000us       0.000us             6  
+                                  cudaDeviceSynchronize        65.43%       1.025ms        65.43%       1.025ms       1.025ms       0.000us         0.00%       0.000us       0.000us             1  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-Self CPU time total: 3.778ms
-Self CUDA time total: 3.314ms
+Self CPU time total: 1.567ms
+Self CUDA time total: 1.149ms
 
 
 
@@ -4859,19 +4847,19 @@ PROFILE TRACE: torch_layer_norm | LN_B16_S1024_D1024
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
                                                    Name    Self CPU %      Self CPU   CPU total %     CPU total  CPU time avg     Self CUDA   Self CUDA %    CUDA total  CUDA time avg    # of Calls  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-                                       torch_layer_norm         5.47%      63.581us        43.72%     508.311us     508.311us       0.000us         0.00%       1.020ms       1.020ms             1  
-                                       aten::layer_norm         0.74%       8.570us        38.25%     444.730us     148.243us       0.000us         0.00%       1.020ms     340.056us             3  
-                                aten::native_layer_norm         4.13%      48.012us        37.51%     436.160us     145.387us     778.734us       100.00%       1.020ms     340.056us             3  
-                                       torch_layer_norm         0.00%       0.000us         0.00%       0.000us       0.000us     780.173us       100.18%     780.173us     780.173us             1  
-void at::native::(anonymous namespace)::vectorized_l...         0.00%       0.000us         0.00%       0.000us       0.000us     778.734us       100.00%     778.734us     259.578us             3  
-                                Activity Buffer Request        16.59%     192.874us        16.59%     192.874us     192.874us     241.434us        31.00%     241.434us     241.434us             1  
-                                            aten::empty         2.34%      27.210us         2.34%      27.210us       3.023us       0.000us         0.00%       0.000us       0.000us             9  
-                                       cudaLaunchKernel        14.14%     164.374us        14.14%     164.374us      54.791us       0.000us         0.00%       0.000us       0.000us             3  
-                                             aten::view         0.32%       3.690us         0.32%       3.690us       0.615us       0.000us         0.00%       0.000us       0.000us             6  
-                                  cudaDeviceSynchronize        56.28%     654.424us        56.28%     654.424us     654.424us       0.000us         0.00%       0.000us       0.000us             1  
+                                       torch_layer_norm        10.87%      65.290us        97.50%     585.660us     585.660us       0.000us         0.00%     211.160us     211.160us             1  
+                                       aten::layer_norm         1.49%       8.961us        86.63%     520.370us     173.457us       0.000us         0.00%     211.160us      70.387us             3  
+                                aten::native_layer_norm         8.59%      51.600us        85.14%     511.409us     170.470us     139.579us       100.00%     211.160us      70.387us             3  
+                                       torch_layer_norm         0.00%       0.000us         0.00%       0.000us       0.000us     140.987us       101.01%     140.987us     140.987us             1  
+void at::native::(anonymous namespace)::vectorized_l...         0.00%       0.000us         0.00%       0.000us       0.000us     139.579us       100.00%     139.579us      46.526us             3  
+                                Activity Buffer Request        45.81%     275.144us        45.81%     275.144us     275.144us      71.581us        51.28%      71.581us      71.581us             1  
+                                            aten::empty         4.65%      27.942us         4.65%      27.942us       3.105us       0.000us         0.00%       0.000us       0.000us             9  
+                                       cudaLaunchKernel        25.42%     152.693us        25.42%     152.693us      50.898us       0.000us         0.00%       0.000us       0.000us             3  
+                                             aten::view         0.67%       4.030us         0.67%       4.030us       0.672us       0.000us         0.00%       0.000us       0.000us             6  
+                                  cudaDeviceSynchronize         2.50%      14.990us         2.50%      14.990us      14.990us       0.000us         0.00%       0.000us       0.000us             1  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-Self CPU time total: 1.163ms
-Self CUDA time total: 778.734us
+Self CPU time total: 600.650us
+Self CUDA time total: 139.579us
 
 
 
@@ -4881,19 +4869,19 @@ PROFILE TRACE: torch_layer_norm | LN_B16_S1024_D2048
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
                                                    Name    Self CPU %      Self CPU   CPU total %     CPU total  CPU time avg     Self CUDA   Self CUDA %    CUDA total  CUDA time avg    # of Calls  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-                                       torch_layer_norm         3.23%      65.631us        26.92%     546.792us     546.792us       0.000us         0.00%       2.082ms       2.082ms             1  
-                                       aten::layer_norm         0.43%       8.791us        23.69%     481.161us     160.387us       0.000us         0.00%       2.082ms     694.073us             3  
-                                aten::native_layer_norm         2.34%      47.531us        23.26%     472.370us     157.457us       1.610ms       100.00%       2.082ms     694.073us             3  
-                                       torch_layer_norm         0.00%       0.000us         0.00%       0.000us       0.000us       1.611ms       100.08%       1.611ms       1.611ms             1  
-void at::native::(anonymous namespace)::vectorized_l...         0.00%       0.000us         0.00%       0.000us       0.000us       1.610ms       100.00%       1.610ms     536.701us             3  
-                                Activity Buffer Request        11.25%     228.475us        11.25%     228.475us     228.475us     472.116us        29.32%     472.116us     472.116us             1  
-                                            aten::empty         1.35%      27.360us         1.35%      27.360us       3.040us       0.000us         0.00%       0.000us       0.000us             9  
-                                       cudaLaunchKernel         8.13%     165.123us         8.13%     165.123us      55.041us       0.000us         0.00%       0.000us       0.000us             3  
-                                             aten::view         0.19%       3.881us         0.19%       3.881us       0.647us       0.000us         0.00%       0.000us       0.000us             6  
-                                  cudaDeviceSynchronize        73.08%       1.484ms        73.08%       1.484ms       1.484ms       0.000us         0.00%       0.000us       0.000us             1  
+                                       torch_layer_norm         6.53%      63.420us        56.04%     544.209us     544.209us       0.000us         0.00%     725.021us     725.021us             1  
+                                       aten::layer_norm         0.90%       8.770us        49.51%     480.789us     160.263us       0.000us         0.00%     725.021us     241.674us             3  
+                                aten::native_layer_norm         5.25%      50.982us        48.61%     472.019us     157.340us     551.902us       100.00%     725.021us     241.674us             3  
+                                       torch_layer_norm         0.00%       0.000us         0.00%       0.000us       0.000us     553.342us       100.26%     553.342us     553.342us             1  
+void at::native::(anonymous namespace)::vectorized_l...         0.00%       0.000us         0.00%       0.000us       0.000us     551.902us       100.00%     551.902us     183.967us             3  
+                                Activity Buffer Request        24.17%     234.744us        24.17%     234.744us     234.744us     173.119us        31.37%     173.119us     173.119us             1  
+                                            aten::empty         3.03%      29.450us         3.03%      29.450us       3.272us       0.000us         0.00%       0.000us       0.000us             9  
+                                       cudaLaunchKernel        15.70%     152.482us        15.70%     152.482us      50.827us       0.000us         0.00%       0.000us       0.000us             3  
+                                             aten::view         0.45%       4.361us         0.45%       4.361us       0.727us       0.000us         0.00%       0.000us       0.000us             6  
+                                  cudaDeviceSynchronize        43.96%     426.887us        43.96%     426.887us     426.887us       0.000us         0.00%       0.000us       0.000us             1  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-Self CPU time total: 2.031ms
-Self CUDA time total: 1.610ms
+Self CPU time total: 971.096us
+Self CUDA time total: 551.902us
 
 
 
@@ -4903,19 +4891,19 @@ PROFILE TRACE: torch_layer_norm | LN_B16_S1024_D4096
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
                                                    Name    Self CPU %      Self CPU   CPU total %     CPU total  CPU time avg     Self CUDA   Self CUDA %    CUDA total  CUDA time avg    # of Calls  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-                                       torch_layer_norm         1.80%      65.951us        13.62%     497.891us     497.891us       0.000us         0.00%       4.269ms       4.269ms             1  
-                                       aten::layer_norm         0.24%       8.923us        11.82%     431.940us     143.980us       0.000us         0.00%       4.269ms       1.423ms             3  
-                                aten::native_layer_norm         1.26%      46.081us        11.57%     423.017us     141.006us       3.268ms       100.00%       4.269ms       1.423ms             3  
-                                       torch_layer_norm         0.00%       0.000us         0.00%       0.000us       0.000us       3.270ms       100.04%       3.270ms       3.270ms             1  
-void at::native::(anonymous namespace)::vectorized_l...         0.00%       0.000us         0.00%       0.000us       0.000us       3.268ms       100.00%       3.268ms       1.089ms             3  
-                                Activity Buffer Request         4.93%     180.334us         4.93%     180.334us     180.334us       1.001ms        30.64%       1.001ms       1.001ms             1  
-                                            aten::empty         0.79%      28.830us         0.79%      28.830us       3.203us       0.000us         0.00%       0.000us       0.000us             9  
-                                       cudaLaunchKernel         4.49%     164.033us         4.49%     164.033us      54.678us       0.000us         0.00%       0.000us       0.000us             3  
-                                             aten::view         0.10%       3.739us         0.10%       3.739us       0.623us       0.000us         0.00%       0.000us       0.000us             6  
-                                  cudaDeviceSynchronize        86.38%       3.157ms        86.38%       3.157ms       3.157ms       0.000us         0.00%       0.000us       0.000us             1  
+                                       torch_layer_norm         4.07%      66.881us        38.72%     635.751us     635.751us       0.000us         0.00%       1.469ms       1.469ms             1  
+                                       aten::layer_norm         0.55%       9.009us        34.64%     568.870us     189.623us       0.000us         0.00%       1.469ms     489.666us             3  
+                                aten::native_layer_norm         3.27%      53.630us        34.10%     559.861us     186.620us       1.138ms       100.00%       1.469ms     489.666us             3  
+                                       torch_layer_norm         0.00%       0.000us         0.00%       0.000us       0.000us       1.139ms       100.13%       1.139ms       1.139ms             1  
+void at::native::(anonymous namespace)::vectorized_l...         0.00%       0.000us         0.00%       0.000us       0.000us       1.138ms       100.00%       1.138ms     379.279us             3  
+                                Activity Buffer Request        19.12%     313.985us        19.12%     313.985us     313.985us     331.162us        29.10%     331.162us     331.162us             1  
+                                            aten::empty         1.88%      30.903us         1.88%      30.903us       3.434us       0.000us         0.00%       0.000us       0.000us             9  
+                                       cudaLaunchKernel         9.57%     157.133us         9.57%     157.133us      52.378us       0.000us         0.00%       0.000us       0.000us             3  
+                                             aten::view         0.26%       4.210us         0.26%       4.210us       0.702us       0.000us         0.00%       0.000us       0.000us             6  
+                                  cudaDeviceSynchronize        61.28%       1.006ms        61.28%       1.006ms       1.006ms       0.000us         0.00%       0.000us       0.000us             1  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-Self CPU time total: 3.655ms
-Self CUDA time total: 3.268ms
+Self CPU time total: 1.642ms
+Self CUDA time total: 1.138ms
 
 
 
@@ -4925,19 +4913,19 @@ PROFILE TRACE: torch_layer_norm | LN_B16_S1024_D8192
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
                                                    Name    Self CPU %      Self CPU   CPU total %     CPU total  CPU time avg     Self CUDA   Self CUDA %    CUDA total  CUDA time avg    # of Calls  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-                                       torch_layer_norm         0.96%      68.974us         8.04%     575.593us     575.593us       0.000us         0.00%       8.896ms       8.896ms             1  
-                                       aten::layer_norm         0.12%       8.430us         7.07%     506.619us     168.873us       0.000us         0.00%       8.896ms       2.965ms             3  
-                                aten::native_layer_norm         0.65%      46.671us         6.96%     498.189us     166.063us       6.715ms       100.00%       8.896ms       2.965ms             3  
-                                       torch_layer_norm         0.00%       0.000us         0.00%       0.000us       0.000us       6.716ms       100.02%       6.716ms       6.716ms             1  
-void at::native::(anonymous namespace)::vectorized_l...         0.00%       0.000us         0.00%       0.000us       0.000us       6.715ms       100.00%       6.715ms       2.238ms             3  
-                                Activity Buffer Request         3.47%     248.836us         3.47%     248.836us     248.836us       2.181ms        32.49%       2.181ms       2.181ms             1  
-                                            aten::empty         0.38%      27.020us         0.38%      27.020us       3.002us       0.000us         0.00%       0.000us       0.000us             9  
-                                       cudaLaunchKernel         2.40%     171.633us         2.40%     171.633us      57.211us       0.000us         0.00%       0.000us       0.000us             3  
-                                             aten::view         0.06%       4.029us         0.06%       4.029us       0.672us       0.000us         0.00%       0.000us       0.000us             6  
-                                  cudaDeviceSynchronize        91.96%       6.587ms        91.96%       6.587ms       6.587ms       0.000us         0.00%       0.000us       0.000us             1  
+                                       torch_layer_norm         2.42%      65.690us        15.85%     430.707us     430.707us       0.000us         0.00%       3.155ms       3.155ms             1  
+                                       aten::layer_norm         0.35%       9.490us        13.44%     365.017us     121.672us       0.000us         0.00%       3.155ms       1.052ms             3  
+                                aten::native_layer_norm         1.79%      48.727us        13.09%     355.527us     118.509us       2.409ms       100.00%       3.155ms       1.052ms             3  
+                                       torch_layer_norm         0.00%       0.000us         0.00%       0.000us       0.000us       2.410ms       100.06%       2.410ms       2.410ms             1  
+void at::native::(anonymous namespace)::vectorized_l...         0.00%       0.000us         0.00%       0.000us       0.000us       2.409ms       100.00%       2.409ms     802.859us             3  
+                                Activity Buffer Request         4.38%     118.922us         4.38%     118.922us     118.922us     746.656us        31.00%     746.656us     746.656us             1  
+                                            aten::empty         1.13%      30.624us         1.13%      30.624us       3.403us       0.000us         0.00%       0.000us       0.000us             9  
+                                       cudaLaunchKernel         5.65%     153.412us         5.65%     153.412us      51.137us       0.000us         0.00%       0.000us       0.000us             3  
+                                             aten::view         0.14%       3.842us         0.14%       3.842us       0.640us       0.000us         0.00%       0.000us       0.000us             6  
+                                  cudaDeviceSynchronize        84.15%       2.286ms        84.15%       2.286ms       2.286ms       0.000us         0.00%       0.000us       0.000us             1  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-Self CPU time total: 7.163ms
-Self CUDA time total: 6.715ms
+Self CPU time total: 2.717ms
+Self CUDA time total: 2.409ms
 
 
 
@@ -4947,19 +4935,19 @@ PROFILE TRACE: torch_layer_norm | LN_B16_S2048_D1024
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
                                                    Name    Self CPU %      Self CPU   CPU total %     CPU total  CPU time avg     Self CUDA   Self CUDA %    CUDA total  CUDA time avg    # of Calls  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-                                       torch_layer_norm         3.21%      65.101us        27.13%     550.572us     550.572us       0.000us         0.00%       2.070ms       2.070ms             1  
-                                       aten::layer_norm         0.42%       8.470us        23.93%     485.471us     161.824us       0.000us         0.00%       2.070ms     689.859us             3  
-                                aten::native_layer_norm         2.33%      47.334us        23.51%     477.001us     159.000us       1.603ms       100.00%       2.070ms     689.859us             3  
-                                       torch_layer_norm         0.00%       0.000us         0.00%       0.000us       0.000us       1.604ms       100.08%       1.604ms       1.604ms             1  
-void at::native::(anonymous namespace)::vectorized_l...         0.00%       0.000us         0.00%       0.000us       0.000us       1.603ms       100.00%       1.603ms     534.289us             3  
-                                Activity Buffer Request        11.26%     228.575us        11.26%     228.575us     228.575us     466.708us        29.12%     466.708us     466.708us             1  
-                                            aten::empty         1.44%      29.320us         1.44%      29.320us       3.258us       0.000us         0.00%       0.000us       0.000us             9  
-                                       cudaLaunchKernel         8.29%     168.154us         8.29%     168.154us      56.051us       0.000us         0.00%       0.000us       0.000us             3  
-                                             aten::view         0.18%       3.618us         0.18%       3.618us       0.603us       0.000us         0.00%       0.000us       0.000us             6  
-                                  cudaDeviceSynchronize        72.87%       1.479ms        72.87%       1.479ms       1.479ms       0.000us         0.00%       0.000us       0.000us             1  
+                                       torch_layer_norm         6.72%      66.011us        55.62%     546.350us     546.350us       0.000us         0.00%     735.937us     735.937us             1  
+                                       aten::layer_norm         0.92%       8.990us        48.90%     480.339us     160.113us       0.000us         0.00%     735.937us     245.312us             3  
+                                aten::native_layer_norm         5.16%      50.724us        47.98%     471.349us     157.116us     560.097us       100.00%     735.937us     245.312us             3  
+                                       torch_layer_norm         0.00%       0.000us         0.00%       0.000us       0.000us     561.633us       100.27%     561.633us     561.633us             1  
+void at::native::(anonymous namespace)::vectorized_l...         0.00%       0.000us         0.00%       0.000us       0.000us     560.097us       100.00%     560.097us     186.699us             3  
+                                Activity Buffer Request        23.82%     234.014us        23.82%     234.014us     234.014us     175.840us        31.39%     175.840us     175.840us             1  
+                                            aten::empty         2.88%      28.270us         2.88%      28.270us       3.141us       0.000us         0.00%       0.000us       0.000us             9  
+                                       cudaLaunchKernel        15.72%     154.402us        15.72%     154.402us      51.467us       0.000us         0.00%       0.000us       0.000us             3  
+                                             aten::view         0.40%       3.939us         0.40%       3.939us       0.656us       0.000us         0.00%       0.000us       0.000us             6  
+                                  cudaDeviceSynchronize        44.38%     435.997us        44.38%     435.997us     435.997us       0.000us         0.00%       0.000us       0.000us             1  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-Self CPU time total: 2.029ms
-Self CUDA time total: 1.603ms
+Self CPU time total: 982.347us
+Self CUDA time total: 560.097us
 
 
 
@@ -4969,19 +4957,19 @@ PROFILE TRACE: torch_layer_norm | LN_B16_S2048_D2048
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
                                                    Name    Self CPU %      Self CPU   CPU total %     CPU total  CPU time avg     Self CUDA   Self CUDA %    CUDA total  CUDA time avg    # of Calls  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-                                       aten::layer_norm         0.24%       8.881us        12.85%     473.140us     157.713us       0.000us         0.00%       4.264ms       1.421ms             3  
-                                aten::native_layer_norm         1.29%      47.472us        12.61%     464.259us     154.753us       3.266ms       100.00%       4.264ms       1.421ms             3  
-                                       torch_layer_norm         1.85%      67.922us        14.70%     541.062us     541.062us       0.000us         0.00%       4.264ms       4.264ms             1  
-                                       torch_layer_norm         0.00%       0.000us         0.00%       0.000us       0.000us       3.268ms       100.05%       3.268ms       3.268ms             1  
-void at::native::(anonymous namespace)::vectorized_l...         0.00%       0.000us         0.00%       0.000us       0.000us       3.266ms       100.00%       3.266ms       1.089ms             3  
-                                Activity Buffer Request         5.85%     215.475us         5.85%     215.475us     215.475us     997.400us        30.54%     997.400us     997.400us             1  
-                                            aten::empty         0.76%      27.950us         0.76%      27.950us       3.106us       0.000us         0.00%       0.000us       0.000us             9  
-                                       cudaLaunchKernel         4.61%     169.513us         4.61%     169.513us      56.504us       0.000us         0.00%       0.000us       0.000us             3  
-                                             aten::view         0.10%       3.849us         0.10%       3.849us       0.642us       0.000us         0.00%       0.000us       0.000us             6  
-                                  cudaDeviceSynchronize        85.30%       3.140ms        85.30%       3.140ms       3.140ms       0.000us         0.00%       0.000us       0.000us             1  
+                                       torch_layer_norm         4.56%      64.832us        29.06%     412.897us     412.897us       0.000us         0.00%       1.469ms       1.469ms             1  
+                                       aten::layer_norm         0.65%       9.228us        24.50%     348.065us     116.022us       0.000us         0.00%       1.469ms     489.663us             3  
+                                aten::native_layer_norm         3.69%      52.410us        23.85%     338.837us     112.946us       1.133ms       100.00%       1.469ms     489.663us             3  
+                                       torch_layer_norm         0.00%       0.000us         0.00%       0.000us       0.000us       1.135ms       100.12%       1.135ms       1.135ms             1  
+void at::native::(anonymous namespace)::vectorized_l...         0.00%       0.000us         0.00%       0.000us       0.000us       1.133ms       100.00%       1.133ms     377.716us             3  
+                                Activity Buffer Request         7.07%     100.442us         7.07%     100.442us     100.442us     335.839us        29.64%     335.839us     335.839us             1  
+                                            aten::empty         2.06%      29.311us         2.06%      29.311us       3.257us       0.000us         0.00%       0.000us       0.000us             9  
+                                       cudaLaunchKernel        10.76%     152.823us        10.76%     152.823us      50.941us       0.000us         0.00%       0.000us       0.000us             3  
+                                             aten::view         0.27%       3.851us         0.27%       3.851us       0.642us       0.000us         0.00%       0.000us       0.000us             6  
+                                  cudaDeviceSynchronize        70.94%       1.008ms        70.94%       1.008ms       1.008ms       0.000us         0.00%       0.000us       0.000us             1  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-Self CPU time total: 3.681ms
-Self CUDA time total: 3.266ms
+Self CPU time total: 1.421ms
+Self CUDA time total: 1.133ms
 
 
 
@@ -4991,19 +4979,19 @@ PROFILE TRACE: torch_layer_norm | LN_B16_S2048_D4096
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
                                                    Name    Self CPU %      Self CPU   CPU total %     CPU total  CPU time avg     Self CUDA   Self CUDA %    CUDA total  CUDA time avg    # of Calls  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-                                       torch_layer_norm         0.99%      71.201us         8.44%     605.933us     605.933us       0.000us         0.00%       8.838ms       8.838ms             1  
-                                       aten::layer_norm         0.13%       9.280us         7.45%     534.732us     178.244us       0.000us         0.00%       8.838ms       2.946ms             3  
-                                aten::native_layer_norm         0.69%      49.421us         7.32%     525.452us     175.151us       6.702ms       100.00%       8.838ms       2.946ms             3  
-                                       torch_layer_norm         0.00%       0.000us         0.00%       0.000us       0.000us       6.703ms       100.02%       6.703ms       6.703ms             1  
-void at::native::(anonymous namespace)::vectorized_l...         0.00%       0.000us         0.00%       0.000us       0.000us       6.702ms       100.00%       6.702ms       2.234ms             3  
-                                Activity Buffer Request         3.78%     271.526us         3.78%     271.526us     271.526us       2.136ms        31.88%       2.136ms       2.136ms             1  
-                                            aten::empty         0.38%      27.460us         0.38%      27.460us       3.051us       0.000us         0.00%       0.000us       0.000us             9  
-                                       cudaLaunchKernel         2.41%     173.045us         2.41%     173.045us      57.682us       0.000us         0.00%       0.000us       0.000us             3  
-                                             aten::view         0.06%       4.000us         0.06%       4.000us       0.667us       0.000us         0.00%       0.000us       0.000us             6  
-                                  cudaDeviceSynchronize        91.56%       6.572ms        91.56%       6.572ms       6.572ms       0.000us         0.00%       0.000us       0.000us             1  
+                                       torch_layer_norm         2.43%      67.770us        21.38%     597.070us     597.070us       0.000us         0.00%       3.032ms       3.032ms             1  
+                                       aten::layer_norm         0.34%       9.401us        18.95%     529.300us     176.433us       0.000us         0.00%       3.032ms       1.011ms             3  
+                                aten::native_layer_norm         1.84%      51.400us        18.61%     519.899us     173.300us       2.325ms       100.00%       3.032ms       1.011ms             3  
+                                       torch_layer_norm         0.00%       0.000us         0.00%       0.000us       0.000us       2.327ms       100.06%       2.327ms       2.327ms             1  
+void at::native::(anonymous namespace)::vectorized_l...         0.00%       0.000us         0.00%       0.000us       0.000us       2.325ms       100.00%       2.325ms     775.112us             3  
+                                Activity Buffer Request         9.90%     276.585us         9.90%     276.585us     276.585us     706.558us        30.39%     706.558us     706.558us             1  
+                                            aten::empty         1.09%      30.392us         1.09%      30.392us       3.377us       0.000us         0.00%       0.000us       0.000us             9  
+                                       cudaLaunchKernel         5.64%     157.652us         5.64%     157.652us      52.551us       0.000us         0.00%       0.000us       0.000us             3  
+                                             aten::view         0.14%       3.870us         0.14%       3.870us       0.645us       0.000us         0.00%       0.000us       0.000us             6  
+                                  cudaDeviceSynchronize        78.62%       2.196ms        78.62%       2.196ms       2.196ms       0.000us         0.00%       0.000us       0.000us             1  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-Self CPU time total: 7.178ms
-Self CUDA time total: 6.702ms
+Self CPU time total: 2.793ms
+Self CUDA time total: 2.325ms
 
 
 
@@ -5013,121 +5001,75 @@ PROFILE TRACE: torch_layer_norm | LN_B16_S2048_D8192
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
                                                    Name    Self CPU %      Self CPU   CPU total %     CPU total  CPU time avg     Self CUDA   Self CUDA %    CUDA total  CUDA time avg    # of Calls  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-                                       torch_layer_norm         0.51%      71.382us         4.14%     576.813us     576.813us       0.000us         0.00%      17.998ms      17.998ms             1  
-                                       aten::layer_norm         0.06%       9.001us         3.62%     505.431us     168.477us       0.000us         0.00%      17.998ms       5.999ms             3  
-                                aten::native_layer_norm         0.35%      49.501us         3.56%     496.430us     165.477us      13.500ms       100.00%      17.998ms       5.999ms             3  
-                                       torch_layer_norm         0.00%       0.000us         0.00%       0.000us       0.000us      13.502ms       100.01%      13.502ms      13.502ms             1  
-void at::native::(anonymous namespace)::vectorized_l...         0.00%       0.000us         0.00%       0.000us       0.000us      13.500ms       100.00%      13.500ms       4.500ms             3  
-                                Activity Buffer Request         1.51%     210.264us         1.51%     210.264us     210.264us       4.498ms        33.31%       4.498ms       4.498ms             1  
-                                            aten::empty         0.21%      29.200us         0.21%      29.200us       3.244us       0.000us         0.00%       0.000us       0.000us             9  
-                                       cudaLaunchKernel         1.46%     203.594us         1.46%     203.594us      67.865us       0.000us         0.00%       0.000us       0.000us             3  
-                                             aten::view         0.03%       3.871us         0.03%       3.871us       0.645us       0.000us         0.00%       0.000us       0.000us             6  
-                                  cudaDeviceSynchronize        95.86%      13.371ms        95.86%      13.371ms      13.371ms       0.000us         0.00%       0.000us       0.000us             1  
+                                       torch_layer_norm         1.28%      68.262us        10.71%     572.390us     572.390us       0.000us         0.00%       6.493ms       6.493ms             1  
+                                       aten::layer_norm         0.16%       8.770us         9.43%     504.128us     168.043us       0.000us         0.00%       6.493ms       2.164ms             3  
+                                aten::native_layer_norm         0.96%      51.508us         9.27%     495.358us     165.119us       4.900ms       100.00%       6.493ms       2.164ms             3  
+                                       torch_layer_norm         0.00%       0.000us         0.00%       0.000us       0.000us       4.901ms       100.03%       4.901ms       4.901ms             1  
+void at::native::(anonymous namespace)::vectorized_l...         0.00%       0.000us         0.00%       0.000us       0.000us       4.900ms       100.00%       4.900ms       1.633ms             3  
+                                Activity Buffer Request         4.74%     253.634us         4.74%     253.634us     253.634us       1.594ms        32.53%       1.594ms       1.594ms             1  
+                                            aten::empty         0.56%      29.682us         0.56%      29.682us       3.298us       0.000us         0.00%       0.000us       0.000us             9  
+                                       cudaLaunchKernel         2.93%     156.523us         2.93%     156.523us      52.174us       0.000us         0.00%       0.000us       0.000us             3  
+                                             aten::view         0.08%       4.011us         0.08%       4.011us       0.669us       0.000us         0.00%       0.000us       0.000us             6  
+                                  cudaDeviceSynchronize        89.29%       4.774ms        89.29%       4.774ms       4.774ms       0.000us         0.00%       0.000us       0.000us             1  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-Self CPU time total: 13.947ms
-Self CUDA time total: 13.500ms
+Self CPU time total: 5.346ms
+Self CUDA time total: 4.900ms
 
 
 impl                     wl                  p50(ms)  ok
-torch_layer_norm         LN_B16_S1024_D1024     0.29  False
-torch_layer_norm         LN_B16_S1024_D2048     0.59  False
-torch_layer_norm         LN_B16_S1024_D4096     1.15  False
-torch_layer_norm         LN_B16_S1024_D8192     2.27  False
+torch_layer_norm         LN_B16_S1024_D1024     0.05  False
+torch_layer_norm         LN_B16_S1024_D2048     0.21  False
+torch_layer_norm         LN_B16_S1024_D4096     0.42  False
+torch_layer_norm         LN_B16_S1024_D8192     0.85  False
 torch_layer_norm         LN_B16_S128_D1024      0.03  False
-torch_layer_norm         LN_B16_S128_D2048      0.04  False
-torch_layer_norm         LN_B16_S128_D4096      0.05  False
-torch_layer_norm         LN_B16_S128_D8192      0.27  False
-torch_layer_norm         LN_B16_S2048_D1024     0.59  False
-torch_layer_norm         LN_B16_S2048_D2048     1.16  False
-torch_layer_norm         LN_B16_S2048_D4096     2.30  False
-torch_layer_norm         LN_B16_S2048_D8192     4.51  False
-torch_layer_norm         LN_B16_S512_D1024      0.07  False
-torch_layer_norm         LN_B16_S512_D2048      0.29  False
-torch_layer_norm         LN_B16_S512_D4096      0.59  False
-torch_layer_norm         LN_B16_S512_D8192      1.15  False
+torch_layer_norm         LN_B16_S128_D2048      0.03  False
+torch_layer_norm         LN_B16_S128_D4096      0.04  False
+torch_layer_norm         LN_B16_S128_D8192      0.05  False
+torch_layer_norm         LN_B16_S2048_D1024     0.21  False
+torch_layer_norm         LN_B16_S2048_D2048     0.42  False
+torch_layer_norm         LN_B16_S2048_D4096     0.82  False
+torch_layer_norm         LN_B16_S2048_D8192     1.68  False
+torch_layer_norm         LN_B16_S512_D1024      0.04  False
+torch_layer_norm         LN_B16_S512_D2048      0.05  False
+torch_layer_norm         LN_B16_S512_D4096      0.21  False
+torch_layer_norm         LN_B16_S512_D8192      0.43  False
 torch_layer_norm         LN_B1_S1024_D1024      0.03  False
 torch_layer_norm         LN_B1_S1024_D2048      0.03  False
-torch_layer_norm         LN_B1_S1024_D4096      0.04  False
-torch_layer_norm         LN_B1_S1024_D8192      0.05  False
-torch_layer_norm         LN_B1_S128_D1024       0.03  False
+torch_layer_norm         LN_B1_S1024_D4096      0.03  False
+torch_layer_norm         LN_B1_S1024_D8192      0.04  False
+torch_layer_norm         LN_B1_S128_D1024       0.02  False
 torch_layer_norm         LN_B1_S128_D2048       0.03  False
 torch_layer_norm         LN_B1_S128_D4096       0.03  False
 torch_layer_norm         LN_B1_S128_D8192       0.03  False
-torch_layer_norm         LN_B1_S2048_D1024      0.04  False
-torch_layer_norm         LN_B1_S2048_D2048      0.04  False
-torch_layer_norm         LN_B1_S2048_D4096      0.05  False
-torch_layer_norm         LN_B1_S2048_D8192      0.27  False
+torch_layer_norm         LN_B1_S2048_D1024      0.03  False
+torch_layer_norm         LN_B1_S2048_D2048      0.03  False
+torch_layer_norm         LN_B1_S2048_D4096      0.04  False
+torch_layer_norm         LN_B1_S2048_D8192      0.05  False
 torch_layer_norm         LN_B1_S512_D1024       0.03  False
 torch_layer_norm         LN_B1_S512_D2048       0.03  False
 torch_layer_norm         LN_B1_S512_D4096       0.03  False
-torch_layer_norm         LN_B1_S512_D8192       0.04  False
-torch_layer_norm         LN_B4_S1024_D1024      0.05  False
-torch_layer_norm         LN_B4_S1024_D2048      0.06  False
-torch_layer_norm         LN_B4_S1024_D4096      0.28  False
-torch_layer_norm         LN_B4_S1024_D8192      0.59  False
+torch_layer_norm         LN_B1_S512_D8192       0.03  False
+torch_layer_norm         LN_B4_S1024_D1024      0.03  False
+torch_layer_norm         LN_B4_S1024_D2048      0.04  False
+torch_layer_norm         LN_B4_S1024_D4096      0.05  False
+torch_layer_norm         LN_B4_S1024_D8192      0.20  False
 torch_layer_norm         LN_B4_S128_D1024       0.03  False
 torch_layer_norm         LN_B4_S128_D2048       0.03  False
 torch_layer_norm         LN_B4_S128_D4096       0.03  False
-torch_layer_norm         LN_B4_S128_D8192       0.04  False
-torch_layer_norm         LN_B4_S2048_D1024      0.07  False
-torch_layer_norm         LN_B4_S2048_D2048      0.28  False
-torch_layer_norm         LN_B4_S2048_D4096      0.58  False
-torch_layer_norm         LN_B4_S2048_D8192      1.15  False
+torch_layer_norm         LN_B4_S128_D8192       0.03  False
+torch_layer_norm         LN_B4_S2048_D1024      0.04  False
+torch_layer_norm         LN_B4_S2048_D2048      0.05  False
+torch_layer_norm         LN_B4_S2048_D4096      0.21  False
+torch_layer_norm         LN_B4_S2048_D8192      0.44  False
 torch_layer_norm         LN_B4_S512_D1024       0.03  False
-torch_layer_norm         LN_B4_S512_D2048       0.04  False
-torch_layer_norm         LN_B4_S512_D4096       0.05  False
-torch_layer_norm         LN_B4_S512_D8192       0.27  False
+torch_layer_norm         LN_B4_S512_D2048       0.03  False
+torch_layer_norm         LN_B4_S512_D4096       0.04  False
+torch_layer_norm         LN_B4_S512_D8192       0.05  False
 
▶ UV Install Logs
diff --git a/layer_norm/results/artifacts/combine/latency.svg b/layer_norm/results/artifacts/combine/latency.svg index 2e30821930d26924b75424f8758b9026fe74c8c4..effb56da9741bfd7e06b460f98d00a10f6c0dd0b 100644 --- a/layer_norm/results/artifacts/combine/latency.svg +++ b/layer_norm/results/artifacts/combine/latency.svg @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:9666e51a7b23e41e320cf61de04ef7044c3870632454dcae02bf6d9c87decec7 +oid sha256:e7883bd5f88a9163cc9fdaeec2076ca6319f97d413c6bea136db33612dc2b864 size 947 diff --git a/layer_norm/results/combined_results.html b/layer_norm/results/combined_results.html index ea4e6fadbc68b996d1f222c9696292e116acb189..8b7ee1ce45eb4a38c542c0685544f74a7f2b87bd 100644 --- a/layer_norm/results/combined_results.html +++ b/layer_norm/results/combined_results.html @@ -3857,7 +3857,7 @@ span.linenos.special { color: #000000; background-color: #ffffc0; padding-left:
Generated on:
- Linux x86_64 | Linux-5.15.0-1084-aws-x86_64-with-glibc2.31 + Linux x86_64 | Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35
@@ -3872,7 +3872,7 @@ span.linenos.special { color: #000000; background-color: #ffffc0; padding-left: - 2025-10-24T19:26:16.447564 + 2025-10-27T14:46:34.455868 image/svg+xml @@ -3900,7 +3900,7 @@ span.linenos.special { color: #000000; background-color: #ffffc0; padding-left: ▼ output ▶ uv-logs | -Cell: combine | 38.84s +Cell: combine | 4.28s | Raw @@ -3972,13 +3972,13 @@ Cell: combine | 38.84s
======================================================================
 LOADING BENCHMARK DATA
 ======================================================================
-✓ PyTorch LayerNorm             : /home/ubuntu/Projects/kernels-benchmarks-consolidated/benches/layer_norm/impls/.uvnote/cache/679c54caaf848e698d978e76e5f2839b8565918d30fac991242aebea8229f1c9
-✓ HF Kernels LayerNorm          : /home/ubuntu/Projects/kernels-benchmarks-consolidated/benches/layer_norm/impls/.uvnote/cache/f7de4b4d3171468ce97015124a3af1a23ef8d4ff4f319bd566a88676d47f08db
+✓ PyTorch LayerNorm             : /__w/kernels-benchmarks/kernels-benchmarks/benches/layer_norm/impls/.uvnote/cache/4403c31e9bef6e648597b4fcc9cfdc402678aaa4f90636b74325f12d334214a3
+✓ HF Kernels LayerNorm          : /__w/kernels-benchmarks/kernels-benchmarks/benches/layer_norm/impls/.uvnote/cache/bd278151199f29b397d85857b87922edaa39a62623fb28e0465de47d6a3bac74
 
   ✓ Found PyTorch LayerNorm
-     Path: /home/ubuntu/Projects/kernels-benchmarks-consolidated/benches/layer_norm/impls/.uvnote/cache/679c54caaf848e698d978e76e5f2839b8565918d30fac991242aebea8229f1c9/layer_norm.jsonl
+     Path: /__w/kernels-benchmarks/kernels-benchmarks/benches/layer_norm/impls/.uvnote/cache/4403c31e9bef6e648597b4fcc9cfdc402678aaa4f90636b74325f12d334214a3/layer_norm.jsonl
   ✓ Found HF Kernels LayerNorm
-     Path: /home/ubuntu/Projects/kernels-benchmarks-consolidated/benches/layer_norm/impls/.uvnote/cache/f7de4b4d3171468ce97015124a3af1a23ef8d4ff4f319bd566a88676d47f08db/layer_norm.jsonl
+     Path: /__w/kernels-benchmarks/kernels-benchmarks/benches/layer_norm/impls/.uvnote/cache/bd278151199f29b397d85857b87922edaa39a62623fb28e0465de47d6a3bac74/layer_norm.jsonl
 
 ======================================================================
 Summary: 2 found, 0 skipped, 0 missing
@@ -3987,102 +3987,102 @@ Summary: 2 found, 0 skipped, 0 missing
 COMBINED BENCHMARK SUMMARY
 
 impl                     wl                  p50(ms)  ok
-hf_kernels_layer_norm    LN_B16_S1024_D1024     0.29  False
-hf_kernels_layer_norm    LN_B16_S1024_D2048     0.61  False
-hf_kernels_layer_norm    LN_B16_S1024_D4096     1.15  False
-hf_kernels_layer_norm    LN_B16_S1024_D8192     2.27  False
+hf_kernels_layer_norm    LN_B16_S1024_D1024     0.05  False
+hf_kernels_layer_norm    LN_B16_S1024_D2048     0.22  False
+hf_kernels_layer_norm    LN_B16_S1024_D4096     0.44  False
+hf_kernels_layer_norm    LN_B16_S1024_D8192     0.84  False
 hf_kernels_layer_norm    LN_B16_S128_D1024      0.05  False
 hf_kernels_layer_norm    LN_B16_S128_D2048      0.05  False
-hf_kernels_layer_norm    LN_B16_S128_D4096      0.06  False
-hf_kernels_layer_norm    LN_B16_S128_D8192      0.30  False
-hf_kernels_layer_norm    LN_B16_S2048_D1024     0.61  False
-hf_kernels_layer_norm    LN_B16_S2048_D2048     1.20  False
-hf_kernels_layer_norm    LN_B16_S2048_D4096     2.27  False
-hf_kernels_layer_norm    LN_B16_S2048_D8192     4.51  False
-hf_kernels_layer_norm    LN_B16_S512_D1024      0.06  False
-hf_kernels_layer_norm    LN_B16_S512_D2048      0.30  False
-hf_kernels_layer_norm    LN_B16_S512_D4096      0.59  False
-hf_kernels_layer_norm    LN_B16_S512_D8192      1.16  False
+hf_kernels_layer_norm    LN_B16_S128_D4096      0.05  False
+hf_kernels_layer_norm    LN_B16_S128_D8192      0.05  False
+hf_kernels_layer_norm    LN_B16_S2048_D1024     0.21  False
+hf_kernels_layer_norm    LN_B16_S2048_D2048     0.46  False
+hf_kernels_layer_norm    LN_B16_S2048_D4096     0.84  False
+hf_kernels_layer_norm    LN_B16_S2048_D8192     1.65  False
+hf_kernels_layer_norm    LN_B16_S512_D1024      0.05  False
+hf_kernels_layer_norm    LN_B16_S512_D2048      0.05  False
+hf_kernels_layer_norm    LN_B16_S512_D4096      0.21  False
+hf_kernels_layer_norm    LN_B16_S512_D8192      0.43  False
 hf_kernels_layer_norm    LN_B1_S1024_D1024      0.05  False
 hf_kernels_layer_norm    LN_B1_S1024_D2048      0.05  False
 hf_kernels_layer_norm    LN_B1_S1024_D4096      0.05  False
-hf_kernels_layer_norm    LN_B1_S1024_D8192      0.06  False
-hf_kernels_layer_norm    LN_B1_S128_D1024       0.05  False
+hf_kernels_layer_norm    LN_B1_S1024_D8192      0.05  False
+hf_kernels_layer_norm    LN_B1_S128_D1024       0.04  False
 hf_kernels_layer_norm    LN_B1_S128_D2048       0.05  False
 hf_kernels_layer_norm    LN_B1_S128_D4096       0.05  False
 hf_kernels_layer_norm    LN_B1_S128_D8192       0.05  False
 hf_kernels_layer_norm    LN_B1_S2048_D1024      0.05  False
 hf_kernels_layer_norm    LN_B1_S2048_D2048      0.05  False
-hf_kernels_layer_norm    LN_B1_S2048_D4096      0.06  False
-hf_kernels_layer_norm    LN_B1_S2048_D8192      0.29  False
+hf_kernels_layer_norm    LN_B1_S2048_D4096      0.05  False
+hf_kernels_layer_norm    LN_B1_S2048_D8192      0.05  False
 hf_kernels_layer_norm    LN_B1_S512_D1024       0.05  False
 hf_kernels_layer_norm    LN_B1_S512_D2048       0.05  False
 hf_kernels_layer_norm    LN_B1_S512_D4096       0.05  False
 hf_kernels_layer_norm    LN_B1_S512_D8192       0.05  False
 hf_kernels_layer_norm    LN_B4_S1024_D1024      0.05  False
-hf_kernels_layer_norm    LN_B4_S1024_D2048      0.07  False
-hf_kernels_layer_norm    LN_B4_S1024_D4096      0.29  False
-hf_kernels_layer_norm    LN_B4_S1024_D8192      0.59  False
+hf_kernels_layer_norm    LN_B4_S1024_D2048      0.05  False
+hf_kernels_layer_norm    LN_B4_S1024_D4096      0.05  False
+hf_kernels_layer_norm    LN_B4_S1024_D8192      0.21  False
 hf_kernels_layer_norm    LN_B4_S128_D1024       0.05  False
 hf_kernels_layer_norm    LN_B4_S128_D2048       0.05  False
 hf_kernels_layer_norm    LN_B4_S128_D4096       0.05  False
 hf_kernels_layer_norm    LN_B4_S128_D8192       0.05  False
-hf_kernels_layer_norm    LN_B4_S2048_D1024      0.06  False
-hf_kernels_layer_norm    LN_B4_S2048_D2048      0.30  False
-hf_kernels_layer_norm    LN_B4_S2048_D4096      0.60  False
-hf_kernels_layer_norm    LN_B4_S2048_D8192      1.15  False
+hf_kernels_layer_norm    LN_B4_S2048_D1024      0.05  False
+hf_kernels_layer_norm    LN_B4_S2048_D2048      0.06  False
+hf_kernels_layer_norm    LN_B4_S2048_D4096      0.21  False
+hf_kernels_layer_norm    LN_B4_S2048_D8192      0.44  False
 hf_kernels_layer_norm    LN_B4_S512_D1024       0.05  False
 hf_kernels_layer_norm    LN_B4_S512_D2048       0.05  False
-hf_kernels_layer_norm    LN_B4_S512_D4096       0.06  False
-hf_kernels_layer_norm    LN_B4_S512_D8192       0.29  False
-torch_layer_norm         LN_B16_S1024_D1024     0.29  False
-torch_layer_norm         LN_B16_S1024_D2048     0.59  False
-torch_layer_norm         LN_B16_S1024_D4096     1.15  False
-torch_layer_norm         LN_B16_S1024_D8192     2.27  False
+hf_kernels_layer_norm    LN_B4_S512_D4096       0.05  False
+hf_kernels_layer_norm    LN_B4_S512_D8192       0.05  False
+torch_layer_norm         LN_B16_S1024_D1024     0.05  False
+torch_layer_norm         LN_B16_S1024_D2048     0.21  False
+torch_layer_norm         LN_B16_S1024_D4096     0.42  False
+torch_layer_norm         LN_B16_S1024_D8192     0.85  False
 torch_layer_norm         LN_B16_S128_D1024      0.03  False
-torch_layer_norm         LN_B16_S128_D2048      0.04  False
-torch_layer_norm         LN_B16_S128_D4096      0.05  False
-torch_layer_norm         LN_B16_S128_D8192      0.27  False
-torch_layer_norm         LN_B16_S2048_D1024     0.59  False
-torch_layer_norm         LN_B16_S2048_D2048     1.16  False
-torch_layer_norm         LN_B16_S2048_D4096     2.30  False
-torch_layer_norm         LN_B16_S2048_D8192     4.51  False
-torch_layer_norm         LN_B16_S512_D1024      0.07  False
-torch_layer_norm         LN_B16_S512_D2048      0.29  False
-torch_layer_norm         LN_B16_S512_D4096      0.59  False
-torch_layer_norm         LN_B16_S512_D8192      1.15  False
+torch_layer_norm         LN_B16_S128_D2048      0.03  False
+torch_layer_norm         LN_B16_S128_D4096      0.04  False
+torch_layer_norm         LN_B16_S128_D8192      0.05  False
+torch_layer_norm         LN_B16_S2048_D1024     0.21  False
+torch_layer_norm         LN_B16_S2048_D2048     0.42  False
+torch_layer_norm         LN_B16_S2048_D4096     0.82  False
+torch_layer_norm         LN_B16_S2048_D8192     1.68  False
+torch_layer_norm         LN_B16_S512_D1024      0.04  False
+torch_layer_norm         LN_B16_S512_D2048      0.05  False
+torch_layer_norm         LN_B16_S512_D4096      0.21  False
+torch_layer_norm         LN_B16_S512_D8192      0.43  False
 torch_layer_norm         LN_B1_S1024_D1024      0.03  False
 torch_layer_norm         LN_B1_S1024_D2048      0.03  False
-torch_layer_norm         LN_B1_S1024_D4096      0.04  False
-torch_layer_norm         LN_B1_S1024_D8192      0.05  False
-torch_layer_norm         LN_B1_S128_D1024       0.03  False
+torch_layer_norm         LN_B1_S1024_D4096      0.03  False
+torch_layer_norm         LN_B1_S1024_D8192      0.04  False
+torch_layer_norm         LN_B1_S128_D1024       0.02  False
 torch_layer_norm         LN_B1_S128_D2048       0.03  False
 torch_layer_norm         LN_B1_S128_D4096       0.03  False
 torch_layer_norm         LN_B1_S128_D8192       0.03  False
-torch_layer_norm         LN_B1_S2048_D1024      0.04  False
-torch_layer_norm         LN_B1_S2048_D2048      0.04  False
-torch_layer_norm         LN_B1_S2048_D4096      0.05  False
-torch_layer_norm         LN_B1_S2048_D8192      0.27  False
+torch_layer_norm         LN_B1_S2048_D1024      0.03  False
+torch_layer_norm         LN_B1_S2048_D2048      0.03  False
+torch_layer_norm         LN_B1_S2048_D4096      0.04  False
+torch_layer_norm         LN_B1_S2048_D8192      0.05  False
 torch_layer_norm         LN_B1_S512_D1024       0.03  False
 torch_layer_norm         LN_B1_S512_D2048       0.03  False
 torch_layer_norm         LN_B1_S512_D4096       0.03  False
-torch_layer_norm         LN_B1_S512_D8192       0.04  False
-torch_layer_norm         LN_B4_S1024_D1024      0.05  False
-torch_layer_norm         LN_B4_S1024_D2048      0.06  False
-torch_layer_norm         LN_B4_S1024_D4096      0.28  False
-torch_layer_norm         LN_B4_S1024_D8192      0.59  False
+torch_layer_norm         LN_B1_S512_D8192       0.03  False
+torch_layer_norm         LN_B4_S1024_D1024      0.03  False
+torch_layer_norm         LN_B4_S1024_D2048      0.04  False
+torch_layer_norm         LN_B4_S1024_D4096      0.05  False
+torch_layer_norm         LN_B4_S1024_D8192      0.20  False
 torch_layer_norm         LN_B4_S128_D1024       0.03  False
 torch_layer_norm         LN_B4_S128_D2048       0.03  False
 torch_layer_norm         LN_B4_S128_D4096       0.03  False
-torch_layer_norm         LN_B4_S128_D8192       0.04  False
-torch_layer_norm         LN_B4_S2048_D1024      0.07  False
-torch_layer_norm         LN_B4_S2048_D2048      0.28  False
-torch_layer_norm         LN_B4_S2048_D4096      0.58  False
-torch_layer_norm         LN_B4_S2048_D8192      1.15  False
+torch_layer_norm         LN_B4_S128_D8192       0.03  False
+torch_layer_norm         LN_B4_S2048_D1024      0.04  False
+torch_layer_norm         LN_B4_S2048_D2048      0.05  False
+torch_layer_norm         LN_B4_S2048_D4096      0.21  False
+torch_layer_norm         LN_B4_S2048_D8192      0.44  False
 torch_layer_norm         LN_B4_S512_D1024       0.03  False
-torch_layer_norm         LN_B4_S512_D2048       0.04  False
-torch_layer_norm         LN_B4_S512_D4096       0.05  False
-torch_layer_norm         LN_B4_S512_D8192       0.27  False
+torch_layer_norm         LN_B4_S512_D2048       0.03  False
+torch_layer_norm         LN_B4_S512_D4096       0.04  False
+torch_layer_norm         LN_B4_S512_D8192       0.05  False
 
 GENERATING COMBINED VISUALIZATION
 
@@ -4101,53 +4101,7 @@ Implementations included:
 
▶ UV Install Logs
@@ -4160,7 +4114,7 @@ Installed 37 packages in 205ms - 2025-10-24T19:26:16.447564 + 2025-10-27T14:46:34.455868 image/svg+xml